In [572]:
#!pip install -U pandas

In [573]:
#!pip install -U pyLDAvis

In [574]:
## imports
import os, sys
import pprint as pp

In [575]:
## 一つ上の階層のファイルを見るように設定
sys.path.append(os.path.join(os.path.dirname("__file__"), '..'))

In [576]:
## target language
## a key must be part of a file name 
target_lang_dict = {    'ar'    : 'Arabic',
                        'de'    : 'German',
                        'de_N_only' : 'German Nouns',
                        'de_non_N_only' : 'German Non-nouns',
                        'en_US' : 'English (US)',
                        'en_UK' : 'English (UK)',
                        'en_N_only' : 'English noun (WN)',
                        'en_V_only' : 'English verb (WN)',
                        'en_A_only' : 'English adj (WN)',
                        'en_R_only' : 'English adv (WN)',
                        'eo'    : 'Esperanto',
                        'es_ES' : 'Spanish (Spain)',
                        'es_MX' : 'Spanish (Mexico)',
                        'fi'    : 'Finnish',
                        'fr_FR' : 'French (France)',
                        'fr_QC' : 'French (Quebec)',
                        'is'    : 'Icelandic',
                        'nl'    : 'Dutch',
                        'ro'    : 'Romanian',
                        'sw'    : 'Swahili' }
target_lang_keys = [    'ar', 'de', 'de_N_only', 'de_non_N_only',
                        'en_US', 'en_UK', 'en_N_only', 'en_V_only', 'en_A_only', 'en_R_only',
                        'eo', 'es_ES', 'es_MX',
                        'fi', 'fr_FR', 'fr_QC',
                        'is', 'nl', 'ro', 'sw' ]
#
target_lang_key  = target_lang_keys[9]
print(f"target lang: {target_lang_dict[target_lang_key]} ({target_lang_key})")

target lang: English adv (WN) (en_R_only)


In [577]:
## term settings
term_classes       = [ 'spell', 'sound' ]
term_class         = term_classes[0]
ngram_is_inclusive = True
gap_mark           = "…"
term_is_skippy     = True
n_for_ngram        = 4
print(f"term_class: {term_class}")
print(f"term_is_skippy: {term_is_skippy}")
print(f"n_for_ngram: {n_for_ngram}")
## define term_type
if term_class == 'spell':
    if term_is_skippy:
        term_type = f"sp_skippy{n_for_ngram}gram"
    else:
        term_type = f"sp_{n_for_ngram}gram"
else:
    if term_is_skippy:
        term_type = f"sn_skippy{n_for_ngram}gram"
    else:
        term_type = f"sn_{n_for_ngram}gram"
## check
print(f"term_type: {term_type}")
## doc settings
max_doc_size       = 10
min_doc_size       =  5
### boundary handling
add_boundary       = True
boundary_mark      = "#"
### accent handling
suppress_accents   = True
accent_marks       = [ "ˈ", "ˌ" ] 
if term_class == 'sound':
    if suppress_accents:
        accent_status = "unaccented-"
    else:
        accent_stratus = "accented-"
else:
    accent_status = ""
print(f"accent: {accent_status}")

term_class: spell
term_is_skippy: True
n_for_ngram: 4
term_type: sp_skippy4gram
accent: 


In [578]:
## LDA/HDP
apply_term_filtering = True
## The following parameters need to be relatively large to prevent "Row sum not equal 1" error
term_minfreq         = 3
abuse_threshold      = 0.03
min_bot_size         = 3
	

In [579]:
## sampling
source_sampling          = True
source_sampling_rate     = 0.5
source_sampling_max_size = 30000
second_sampling          = False
second_sampling_rate     = 0.7

In [580]:
## set target files
import glob
data_dir1     = "data/open-dict-ipa/data1/"
data_dir2     = "data/open-dict-ipa/data1a/"
data_dir3     = "data/wn3/"
target_files = glob.glob(f"{data_dir1}/*")
target_files2 = glob.glob(f"{data_dir2}/*")
target_files.extend(target_files2)
target_files3 = glob.glob(f"{data_dir3}/*")
target_files.extend(target_files3)
target_files = sorted([ file for file in target_files if ".csv" in file ])
pp.pprint(target_files)

['data/open-dict-ipa/data1/ar.csv.gz',
 'data/open-dict-ipa/data1/de.csv.gz',
 'data/open-dict-ipa/data1/en_UK.csv.gz',
 'data/open-dict-ipa/data1/en_US.csv.gz',
 'data/open-dict-ipa/data1/eo.csv.gz',
 'data/open-dict-ipa/data1/es_ES.csv.gz',
 'data/open-dict-ipa/data1/es_MX.csv.gz',
 'data/open-dict-ipa/data1/fa.csv.gz',
 'data/open-dict-ipa/data1/fi.csv.gz',
 'data/open-dict-ipa/data1/fr_FR.csv.gz',
 'data/open-dict-ipa/data1/fr_QC.csv.gz',
 'data/open-dict-ipa/data1/is.csv.gz',
 'data/open-dict-ipa/data1/ja.csv.gz',
 'data/open-dict-ipa/data1/jam.csv.gz',
 'data/open-dict-ipa/data1/ma.csv.gz',
 'data/open-dict-ipa/data1/nb.csv.gz',
 'data/open-dict-ipa/data1/nl.csv.gz',
 'data/open-dict-ipa/data1/or.csv.gz',
 'data/open-dict-ipa/data1/sv.csv.gz',
 'data/open-dict-ipa/data1/sw.csv.gz',
 'data/open-dict-ipa/data1/vi_C.csv.gz',
 'data/open-dict-ipa/data1/vi_N.csv.gz',
 'data/open-dict-ipa/data1/vi_S.csv.gz',
 'data/open-dict-ipa/data1/yue.csv.gz',
 'data/open-dict-ipa/data1/zh_hans.csv

In [581]:
## get source data from files
import pandas as pd
import gzip

#target_language_key = "en_US" # can be changed to get other languages
file = [ f for f in target_files if target_lang_key in f ][0]
print(f"processing: {file}")

if file.endswith(".gz"):
    with gzip.open(file, "rt") as f:
        raw_df = pd.read_csv(f, encoding = 'utf8', header = None, names = ['spell', 'sound'])
else:
    with open(file, "rt") as f:
        raw_df = pd.read_csv(f, encoding = 'utf8', header = None, names = ['spell', 'sound'])
try:
    sounds = raw_df['sound'].apply(lambda x: x.strip('/') )
    sounds = [ x.split("/,")[0] for x in sounds ] # picks up only the first of multiple entries
    raw_df['sound'] = sounds
except AttributeError:
    pass
#
raw_df.sample(10)

processing: data/wn3/en_R_only.csv


Unnamed: 0,spell,sound
656,interestingly,ˈɪntɝˌɛstɪŋɫi
1108,soaking,ˈsoʊkɪŋ
535,haughtily,ˈhɔtɪɫi
573,hypothetically,ˌhaɪpəˈθɛtɪkɫi
1057,second,ˈsɛkənd
951,presently,ˈpɹɛzəntɫi
20,actually,ˈækˌtʃuəɫi
92,appreciatively,əˈpɹiʃiˌeɪtɪvɫi
427,fashionably,ˈfæʃənəbɫi
512,gravely,ˈɡɹeɪvɫi


In [582]:
## source sampling
len(raw_df)
if source_sampling:
	print(f"source sampling applied")
	if len(raw_df) >= source_sampling_max_size:
		raw_df = raw_df.sample(source_sampling_max_size)
	else:
		raw_df = raw_df.sample(round(len(raw_df) * source_sampling_rate))
## remove accent marking
if suppress_accents:
	raw_df['sound'] = raw_df['sound'].apply(lambda x: "".join([ y for y in list(x) if y not in accent_marks ]))
## add boudary marks
if add_boundary:
	raw_df['spell'] = raw_df['spell'].apply(lambda x: f"{boundary_mark}{x}{boundary_mark}")
	raw_df['sound'] = raw_df['sound'].apply(lambda x: f"{boundary_mark}{x}{boundary_mark}")
#
print(raw_df)

source sampling applied
                spell            sound
644         #insofar#        #ɪnsəfɑɹ#
901     #permanently#     #pɝmənəntɫi#
67      #ambitiously#      #æmbɪʃəsɫi#
1325        #upriver#         #əpɹɪvɝ#
989         #quickly#         #kwɪkɫi#
...               ...              ...
577   #ideologically#  #aɪdiəɫɑdʒɪkɫi#
148         #between#         #bitwin#
76      #angelically#     #ændʒɛɫɪkɫi#
520      #grudgingly#     #ɡɹədʒɪŋɡɫi#
1242         #thrice#          #θɹaɪs#

[698 rows x 2 columns]


In [583]:
## generate 1-grams for spell and sound
## spell
raw_df['sp_1gram'] = raw_df['spell'].apply(lambda x: list(str(x)))
# add column of size
raw_df['sp_size'] = raw_df['sp_1gram'].apply(lambda x: len(x))
# add column of count of '-' inside
raw_df['hyphen'] = raw_df['sp_1gram'].apply(lambda x: list(x).count("-"))
# add column of count of '.' inside
raw_df['period'] = raw_df['sp_1gram'].apply(lambda x: list(x).count("."))
## sound
# takes the first entry, removes '/' around
try:
    raw_df['sn_1gram'] = raw_df['sound'].apply(lambda x: list(x) )
except TypeError:
    pass
# add column of size
try:
    raw_df['sn_size'] = raw_df['sn_1gram'].apply(lambda x: len(x))
except KeyError:
    pass
## check
raw_df

Unnamed: 0,spell,sound,sp_1gram,sp_size,hyphen,period,sn_1gram,sn_size
644,#insofar#,#ɪnsəfɑɹ#,"[#, i, n, s, o, f, a, r, #]",9,0,0,"[#, ɪ, n, s, ə, f, ɑ, ɹ, #]",9
901,#permanently#,#pɝmənəntɫi#,"[#, p, e, r, m, a, n, e, n, t, l, y, #]",13,0,0,"[#, p, ɝ, m, ə, n, ə, n, t, ɫ, i, #]",12
67,#ambitiously#,#æmbɪʃəsɫi#,"[#, a, m, b, i, t, i, o, u, s, l, y, #]",13,0,0,"[#, æ, m, b, ɪ, ʃ, ə, s, ɫ, i, #]",11
1325,#upriver#,#əpɹɪvɝ#,"[#, u, p, r, i, v, e, r, #]",9,0,0,"[#, ə, p, ɹ, ɪ, v, ɝ, #]",8
989,#quickly#,#kwɪkɫi#,"[#, q, u, i, c, k, l, y, #]",9,0,0,"[#, k, w, ɪ, k, ɫ, i, #]",8
...,...,...,...,...,...,...,...,...
577,#ideologically#,#aɪdiəɫɑdʒɪkɫi#,"[#, i, d, e, o, l, o, g, i, c, a, l, l, y, #]",15,0,0,"[#, a, ɪ, d, i, ə, ɫ, ɑ, d, ʒ, ɪ, k, ɫ, i, #]",15
148,#between#,#bitwin#,"[#, b, e, t, w, e, e, n, #]",9,0,0,"[#, b, i, t, w, i, n, #]",8
76,#angelically#,#ændʒɛɫɪkɫi#,"[#, a, n, g, e, l, i, c, a, l, l, y, #]",13,0,0,"[#, æ, n, d, ʒ, ɛ, ɫ, ɪ, k, ɫ, i, #]",12
520,#grudgingly#,#ɡɹədʒɪŋɡɫi#,"[#, g, r, u, d, g, i, n, g, l, y, #]",12,0,0,"[#, ɡ, ɹ, ə, d, ʒ, ɪ, ŋ, ɡ, ɫ, i, #]",12


In [584]:
## filtering raw_data by size
print(f"term_type: {term_type}")
if "sp_" in term_type:
    df_filtered = raw_df[ (raw_df['sp_size'] <= max_doc_size) & (raw_df['sp_size'] >= min_doc_size) & (raw_df['hyphen'] == 0) & (raw_df['period'] == 0) ]
else:
    df_filtered = raw_df[ (raw_df['sn_size'] <= max_doc_size) & (raw_df['sn_size'] >= min_doc_size) ]
#
df_filtered

term_type: sp_skippy4gram


Unnamed: 0,spell,sound,sp_1gram,sp_size,hyphen,period,sn_1gram,sn_size
644,#insofar#,#ɪnsəfɑɹ#,"[#, i, n, s, o, f, a, r, #]",9,0,0,"[#, ɪ, n, s, ə, f, ɑ, ɹ, #]",9
1325,#upriver#,#əpɹɪvɝ#,"[#, u, p, r, i, v, e, r, #]",9,0,0,"[#, ə, p, ɹ, ɪ, v, ɝ, #]",8
989,#quickly#,#kwɪkɫi#,"[#, q, u, i, c, k, l, y, #]",9,0,0,"[#, k, w, ɪ, k, ɫ, i, #]",8
457,#foremost#,#fɔɹmoʊst#,"[#, f, o, r, e, m, o, s, t, #]",10,0,0,"[#, f, ɔ, ɹ, m, o, ʊ, s, t, #]",10
47,#akimbo#,#əkɪmboʊ#,"[#, a, k, i, m, b, o, #]",8,0,0,"[#, ə, k, ɪ, m, b, o, ʊ, #]",9
...,...,...,...,...,...,...,...,...
144,#benignly#,#bənaɪnɫi#,"[#, b, e, n, i, g, n, l, y, #]",10,0,0,"[#, b, ə, n, a, ɪ, n, ɫ, i, #]",10
136,#bang#,#bæŋ#,"[#, b, a, n, g, #]",6,0,0,"[#, b, æ, ŋ, #]",5
540,#heavily#,#hɛvəɫi#,"[#, h, e, a, v, i, l, y, #]",9,0,0,"[#, h, ɛ, v, ə, ɫ, i, #]",8
148,#between#,#bitwin#,"[#, b, e, t, w, e, e, n, #]",9,0,0,"[#, b, i, t, w, i, n, #]",8


In [585]:
## define df after second sampling if any
len(df_filtered)
if second_sampling:
    df = df_filtered.sample(round(len(df_filtered) * second_sampling_rate))
else:
    df = df_filtered
len(df)

297

In [586]:
## spell 2grams
import ngrams
reload_module = False
if reload_module:
    import importlib
    importlib.reload(ngrams)

if term_class == 'spell':
    sp_2grams = [ ngrams.list_gen_ngrams (x, n = 2, check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_2grams):
            g.extend(list(df['sp_1gram'])[i])
    ## add sp_2gram
    df['sp_2gram'] = sp_2grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sp_2gram'] = sp_2grams


In [587]:
## spell 3grams
import ngrams
if n_for_ngram > 2 and term_class == 'spell':
    sp_3grams = [ ngrams.list_gen_ngrams (x, n = 3, check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_3grams):
            g.extend(list(df['sp_2gram'])[i])
    ## add sp_2gram
    df['sp_3gram'] = sp_3grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sp_3gram'] = sp_3grams


In [588]:
## spell 4grams
import ngrams
if n_for_ngram > 3 and term_class == 'spell':
    sp_4grams = [ ngrams.list_gen_ngrams (x, n = 4, check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_4grams):
            g.extend(list(df['sp_3gram'])[i])
    ## add sp_2gram
    df['sp_4gram'] = sp_4grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sp_4gram'] = sp_4grams


In [589]:
## spell skippy2gram
import ngrams_skippy
reload_module = False
if reload_module:
    import importlib
    importlib.reload(ngrams_skippy)
#
if term_class == 'spell':
    sp_skippy2grams = [ ngrams_skippy.gen_skippy2grams(x, missing_mark = gap_mark, check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_skippy2grams):
            g.extend(list(df['sp_1gram'])[i])
    #
    df['sp_skippy2gram'] = sp_skippy2grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sp_skippy2gram'] = sp_skippy2grams


In [590]:
## spell skippy3gram
import ngrams_skippy
if n_for_ngram > 2 and term_class == 'spell':
    sp_skippy3grams = [ ngrams_skippy.gen_skippy3grams(x, missing_mark = gap_mark, check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_skippy3grams):
            g.extend(list(df['sp_skippy2gram'])[i])
    #
    df['sp_skippy3gram'] = sp_skippy3grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sp_skippy3gram'] = sp_skippy3grams


In [591]:
## spell skippy4gram
import ngrams_skippy
if n_for_ngram > 3 and term_class == 'spell':
    sp_skippy4grams = [ ngrams_skippy.gen_skippy4grams(x, missing_mark = gap_mark, check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_skippy4grams):
            g.extend(list(df['sp_skippy3gram'])[i])
    #
    df['sp_skippy4gram'] = sp_skippy4grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sp_skippy4gram'] = sp_skippy4grams


In [592]:
## sound 2grams
import ngrams
reload_module = False
if reload_module:
    import importlib
    importlib.reload(ngrams)
#
if term_class == 'sound':
    sn_2grams = [ ngrams.list_gen_ngrams (x, n = 2, check = False) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_2grams):
            g.extend(list(df['sn_1gram'])[i])
    ## add sn_2gram
    df['sn_2gram'] = sn_2grams

In [593]:
## sound 3grams
import ngrams
if n_for_ngram > 2 and term_class == 'sound':
    sn_3grams = [ ngrams.list_gen_ngrams (x, n = 3, check = False) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_3grams):
            g.extend(list(df['sn_2gram'])[i])
    ## add sn_3gram
    df['sn_3gram'] = sn_3grams

In [594]:
## sound 4grams
import ngrams
if n_for_ngram > 3 and term_class == 'sound':
    sn_4grams = [ ngrams.list_gen_ngrams (x, n = 4, check = False) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_3grams):
            g.extend(list(df['sn_2gram'])[i])
    ## add sn_4gram
    df['sn_4gram'] = sn_3grams

In [595]:
## sound skippy2gram
import ngrams_skippy
if term_class == 'sound':
    sn_skippy2grams = [ ngrams_skippy.gen_skippy2grams(x, missing_mark = gap_mark, check = False) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_skippy2grams):
            g.extend(list(df['sn_1gram'])[i])
    #
    df['sn_skippy2gram'] = sn_skippy2grams

In [596]:
## sound skippy3gram
import ngrams_skippy
if n_for_ngram > 2 and term_class == 'sound':
    sn_skippy3grams = [ ngrams_skippy.gen_skippy3grams(x, missing_mark = gap_mark, check = False) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_skippy3grams):
            g.extend(list(df['sn_skippy2gram'])[i])
    #
    df['sn_skippy3gram'] = sn_skippy3grams

In [597]:
## sound skippy4gram
import ngrams_skippy
if n_for_ngram > 3 and term_class == 'sound':
    sn_skippy4grams = [ ngrams_skippy.gen_skippy4grams(x, missing_mark = gap_mark, check = False) for x in df['sn_1gram'] ]
    #
    if ngram_is_inclusive:
        for i, g in enumerate(sn_skippy4grams):
            g.extend(list(df['sn_skippy3gram'])[i])
    #
    df['sn_skippy4gram'] = sn_skippy4grams

In [598]:
## check df
dropped_vars = [ 'sp_size', 'hyphen', 'period', 'sn_size' ]
if term_class == 'spell':
    extra = [ 'sn_1gram', 'sn_2gram', 'sn_3gram', 'sn_4gram',
             'sn_skippy2gram', 'sn_skippy3gram', 'sn_skippy4gram' ]
    dropped_vars.extend(extra)
    target_vars = [ x for x in df.columns if not x in dropped_vars ]
else:
    extra = [ 'sp_1gram', 'sp_2gram', 'sp_3gram', 'sp_4gram',
             'sp_skippy2gram', 'sp_skippy3gram', 'sp_skippy4gram' ]
    dropped_vars.extend(extra)
    target_vars = [ x for x in df.columns if not x in dropped_vars ]    
#
df[target_vars]

Unnamed: 0,spell,sound,sp_1gram,sp_2gram,sp_3gram,sp_4gram,sp_skippy2gram,sp_skippy3gram,sp_skippy4gram
644,#insofar#,#ɪnsəfɑɹ#,"[#, i, n, s, o, f, a, r, #]","[#i, in, ns, so, of, fa, ar, r#, #, i, n, s, o...","[#in, ins, nso, sof, ofa, far, ar#, #i, in, ns...","[#ins, inso, nsof, sofa, ofar, far#, #in, ins,...","[#i, #…n, #…s, #…o, #…f, #…a, #…r, #…#, in, i…...","[#in, #i…s, #i…o, #i…f, #i…a, #i…r, #i…#, #…ns...","[#ins, #in…o, #in…f, #in…a, #in…r, #in…#, #i…s..."
1325,#upriver#,#əpɹɪvɝ#,"[#, u, p, r, i, v, e, r, #]","[#u, up, pr, ri, iv, ve, er, r#, #, u, p, r, i...","[#up, upr, pri, riv, ive, ver, er#, #u, up, pr...","[#upr, upri, priv, rive, iver, ver#, #up, upr,...","[#u, #…p, #…r, #…i, #…v, #…e, #…#, up, u…r, u…...","[#up, #u…r, #u…i, #u…v, #u…e, #u…#, #…pr, #…p…...","[#upr, #up…i, #up…v, #up…e, #up…r, #up…#, #u…r..."
989,#quickly#,#kwɪkɫi#,"[#, q, u, i, c, k, l, y, #]","[#q, qu, ui, ic, ck, kl, ly, y#, #, q, u, i, c...","[#qu, qui, uic, ick, ckl, kly, ly#, #q, qu, ui...","[#qui, quic, uick, ickl, ckly, kly#, #qu, qui,...","[#q, #…u, #…i, #…c, #…k, #…l, #…y, #…#, qu, q…...","[#qu, #q…i, #q…c, #q…k, #q…l, #q…y, #q…#, #…ui...","[#qui, #qu…c, #qu…k, #qu…l, #qu…y, #qu…#, #q…i..."
457,#foremost#,#fɔɹmoʊst#,"[#, f, o, r, e, m, o, s, t, #]","[#f, fo, or, re, em, mo, os, st, t#, #, f, o, ...","[#fo, for, ore, rem, emo, mos, ost, st#, #f, f...","[#for, fore, orem, remo, emos, most, ost#, #fo...","[#f, #…o, #…r, #…e, #…m, #…s, #…t, #…#, fo, f…...","[#fo, #f…r, #f…e, #f…m, #f…o, #f…s, #f…t, #f…#...","[#for, #fo…e, #fo…m, #fo…o, #fo…s, #fo…t, #fo…..."
47,#akimbo#,#əkɪmboʊ#,"[#, a, k, i, m, b, o, #]","[#a, ak, ki, im, mb, bo, o#, #, a, k, i, m, b,...","[#ak, aki, kim, imb, mbo, bo#, #a, ak, ki, im,...","[#aki, akim, kimb, imbo, mbo#, #ak, aki, kim, ...","[#a, #…k, #…i, #…m, #…b, #…o, #…#, ak, a…i, a…...","[#ak, #a…i, #a…m, #a…b, #a…o, #a…#, #…ki, #…k…...","[#aki, #ak…m, #ak…b, #ak…o, #ak…#, #a…im, #a…i..."
...,...,...,...,...,...,...,...,...,...
144,#benignly#,#bənaɪnɫi#,"[#, b, e, n, i, g, n, l, y, #]","[#b, be, en, ni, ig, gn, nl, ly, y#, #, b, e, ...","[#be, ben, eni, nig, ign, gnl, nly, ly#, #b, b...","[#ben, beni, enig, nign, ignl, gnly, nly#, #be...","[#b, #…e, #…n, #…i, #…g, #…l, #…y, #…#, be, b…...","[#be, #b…n, #b…i, #b…g, #b…l, #b…y, #b…#, #…en...","[#ben, #be…i, #be…g, #be…n, #be…l, #be…y, #be…..."
136,#bang#,#bæŋ#,"[#, b, a, n, g, #]","[#b, ba, an, ng, g#, #, b, a, n, g, #]","[#ba, ban, ang, ng#, #b, ba, an, ng, g#, #, b,...","[#ban, bang, ang#, #ba, ban, ang, ng#, #b, ba,...","[#b, #…a, #…n, #…g, #…#, ba, b…n, b…g, b…#, an...","[#ba, #b…n, #b…g, #b…#, #…an, #…a…g, #…a…#, #…...","[#ban, #ba…g, #ba…#, #b…ng, #b…n…#, #b…g#, #…a..."
540,#heavily#,#hɛvəɫi#,"[#, h, e, a, v, i, l, y, #]","[#h, he, ea, av, vi, il, ly, y#, #, h, e, a, v...","[#he, hea, eav, avi, vil, ily, ly#, #h, he, ea...","[#hea, heav, eavi, avil, vily, ily#, #he, hea,...","[#h, #…e, #…a, #…v, #…i, #…l, #…y, #…#, he, h…...","[#he, #h…a, #h…v, #h…i, #h…l, #h…y, #h…#, #…ea...","[#hea, #he…v, #he…i, #he…l, #he…y, #he…#, #h…a..."
148,#between#,#bitwin#,"[#, b, e, t, w, e, e, n, #]","[#b, be, et, tw, we, ee, en, n#, #, b, e, t, w...","[#be, bet, etw, twe, wee, een, en#, #b, be, et...","[#bet, betw, etwe, twee, ween, een#, #be, bet,...","[#b, #…e, #…t, #…w, #…n, #…#, be, b…t, b…w, b…...","[#be, #b…t, #b…w, #b…e, #b…n, #b…#, #…et, #…e…...","[#bet, #be…w, #be…e, #be…n, #be…#, #b…tw, #b…t..."


In [599]:
## select data type and define doc_dict
import random
if "sp_" in term_type:
    base_type = "spell"
else:
    base_type = "sound"
doc_dict = { i: x for i, x in enumerate(df[base_type]) }
## check
random.sample(doc_dict.items(), 10)

since Python 3.9 and will be removed in a subsequent version.
  random.sample(doc_dict.items(), 10)


[(93, '#sloppily#'),
 (109, '#sexually#'),
 (263, '#jokingly#'),
 (260, '#home#'),
 (43, '#little#'),
 (98, '#homeward#'),
 (207, '#legato#'),
 (20, '#recently#'),
 (91, '#inward#'),
 (251, '#pat#')]

In [600]:
## select bots for analysis
enable_term_change = False # if you want to change term_type to save time and energy
if enable_term_change:
	term_type = 'sp_skippy4gram'
print(f"(changed) term_type: {term_type}")

## bot stands for 'bag-of-terms', a generalization of 'bag-of-words'
bots = [ x for x in df[term_type] if len(x) > min_bot_size ] # Crucially
import random
random.sample(bots, 3)

(changed) term_type: sp_skippy4gram


[['#noi',
  '#no…s',
  '#no…i',
  '#no…l',
  '#no…y',
  '#no…#',
  '#n…is',
  '#n…i…i',
  '#n…i…l',
  '#n…i…y',
  '#n…i…#',
  '#n…si',
  '#n…s…l',
  '#n…s…y',
  '#n…s…#',
  '#n…il',
  '#n…ly',
  '#n…l…#',
  '#n…y#',
  '#…o…is',
  '#…oi…i',
  '#…oi…l',
  '#…oi…y',
  '#…oi…#',
  '#…o…si',
  '#…o…s…l',
  '#…o…s…y',
  '#…o…s…#',
  '#…o…il',
  '#…o…i…y',
  '#…o…i…#',
  '#…o…ly',
  '#…o…l…#',
  '#…o…y#',
  '#…i…si',
  '#…is…l',
  '#…is…y',
  '#…is…#',
  '#…i…il',
  '#…i…i…y',
  '#…i…i…#',
  '#…i…ly',
  '#…i…l…#',
  '#…i…y#',
  '#…s…il',
  '#…si…y',
  '#…si…#',
  '#…s…ly',
  '#…s…l…#',
  '#…s…y#',
  '#…il…#',
  '#…l…y#',
  'nois',
  'noi…i',
  'noi…l',
  'noi…y',
  'noi…#',
  'no…si',
  'no…s…l',
  'no…s…y',
  'no…s…#',
  'no…il',
  'no…i…y',
  'no…i…#',
  'no…ly',
  'no…l…#',
  'no…y#',
  'n…i…si',
  'n…is…l',
  'n…is…y',
  'n…is…#',
  'n…i…il',
  'n…i…i…y',
  'n…i…i…#',
  'n…i…ly',
  'n…i…l…#',
  'n…i…y#',
  'n…s…il',
  'n…si…y',
  'n…si…#',
  'n…s…ly',
  'n…s…l…#',
  'n…s…y#',
  'n…il…#',


In [601]:
## generate dictionary
from gensim.corpora import Dictionary
diction = Dictionary(bots)
print(diction)

if apply_term_filtering:
    print(f"term filtering applied")
    diction.filter_extremes(no_below = term_minfreq, no_above = abuse_threshold)
else:
    print(f"term filtering not applied")
print(diction)

## generate DTM
corpus = [ diction.doc2bow(bot) for bot in bots if len(bot) > min_bot_size ] # Crucially

Dictionary(27164 unique tokens: ['#', '#i', '#in', '#ins', '#in…#']...)
term filtering applied
Dictionary(3801 unique tokens: ['#i', '#in', '#in…#', '#in…a', '#in…r']...)


In [602]:
## HDP (n_topics = 90)
import gensim.models
import pyLDAvis.gensim
max_n_topics = 90
hdp = gensim.models.HdpModel(corpus, diction, random_state = 1, T = max_n_topics)
vis_data = pyLDAvis.gensim.prepare(hdp, corpus, diction)
pyLDAvis.display(vis_data)

In [603]:
## save LDAvis output as a html file
save_LDAvis = True
if save_LDAvis:
	vis_output = f"results/LDAvis/{target_lang_dict[target_lang_key]}-HDP-max_ntop{max_n_topics}-{accent_status}{term_type}.html"
	pyLDAvis.save_html(vis_data, vis_output)

In [604]:
## topic investigation
import numpy as np
import HDP_helper

documents_topics = np.zeros([hdp.m_T, len(corpus)])
for doc_id, c in enumerate(corpus):
    for topic_id, prob in hdp[c]:
        documents_topics[topic_id][doc_id] = prob

## investigate topics
n_docs_to_show  = 10
n_terms_to_show = 7
hdp.optimal_ordering()
for topic_id, probs in enumerate(documents_topics):
    print(f"==============")
    #topic_encoding = ", ".join(hdp.show_topic(topic_id))
    topic_t = hdp.print_topic(topic_id, topn = n_terms_to_show)
    print(f"topic_id {topic_id}: {HDP_helper.reformat_topic (topic_t, n_terms_to_show)}")
    print(f"nonzero count: ", len(probs.nonzero()[0]))
    for doc_id in probs.argsort()[::-1][:n_docs_to_show]:
        doc = doc_dict[doc_id]
        print(f"\t{probs[doc_id]:0.4f}: {doc}")

topic_id 0: 0.002 * o…i…ng + 0.002 * s…d…# + 0.002 * #n…y + 0.002 * for…# + 0.002 * i…rd + 0.002 * m…e…# + 0.002 * gh
nonzero count:  20
	0.9911: #humanely#
	0.9874: #offshore#
	0.9864: #ashore#
	0.8895: #humbly#
	0.8091: #inshore#
	0.7229: #only#
	0.5348: #afresh#
	0.4460: #plumb#
	0.3209: #offstage#
	0.2687: #openly#
topic_id 1: 0.002 * alo + 0.002 * e…rl…# + 0.002 * e…e…l…# + 0.002 * #m…l + 0.002 * i…w…r + 0.002 * mo…# + 0.002 * #…o…a…l
nonzero count:  19
	0.9890: #jokingly#
	0.8339: #along#
	0.7118: #long#
	0.6856: #soaking#
	0.5309: #jointly#
	0.4490: #singly#
	0.3717: #alone#
	0.3697: #aloft#
	0.3219: #starkly#
	0.2530: #aloud#
topic_id 2: 0.002 * a…nt…# + 0.002 * #so…# + 0.002 * #…n…r…# + 0.002 * f…c…y + 0.002 * o…el + 0.002 * #…r…b…y + 0.002 * #l…e…#
nonzero count:  20
	0.9858: #evermore#
	0.9431: #more#
	0.6293: #well#
	0.5376: #piping#
	0.5364: #after#
	0.4823: #even#
	0.4320: #underarm#
	0.4182: #later#
	0.3570: #singly#
	0.3515: #roaring#
topic_id 3: 0.002 * h…i…y + 0.002 *

In [605]:
## HDP (n_topics = 45)
import gensim.models
import pyLDAvis.gensim
max_n_topics = 45
hdp = gensim.models.HdpModel(corpus, diction, random_state = 1, T = max_n_topics)
vis_data = pyLDAvis.gensim.prepare(hdp, corpus, diction)
pyLDAvis.display(vis_data)

In [606]:
## save LDAvis output as a html file
save_LDAvis = True
if save_LDAvis:
	output = f"results/LDAvis/{target_lang_dict[target_lang_key]}-HDP-max_ntop{max_n_topics}-{accent_status}{term_type}.html"
	pyLDAvis.save_html(vis_data, vis_output)

In [607]:
## topic investigation
import numpy as np
import HDP_helper

documents_topics = np.zeros([hdp.m_T, len(corpus)])
for doc_id, c in enumerate(corpus):
    for topic_id, prob in hdp[c]:
        documents_topics[topic_id][doc_id] = prob

n_docs_to_show = 10
n_terms_to_show = 7
hdp.optimal_ordering()
for topic_id, probs in enumerate(documents_topics):
    print(f"==============")
    topic_t = hdp.print_topic(topic_id, topn = n_terms_to_show)
    print(f"topic_id {topic_id}: {HDP_helper.reformat_topic (topic_t, n_terms_to_show)}")
    print(f"nonzero count: {len(probs.nonzero()[0])}")
    for doc_id in probs.argsort()[::-1][:n_docs_to_show]:
        doc = doc_dict[doc_id]
        print(f"\t{probs[doc_id]:0.4f}: {doc}")

topic_id 0: 0.002 * h…i…y + 0.002 * g…a…# + 0.002 * a…u…ll + 0.002 * #…vi…# + 0.002 * #q…i…# + 0.002 * #…c…k# + 0.002 * #…u…i…y
nonzero count: 29
	0.9892: #jokingly#
	0.9849: #heavily#
	0.5710: #behind#
	0.4122: #soaking#
	0.4100: #jointly#
	0.3673: #second#
	0.3656: #happily#
	0.3486: #lavishly#
	0.3137: #clumsily#
	0.3129: #little#
topic_id 1: 0.002 * t…n + 0.002 * so…i…# + 0.002 * f…d + 0.002 * #…s…a…e + 0.002 * c…tl + 0.002 * ep…y + 0.002 * #t…o…#
nonzero count: 33
	0.9897: #benignly#
	0.9829: #between#
	0.9772: #better#
	0.7859: #below#
	0.6363: #best#
	0.5378: #lamely#
	0.4924: #each#
	0.4612: #largo#
	0.4379: #enough#
	0.3120: #ninefold#
topic_id 2: 0.002 * #j…y# + 0.002 * #…of + 0.002 * #m…r…# + 0.002 * #…nt…l + 0.002 * #…r…h…# + 0.002 * ply + 0.001 * nt…ly
nonzero count: 39
	0.9830: #counter#
	0.9554: #aloft#
	0.9415: #bang#
	0.7959: #later#
	0.7930: #stark#
	0.6139: #blankly#
	0.5975: #aft#
	0.5020: #askew#
	0.5016: #far#
	0.4923: #alone#
topic_id 3: 0.002 * o…i…ng + 0.002 * 

In [608]:
## HDP (n_topics = 15)
import gensim.models
import pyLDAvis.gensim
max_n_topics = 15
hdp = gensim.models.HdpModel(corpus, diction, random_state = 1, T = max_n_topics)
vis_data = pyLDAvis.gensim.prepare(hdp, corpus, diction)
pyLDAvis.display(vis_data)

In [609]:
## save LDAvis output as a html file
save_LDAvis = True
if save_LDAvis:
	vis_output = f"results/LDAvis/{target_lang_dict[target_lang_key]}-HDP-max_ntop{max_n_topics}-{accent_status}{term_type}.html"
	pyLDAvis.save_html(vis_data, vis_output)

In [610]:
## topic investigation
import numpy as np
import HDP_helper
reload_module = True
if reload_module:
    import importlib
    importlib.reload(HDP_helper)

documents_topics = np.zeros([hdp.m_T, len(corpus)])
for doc_id, c in enumerate(corpus):
    for topic_id, prob in hdp[c]:
        documents_topics[topic_id][doc_id] = prob

n_docs_to_show = 10
n_terms_to_show = 7
hdp.optimal_ordering()
for topic_id, probs in enumerate(documents_topics):
    print(f"==============")
    topic_t = hdp.print_topic(topic_id, topn = n_terms_to_show)
    print(f"topic_id {topic_id}: {HDP_helper.reformat_topic (topic_t, n_terms_to_show)}")
    print(f"nonzero count: ", len(probs.nonzero()[0]))
    for doc_id in probs.argsort()[::-1][:n_docs_to_show]:
        doc = doc_dict[doc_id]
        print(f"\t{probs[doc_id]:0.4f}: {doc}")

topic_id 0: 0.002 * no…y + 0.002 * ep…y# + 0.002 * #…n…g + 0.002 * #…r…o…# + 0.002 * #…m…el + 0.002 * eas + 0.002 * se…#
nonzero count:  101
	0.9183: #best#
	0.9001: #sic#
	0.8202: #due#
	0.7536: #between#
	0.7003: #yet#
	0.6541: #better#
	0.6506: #briskly#
	0.6184: #tenfold#
	0.5878: #live#
	0.5875: #quickly#
topic_id 1: 0.002 * i…i…l…# + 0.002 * #i…l…# + 0.002 * #ma…y + 0.002 * u…t…y + 0.002 * o…e…ly + 0.002 * #…us…# + 0.002 * i…e…l
nonzero count:  93
	0.9186: #aft#
	0.6028: #innately#
	0.5948: #dorsally#
	0.5862: #aloft#
	0.5196: #forward#
	0.4724: #normally#
	0.4564: #singly#
	0.4296: #namely#
	0.4121: #nowhere#
	0.3987: #early#
topic_id 2: 0.002 * hor + 0.002 * ply + 0.002 * #pro + 0.002 * p…o…# + 0.002 * e…i…# + 0.002 * p…il…# + 0.002 * #…r…e…y
nonzero count:  84
	0.6045: #pop#
	0.5558: #stiff#
	0.5350: #express#
	0.5340: #under#
	0.4879: #such#
	0.4384: #adrift#
	0.4104: #just#
	0.4084: #plop#
	0.3984: #behind#
	0.3886: #upstairs#
topic_id 3: 0.003 * o…t…y + 0.002 * ee…l…# + 0.0