In [1]:
#%pip install -U pandas

In [2]:
#%pip install -U pyLDAvis

In [3]:
## imports
import os, sys
import pprint as pp

In [4]:
## 一つ上の階層のファイルを見るように設定
sys.path.append(os.path.join(os.path.dirname("__file__"), '..'))

In [5]:
## variables

## sampling
source_sampling      = True
source_sampling_rate = 0.2
## doc settings
max_doc_size = 10
min_doc_size =  5
## term settings
ngram_is_inclusive = True
gap_mark   = "…"
term_is_skippy  = True
term_classes    = [ 'spell', 'sound' ]
term_class      = term_classes[1]
n_for_ngram     = 4
print(f"term_is_skippy: {term_is_skippy}")
print(f"term_class: {term_class}")
print(f"n_for_ngram: {n_for_ngram}")
## define term_type
if term_class == 'spell':
    if term_is_skippy:
        term_type = f"sp_skippy{n_for_ngram}gram"
    else:
        term_type = f"sp_{n_for_ngram}gram"
else:
    if term_is_skippy:
        term_type = f"sn_skippy{n_for_ngram}gram"
    else:
        term_type = f"sn_{n_for_ngram}gram"
## check
print(f"term_type: {term_type}")

term_is_skippy: True
term_class: sound
n_for_ngram: 4
term_type: sn_skippy4gram


In [6]:
## LDA/HDP
apply_term_filtering = True
## The following parameters need to be relatively large to prevent "Row sum not equal 1" error
term_minfreq         = 3
abuse_threshold      = 0.03
min_bot_size         = 3

In [7]:
## set target files
import glob
data_dir = "data/open-dict-ipa/data1/"
target_files = glob.glob(f"{data_dir}/*")
target_files = [ file for file in target_files if ".csv" in file ]
pp.pprint(target_files)

['data/open-dict-ipa/data1/nb.csv',
 'data/open-dict-ipa/data1/yue.csv',
 'data/open-dict-ipa/data1/or.csv',
 'data/open-dict-ipa/data1/fr_FR.csv',
 'data/open-dict-ipa/data1/es_ES.csv',
 'data/open-dict-ipa/data1/jam.csv',
 'data/open-dict-ipa/data1/is.csv',
 'data/open-dict-ipa/data1/vi_S.csv',
 'data/open-dict-ipa/data1/fi.csv',
 'data/open-dict-ipa/data1/vi_C.csv',
 'data/open-dict-ipa/data1/de.csv',
 'data/open-dict-ipa/data1/en_US.csv',
 'data/open-dict-ipa/data1/es_MX.csv',
 'data/open-dict-ipa/data1/eo.csv',
 'data/open-dict-ipa/data1/ja.csv',
 'data/open-dict-ipa/data1/en_UK.csv',
 'data/open-dict-ipa/data1/sv.csv',
 'data/open-dict-ipa/data1/sw.csv',
 'data/open-dict-ipa/data1/fa.csv',
 'data/open-dict-ipa/data1/vi_N.csv',
 'data/open-dict-ipa/data1/ar.csv',
 'data/open-dict-ipa/data1/zh_hans.csv',
 'data/open-dict-ipa/data1/zh_hant.csv',
 'data/open-dict-ipa/data1/ma.csv',
 'data/open-dict-ipa/data1/nl.csv',
 'data/open-dict-ipa/data1/fr_QC.csv']


In [8]:
## get data from files
import pandas as pd

target_key = "en_US" # can be changed to get other languages
file = [ f for f in target_files if target_key in f ][0]
print(f"processing: {file}")
with open(file, "rt") as f:
    raw_df = pd.read_csv(f, encoding = 'utf8', header = None, names = ['spell', 'sound'])
    sounds = raw_df['sound'].apply(lambda x: x.strip('/') )
    sounds = [ x.split("/,")[0] for x in sounds ] # picks up only the first of multiple entries
    raw_df['sound'] = sounds
# 
raw_df.sample(10)

processing: data/open-dict-ipa/data1/en_US.csv


Unnamed: 0,spell,sound
73563,mihalko,mɪˈhæɫkoʊ
83676,parizek,pɝˈɪzɛk
32998,dunked,ˈdəŋkt
48706,hangouts,ˈhæˌŋaʊts
17592,caroselli,kɑɹoʊˈsɛɫi
61390,komarek,koʊˈmɑɹɛk
56836,isolationism,ˌaɪsəˈɫeɪʃəˌnɪzəm
86509,pinheiro,pinˈhɛɹoʊ
101716,session's,ˈsɛʃənz
123762,woodland's,ˈwʊdˌɫændz


In [9]:
## generate 1-grams for spell and sound
## spell
raw_df['sp_1gram'] = raw_df['spell'].apply(lambda x: list(str(x)))
# add column of size
raw_df['sp_size'] = raw_df['sp_1gram'].apply(lambda x: len(x))
# add column of count of '-' inside
raw_df['hyphen'] = raw_df['sp_1gram'].apply(lambda x: list(x).count("-"))
# add column of count of '.' inside
raw_df['period'] = raw_df['sp_1gram'].apply(lambda x: list(x).count("."))
## sound
# takes the first entry, removes '/' around
raw_df['sn_1gram'] = raw_df['sound'].apply(lambda x: list(x) )
# add column of size
raw_df['sn_size'] = raw_df['sn_1gram'].apply(lambda x: len(x))
## check
raw_df

Unnamed: 0,spell,sound,sp_1gram,sp_size,hyphen,period,sn_1gram,sn_size
0,'bout,ˈbaʊt,"[', b, o, u, t]",5,0,0,"[ˈ, b, a, ʊ, t]",5
1,'cause,kəz,"[', c, a, u, s, e]",6,0,0,"[k, ə, z]",3
2,'course,ˈkɔɹs,"[', c, o, u, r, s, e]",7,0,0,"[ˈ, k, ɔ, ɹ, s]",5
3,'cuse,ˈkjuz,"[', c, u, s, e]",5,0,0,"[ˈ, k, j, u, z]",5
4,'em,əm,"[', e, m]",3,0,0,"[ə, m]",2
...,...,...,...,...,...,...,...,...
125922,zysk,ˈzaɪsk,"[z, y, s, k]",4,0,0,"[ˈ, z, a, ɪ, s, k]",6
125923,zyskowski,zɪˈskɔfski,"[z, y, s, k, o, w, s, k, i]",9,0,0,"[z, ɪ, ˈ, s, k, ɔ, f, s, k, i]",10
125924,zyuganov,ˈzjuɡɑnɑv,"[z, y, u, g, a, n, o, v]",8,0,0,"[ˈ, z, j, u, ɡ, ɑ, n, ɑ, v]",9
125925,zyuganov's,ˈzjuɡɑnɑvz,"[z, y, u, g, a, n, o, v, ', s]",10,0,0,"[ˈ, z, j, u, ɡ, ɑ, n, ɑ, v, z]",10


In [10]:
## filtering raw_data by size
print(f"term_type: {term_type}")
if "sp_" in term_type:
    df_filtered = raw_df[ (raw_df['sp_size'] <= max_doc_size) & (raw_df['sp_size'] >= min_doc_size) & (raw_df['hyphen'] == 0) & (raw_df['period'] == 0) ]
else:
    df_filtered = raw_df[ (raw_df['sn_size'] <= max_doc_size) & (raw_df['sn_size'] >= min_doc_size) ]
#
df_filtered

term_type: sn_skippy4gram


Unnamed: 0,spell,sound,sp_1gram,sp_size,hyphen,period,sn_1gram,sn_size
0,'bout,ˈbaʊt,"[', b, o, u, t]",5,0,0,"[ˈ, b, a, ʊ, t]",5
2,'course,ˈkɔɹs,"[', c, o, u, r, s, e]",7,0,0,"[ˈ, k, ɔ, ɹ, s]",5
3,'cuse,ˈkjuz,"[', c, u, s, e]",5,0,0,"[ˈ, k, j, u, z]",5
5,'frisco,ˈfɹɪskoʊ,"[', f, r, i, s, c, o]",7,0,0,"[ˈ, f, ɹ, ɪ, s, k, o, ʊ]",8
10,'round,ˈɹaʊnd,"[', r, o, u, n, d]",6,0,0,"[ˈ, ɹ, a, ʊ, n, d]",6
...,...,...,...,...,...,...,...,...
125922,zysk,ˈzaɪsk,"[z, y, s, k]",4,0,0,"[ˈ, z, a, ɪ, s, k]",6
125923,zyskowski,zɪˈskɔfski,"[z, y, s, k, o, w, s, k, i]",9,0,0,"[z, ɪ, ˈ, s, k, ɔ, f, s, k, i]",10
125924,zyuganov,ˈzjuɡɑnɑv,"[z, y, u, g, a, n, o, v]",8,0,0,"[ˈ, z, j, u, ɡ, ɑ, n, ɑ, v]",9
125925,zyuganov's,ˈzjuɡɑnɑvz,"[z, y, u, g, a, n, o, v, ', s]",10,0,0,"[ˈ, z, j, u, ɡ, ɑ, n, ɑ, v, z]",10


In [11]:
## establish df after sampling if any
len(df_filtered)
if source_sampling:
    df = df_filtered.sample(round(len(df_filtered) * source_sampling_rate))
else:
    df = df_filtered
len(df)

20170

In [13]:
## spell 2grams
import ngrams
reload_module = False
if reload_module:
    import importlib
    importlib.reload(ngrams)
sp_2grams = [ ngrams.list_gen_ngrams (x, n = 2, check = False) for x in df['sp_1gram'] ]
if ngram_is_inclusive:
    for i, g in enumerate(sp_2grams):
        g.extend(list(df['sp_1gram'])[i])
## add sp_2gram
df['sp_2gram'] = sp_2grams

In [14]:
## spell 3grams
import ngrams
sp_3grams = [ ngrams.list_gen_ngrams (x, n = 3, check = False) for x in df['sp_1gram'] ]
if ngram_is_inclusive:
    for i, g in enumerate(sp_3grams):
        g.extend(list(df['sp_2gram'])[i])
## add sp_2gram
df['sp_3gram'] = sp_3grams

In [15]:
## spell 4grams
import ngrams
sp_4grams = [ ngrams.list_gen_ngrams (x, n = 4, check = False) for x in df['sp_1gram'] ]
if ngram_is_inclusive:
    for i, g in enumerate(sp_4grams):
        g.extend(list(df['sp_3gram'])[i])
## add sp_2gram
df['sp_4gram'] = sp_4grams

In [16]:
## spell skippy2gram
import ngrams_skippy
reload_module = False
if reload_module:
    import importlib
    importlib.reload(ngrams_skippy)
#
sp_skippy2grams = [ ngrams_skippy.gen_skippy2grams(x) for x in df['sp_1gram'] ]
if ngram_is_inclusive:
    for i, g in enumerate(sp_skippy2grams):
        g.extend(list(df['sp_1gram'])[i])
#
df['sp_skippy2gram'] = sp_skippy2grams

In [17]:
## spell skippy3gram
import ngrams_skippy
sp_skippy3grams = [ ngrams_skippy.gen_skippy3grams(x) for x in df['sp_1gram'] ]
if ngram_is_inclusive:
    for i, g in enumerate(sp_skippy3grams):
        g.extend(list(df['sp_skippy2gram'])[i])
#
df['sp_skippy3gram'] = sp_skippy3grams

In [18]:
## spell skippy4gram
import ngrams_skippy
sp_skippy4grams = [ ngrams_skippy.gen_skippy4grams(x) for x in df['sp_1gram'] ]
if ngram_is_inclusive:
    for i, g in enumerate(sp_skippy4grams):
        g.extend(list(df['sp_skippy3gram'])[i])
#
df['sp_skippy4gram'] = sp_skippy4grams

In [19]:
## sound 2grams
import ngrams
reload_module = False
if reload_module:
    import importlib
    importlib.reload(ngrams)
#
sn_2grams = [ ngrams.list_gen_ngrams (x, n = 2, check = False) for x in df['sn_1gram'] ]
if ngram_is_inclusive:
    for i, g in enumerate(sn_2grams):
        g.extend(list(df['sn_1gram'])[i])
## add sn_2gram
df['sn_2gram'] = sn_2grams

In [20]:
## sound 3grams
import ngrams
sn_3grams = [ ngrams.list_gen_ngrams (x, n = 3, check = False) for x in df['sn_1gram'] ]
if ngram_is_inclusive:
    for i, g in enumerate(sn_3grams):
        g.extend(list(df['sn_2gram'])[i])
## add sn_3gram
df['sn_3gram'] = sn_3grams

In [21]:
## sound 4grams
import ngrams
sn_4grams = [ ngrams.list_gen_ngrams (x, n = 4, check = False) for x in df['sn_1gram'] ]
if ngram_is_inclusive:
    for i, g in enumerate(sn_3grams):
        g.extend(list(df['sn_2gram'])[i])
## add sn_4gram
df['sn_4gram'] = sn_3grams

In [22]:
## sound skippy2gram
import ngrams_skippy
sn_skippy2grams = [ ngrams_skippy.gen_skippy2grams(x) for x in df['sn_1gram'] ]
if ngram_is_inclusive:
    for i, g in enumerate(sn_skippy2grams):
        g.extend(list(df['sn_1gram'])[i])
#
df['sn_skippy2gram'] = sn_skippy2grams

In [23]:
## sound skippy3gram
import ngrams_skippy
sn_skippy3grams = [ ngrams_skippy.gen_skippy3grams(x) for x in df['sn_1gram'] ]
if ngram_is_inclusive:
    for i, g in enumerate(sn_skippy3grams):
        g.extend(list(df['sn_skippy2gram'])[i])
#
df['sn_skippy3gram'] = sn_skippy3grams

In [24]:
## sound skippy4gram
import ngrams_skippy
sn_skippy4grams = [ ngrams_skippy.gen_skippy4grams(x) for x in df['sn_1gram'] ]
#
if ngram_is_inclusive:
    for i, g in enumerate(sn_skippy4grams):
        g.extend(list(df['sn_skippy3gram'])[i])
#
df['sn_skippy4gram'] = sn_skippy4grams

In [25]:
## check df
dropped_vars = [ 'sp_size', 'hyphen', 'period', 'sn_size' ]
if "sp_" in term_type:
    extra = [ 'sn_1gram', 'sn_2gram', 'sn_3gram', 'sn_skippy2gram', 'sn_skippy3gram', 'sn_skippy4gram' ]
    dropped_vars.extend(extra)
    target_vars = [ x for x in df.columns if not x in dropped_vars ]
else:
    extra = [ 'sp_1gram', 'sp_2gram', 'sp_3gram', 'sp_skippy2gram', 'sp_skippy3gram', 'sp_skippy4gram' ]
    dropped_vars.extend(extra)
    target_vars = [ x for x in df.columns if not x in dropped_vars ]    
#
df[target_vars]

Unnamed: 0,spell,sound,sn_1gram,sp_4gram,sn_2gram,sn_3gram,sn_4gram,sn_skippy2gram,sn_skippy3gram,sn_skippy4gram
19277,cheapen,ˈtʃipən,"[ˈ, t, ʃ, i, p, ə, n]","[chea, heap, eape, apen, che, hea, eap, ape, p...","[ˈt, tʃ, ʃi, ip, pə, ən, ˈ, t, ʃ, i, p, ə, n]","[ˈtʃ, tʃi, ʃip, ipə, pən, ˈt, tʃ, ʃi, ip, pə, ...","[ˈtʃ, tʃi, ʃip, ipə, pən, ˈt, tʃ, ʃi, ip, pə, ...","[ˈt, ˈ_ʃ, ˈ_i, ˈ_p, ˈ_ə, ˈ_n, tʃ, t_i, t_p, t_...","[ˈtʃ, ˈt_i, ˈt_p, ˈt_ə, ˈt_n, ˈ_ʃi, ˈ_ʃ_p, ˈ_ʃ...","[ˈtʃi, ˈtʃ_p, ˈtʃ_ə, ˈtʃ_n, ˈt_ip, ˈt_i_ə, ˈt_..."
43649,gejdenson,ˈɡeɪdənsən,"[ˈ, ɡ, e, ɪ, d, ə, n, s, ə, n]","[gejd, ejde, jden, dens, enso, nson, gej, ejd,...","[ˈɡ, ɡe, eɪ, ɪd, də, ən, ns, sə, ən, ˈ, ɡ, e, ...","[ˈɡe, ɡeɪ, eɪd, ɪdə, dən, əns, nsə, sən, ˈɡ, ɡ...","[ˈɡe, ɡeɪ, eɪd, ɪdə, dən, əns, nsə, sən, ˈɡ, ɡ...","[ˈɡ, ˈ_e, ˈ_ɪ, ˈ_d, ˈ_ə, ˈ_n, ˈ_s, ɡe, ɡ_ɪ, ɡ_...","[ˈɡe, ˈɡ_ɪ, ˈɡ_d, ˈɡ_ə, ˈɡ_n, ˈɡ_s, ˈ_eɪ, ˈ_e_...","[ˈɡeɪ, ˈɡe_d, ˈɡe_ə, ˈɡe_n, ˈɡe_s, ˈɡ_ɪd, ˈɡ_ɪ..."
107060,sta,ˈɛsˈtiˈeɪ,"[ˈ, ɛ, s, ˈ, t, i, ˈ, e, ɪ]","[sta, sta, st, ta, s, t, a]","[ˈɛ, ɛs, sˈ, ˈt, ti, iˈ, ˈe, eɪ, ˈ, ɛ, s, ˈ, t...","[ˈɛs, ɛsˈ, sˈt, ˈti, tiˈ, iˈe, ˈeɪ, ˈɛ, ɛs, sˈ...","[ˈɛs, ɛsˈ, sˈt, ˈti, tiˈ, iˈe, ˈeɪ, ˈɛ, ɛs, sˈ...","[ˈɛ, ˈ_s, ˈ_ˈ, ˈ_t, ˈ_i, ˈ_e, ˈ_ɪ, ɛs, ɛ_ˈ, ɛ_...","[ˈɛs, ˈɛ_ˈ, ˈɛ_t, ˈɛ_i, ˈɛ_e, ˈɛ_ɪ, ˈ_sˈ, ˈ_s_...","[ˈɛsˈ, ˈɛs_t, ˈɛs_i, ˈɛs_ˈ, ˈɛs_e, ˈɛs_ɪ, ˈɛ_ˈ..."
14677,browned,ˈbɹaʊnd,"[ˈ, b, ɹ, a, ʊ, n, d]","[brow, rown, owne, wned, bro, row, own, wne, n...","[ˈb, bɹ, ɹa, aʊ, ʊn, nd, ˈ, b, ɹ, a, ʊ, n, d]","[ˈbɹ, bɹa, ɹaʊ, aʊn, ʊnd, ˈb, bɹ, ɹa, aʊ, ʊn, ...","[ˈbɹ, bɹa, ɹaʊ, aʊn, ʊnd, ˈb, bɹ, ɹa, aʊ, ʊn, ...","[ˈb, ˈ_ɹ, ˈ_a, ˈ_ʊ, ˈ_n, ˈ_d, bɹ, b_a, b_ʊ, b_...","[ˈbɹ, ˈb_a, ˈb_ʊ, ˈb_n, ˈb_d, ˈ_ɹa, ˈ_ɹ_ʊ, ˈ_ɹ...","[ˈbɹa, ˈbɹ_ʊ, ˈbɹ_n, ˈbɹ_d, ˈb_aʊ, ˈb_a_n, ˈb_..."
61199,kocis,ˈkoʊsɪs,"[ˈ, k, o, ʊ, s, ɪ, s]","[koci, ocis, koc, oci, cis, ko, oc, ci, is, k,...","[ˈk, ko, oʊ, ʊs, sɪ, ɪs, ˈ, k, o, ʊ, s, ɪ, s]","[ˈko, koʊ, oʊs, ʊsɪ, sɪs, ˈk, ko, oʊ, ʊs, sɪ, ...","[ˈko, koʊ, oʊs, ʊsɪ, sɪs, ˈk, ko, oʊ, ʊs, sɪ, ...","[ˈk, ˈ_o, ˈ_ʊ, ˈ_s, ˈ_ɪ, ko, k_ʊ, k_s, k_ɪ, oʊ...","[ˈko, ˈk_ʊ, ˈk_s, ˈk_ɪ, ˈ_oʊ, ˈ_o_s, ˈ_o_ɪ, ˈ_...","[ˈkoʊ, ˈko_s, ˈko_ɪ, ˈk_ʊs, ˈk_ʊ_ɪ, ˈk_ʊ_s, ˈk..."
...,...,...,...,...,...,...,...,...,...,...
41344,francies,fɹənˈsiz,"[f, ɹ, ə, n, ˈ, s, i, z]","[fran, ranc, anci, ncie, cies, fra, ran, anc, ...","[fɹ, ɹə, ən, nˈ, ˈs, si, iz, f, ɹ, ə, n, ˈ, s,...","[fɹə, ɹən, ənˈ, nˈs, ˈsi, siz, fɹ, ɹə, ən, nˈ,...","[fɹə, ɹən, ənˈ, nˈs, ˈsi, siz, fɹ, ɹə, ən, nˈ,...","[fɹ, f_ə, f_n, f_ˈ, f_s, f_i, f_z, ɹə, ɹ_n, ɹ_...","[fɹə, fɹ_n, fɹ_ˈ, fɹ_s, fɹ_i, fɹ_z, f_ən, f_ə_...","[fɹən, fɹə_ˈ, fɹə_s, fɹə_i, fɹə_z, fɹ_nˈ, fɹ_n..."
45497,goldwin,ˈɡoʊɫdwɪn,"[ˈ, ɡ, o, ʊ, ɫ, d, w, ɪ, n]","[gold, oldw, ldwi, dwin, gol, old, ldw, dwi, w...","[ˈɡ, ɡo, oʊ, ʊɫ, ɫd, dw, wɪ, ɪn, ˈ, ɡ, o, ʊ, ɫ...","[ˈɡo, ɡoʊ, oʊɫ, ʊɫd, ɫdw, dwɪ, wɪn, ˈɡ, ɡo, oʊ...","[ˈɡo, ɡoʊ, oʊɫ, ʊɫd, ɫdw, dwɪ, wɪn, ˈɡ, ɡo, oʊ...","[ˈɡ, ˈ_o, ˈ_ʊ, ˈ_ɫ, ˈ_d, ˈ_w, ˈ_ɪ, ˈ_n, ɡo, ɡ_...","[ˈɡo, ˈɡ_ʊ, ˈɡ_ɫ, ˈɡ_d, ˈɡ_w, ˈɡ_ɪ, ˈɡ_n, ˈ_oʊ...","[ˈɡoʊ, ˈɡo_ɫ, ˈɡo_d, ˈɡo_w, ˈɡo_ɪ, ˈɡo_n, ˈɡ_ʊ..."
87516,polevanov,pəˈɫɛvənɑv,"[p, ə, ˈ, ɫ, ɛ, v, ə, n, ɑ, v]","[pole, olev, leva, evan, vano, anov, pol, ole,...","[pə, əˈ, ˈɫ, ɫɛ, ɛv, və, ən, nɑ, ɑv, p, ə, ˈ, ...","[pəˈ, əˈɫ, ˈɫɛ, ɫɛv, ɛvə, vən, ənɑ, nɑv, pə, ə...","[pəˈ, əˈɫ, ˈɫɛ, ɫɛv, ɛvə, vən, ənɑ, nɑv, pə, ə...","[pə, p_ˈ, p_ɫ, p_ɛ, p_v, p_ə, p_n, p_ɑ, əˈ, ə_...","[pəˈ, pə_ɫ, pə_ɛ, pə_v, pə_ə, pə_n, pə_ɑ, p_ˈɫ...","[pəˈɫ, pəˈ_ɛ, pəˈ_v, pəˈ_ə, pəˈ_n, pəˈ_ɑ, pə_ɫ..."
54645,importers,ˌɪmˈpɔɹtɝz,"[ˌ, ɪ, m, ˈ, p, ɔ, ɹ, t, ɝ, z]","[impo, mpor, port, orte, rter, ters, imp, mpo,...","[ˌɪ, ɪm, mˈ, ˈp, pɔ, ɔɹ, ɹt, tɝ, ɝz, ˌ, ɪ, m, ...","[ˌɪm, ɪmˈ, mˈp, ˈpɔ, pɔɹ, ɔɹt, ɹtɝ, tɝz, ˌɪ, ɪ...","[ˌɪm, ɪmˈ, mˈp, ˈpɔ, pɔɹ, ɔɹt, ɹtɝ, tɝz, ˌɪ, ɪ...","[ˌɪ, ˌ_m, ˌ_ˈ, ˌ_p, ˌ_ɔ, ˌ_ɹ, ˌ_t, ˌ_ɝ, ˌ_z, ɪ...","[ˌɪm, ˌɪ_ˈ, ˌɪ_p, ˌɪ_ɔ, ˌɪ_ɹ, ˌɪ_t, ˌɪ_ɝ, ˌɪ_z...","[ˌɪmˈ, ˌɪm_p, ˌɪm_ɔ, ˌɪm_ɹ, ˌɪm_t, ˌɪm_ɝ, ˌɪm_..."


In [26]:
## select data type and define doc_dict
import random
if "sp_" in term_type:
    base_type = "spell"
else:
    base_type = "sound"
doc_dict = { i: x for i, x in enumerate(df[base_type]) }
## check
random.sample(doc_dict.items(), 10)

since Python 3.9 and will be removed in a subsequent version.
  random.sample(doc_dict.items(), 10)


[(2279, 'ˈθimə'),
 (2832, 'kɫiˈænθə'),
 (3689, 'ˈvɪɫən'),
 (9160, 'ˈdʒækɪts'),
 (1658, 'ˈdɹək'),
 (15591, 'ˈeɪtiz'),
 (2603, 'ˈmækɪɫə'),
 (16234, 'ˈkupɝ'),
 (4731, 'ˈsɫɑmə'),
 (8740, 'ənˈdaɪɪŋ')]

In [27]:
## select bots for analysis
import random

bots = [ x for x in df[term_type] if len(x) > 1 ] # Crucially
random.sample(bots, 3)

[['ˈbɫæ',
  'ˈbɫ_n',
  'ˈbɫ_d',
  'ˈbɫ_ɝ',
  'ˈb_æn',
  'ˈb_æ_d',
  'ˈb_æ_ɝ',
  'ˈb_nd',
  'ˈb_n_ɝ',
  'ˈb_dɝ',
  'ˈ_ɫ_æn',
  'ˈ_ɫæ_d',
  'ˈ_ɫæ_ɝ',
  'ˈ_ɫ_nd',
  'ˈ_ɫ_n_ɝ',
  'ˈ_ɫ_dɝ',
  'ˈ_æ_nd',
  'ˈ_æn_ɝ',
  'ˈ_æ_dɝ',
  'ˈ_n_dɝ',
  'bɫæn',
  'bɫæ_d',
  'bɫæ_ɝ',
  'bɫ_nd',
  'bɫ_n_ɝ',
  'bɫ_dɝ',
  'b_æ_nd',
  'b_æn_ɝ',
  'b_æ_dɝ',
  'b_n_dɝ',
  'ɫænd',
  'ɫæn_ɝ',
  'ɫæ_dɝ',
  'ɫ_n_dɝ',
  'ændɝ',
  'ˈbɫ',
  'ˈb_æ',
  'ˈb_n',
  'ˈb_d',
  'ˈb_ɝ',
  'ˈ_ɫæ',
  'ˈ_ɫ_n',
  'ˈ_ɫ_d',
  'ˈ_ɫ_ɝ',
  'ˈ_æn',
  'ˈ_æ_d',
  'ˈ_æ_ɝ',
  'ˈ_nd',
  'ˈ_n_ɝ',
  'ˈ_dɝ',
  'bɫæ',
  'bɫ_n',
  'bɫ_d',
  'bɫ_ɝ',
  'b_æn',
  'b_æ_d',
  'b_æ_ɝ',
  'b_nd',
  'b_n_ɝ',
  'b_dɝ',
  'ɫæn',
  'ɫæ_d',
  'ɫæ_ɝ',
  'ɫ_nd',
  'ɫ_n_ɝ',
  'ɫ_dɝ',
  'ænd',
  'æn_ɝ',
  'æ_dɝ',
  'ndɝ',
  'ˈb',
  'ˈ_ɫ',
  'ˈ_æ',
  'ˈ_n',
  'ˈ_d',
  'ˈ_ɝ',
  'bɫ',
  'b_æ',
  'b_n',
  'b_d',
  'b_ɝ',
  'ɫæ',
  'ɫ_n',
  'ɫ_d',
  'ɫ_ɝ',
  'æn',
  'æ_d',
  'æ_ɝ',
  'nd',
  'n_ɝ',
  'dɝ',
  'ˈ',
  'b',
  'ɫ',
  'æ',
  'n',
  'd',
  'ɝ'],
 ['ˈsaɪ',
 

In [28]:
## generate dictionary
from gensim.corpora import Dictionary
diction = Dictionary(bots)
print(diction)

if apply_term_filtering:
    print(f"term filtering applied")
    diction.filter_extremes(no_below = term_minfreq, no_above = abuse_threshold)
else:
    print(f"term filtering not applied")
print(diction)

## generate DTM
corpus = [ diction.doc2bow(bot) for bot in bots if len(bot) > min_bot_size ] # Crucially

Dictionary(570208 unique tokens: ['i', 'i_n', 'i_ə', 'i_ən', 'ip']...)
term filtering applied
Dictionary(100000 unique tokens: ['i_n', 'i_ə', 'i_ən', 'ip', 'ip_n']...)


In [29]:
## HDP (n_topics = 15)
import gensim.models
import pyLDAvis.gensim

max_n_topics = 15
hdp = gensim.models.HdpModel(corpus, diction, random_state = 1, T = max_n_topics)
vis_data = pyLDAvis.gensim.prepare(hdp, corpus, diction)
pyLDAvis.display(vis_data)

In [31]:
## topic investigation
import numpy as np
import HDP_helper
reload_module = True
if reload_module:
    import importlib
    importlib.reload(HDP_helper)

documents_topics = np.zeros([hdp.m_T, len(corpus)])
for doc_id, c in enumerate(corpus):
    for topic_id, prob in hdp[c]:
        documents_topics[topic_id][doc_id] = prob

n_docs_to_show = 10
n_terms_to_show = 7
hdp.optimal_ordering()
for topic_id, probs in enumerate(documents_topics):
    print(f"==============")
    topic_t = hdp.print_topic(topic_id, topn = n_terms_to_show)
    print(f"topic_id {topic_id}: {HDP_helper.reformat_topic (topic_t, n_terms_to_show)}")
    print(f"nonzero count: ", len(probs.nonzero()[0]))
    for doc_id in probs.argsort()[::-1][:n_docs_to_show]:
        doc = doc_dict[doc_id]
        print(f"\t{probs[doc_id]:0.4f}: {doc}")

topic_id 0: 0.0 * tɪ + 0.0 * ˈ_æ_ɪ + 0.0 * ˈ_tɪ + 0.0 * sɪ + 0.0 * dɪ + 0.0 * æ_t + 0.0 * ˈ_ə_ɪ
nonzero count:  5369
	0.9972: ˈmɪnəstɹiz
	0.9972: ˈɪndəstɹiz
	0.9972: ˈɪndəstɹiz
	0.9970: ɹiˈsaʊndɪŋ
	0.9970: ˈpɹɪnstənz
	0.9969: ˈbɹaɪtənɪŋ
	0.9969: ˈpɹɛsɪdənt
	0.9969: dɪˈstɹɛsɪŋ
	0.9969: dɪˈspɛnsɪz
	0.9969: ˈkɹɪstəfɝz
topic_id 1: 0.001 * ə_k + 0.0 * m_ˈ + 0.0 * ɫ_k + 0.0 * əˈ_ə + 0.0 * i_ə + 0.0 * ˈ_nz + 0.0 * əˈ_n
nonzero count:  5197
	0.9970: kəˈnɛkʃənz
	0.9970: məˈkɫɪntək
	0.9969: pɹəˈfɛʃənz
	0.9969: sməˈɫɛnski
	0.9969: əˈɹeɪbiənz
	0.9968: ɪˈtɹəskənz
	0.9968: bəˈtʃɪnski
	0.9968: məkˈdɑnəɫz
	0.9967: məˈkɔɹmɪks
	0.9967: bəˈɫɪviənz
topic_id 2: 0.0 * ˌ_t + 0.0 * s_ɹ + 0.0 * aʊ + 0.0 * ˌ_n + 0.0 * ˌ_ɹ + 0.0 * ˈ_ˌ_t + 0.0 * tɹ
nonzero count:  3279
	0.9970: kəmˈpɫeɪnz
	0.9970: ˈɹaɪnˌhɑɹt
	0.9970: ˈɹaɪnˌhɑɹt
	0.9970: ˈmaɪnˌhɑɹt
	0.9969: ˈtɛɹəˌdaɪn
	0.9969: ˈɪməˌɫeɪts
	0.9969: ˈtɛɹəˌfaɪz
	0.9969: ˈtɛɹəˌfaɪd
	0.9968: ˈwaɪnˌhɑɹt
	0.9968: ˌɪnˈsænəti
topic_id 3: 0.001 * ˈ_m_ən + 0.001 * ʊ_ə + 0.001

In [32]:
## HDP (n_topics = 45)
import gensim.models
import pyLDAvis.gensim
max_n_topics = 45
hdp = gensim.models.HdpModel(corpus, diction, random_state = 1, T = max_n_topics)
vis_data = pyLDAvis.gensim.prepare(hdp, corpus, diction)
pyLDAvis.display(vis_data)

In [33]:
## topic investigation
import numpy as np
import HDP_helper

documents_topics = np.zeros([hdp.m_T, len(corpus)])
for doc_id, c in enumerate(corpus):
    for topic_id, prob in hdp[c]:
        documents_topics[topic_id][doc_id] = prob

n_docs_to_show = 10
n_terms_to_show = 7
hdp.optimal_ordering()
for topic_id, probs in enumerate(documents_topics):
    print(f"==============")
    topic_t = hdp.print_topic(topic_id, topn = n_terms_to_show)
    print(f"topic_id {topic_id}: {HDP_helper.reformat_topic (topic_t, n_terms_to_show)}")
    print(f"nonzero count: ", len(probs.nonzero()[0]))
    for doc_id in probs.argsort()[::-1][:n_docs_to_show]:
        doc = doc_dict[doc_id]
        print(f"\t{probs[doc_id]:0.4f}: {doc}")

topic_id 0: 0.0 * ˈ_ə_t + 0.0 * ɛ_s + 0.0 * t_t + 0.0 * ˈ_ɛ_t + 0.0 * ˈ_ɛ_ɪ + 0.0 * s_s + 0.0 * ˈ_s_n
nonzero count:  9994
	0.9977: ˈkɫeɪmənts
	0.9977: ˈpɹɪnstənz
	0.9976: ˈtʃɪɫdɹənz
	0.9976: ˈtʃɪɫdɹənz
	0.9976: ˈtʃɪɫdɹənz
	0.9976: ˈtʃæmpiənz
	0.9976: ˈtʃæmpiənz
	0.9976: dɪˈpɹɛsənt
	0.9975: ˈdɛɹəˌɫɪks
	0.9974: ˈmɑɹtɪnsən
topic_id 1: 0.001 * n_ʊ + 0.001 * ɫ_ʊ + 0.001 * aʊ + 0.001 * ˌ_ʊ + 0.001 * t_ʊ + 0.001 * ˈ_aʊ + 0.001 * ɪ_ʊ
nonzero count:  2362
	0.9965: ˈɫændˌmɑɹk
	0.9965: ˈæɫkəˌtɛɫz
	0.9965: sɪˈnɛɹioʊz
	0.9964: dɪmɑˈɹinoʊ
	0.9964: ˈɪntɝˌkoʊz
	0.9963: ənˈfoʊɫdɪd
	0.9963: dɪˈpɫoʊməz
	0.9962: ˌsænˈtɪɫoʊ
	0.9962: ˈɫaɪnˌbækɝ
	0.9962: sɪnˈfoʊniə
topic_id 2: 0.001 * d_ʊ + 0.001 * ˈ_ɹ_ʊ + 0.001 * n_ʊ + 0.001 * ˌ_ʊ + 0.001 * ɑ_ʊ + 0.001 * m_ʊ + 0.001 * n_ɝ
nonzero count:  1909
	0.9967: ˈænəˌɫaɪzɝ
	0.9966: ˈpænəˌɫaɪz
	0.9966: ˈəndɝˌɫaɪn
	0.9965: ˈɫændˌɫaɪn
	0.9965: ˈɹiˌbaʊndz
	0.9964: ˈwaɪnbɝɡɝz
	0.9963: daɪˈɹɛktɪd
	0.9963: ˈdɑmɪˌnoʊz
	0.9963: ˈdɑmɪˌnoʊz
	0.9963: ˈmɑɹkˌdaʊn
topic_id 3: 0.001

In [34]:
## HDP (n_topics = 90)
import gensim.models
import pyLDAvis.gensim
max_n_topics = 90
hdp = gensim.models.HdpModel(corpus, diction, random_state = 1, T = max_n_topics)
vis_data = pyLDAvis.gensim.prepare(hdp, corpus, diction)
pyLDAvis.display(vis_data)

In [35]:
## topic investigation
import numpy as np
import HDP_helper

documents_topics = np.zeros([hdp.m_T, len(corpus)])
for doc_id, c in enumerate(corpus):
    for topic_id, prob in hdp[c]:
        documents_topics[topic_id][doc_id] = prob

## investigate topics
n_docs_to_show = 10
n_terms_to_show = 7
hdp.optimal_ordering()
for topic_id, probs in enumerate(documents_topics):
    print(f"==============")
    #topic_encoding = ", ".join(hdp.show_topic(topic_id))
    topic_t = hdp.print_topic(topic_id, topn = n_terms_to_show)
    print(f"topic_id {topic_id}: {HDP_helper.reformat_topic (topic_t, n_terms_to_show)}")
    print(f"nonzero count: ", len(probs.nonzero()[0]))
    for doc_id in probs.argsort()[::-1][:n_docs_to_show]:
        doc = doc_dict[doc_id]
        print(f"\t{probs[doc_id]:0.4f}: {doc}")

topic_id 0: 0.001 * ˈ_ɛ_t + 0.001 * ˈ_ɛ_ɪ + 0.001 * ˈ_ə_t + 0.001 * ɛ_s + 0.001 * t_t + 0.001 * s_s + 0.0 * t_s
nonzero count:  6461
	0.9974: ˈtɹɪnˌtɛks
	0.9972: ˈtʃɛɹɪŋtən
	0.9972: ˈtɹɛndiəst
	0.9972: ˈsɑɹdʒənts
	0.9971: ˈseɪtənɪst
	0.9971: ˈmɝtʃɪnsən
	0.9970: ˈtɛɫəˌkæst
	0.9970: ˈdɪɫətɑnts
	0.9970: ˈstədˌstɪɫ
	0.9969: ˈfɑɹməsɪst
topic_id 1: 0.001 * ɹ_ˈ + 0.001 * iˈ + 0.001 * n_i + 0.001 * ɫ_i + 0.001 * k_ˈ + 0.001 * i_i + 0.001 * i_ə
nonzero count:  2680
	0.9967: pɹiˈtɛndɪŋ
	0.9967: pɹəˈfɛʃənz
	0.9966: səˈɫinəsɪz
	0.9966: əˈɫaɪənsəz
	0.9965: əˈsɫeɪniən
	0.9965: ɹiˈtɹæktɪŋ
	0.9965: ˈpɹaɪvəsiz
	0.9965: vəˈɫɛnsiəz
	0.9965: kɹiˈmeɪʃən
	0.9964: ɹiˈstɹeɪnt
topic_id 2: 0.001 * ˈ_ɫ_ə + 0.001 * ˈ_ɫ_ən + 0.001 * ˈ_m_ən + 0.001 * mən + 0.001 * ɫ_ən + 0.001 * d_ɫ + 0.001 * h_n
nonzero count:  2960
	0.9967: ˈhæməɫtənz
	0.9967: ˈbɹæməˌɫiz
	0.9967: məˈkɫɛɫənd
	0.9967: məˈkɫɛɫənd
	0.9966: məˈdʒɛɫənz
	0.9966: ˈhɑɹtˌfiɫd
	0.9965: ˈmændəˌvɪɫ
	0.9965: ˈʃækəɫtənz
	0.9965: ˈdoʊɫdɹəmz
	0.9964: ˈhɑɫdəmənz
t