In [308]:
#!pip install -U pandas

In [309]:
#!pip install -U pyLDAvis

In [310]:
## imports
import os, sys
import pprint as pp

In [311]:
## 一つ上の階層のファイルを見るように設定
sys.path.append(os.path.join(os.path.dirname("__file__"), '..'))

In [312]:
## variables

## target language
## a key must be part of an open-dict-ipa file name 
target_lang_dict = {    'ar'    : 'Arabic',
                        'de'    : 'German',
                        'en_US' : 'English (US)',
                        'en_UK' : 'English (UK)',
                        'eo'    : 'Esperanto',
                        'es_ES' : 'Spanish (Spain)',
                        'es_MX' : 'Spanish (Mexico)',
                        'fi'    : 'Finnish',
                        'fr_FR' : 'French (France)',
                        'fr_QC' : 'French (Quebec)',
                        'is'    : 'Icelandic',
                        'nl'    : 'Dutch',
                        'ro'    : 'Romanian',
                        'sw'    : 'Swahili' }
target_lang_keys = [    'ar', 'de', 'en_US', 'en_UK', 'eo', 'es_ES', 'es_MX',
                        'fi', 'fr_FR', 'fr_QC', 'is', 'nl', 'ro', 'sw' ]
target_lang_key  = target_lang_keys[-1]
print(f"target: {target_lang_dict[target_lang_key]} ({target_lang_key})")

## sampling
source_sampling      = True
source_samping_rate  = 0.5
second_sampling      = False
second_sampling_rate = 0.7
## doc settings
max_doc_size       = 10
min_doc_size       =  5
## term settings
term_classes       = [ 'spell', 'sound' ]
term_class         = term_classes[1]
ngram_is_inclusive = True
gap_mark           = "…"
term_is_skippy     = True
n_for_ngram        = 4
print(f"term_is_skippy: {term_is_skippy}")
print(f"term_class: {term_class}")
print(f"n_for_ngram: {n_for_ngram}")
## define term_type
if term_class == 'spell':
    if term_is_skippy:
        term_type = f"sp_skippy{n_for_ngram}gram"
    else:
        term_type = f"sp_{n_for_ngram}gram"
else:
    if term_is_skippy:
        term_type = f"sn_skippy{n_for_ngram}gram"
    else:
        term_type = f"sn_{n_for_ngram}gram"
## check
print(f"term_type: {term_type}")

target: Swahili (sw)
term_is_skippy: True
term_class: sound
n_for_ngram: 4
term_type: sn_skippy4gram


In [313]:
## LDA/HDP
apply_term_filtering = True
## The following parameters need to be relatively large to prevent "Row sum not equal 1" error
term_minfreq         = 3
abuse_threshold      = 0.03
min_bot_size         = 3

In [315]:
## set target files
import glob
data_dir = "data/open-dict-ipa/data1/"
target_files = glob.glob(f"{data_dir}/*")
target_files = sorted([ file for file in target_files if ".csv" in file ])
pp.pprint(target_files)

['data/open-dict-ipa/data1/ar.csv',
 'data/open-dict-ipa/data1/de.csv',
 'data/open-dict-ipa/data1/en_UK.csv',
 'data/open-dict-ipa/data1/en_US.csv',
 'data/open-dict-ipa/data1/eo.csv',
 'data/open-dict-ipa/data1/es_ES.csv',
 'data/open-dict-ipa/data1/es_MX.csv',
 'data/open-dict-ipa/data1/fa.csv',
 'data/open-dict-ipa/data1/fi.csv',
 'data/open-dict-ipa/data1/fr_FR.csv',
 'data/open-dict-ipa/data1/fr_QC.csv',
 'data/open-dict-ipa/data1/is.csv',
 'data/open-dict-ipa/data1/ja.csv',
 'data/open-dict-ipa/data1/jam.csv',
 'data/open-dict-ipa/data1/ma.csv',
 'data/open-dict-ipa/data1/nb.csv',
 'data/open-dict-ipa/data1/nl.csv',
 'data/open-dict-ipa/data1/or.csv',
 'data/open-dict-ipa/data1/sv.csv',
 'data/open-dict-ipa/data1/sw.csv',
 'data/open-dict-ipa/data1/vi_C.csv',
 'data/open-dict-ipa/data1/vi_N.csv',
 'data/open-dict-ipa/data1/vi_S.csv',
 'data/open-dict-ipa/data1/yue.csv',
 'data/open-dict-ipa/data1/zh_hans.csv',
 'data/open-dict-ipa/data1/zh_hant.csv']


In [316]:
## get source data from files
import pandas as pd

#target_language_key = "en_US" # can be changed to get other languages
file = [ f for f in target_files if target_lang_key in f ][0]
print(f"processing: {file}")
with open(file, "rt") as f:
    raw_df = pd.read_csv(f, encoding = 'utf8', header = None, names = ['spell', 'sound'])
    sounds = raw_df['sound'].apply(lambda x: x.strip('/') )
    sounds = [ x.split("/,")[0] for x in sounds ] # picks up only the first of multiple entries
    raw_df['sound'] = sounds
#
raw_df.sample(10)


processing: data/open-dict-ipa/data1/sw.csv


Unnamed: 0,spell,sound
28263,mmemvaa,mmeᵐvaa
37432,uandamanaji,uaⁿɗamanaʄi
41922,vinavyotemwa,vinavjotemwa
37714,udodi,uɗoɗi
44100,walipambana,walipaᵐɓana
23836,linahesabik,linahesaɓik
40893,uteo,uteo
42384,vizingiti,viziᵑgiti
40732,utambuliwe,utaᵐɓuliwe
47288,yanayowahus,janajowahus


In [317]:
## source sampling
len(raw_df)
if source_sampling:
	print(f"source sampling applied")
	raw_df = raw_df.sample(round(len(raw_df) * source_sampling_rate))
print(raw_df)

source sampling applied
             spell        sound
2292      akawaaga     akawaaɠa
19656    kuepukana    kuepukana
35151   sitakutupa   sitakutupa
38942  uliyomjalia  ulijomʄalia
26339        mdimu        mɗimu
...            ...          ...
43558        wakoo        wakoo
9285     chomoleka    tʃomoleka
364        Izraili      Izɾaili
27583      mkakaya      mkakaja
2115   akanipeleka  akanipeleka

[9662 rows x 2 columns]


In [318]:
## generate 1-grams for spell and sound
## spell
raw_df['sp_1gram'] = raw_df['spell'].apply(lambda x: list(str(x)))
# add column of size
raw_df['sp_size'] = raw_df['sp_1gram'].apply(lambda x: len(x))
# add column of count of '-' inside
raw_df['hyphen'] = raw_df['sp_1gram'].apply(lambda x: list(x).count("-"))
# add column of count of '.' inside
raw_df['period'] = raw_df['sp_1gram'].apply(lambda x: list(x).count("."))
## sound
# takes the first entry, removes '/' around
raw_df['sn_1gram'] = raw_df['sound'].apply(lambda x: list(x) )
# add column of size
raw_df['sn_size'] = raw_df['sn_1gram'].apply(lambda x: len(x))
## check
raw_df

Unnamed: 0,spell,sound,sp_1gram,sp_size,hyphen,period,sn_1gram,sn_size
2292,akawaaga,akawaaɠa,"[a, k, a, w, a, a, g, a]",8,0,0,"[a, k, a, w, a, a, ɠ, a]",8
19656,kuepukana,kuepukana,"[k, u, e, p, u, k, a, n, a]",9,0,0,"[k, u, e, p, u, k, a, n, a]",9
35151,sitakutupa,sitakutupa,"[s, i, t, a, k, u, t, u, p, a]",10,0,0,"[s, i, t, a, k, u, t, u, p, a]",10
38942,uliyomjalia,ulijomʄalia,"[u, l, i, y, o, m, j, a, l, i, a]",11,0,0,"[u, l, i, j, o, m, ʄ, a, l, i, a]",11
26339,mdimu,mɗimu,"[m, d, i, m, u]",5,0,0,"[m, ɗ, i, m, u]",5
...,...,...,...,...,...,...,...,...
43558,wakoo,wakoo,"[w, a, k, o, o]",5,0,0,"[w, a, k, o, o]",5
9285,chomoleka,tʃomoleka,"[c, h, o, m, o, l, e, k, a]",9,0,0,"[t, ʃ, o, m, o, l, e, k, a]",9
364,Izraili,Izɾaili,"[I, z, r, a, i, l, i]",7,0,0,"[I, z, ɾ, a, i, l, i]",7
27583,mkakaya,mkakaja,"[m, k, a, k, a, y, a]",7,0,0,"[m, k, a, k, a, j, a]",7


In [319]:
## filtering raw_data by size
print(f"term_type: {term_type}")
if "sp_" in term_type:
    df_filtered = raw_df[ (raw_df['sp_size'] <= max_doc_size) & (raw_df['sp_size'] >= min_doc_size) & (raw_df['hyphen'] == 0) & (raw_df['period'] == 0) ]
else:
    df_filtered = raw_df[ (raw_df['sn_size'] <= max_doc_size) & (raw_df['sn_size'] >= min_doc_size) ]
#
df_filtered

term_type: sn_skippy4gram


Unnamed: 0,spell,sound,sp_1gram,sp_size,hyphen,period,sn_1gram,sn_size
2292,akawaaga,akawaaɠa,"[a, k, a, w, a, a, g, a]",8,0,0,"[a, k, a, w, a, a, ɠ, a]",8
19656,kuepukana,kuepukana,"[k, u, e, p, u, k, a, n, a]",9,0,0,"[k, u, e, p, u, k, a, n, a]",9
35151,sitakutupa,sitakutupa,"[s, i, t, a, k, u, t, u, p, a]",10,0,0,"[s, i, t, a, k, u, t, u, p, a]",10
26339,mdimu,mɗimu,"[m, d, i, m, u]",5,0,0,"[m, ɗ, i, m, u]",5
19152,komeshwa,komeʃwa,"[k, o, m, e, s, h, w, a]",8,0,0,"[k, o, m, e, ʃ, w, a]",7
...,...,...,...,...,...,...,...,...
38100,ukakafu,ukakafu,"[u, k, a, k, a, f, u]",7,0,0,"[u, k, a, k, a, f, u]",7
43558,wakoo,wakoo,"[w, a, k, o, o]",5,0,0,"[w, a, k, o, o]",5
9285,chomoleka,tʃomoleka,"[c, h, o, m, o, l, e, k, a]",9,0,0,"[t, ʃ, o, m, o, l, e, k, a]",9
364,Izraili,Izɾaili,"[I, z, r, a, i, l, i]",7,0,0,"[I, z, ɾ, a, i, l, i]",7


In [320]:
## define df after second sampling if any
len(df_filtered)
if second_sampling:
    df = df_filtered.sample(round(len(df_filtered) * second_sampling_rate))
else:
    df = df_filtered
len(df)

7427

In [321]:
## spell 2grams
import ngrams
reload_module = False
if reload_module:
    import importlib
    importlib.reload(ngrams)

if term_class == 'spell':
    sp_2grams = [ ngrams.list_gen_ngrams (x, n = 2, check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_2grams):
            g.extend(list(df['sp_1gram'])[i])
    ## add sp_2gram
    df['sp_2gram'] = sp_2grams

In [322]:
## spell 3grams
import ngrams
if n_for_ngram > 2 and term_class == 'spell':
    sp_3grams = [ ngrams.list_gen_ngrams (x, n = 3, check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_3grams):
            g.extend(list(df['sp_2gram'])[i])
    ## add sp_2gram
    df['sp_3gram'] = sp_3grams

In [323]:
## spell 4grams
import ngrams
if n_for_ngram > 3 and term_class == 'spell':
    sp_4grams = [ ngrams.list_gen_ngrams (x, n = 4, check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_4grams):
            g.extend(list(df['sp_3gram'])[i])
    ## add sp_2gram
    df['sp_4gram'] = sp_4grams

In [324]:
## spell skippy2gram
import ngrams_skippy
reload_module = False
if reload_module:
    import importlib
    importlib.reload(ngrams_skippy)
#
if term_class == 'spell':
    sp_skippy2grams = [ ngrams_skippy.gen_skippy2grams(x) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_skippy2grams):
            g.extend(list(df['sp_1gram'])[i])
    #
    df['sp_skippy2gram'] = sp_skippy2grams

In [325]:
## spell skippy3gram
import ngrams_skippy
if n_for_ngram > 2 and term_class == 'spell':
    sp_skippy3grams = [ ngrams_skippy.gen_skippy3grams(x) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_skippy3grams):
            g.extend(list(df['sp_skippy2gram'])[i])
    #
    df['sp_skippy3gram'] = sp_skippy3grams

In [326]:
## spell skippy4gram
import ngrams_skippy
if n_for_ngram > 3 and term_class == 'spell':
    sp_skippy4grams = [ ngrams_skippy.gen_skippy4grams(x) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_skippy4grams):
            g.extend(list(df['sp_skippy3gram'])[i])
    #
    df['sp_skippy4gram'] = sp_skippy4grams

In [327]:
## sound 2grams
import ngrams
reload_module = False
if reload_module:
    import importlib
    importlib.reload(ngrams)
#
if term_class == 'sound':
    sn_2grams = [ ngrams.list_gen_ngrams (x, n = 2, check = False) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_2grams):
            g.extend(list(df['sn_1gram'])[i])
    ## add sn_2gram
    df['sn_2gram'] = sn_2grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sn_2gram'] = sn_2grams


In [328]:
## sound 3grams
import ngrams
if n_for_ngram > 2 and term_class == 'sound':
    sn_3grams = [ ngrams.list_gen_ngrams (x, n = 3, check = False) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_3grams):
            g.extend(list(df['sn_2gram'])[i])
    ## add sn_3gram
    df['sn_3gram'] = sn_3grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sn_3gram'] = sn_3grams


In [329]:
## sound 4grams
import ngrams
if n_for_ngram > 3 and term_class == 'sound':
    sn_4grams = [ ngrams.list_gen_ngrams (x, n = 4, check = False) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_3grams):
            g.extend(list(df['sn_2gram'])[i])
    ## add sn_4gram
    df['sn_4gram'] = sn_3grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sn_4gram'] = sn_3grams


In [332]:
## sound skippy2gram
import ngrams_skippy
if term_class == 'sound':
    sn_skippy2grams = [ ngrams_skippy.gen_skippy2grams(x) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_skippy2grams):
            g.extend(list(df['sn_1gram'])[i])
    #
    df['sn_skippy2gram'] = sn_skippy2grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sn_skippy2gram'] = sn_skippy2grams


In [333]:
## sound skippy3gram
import ngrams_skippy
if n_for_ngram > 2 and term_class == 'sound':
    sn_skippy3grams = [ ngrams_skippy.gen_skippy3grams(x) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_skippy3grams):
            g.extend(list(df['sn_skippy2gram'])[i])
    #
    df['sn_skippy3gram'] = sn_skippy3grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sn_skippy3gram'] = sn_skippy3grams


In [334]:
## sound skippy4gram
import ngrams_skippy
if n_for_ngram > 3 and term_class == 'sound':
    sn_skippy4grams = [ ngrams_skippy.gen_skippy4grams(x) for x in df['sn_1gram'] ]
    #
    if ngram_is_inclusive:
        for i, g in enumerate(sn_skippy4grams):
            g.extend(list(df['sn_skippy3gram'])[i])
    #
    df['sn_skippy4gram'] = sn_skippy4grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sn_skippy4gram'] = sn_skippy4grams


In [335]:
## check df
dropped_vars = [ 'sp_size', 'hyphen', 'period', 'sn_size' ]
if term_class == 'spell':
    extra = [ 'sn_1gram', 'sn_2gram', 'sn_3gram', 'sn_skippy2gram', 'sn_skippy3gram', 'sn_skippy4gram' ]
    dropped_vars.extend(extra)
    target_vars = [ x for x in df.columns if not x in dropped_vars ]
else:
    extra = [ 'sp_1gram', 'sp_2gram', 'sp_3gram', 'sp_skippy2gram', 'sp_skippy3gram', 'sp_skippy4gram' ]
    dropped_vars.extend(extra)
    target_vars = [ x for x in df.columns if not x in dropped_vars ]    
#
df[target_vars]

Unnamed: 0,spell,sound,sn_1gram,sn_2gram,sn_3gram,sn_4gram,sn_skippy2gram,sn_skippy3gram,sn_skippy4gram
2292,akawaaga,akawaaɠa,"[a, k, a, w, a, a, ɠ, a]","[ak, ka, aw, wa, aa, aɠ, ɠa, a, k, a, w, a, a,...","[aka, kaw, awa, waa, aaɠ, aɠa, ak, ka, aw, wa,...","[aka, kaw, awa, waa, aaɠ, aɠa, ak, ka, aw, wa,...","[ak, a_a, a_w, a_ɠ, ka, k_w, k_a, k_ɠ, aw, wa,...","[aka, ak_w, ak_a, ak_ɠ, a_aw, a_a_a, a_a_ɠ, a_...","[akaw, aka_a, aka_ɠ, ak_wa, ak_w_a, ak_w_ɠ, ak..."
19656,kuepukana,kuepukana,"[k, u, e, p, u, k, a, n, a]","[ku, ue, ep, pu, uk, ka, an, na, k, u, e, p, u...","[kue, uep, epu, puk, uka, kan, ana, ku, ue, ep...","[kue, uep, epu, puk, uka, kan, ana, ku, ue, ep...","[ku, k_e, k_p, k_u, k_k, k_a, k_n, ue, u_p, u_...","[kue, ku_p, ku_u, ku_k, ku_a, ku_n, k_ep, k_e_...","[kuep, kue_u, kue_k, kue_a, kue_n, ku_pu, ku_p..."
35151,sitakutupa,sitakutupa,"[s, i, t, a, k, u, t, u, p, a]","[si, it, ta, ak, ku, ut, tu, up, pa, s, i, t, ...","[sit, ita, tak, aku, kut, utu, tup, upa, si, i...","[sit, ita, tak, aku, kut, utu, tup, upa, si, i...","[si, s_t, s_a, s_k, s_u, s_p, it, i_a, i_k, i_...","[sit, si_a, si_k, si_u, si_t, si_p, s_ta, s_t_...","[sita, sit_k, sit_u, sit_t, sit_p, sit_a, si_a..."
26339,mdimu,mɗimu,"[m, ɗ, i, m, u]","[mɗ, ɗi, im, mu, m, ɗ, i, m, u]","[mɗi, ɗim, imu, mɗ, ɗi, im, mu, m, ɗ, i, m, u,...","[mɗi, ɗim, imu, mɗ, ɗi, im, mu, m, ɗ, i, m, u,...","[mɗ, m_i, m_m, m_u, ɗi, ɗ_m, ɗ_u, im, i_u, mu,...","[mɗi, mɗ_m, mɗ_u, m_im, m_i_u, m_mu, ɗim, ɗi_u...","[mɗim, mɗi_u, mɗ_mu, m_i_mu, ɗimu, mɗi, mɗ_m, ..."
19152,komeshwa,komeʃwa,"[k, o, m, e, ʃ, w, a]","[ko, om, me, eʃ, ʃw, wa, k, o, m, e, ʃ, w, a]","[kom, ome, meʃ, eʃw, ʃwa, ko, om, me, eʃ, ʃw, ...","[kom, ome, meʃ, eʃw, ʃwa, ko, om, me, eʃ, ʃw, ...","[ko, k_m, k_e, k_ʃ, k_w, k_a, om, o_e, o_ʃ, o_...","[kom, ko_e, ko_ʃ, ko_w, ko_a, k_me, k_m_ʃ, k_m...","[kome, kom_ʃ, kom_w, kom_a, ko_eʃ, ko_e_w, ko_..."
...,...,...,...,...,...,...,...,...,...
38100,ukakafu,ukakafu,"[u, k, a, k, a, f, u]","[uk, ka, ak, ka, af, fu, u, k, a, k, a, f, u]","[uka, kak, aka, kaf, afu, uk, ka, ak, ka, af, ...","[uka, kak, aka, kaf, afu, uk, ka, ak, ka, af, ...","[uk, u_a, u_k, u_f, u_u, ka, k_k, k_a, k_f, k_...","[uka, uk_k, uk_a, uk_f, uk_u, u_ak, u_a_a, u_a...","[ukak, uka_a, uka_f, uka_u, uk_ka, uk_k_f, uk_..."
43558,wakoo,wakoo,"[w, a, k, o, o]","[wa, ak, ko, oo, w, a, k, o, o]","[wak, ako, koo, wa, ak, ko, oo, w, a, k, o, o,...","[wak, ako, koo, wa, ak, ko, oo, w, a, k, o, o,...","[wa, w_k, w_o, ak, a_o, ko, k_o, oo, w, a, k, ...","[wak, wa_o, w_ko, w_k_o, w_oo, ako, ak_o, a_oo...","[wako, wak_o, wa_oo, w_k_oo, akoo, wak, wa_o, ..."
9285,chomoleka,tʃomoleka,"[t, ʃ, o, m, o, l, e, k, a]","[tʃ, ʃo, om, mo, ol, le, ek, ka, t, ʃ, o, m, o...","[tʃo, ʃom, omo, mol, ole, lek, eka, tʃ, ʃo, om...","[tʃo, ʃom, omo, mol, ole, lek, eka, tʃ, ʃo, om...","[tʃ, t_o, t_m, t_l, t_e, t_k, t_a, ʃo, ʃ_m, ʃ_...","[tʃo, tʃ_m, tʃ_o, tʃ_l, tʃ_e, tʃ_k, tʃ_a, t_om...","[tʃom, tʃo_o, tʃo_l, tʃo_e, tʃo_k, tʃo_a, tʃ_m..."
364,Izraili,Izɾaili,"[I, z, ɾ, a, i, l, i]","[Iz, zɾ, ɾa, ai, il, li, I, z, ɾ, a, i, l, i]","[Izɾ, zɾa, ɾai, ail, ili, Iz, zɾ, ɾa, ai, il, ...","[Izɾ, zɾa, ɾai, ail, ili, Iz, zɾ, ɾa, ai, il, ...","[Iz, I_ɾ, I_a, I_i, I_l, zɾ, z_a, z_i, z_l, ɾa...","[Izɾ, Iz_a, Iz_i, Iz_l, I_ɾa, I_ɾ_i, I_ɾ_l, I_...","[Izɾa, Izɾ_i, Izɾ_l, Iz_ai, Iz_a_l, Iz_a_i, Iz..."


In [336]:
## select data type and define doc_dict
import random
if "sp_" in term_type:
    base_type = "spell"
else:
    base_type = "sound"
doc_dict = { i: x for i, x in enumerate(df[base_type]) }
## check
random.sample(doc_dict.items(), 10)

since Python 3.9 and will be removed in a subsequent version.
  random.sample(doc_dict.items(), 10)


[(5073, 'mifupa'),
 (3395, 'akiwaaᵐɓia'),
 (6186, 'niwapajo'),
 (1059, 'tulisifu'),
 (2472, 'ɓiⁿzaɾi'),
 (1467, 'kiseɾikali'),
 (4817, 'vikawa'),
 (4290, 'likatukia'),
 (786, 'falaɗi'),
 (5038, 'itaɓiɗi')]

In [337]:
## select bots for analysis
enable_term_change = False # if you want to change term_type to save time and energy
if enable_term_change:
	term_type = 'sp_skippy4gram'
print(f"(changed) term_type: {term_type}")

## bot stands for 'bag-of-terms', a generalization of 'bag-of-words'
bots = [ x for x in df[term_type] if len(x) > 1 ] # Crucially
import random
random.sample(bots, 3)

(changed) term_type: sn_skippy4gram


[['kise',
  'kis_j',
  'kis_e',
  'ki_ej',
  'ki_e_e',
  'ki_e_j',
  'ki_je',
  'ki_j_j',
  'ki_j_e',
  'k_s_ej',
  'k_se_e',
  'k_se_j',
  'k_s_je',
  'k_s_j_j',
  'k_s_j_e',
  'k_s_e_e',
  'k_e_je',
  'k_ej_j',
  'k_ej_e',
  'k_e_ej',
  'k_e_e_e',
  'k_j_ej',
  'k_je_e',
  'k_j_je',
  'isej',
  'ise_e',
  'ise_j',
  'is_je',
  'is_j_j',
  'is_j_e',
  'is_ej',
  'is_e_e',
  'i_e_je',
  'i_ej_j',
  'i_ej_e',
  'i_e_ej',
  'i_e_e_e',
  'i_j_ej',
  'i_je_e',
  'i_j_je',
  'seje',
  'sej_j',
  'sej_e',
  'se_ej',
  'se_e_e',
  'se_je',
  's_j_ej',
  's_je_e',
  's_j_je',
  's_e_je',
  'ejej',
  'eje_e',
  'ej_je',
  'e_e_je',
  'jeje',
  'kis',
  'ki_e',
  'ki_j',
  'k_se',
  'k_s_j',
  'k_s_e',
  'k_ej',
  'k_e_e',
  'k_e_j',
  'k_je',
  'k_j_j',
  'k_j_e',
  'ise',
  'is_j',
  'is_e',
  'i_ej',
  'i_e_e',
  'i_e_j',
  'i_je',
  'i_j_j',
  'i_j_e',
  'sej',
  'se_e',
  'se_j',
  's_je',
  's_j_j',
  's_j_e',
  's_ej',
  's_e_e',
  'eje',
  'ej_j',
  'ej_e',
  'e_ej',
  'e_e_e',
  'e_je',

In [338]:
## generate dictionary
from gensim.corpora import Dictionary
diction = Dictionary(bots)
print(diction)

if apply_term_filtering:
    print(f"term filtering applied")
    diction.filter_extremes(no_below = term_minfreq, no_above = abuse_threshold)
else:
    print(f"term filtering not applied")
print(diction)

## generate DTM
corpus = [ diction.doc2bow(bot) for bot in bots if len(bot) > min_bot_size ] # Crucially

Dictionary<214007 unique tokens: ['a', 'a_a', 'a_a_a', 'a_a_a_a', 'a_a_a_ɠ']...>
term filtering applied
Dictionary<70226 unique tokens: ['a_a_a_a', 'a_a_aa', 'a_a_aɠ', 'a_a_wa', 'a_a_ɠ']...>


In [339]:
## HDP (n_topics = 90)
import gensim.models
import pyLDAvis.gensim
max_n_topics = 90
hdp = gensim.models.HdpModel(corpus, diction, random_state = 1, T = max_n_topics)
vis_data = pyLDAvis.gensim.prepare(hdp, corpus, diction)
pyLDAvis.display(vis_data)

In [None]:
## save LDAvis output as a html file
save_LDAvis = True
if save_LDAvis:
	output = f"results/LDAvis/{target_lang_dict[target_lang_key]}-HDP-max_ntop{max_n_topics}-{term_type}.html"
	pyLDAvis.save_html(vis_data, output)

In [None]:
## topic investigation
import numpy as np
import HDP_helper

documents_topics = np.zeros([hdp.m_T, len(corpus)])
for doc_id, c in enumerate(corpus):
    for topic_id, prob in hdp[c]:
        documents_topics[topic_id][doc_id] = prob

## investigate topics
n_docs_to_show  = 10
n_terms_to_show = 7
hdp.optimal_ordering()
for topic_id, probs in enumerate(documents_topics):
    print(f"==============")
    #topic_encoding = ", ".join(hdp.show_topic(topic_id))
    topic_t = hdp.print_topic(topic_id, topn = n_terms_to_show)
    print(f"topic_id {topic_id}: {HDP_helper.reformat_topic (topic_t, n_terms_to_show)}")
    print(f"nonzero count: ", len(probs.nonzero()[0]))
    for doc_id in probs.argsort()[::-1][:n_docs_to_show]:
        doc = doc_dict[doc_id]
        print(f"\t{probs[doc_id]:0.4f}: {doc}")

topic_id 0: 0.001 * i_y + 0.001 * t_w + 0.001 * li_o + 0.001 * yo + 0.001 * n_t + 0.001 * fa + 0.001 * t_a_a
nonzero count:  761
	0.9970: uliyofanya
	0.9970: aliyofanya
	0.9968: kuyapoteza
	0.9968: mkusanyiko
	0.9968: walimokuwa
	0.9968: inayosukwa
	0.9967: unaofanywa
	0.9967: aliyoiteka
	0.9967: litakiwalo
	0.9967: alivyokuwa
topic_id 1: 0.001 * p_n + 0.001 * pe + 0.001 * a_nd + 0.001 * p_i + 0.001 * p_n_a + 0.001 * en_a + 0.001 * nda
nonzero count:  757
	0.9969: akipunguza
	0.9968: waniongoza
	0.9968: alitupenda
	0.9968: nilimpenda
	0.9968: kuniongoza
	0.9967: unaopindwa
	0.9967: kuwaongoza
	0.9967: tupendavyo
	0.9967: akinipenda
	0.9967: uliopandwa
topic_id 2: 0.001 * a_sh + 0.001 * a_i_h + 0.001 * a_is + 0.001 * a_s_a + 0.001 * i_ha + 0.001 * a_i_sh + 0.001 * a_hi
nonzero count:  882
	0.9968: wanaoshika
	0.9968: kunawishia
	0.9967: dhamirisha
	0.9966: dhaminisha
	0.9966: hatamshika
	0.9966: kuinamisha
	0.9966: kuahirisha
	0.9966: aminishika
	0.9965: wamekwisha
	0.9965: akadhurika
t

In [None]:
## HDP (n_topics = 45)
import gensim.models
import pyLDAvis.gensim
max_n_topics = 45
hdp = gensim.models.HdpModel(corpus, diction, random_state = 1, T = max_n_topics)
vis_data = pyLDAvis.gensim.prepare(hdp, corpus, diction)
pyLDAvis.display(vis_data)

In [None]:
## topic investigation
import numpy as np
import HDP_helper

documents_topics = np.zeros([hdp.m_T, len(corpus)])
for doc_id, c in enumerate(corpus):
    for topic_id, prob in hdp[c]:
        documents_topics[topic_id][doc_id] = prob

n_docs_to_show = 10
n_terms_to_show = 7
hdp.optimal_ordering()
for topic_id, probs in enumerate(documents_topics):
    print(f"==============")
    topic_t = hdp.print_topic(topic_id, topn = n_terms_to_show)
    print(f"topic_id {topic_id}: {HDP_helper.reformat_topic (topic_t, n_terms_to_show)}")
    print(f"nonzero count: {len(probs.nonzero()[0])}")
    for doc_id in probs.argsort()[::-1][:n_docs_to_show]:
        doc = doc_dict[doc_id]
        print(f"\t{probs[doc_id]:0.4f}: {doc}")

topic_id 0: 0.0 * w_n_a + 0.0 * a_za + 0.0 * n_z + 0.0 * w_i_a + 0.0 * m_u_a + 0.0 * w_z + 0.0 * a_ng
nonzero count: 1808
	0.9973: walifungwa
	0.9973: limefungwa
	0.9973: wakifungwa
	0.9972: wamefungwa
	0.9972: walimfunga
	0.9971: waniongoza
	0.9971: changuliwa
	0.9970: wakaongeza
	0.9970: wameanguka
	0.9970: wakafungua
topic_id 1: 0.001 * fa + 0.001 * f_n + 0.0 * fan + 0.0 * any + 0.0 * i_y + 0.0 * fa_a + 0.0 * f_y
nonzero count: 1133
	0.9971: uliyofanya
	0.9970: aliyofanya
	0.9969: kinatumiwa
	0.9968: unaofanywa
	0.9968: yakitumika
	0.9967: inayofanya
	0.9967: hatamshika
	0.9967: kubadilika
	0.9966: hakutakuwa
	0.9966: kayatuliza
topic_id 2: 0.001 * k_ka + 0.001 * u_m_a + 0.0 * k_ta + 0.0 * i_b + 0.0 * ku_i_a + 0.0 * v_a + 0.0 * ku_m
nonzero count: 1176
	0.9969: hutandikwa
	0.9968: ulikomweka
	0.9968: kutumbukia
	0.9968: litakiwalo
	0.9967: kusumbukia
	0.9966: kuwasumbua
	0.9966: wanakimbia
	0.9966: watakimbia
	0.9965: litamkwalo
	0.9965: mtandikaji
topic_id 3: 0.001 * ir + 0.001 * h

In [None]:
## HDP (n_topics = 15)
import gensim.models
import pyLDAvis.gensim

max_n_topics = 15
hdp = gensim.models.HdpModel(corpus, diction, random_state = 1, T = max_n_topics)
vis_data = pyLDAvis.gensim.prepare(hdp, corpus, diction)
pyLDAvis.display(vis_data)

In [None]:
## topic investigation
import numpy as np
import HDP_helper
reload_module = True
if reload_module:
    import importlib
    importlib.reload(HDP_helper)

documents_topics = np.zeros([hdp.m_T, len(corpus)])
for doc_id, c in enumerate(corpus):
    for topic_id, prob in hdp[c]:
        documents_topics[topic_id][doc_id] = prob

n_docs_to_show = 10
n_terms_to_show = 7
hdp.optimal_ordering()
for topic_id, probs in enumerate(documents_topics):
    print(f"==============")
    topic_t = hdp.print_topic(topic_id, topn = n_terms_to_show)
    print(f"topic_id {topic_id}: {HDP_helper.reformat_topic (topic_t, n_terms_to_show)}")
    print(f"nonzero count: ", len(probs.nonzero()[0]))
    for doc_id in probs.argsort()[::-1][:n_docs_to_show]:
        doc = doc_dict[doc_id]
        print(f"\t{probs[doc_id]:0.4f}: {doc}")

topic_id 0: 0.0 * ata + 0.0 * ut_a + 0.0 * t_t + 0.0 * a_at + 0.0 * el + 0.0 * k_ka + 0.0 * h_a_a
nonzero count:  3054
	0.9974: kutobolewa
	0.9973: kuambuliwa
	0.9972: kuyapoteza
	0.9972: watampokea
	0.9972: wakaupokea
	0.9972: hutobolewa
	0.9971: wakubalike
	0.9971: hakujiweka
	0.9971: hulazimika
	0.9970: kutembelea


topic_id 1: 0.0 * ku_i_a + 0.0 * u_k_a + 0.0 * uk_a + 0.0 * ku_k + 0.0 * k_ia + 0.0 * u_m_a + 0.0 * ku_n
nonzero count:  2472
	0.9972: hutandikwa
	0.9972: wakichukua
	0.9971: kuambukiza
	0.9971: ulikomweka
	0.9971: mkusanyiko
	0.9970: akimtukuza
	0.9970: kutumbukia
	0.9970: kusindikia
	0.9970: kujifanyia
	0.9970: kusumbukia
topic_id 2: 0.0 * n_z + 0.0 * f_n + 0.0 * w_n_a + 0.0 * wa_n + 0.0 * n_n_a + 0.0 * f_n_a + 0.0 * ana_a
nonzero count:  1485
	0.9972: hupendezwa
	0.9972: walifungwa
	0.9971: limefungwa
	0.9971: wakifungwa
	0.9971: wamefungwa
	0.9970: walimfunga
	0.9969: hupendezwi
	0.9969: kumwongeza
	0.9969: waniongoza
	0.9969: amefunikwa
topic_id 3: 0.0 * f_i + 0.0 * n_a_i + 0.0 * m_d + 0.0 * h_h + 0.0 * ad + 0.0 * a_a_a_i + 0.0 * i_ha
nonzero count:  1305
	0.9968: unaopindwa
	0.9967: shindikiza
	0.9966: uchikichia
	0.9966: hukipuliza
	0.9966: hufundisha
	0.9966: ukanijalia
	0.9966: unaopandia
	0.9966: mtandikaji
	0.9965: tunafanywa
	0.9964: naufurahia
topic_id 4: 0.0 * li_o + 0.0 