In [116]:
#!pip install -U pandas

In [117]:
#!pip install -U pyLDAvis

In [118]:
## imports
import os, sys
import pprint as pp

In [119]:
## 一つ上の階層のファイルを見るように設定
sys.path.append(os.path.join(os.path.dirname("__file__"), '..'))

In [120]:
## target language
## a key must be part of an open-dict-ipa file name 
target_lang_dict = {    'ar'    : 'Arabic',
                        'de'    : 'German',
                        'en_US' : 'English (US)',
                        'en_UK' : 'English (UK)',
                        'eo'    : 'Esperanto',
                        'es_ES' : 'Spanish (Spain)',
                        'es_MX' : 'Spanish (Mexico)',
                        'fi'    : 'Finnish',
                        'fr_FR' : 'French (France)',
                        'fr_QC' : 'French (Quebec)',
                        'is'    : 'Icelandic',
                        'nl'    : 'Dutch',
                        'ro'    : 'Romanian',
                        'sw'    : 'Swahili' }
target_lang_keys = [    'ar', 'de', 'en_US', 'en_UK', 'eo',
                        'es_ES', 'es_MX', 'fi', 'fr_FR', 'fr_QC',
                        'is', 'nl', 'ro', 'sw' ]
target_lang_key  = target_lang_keys[5]
print(f"target lang: {target_lang_dict[target_lang_key]} ({target_lang_key})")

target lang: Spanish (Spain) (es_ES)


In [121]:
## term settings
term_classes       = [ 'spell', 'sound' ]
term_class         = term_classes[0]
ngram_is_inclusive = True
gap_mark           = "…"
term_is_skippy     = True
n_for_ngram        = 4
print(f"term_class: {term_class}")
print(f"term_is_skippy: {term_is_skippy}")
print(f"n_for_ngram: {n_for_ngram}")
## define term_type
if term_class == 'spell':
    if term_is_skippy:
        term_type = f"sp_skippy{n_for_ngram}gram"
    else:
        term_type = f"sp_{n_for_ngram}gram"
else:
    if term_is_skippy:
        term_type = f"sn_skippy{n_for_ngram}gram"
    else:
        term_type = f"sn_{n_for_ngram}gram"
## check
print(f"term_type: {term_type}")
## doc settings
max_doc_size       = 10
min_doc_size       =  5

term_class: spell
term_is_skippy: True
n_for_ngram: 4
term_type: sp_skippy4gram


In [122]:
## LDA/HDP
apply_term_filtering = True
## The following parameters need to be relatively large to prevent "Row sum not equal 1" error
term_minfreq         = 3
abuse_threshold      = 0.03
min_bot_size         = 3

In [123]:
## sampling
source_sampling          = True
source_sampling_rate     = 0.5
source_sampling_max_size = 30000
second_sampling          = False
second_sampling_rate     = 0.7

In [124]:
## set target files
import glob
data_dir     = "data/open-dict-ipa/data1/"
target_files = glob.glob(f"{data_dir}/*")
target_files = sorted([ file for file in target_files if ".csv" in file ])
pp.pprint(target_files)

['data/open-dict-ipa/data1/ar.csv',
 'data/open-dict-ipa/data1/de.csv',
 'data/open-dict-ipa/data1/en_UK.csv',
 'data/open-dict-ipa/data1/en_US.csv',
 'data/open-dict-ipa/data1/eo.csv',
 'data/open-dict-ipa/data1/es_ES.csv',
 'data/open-dict-ipa/data1/es_MX.csv',
 'data/open-dict-ipa/data1/fa.csv',
 'data/open-dict-ipa/data1/fi.csv',
 'data/open-dict-ipa/data1/fr_FR.csv',
 'data/open-dict-ipa/data1/fr_QC.csv',
 'data/open-dict-ipa/data1/is.csv',
 'data/open-dict-ipa/data1/ja.csv',
 'data/open-dict-ipa/data1/jam.csv',
 'data/open-dict-ipa/data1/ma.csv',
 'data/open-dict-ipa/data1/nb.csv',
 'data/open-dict-ipa/data1/nl.csv',
 'data/open-dict-ipa/data1/or.csv',
 'data/open-dict-ipa/data1/sv.csv',
 'data/open-dict-ipa/data1/sw.csv',
 'data/open-dict-ipa/data1/vi_C.csv',
 'data/open-dict-ipa/data1/vi_N.csv',
 'data/open-dict-ipa/data1/vi_S.csv',
 'data/open-dict-ipa/data1/yue.csv',
 'data/open-dict-ipa/data1/zh_hans.csv',
 'data/open-dict-ipa/data1/zh_hant.csv']


In [125]:
## get source data from files
import pandas as pd
import gzip

#target_language_key = "en_US" # can be changed to get other languages
file = [ f for f in target_files if target_lang_key in f ][0]
print(f"processing: {file}")
with gzip.open(file, "rt") as f:
    raw_df = pd.read_csv(f, encoding = 'utf8', header = None, names = ['spell', 'sound'])
    sounds = raw_df['sound'].apply(lambda x: x.strip('/') )
    sounds = [ x.split("/,")[0] for x in sounds ] # picks up only the first of multiple entries
    raw_df['sound'] = sounds
#
raw_df.sample(10)


processing: data/open-dict-ipa/data1/es_ES.csv


Unnamed: 0,spell,sound
184580,desencapillaron,deseŋkapiʎaɾon
94541,bombarda,bombaɾða
407485,maqueadas,makeaðas
75426,atizo,atiθo
22194,aeronato,aeɾonato
271886,engruesad,eŋgɾwesað
62294,apurrió,apuˈrjo
131689,condensarías,kondensaˈɾias
351393,guarecería,gwaɾeθeˈɾia
555525,sustituyéndomeles,sustituˈʝendomeles


In [126]:
## source sampling
len(raw_df)
if source_sampling:
	print(f"source sampling applied")
	if len(raw_df) >= source_sampling_max_size:
		raw_df = raw_df.sample(source_sampling_max_size)
	else:
		raw_df = raw_df.sample(round(len(raw_df) * source_sampling_rate))
print(raw_df)

source sampling applied
               spell          sound
445434  pastelearían  pasteleaˈɾian
430375       nevisca        neβiska
207388      despeada       despeaða
396481  liquidadoras    likiðaðoɾas
61056       aproarán      apɾoaˈɾan
...              ...            ...
223756     dilusivos      dilusiβos
364926   ilustrísima   iluˈstɾisima
277696     enmatemos      emmatemos
558350    taraceares     taɾaθeaɾes
349362    gradaríais    gɾaðaˈɾiais

[30000 rows x 2 columns]


In [127]:
## generate 1-grams for spell and sound
## spell
raw_df['sp_1gram'] = raw_df['spell'].apply(lambda x: list(str(x)))
# add column of size
raw_df['sp_size'] = raw_df['sp_1gram'].apply(lambda x: len(x))
# add column of count of '-' inside
raw_df['hyphen'] = raw_df['sp_1gram'].apply(lambda x: list(x).count("-"))
# add column of count of '.' inside
raw_df['period'] = raw_df['sp_1gram'].apply(lambda x: list(x).count("."))
## sound
# takes the first entry, removes '/' around
raw_df['sn_1gram'] = raw_df['sound'].apply(lambda x: list(x) )
# add column of size
raw_df['sn_size'] = raw_df['sn_1gram'].apply(lambda x: len(x))
## check
raw_df

Unnamed: 0,spell,sound,sp_1gram,sp_size,hyphen,period,sn_1gram,sn_size
445434,pastelearían,pasteleaˈɾian,"[p, a, s, t, e, l, e, a, r, í, a, n]",12,0,0,"[p, a, s, t, e, l, e, a, ˈ, ɾ, i, a, n]",13
430375,nevisca,neβiska,"[n, e, v, i, s, c, a]",7,0,0,"[n, e, β, i, s, k, a]",7
207388,despeada,despeaða,"[d, e, s, p, e, a, d, a]",8,0,0,"[d, e, s, p, e, a, ð, a]",8
396481,liquidadoras,likiðaðoɾas,"[l, i, q, u, i, d, a, d, o, r, a, s]",12,0,0,"[l, i, k, i, ð, a, ð, o, ɾ, a, s]",11
61056,aproarán,apɾoaˈɾan,"[a, p, r, o, a, r, á, n]",8,0,0,"[a, p, ɾ, o, a, ˈ, ɾ, a, n]",9
...,...,...,...,...,...,...,...,...
223756,dilusivos,dilusiβos,"[d, i, l, u, s, i, v, o, s]",9,0,0,"[d, i, l, u, s, i, β, o, s]",9
364926,ilustrísima,iluˈstɾisima,"[i, l, u, s, t, r, í, s, i, m, a]",11,0,0,"[i, l, u, ˈ, s, t, ɾ, i, s, i, m, a]",12
277696,enmatemos,emmatemos,"[e, n, m, a, t, e, m, o, s]",9,0,0,"[e, m, m, a, t, e, m, o, s]",9
558350,taraceares,taɾaθeaɾes,"[t, a, r, a, c, e, a, r, e, s]",10,0,0,"[t, a, ɾ, a, θ, e, a, ɾ, e, s]",10


In [128]:
## filtering raw_data by size
print(f"term_type: {term_type}")
if "sp_" in term_type:
    df_filtered = raw_df[ (raw_df['sp_size'] <= max_doc_size) & (raw_df['sp_size'] >= min_doc_size) & (raw_df['hyphen'] == 0) & (raw_df['period'] == 0) ]
else:
    df_filtered = raw_df[ (raw_df['sn_size'] <= max_doc_size) & (raw_df['sn_size'] >= min_doc_size) ]
#
df_filtered

term_type: sp_skippy4gram


Unnamed: 0,spell,sound,sp_1gram,sp_size,hyphen,period,sn_1gram,sn_size
430375,nevisca,neβiska,"[n, e, v, i, s, c, a]",7,0,0,"[n, e, β, i, s, k, a]",7
207388,despeada,despeaða,"[d, e, s, p, e, a, d, a]",8,0,0,"[d, e, s, p, e, a, ð, a]",8
61056,aproarán,apɾoaˈɾan,"[a, p, r, o, a, r, á, n]",8,0,0,"[a, p, ɾ, o, a, ˈ, ɾ, a, n]",9
59012,apoplejías,apopleˈxias,"[a, p, o, p, l, e, j, í, a, s]",10,0,0,"[a, p, o, p, l, e, ˈ, x, i, a, s]",11
96218,botillera,botiʎeɾa,"[b, o, t, i, l, l, e, r, a]",9,0,0,"[b, o, t, i, ʎ, e, ɾ, a]",8
...,...,...,...,...,...,...,...,...
173062,descarté,deskaˈɾte,"[d, e, s, c, a, r, t, é]",8,0,0,"[d, e, s, k, a, ˈ, ɾ, t, e]",9
223756,dilusivos,dilusiβos,"[d, i, l, u, s, i, v, o, s]",9,0,0,"[d, i, l, u, s, i, β, o, s]",9
277696,enmatemos,emmatemos,"[e, n, m, a, t, e, m, o, s]",9,0,0,"[e, m, m, a, t, e, m, o, s]",9
558350,taraceares,taɾaθeaɾes,"[t, a, r, a, c, e, a, r, e, s]",10,0,0,"[t, a, ɾ, a, θ, e, a, ɾ, e, s]",10


In [129]:
## define df after second sampling if any
len(df_filtered)
if second_sampling:
    df = df_filtered.sample(round(len(df_filtered) * second_sampling_rate))
else:
    df = df_filtered
len(df)

15604

In [130]:
## spell 2grams
import ngrams
reload_module = False
if reload_module:
    import importlib
    importlib.reload(ngrams)

if term_class == 'spell':
    sp_2grams = [ ngrams.list_gen_ngrams (x, n = 2, check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_2grams):
            g.extend(list(df['sp_1gram'])[i])
    ## add sp_2gram
    df['sp_2gram'] = sp_2grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sp_2gram'] = sp_2grams


In [131]:
## spell 3grams
import ngrams
if n_for_ngram > 2 and term_class == 'spell':
    sp_3grams = [ ngrams.list_gen_ngrams (x, n = 3, check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_3grams):
            g.extend(list(df['sp_2gram'])[i])
    ## add sp_2gram
    df['sp_3gram'] = sp_3grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sp_3gram'] = sp_3grams


In [132]:
## spell 4grams
import ngrams
if n_for_ngram > 3 and term_class == 'spell':
    sp_4grams = [ ngrams.list_gen_ngrams (x, n = 4, check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_4grams):
            g.extend(list(df['sp_3gram'])[i])
    ## add sp_2gram
    df['sp_4gram'] = sp_4grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sp_4gram'] = sp_4grams


In [133]:
## spell skippy2gram
import ngrams_skippy
reload_module = False
if reload_module:
    import importlib
    importlib.reload(ngrams_skippy)
#
if term_class == 'spell':
    sp_skippy2grams = [ ngrams_skippy.gen_skippy2grams(x, check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_skippy2grams):
            g.extend(list(df['sp_1gram'])[i])
    #
    df['sp_skippy2gram'] = sp_skippy2grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sp_skippy2gram'] = sp_skippy2grams


In [134]:
## spell skippy3gram
import ngrams_skippy
if n_for_ngram > 2 and term_class == 'spell':
    sp_skippy3grams = [ ngrams_skippy.gen_skippy3grams(x, check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_skippy3grams):
            g.extend(list(df['sp_skippy2gram'])[i])
    #
    df['sp_skippy3gram'] = sp_skippy3grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sp_skippy3gram'] = sp_skippy3grams


In [135]:
## spell skippy4gram
import ngrams_skippy
if n_for_ngram > 3 and term_class == 'spell':
    sp_skippy4grams = [ ngrams_skippy.gen_skippy4grams(x, check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_skippy4grams):
            g.extend(list(df['sp_skippy3gram'])[i])
    #
    df['sp_skippy4gram'] = sp_skippy4grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sp_skippy4gram'] = sp_skippy4grams


In [136]:
## sound 2grams
import ngrams
reload_module = False
if reload_module:
    import importlib
    importlib.reload(ngrams)
#
if term_class == 'sound':
    sn_2grams = [ ngrams.list_gen_ngrams (x, n = 2, check = False) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_2grams):
            g.extend(list(df['sn_1gram'])[i])
    ## add sn_2gram
    df['sn_2gram'] = sn_2grams

In [137]:
## sound 3grams
import ngrams
if n_for_ngram > 2 and term_class == 'sound':
    sn_3grams = [ ngrams.list_gen_ngrams (x, n = 3, check = False) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_3grams):
            g.extend(list(df['sn_2gram'])[i])
    ## add sn_3gram
    df['sn_3gram'] = sn_3grams

In [138]:
## sound 4grams
import ngrams
if n_for_ngram > 3 and term_class == 'sound':
    sn_4grams = [ ngrams.list_gen_ngrams (x, n = 4, check = False) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_3grams):
            g.extend(list(df['sn_2gram'])[i])
    ## add sn_4gram
    df['sn_4gram'] = sn_3grams

In [139]:
## sound skippy2gram
import ngrams_skippy
if term_class == 'sound':
    sn_skippy2grams = [ ngrams_skippy.gen_skippy2grams(x) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_skippy2grams):
            g.extend(list(df['sn_1gram'])[i])
    #
    df['sn_skippy2gram'] = sn_skippy2grams

In [140]:
## sound skippy3gram
import ngrams_skippy
if n_for_ngram > 2 and term_class == 'sound':
    sn_skippy3grams = [ ngrams_skippy.gen_skippy3grams(x, check = False) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_skippy3grams):
            g.extend(list(df['sn_skippy2gram'])[i])
    #
    df['sn_skippy3gram'] = sn_skippy3grams

In [141]:
## sound skippy4gram
import ngrams_skippy
if n_for_ngram > 3 and term_class == 'sound':
    sn_skippy4grams = [ ngrams_skippy.gen_skippy4grams(x, check = False) for x in df['sn_1gram'] ]
    #
    if ngram_is_inclusive:
        for i, g in enumerate(sn_skippy4grams):
            g.extend(list(df['sn_skippy3gram'])[i])
    #
    df['sn_skippy4gram'] = sn_skippy4grams

In [142]:
## check df
dropped_vars = [ 'sp_size', 'hyphen', 'period', 'sn_size' ]
if term_class == 'spell':
    extra = [ 'sn_1gram', 'sn_2gram', 'sn_3gram', 'sn_4gram',
             'sn_skippy2gram', 'sn_skippy3gram', 'sn_skippy4gram' ]
    dropped_vars.extend(extra)
    target_vars = [ x for x in df.columns if not x in dropped_vars ]
else:
    extra = [ 'sp_1gram', 'sp_2gram', 'sp_3gram', 'sp_4gram',
             'sp_skippy2gram', 'sp_skippy3gram', 'sp_skippy4gram' ]
    dropped_vars.extend(extra)
    target_vars = [ x for x in df.columns if not x in dropped_vars ]    
#
df[target_vars]

Unnamed: 0,spell,sound,sp_1gram,sp_2gram,sp_3gram,sp_4gram,sp_skippy2gram,sp_skippy3gram,sp_skippy4gram
430375,nevisca,neβiska,"[n, e, v, i, s, c, a]","[ne, ev, vi, is, sc, ca, n, e, v, i, s, c, a]","[nev, evi, vis, isc, sca, ne, ev, vi, is, sc, ...","[nevi, evis, visc, isca, nev, evi, vis, isc, s...","[ne, n_v, n_i, n_s, n_c, n_a, ev, e_i, e_s, e_...","[nev, ne_i, ne_s, ne_c, ne_a, n_vi, n_v_s, n_v...","[nevi, nev_s, nev_c, nev_a, ne_is, ne_i_c, ne_..."
207388,despeada,despeaða,"[d, e, s, p, e, a, d, a]","[de, es, sp, pe, ea, ad, da, d, e, s, p, e, a,...","[des, esp, spe, pea, ead, ada, de, es, sp, pe,...","[desp, espe, spea, pead, eada, des, esp, spe, ...","[de, d_s, d_p, d_e, d_a, d_d, es, e_p, e_e, e_...","[des, de_p, de_e, de_a, de_d, d_sp, d_s_e, d_s...","[desp, des_e, des_a, des_d, de_pe, de_p_a, de_..."
61056,aproarán,apɾoaˈɾan,"[a, p, r, o, a, r, á, n]","[ap, pr, ro, oa, ar, rá, án, a, p, r, o, a, r,...","[apr, pro, roa, oar, ará, rán, ap, pr, ro, oa,...","[apro, proa, roar, oará, arán, apr, pro, roa, ...","[ap, a_r, a_o, a_a, a_á, a_n, pr, p_o, p_a, p_...","[apr, ap_o, ap_a, ap_r, ap_á, ap_n, a_ro, a_r_...","[apro, apr_a, apr_r, apr_á, apr_n, ap_oa, ap_o..."
59012,apoplejías,apopleˈxias,"[a, p, o, p, l, e, j, í, a, s]","[ap, po, op, pl, le, ej, jí, ía, as, a, p, o, ...","[apo, pop, opl, ple, lej, ejí, jía, ías, ap, p...","[apop, popl, ople, plej, lejí, ejía, jías, apo...","[ap, a_o, a_p, a_l, a_e, a_j, a_í, a_a, a_s, p...","[apo, ap_p, ap_l, ap_e, ap_j, ap_í, ap_a, ap_s...","[apop, apo_l, apo_e, apo_j, apo_í, apo_a, apo_..."
96218,botillera,botiʎeɾa,"[b, o, t, i, l, l, e, r, a]","[bo, ot, ti, il, ll, le, er, ra, b, o, t, i, l...","[bot, oti, til, ill, lle, ler, era, bo, ot, ti...","[boti, otil, till, ille, ller, lera, bot, oti,...","[bo, b_t, b_i, b_l, b_e, b_r, b_a, ot, o_i, o_...","[bot, bo_i, bo_l, bo_e, bo_r, bo_a, b_ti, b_t_...","[boti, bot_l, bot_e, bot_r, bot_a, bo_il, bo_i..."
...,...,...,...,...,...,...,...,...,...
173062,descarté,deskaˈɾte,"[d, e, s, c, a, r, t, é]","[de, es, sc, ca, ar, rt, té, d, e, s, c, a, r,...","[des, esc, sca, car, art, rté, de, es, sc, ca,...","[desc, esca, scar, cart, arté, des, esc, sca, ...","[de, d_s, d_c, d_a, d_r, d_t, d_é, es, e_c, e_...","[des, de_c, de_a, de_r, de_t, de_é, d_sc, d_s_...","[desc, des_a, des_r, des_t, des_é, de_ca, de_c..."
223756,dilusivos,dilusiβos,"[d, i, l, u, s, i, v, o, s]","[di, il, lu, us, si, iv, vo, os, d, i, l, u, s...","[dil, ilu, lus, usi, siv, ivo, vos, di, il, lu...","[dilu, ilus, lusi, usiv, sivo, ivos, dil, ilu,...","[di, d_l, d_u, d_s, d_i, d_v, d_o, il, i_u, i_...","[dil, di_u, di_s, di_i, di_v, di_o, d_lu, d_l_...","[dilu, dil_s, dil_i, dil_v, dil_o, di_us, di_u..."
277696,enmatemos,emmatemos,"[e, n, m, a, t, e, m, o, s]","[en, nm, ma, at, te, em, mo, os, e, n, m, a, t...","[enm, nma, mat, ate, tem, emo, mos, en, nm, ma...","[enma, nmat, mate, atem, temo, emos, enm, nma,...","[en, e_m, e_a, e_t, e_e, e_o, e_s, nm, n_a, n_...","[enm, en_a, en_t, en_e, en_m, en_o, en_s, e_ma...","[enma, enm_t, enm_e, enm_m, enm_o, enm_s, en_a..."
558350,taraceares,taɾaθeaɾes,"[t, a, r, a, c, e, a, r, e, s]","[ta, ar, ra, ac, ce, ea, ar, re, es, t, a, r, ...","[tar, ara, rac, ace, cea, ear, are, res, ta, a...","[tara, arac, race, acea, cear, eare, ares, tar...","[ta, t_r, t_a, t_c, t_e, t_s, ar, a_a, a_c, a_...","[tar, ta_a, ta_c, ta_e, ta_r, ta_s, t_ra, t_r_...","[tara, tar_c, tar_e, tar_a, tar_r, tar_s, ta_a..."


In [143]:
## select data type and define doc_dict
import random
if "sp_" in term_type:
    base_type = "spell"
else:
    base_type = "sound"
doc_dict = { i: x for i, x in enumerate(df[base_type]) }
## check
random.sample(doc_dict.items(), 10)

since Python 3.9 and will be removed in a subsequent version.
  random.sample(doc_dict.items(), 10)


[(2259, 'baladren'),
 (6335, 'atortujas'),
 (1497, 'augurasen'),
 (14081, 'ejercían'),
 (6916, 'saqueríos'),
 (9935, 'irradiaste'),
 (4744, 'hablaremos'),
 (2895, 'dislocaban'),
 (3586, 'arraigara'),
 (13649, 'enrubies')]

In [144]:
## select bots for analysis
enable_term_change = False # if you want to change term_type to save time and energy
if enable_term_change:
	term_type = 'sp_skippy4gram'
print(f"(changed) term_type: {term_type}")

## bot stands for 'bag-of-terms', a generalization of 'bag-of-words'
bots = [ x for x in df[term_type] if len(x) > 1 ] # Crucially
import random
random.sample(bots, 3)

(changed) term_type: sp_skippy4gram


[['angu',
  'ang_l',
  'ang_o',
  'ang_s',
  'ang_a',
  'an_ul',
  'an_u_o',
  'an_u_s',
  'an_u_a',
  'an_lo',
  'an_l_s',
  'an_l_a',
  'an_os',
  'an_o_a',
  'an_sa',
  'a_g_ul',
  'a_gu_o',
  'a_gu_s',
  'a_gu_a',
  'a_g_lo',
  'a_g_l_s',
  'a_g_l_a',
  'a_g_os',
  'a_g_o_a',
  'a_g_sa',
  'a_u_lo',
  'a_ul_s',
  'a_ul_a',
  'a_u_os',
  'a_u_o_a',
  'a_u_sa',
  'a_l_os',
  'a_lo_a',
  'a_l_sa',
  'a_o_sa',
  'ngul',
  'ngu_o',
  'ngu_s',
  'ngu_a',
  'ng_lo',
  'ng_l_s',
  'ng_l_a',
  'ng_os',
  'ng_o_a',
  'ng_sa',
  'n_u_lo',
  'n_ul_s',
  'n_ul_a',
  'n_u_os',
  'n_u_o_a',
  'n_u_sa',
  'n_l_os',
  'n_lo_a',
  'n_l_sa',
  'n_o_sa',
  'gulo',
  'gul_s',
  'gul_a',
  'gu_os',
  'gu_o_a',
  'gu_sa',
  'g_l_os',
  'g_lo_a',
  'g_l_sa',
  'g_o_sa',
  'ulos',
  'ulo_a',
  'ul_sa',
  'u_o_sa',
  'losa',
  'ang',
  'an_u',
  'an_l',
  'an_o',
  'an_s',
  'an_a',
  'a_gu',
  'a_g_l',
  'a_g_o',
  'a_g_s',
  'a_g_a',
  'a_ul',
  'a_u_o',
  'a_u_s',
  'a_u_a',
  'a_lo',
  'a_l_s',
  'a_l_a

In [145]:
## generate dictionary
from gensim.corpora import Dictionary
diction = Dictionary(bots)
print(diction)

if apply_term_filtering:
    print(f"term filtering applied")
    diction.filter_extremes(no_below = term_minfreq, no_above = abuse_threshold)
else:
    print(f"term filtering not applied")
print(diction)

## generate DTM
corpus = [ diction.doc2bow(bot) for bot in bots if len(bot) > min_bot_size ] # Crucially

Dictionary<328471 unique tokens: ['a', 'c', 'ca', 'e', 'e_a']...>
term filtering applied
Dictionary<100000 unique tokens: ['e_ca', 'e_i_c', 'e_i_ca', 'e_i_sc', 'e_is_a']...>


In [146]:
## HDP (n_topics = 90)
import gensim.models
import pyLDAvis.gensim
max_n_topics = 90
hdp = gensim.models.HdpModel(corpus, diction, random_state = 1, T = max_n_topics)
vis_data = pyLDAvis.gensim.prepare(hdp, corpus, diction)
pyLDAvis.display(vis_data)

In [147]:
## save LDAvis output as a html file
save_LDAvis = True
if save_LDAvis:
	output = f"results/LDAvis/{target_lang_dict[target_lang_key]}-HDP-max_ntop{max_n_topics}-{term_type}.html"
	pyLDAvis.save_html(vis_data, output)

In [148]:
## topic investigation
import numpy as np
import HDP_helper

documents_topics = np.zeros([hdp.m_T, len(corpus)])
for doc_id, c in enumerate(corpus):
    for topic_id, prob in hdp[c]:
        documents_topics[topic_id][doc_id] = prob

## investigate topics
n_docs_to_show  = 10
n_terms_to_show = 7
hdp.optimal_ordering()
for topic_id, probs in enumerate(documents_topics):
    print(f"==============")
    #topic_encoding = ", ".join(hdp.show_topic(topic_id))
    topic_t = hdp.print_topic(topic_id, topn = n_terms_to_show)
    print(f"topic_id {topic_id}: {HDP_helper.reformat_topic (topic_t, n_terms_to_show)}")
    print(f"nonzero count: ", len(probs.nonzero()[0]))
    for doc_id in probs.argsort()[::-1][:n_docs_to_show]:
        doc = doc_dict[doc_id]
        print(f"\t{probs[doc_id]:0.4f}: {doc}")

In [None]:
## HDP (n_topics = 45)
import gensim.models
import pyLDAvis.gensim
max_n_topics = 45
hdp = gensim.models.HdpModel(corpus, diction, random_state = 1, T = max_n_topics)
vis_data = pyLDAvis.gensim.prepare(hdp, corpus, diction)
pyLDAvis.display(vis_data)

In [None]:
## save LDAvis output as a html file
save_LDAvis = True
if save_LDAvis:
	output = f"results/LDAvis/{target_lang_dict[target_lang_key]}-HDP-max_ntop{max_n_topics}-{term_type}.html"
	pyLDAvis.save_html(vis_data, output)

In [None]:
## topic investigation
import numpy as np
import HDP_helper

documents_topics = np.zeros([hdp.m_T, len(corpus)])
for doc_id, c in enumerate(corpus):
    for topic_id, prob in hdp[c]:
        documents_topics[topic_id][doc_id] = prob

n_docs_to_show = 10
n_terms_to_show = 7
hdp.optimal_ordering()
for topic_id, probs in enumerate(documents_topics):
    print(f"==============")
    topic_t = hdp.print_topic(topic_id, topn = n_terms_to_show)
    print(f"topic_id {topic_id}: {HDP_helper.reformat_topic (topic_t, n_terms_to_show)}")
    print(f"nonzero count: {len(probs.nonzero()[0])}")
    for doc_id in probs.argsort()[::-1][:n_docs_to_show]:
        doc = doc_dict[doc_id]
        print(f"\t{probs[doc_id]:0.4f}: {doc}")

topic_id 0: 0.0 * éc + 0.0 * é_es + 0.0 * dé_r + 0.0 * o_z + 0.0 * d_l + 0.0 * i_i_e + 0.0 * o_ez
nonzero count: 9959
	0.9984: découlâtes
	0.9983: déplantiez
	0.9983: décapotées
	0.9983: spéculeras
	0.9983: décentrais
	0.9983: compostiez
	0.9983: décintrais
	0.9983: décornâtes
	0.9983: déplierais
	0.9983: décapotais
topic_id 1: 0.001 * c_on + 0.001 * a_ns + 0.001 * l_o + 0.001 * a_o_ns + 0.001 * u_nt + 0.0 * u_on + 0.0 * s_on
nonzero count: 4192
	0.9971: boucherons
	0.9970: aboutirons
	0.9970: chaulerons
	0.9970: chauleront
	0.9970: abouchions
	0.9970: fracturons
	0.9970: croulerons
	0.9969: combleront
	0.9969: blaguèrent
	0.9969: brouteront
topic_id 2: 0.001 * ions + 0.001 * r_o_ns + 0.001 * i_o_ns + 0.001 * e_o_ns + 0.001 * eron + 0.001 * ero + 0.001 * d_on
nonzero count: 2199
	0.9967: évolutions
	0.9967: répandions
	0.9967: détaleront
	0.9967: dragueront
	0.9967: étalerions
	0.9967: dévoueront
	0.9966: prendrions
	0.9966: pivoterons
	0.9966: évoluerons
	0.9966: épilerions
topic_id 3

In [None]:
## HDP (n_topics = 15)
import gensim.models
import pyLDAvis.gensim

max_n_topics = 15
hdp = gensim.models.HdpModel(corpus, diction, random_state = 1, T = max_n_topics)
vis_data = pyLDAvis.gensim.prepare(hdp, corpus, diction)
pyLDAvis.display(vis_data)

In [None]:
## save LDAvis output as a html file
save_LDAvis = True
if save_LDAvis:
	output = f"results/LDAvis/{target_lang_dict[target_lang_key]}-HDP-max_ntop{max_n_topics}-{term_type}.html"
	pyLDAvis.save_html(vis_data, output)

In [None]:
## topic investigation
import numpy as np
import HDP_helper
reload_module = True
if reload_module:
    import importlib
    importlib.reload(HDP_helper)

documents_topics = np.zeros([hdp.m_T, len(corpus)])
for doc_id, c in enumerate(corpus):
    for topic_id, prob in hdp[c]:
        documents_topics[topic_id][doc_id] = prob

n_docs_to_show = 10
n_terms_to_show = 7
hdp.optimal_ordering()
for topic_id, probs in enumerate(documents_topics):
    print(f"==============")
    topic_t = hdp.print_topic(topic_id, topn = n_terms_to_show)
    print(f"topic_id {topic_id}: {HDP_helper.reformat_topic (topic_t, n_terms_to_show)}")
    print(f"nonzero count: ", len(probs.nonzero()[0]))
    for doc_id in probs.argsort()[::-1][:n_docs_to_show]:
        doc = doc_dict[doc_id]
        print(f"\t{probs[doc_id]:0.4f}: {doc}")

topic_id 0: 0.0 * r_o_ns + 0.0 * a_ns + 0.0 * a_o_ns + 0.0 * e_o_ns + 0.0 * ions + 0.0 * r_i_n + 0.0 * i_o_ns
nonzero count:  6361
	0.9975: détaleront
	0.9975: évolutions
	0.9975: défigurais
	0.9975: répandions
	0.9975: dévouaient
	0.9975: pondraient
	0.9975: aboutirons
	0.9975: plongeants
	0.9975: députerons
	0.9975: étalerions
topic_id 1: 0.001 * t_ra + 0.0 * l_ra + 0.0 * e_ra + 0.0 * u_ra + 0.0 * a_r_i + 0.0 * t_ai + 0.0 * r_e_ra
nonzero count:  5703
	0.9973: compileras
	0.9973: spéculeras
	0.9973: empilerais
	0.9973: postuleras
	0.9973: épouserait
	0.9973: combineras
	0.9973: rempilerai
	0.9973: comblerait
	0.9972: croupirait
	0.9972: enculerait
topic_id 2: 0.001 * q_e + 0.0 * l_u + 0.0 * a_d + 0.0 * que + 0.0 * h_r + 0.0 * eu + 0.0 * h_i
nonzero count:  4842
	0.9973: décentrais
	0.9972: cloquerais
	0.9972: déplierais
	0.9972: cliquerais
	0.9972: démangeais
	0.9972: éplucherai
	0.9972: enticheras
	0.9972: raplatirez
	0.9972: échangeais
	0.9971: élancerais
topic_id 3: 0.0 * mes + 0.