In [383]:
#!pip install -U pandas

In [384]:
#!pip install -U pyLDAvis

In [385]:
## imports
import os, sys
import pprint as pp

In [386]:
## 一つ上の階層のファイルを見るように設定
sys.path.append(os.path.join(os.path.dirname("__file__"), '..'))

In [387]:
## target language
## a key must be part of a file name 
target_lang_dict = {    'ar'    : 'Arabic',
                        'de'    : 'German',
                        'de_N_only' : 'German Nouns',
                        'de_non_N_only' : 'German Non-nouns',
                        'en_US' : 'English (US)',
                        'en_UK' : 'English (UK)',
                        'en_N_only' : 'English noun (WN)',
                        'en_V_only' : 'English verb (WN)',
                        'en_A_only' : 'English adj (WN)',
                        'en_R_only' : 'English adv (WN)',
                        'eo'    : 'Esperanto',
                        'es_ES' : 'Spanish (Spain)',
                        'es_MX' : 'Spanish (Mexico)',
                        'fi'    : 'Finnish',
                        'fr_FR' : 'French (France)',
                        'fr_QC' : 'French (Quebec)',
                        'is'    : 'Icelandic',
                        'nl'    : 'Dutch',
                        'ro'    : 'Romanian',
                        'sw'    : 'Swahili' }
target_lang_keys = [    'ar', 'de', 'de_N_only', 'de_non_N_only',
                        'en_US', 'en_UK', 'en_N_only', 'en_V_only', 'en_A_only', 'en_R_only',
                        'eo', 'es_ES', 'es_MX',
                        'fi', 'fr_FR', 'fr_QC',
                        'is', 'nl', 'ro', 'sw' ]
#
target_lang_key  = target_lang_keys[8]
print(f"target lang: {target_lang_dict[target_lang_key]} ({target_lang_key})")

target lang: English adj (WN) (en_A_only)


In [388]:
## term settings
term_classes       = [ 'spell', 'sound' ]
term_class         = term_classes[0]
ngram_is_inclusive = True
gap_mark           = "…"
term_is_skippy     = True
n_for_ngram        = 4
print(f"term_class: {term_class}")
print(f"term_is_skippy: {term_is_skippy}")
print(f"n_for_ngram: {n_for_ngram}")
## define term_type
if term_class == 'spell':
    if term_is_skippy:
        term_type = f"sp_skippy{n_for_ngram}gram"
    else:
        term_type = f"sp_{n_for_ngram}gram"
else:
    if term_is_skippy:
        term_type = f"sn_skippy{n_for_ngram}gram"
    else:
        term_type = f"sn_{n_for_ngram}gram"
## check
print(f"term_type: {term_type}")
## doc settings
max_doc_size       = 10
min_doc_size       =  5

term_class: spell
term_is_skippy: True
n_for_ngram: 4
term_type: sp_skippy4gram


In [389]:
## LDA/HDP
apply_term_filtering = True
## The following parameters need to be relatively large to prevent "Row sum not equal 1" error
term_minfreq         = 3
abuse_threshold      = 0.03
min_bot_size         = 3

In [390]:
## sampling
source_sampling          = True
source_sampling_rate     = 0.5
source_sampling_max_size = 30000
second_sampling          = False
second_sampling_rate     = 0.7

In [391]:
## set target files
import glob
data_dir1     = "data/open-dict-ipa/data1/"
data_dir2     = "data/open-dict-ipa/data1a/"
data_dir3     = "data/wn3/"
target_files = glob.glob(f"{data_dir1}/*")
target_files2 = glob.glob(f"{data_dir2}/*")
target_files.extend(target_files2)
target_files3 = glob.glob(f"{data_dir3}/*")
target_files.extend(target_files3)
target_files = sorted([ file for file in target_files if ".csv" in file ])
pp.pprint(target_files)

['data/open-dict-ipa/data1/ar.csv.gz',
 'data/open-dict-ipa/data1/de.csv.gz',
 'data/open-dict-ipa/data1/en_UK.csv.gz',
 'data/open-dict-ipa/data1/en_US.csv.gz',
 'data/open-dict-ipa/data1/eo.csv.gz',
 'data/open-dict-ipa/data1/es_ES.csv.gz',
 'data/open-dict-ipa/data1/es_MX.csv.gz',
 'data/open-dict-ipa/data1/fa.csv.gz',
 'data/open-dict-ipa/data1/fi.csv.gz',
 'data/open-dict-ipa/data1/fr_FR.csv.gz',
 'data/open-dict-ipa/data1/fr_QC.csv.gz',
 'data/open-dict-ipa/data1/is.csv.gz',
 'data/open-dict-ipa/data1/ja.csv.gz',
 'data/open-dict-ipa/data1/jam.csv.gz',
 'data/open-dict-ipa/data1/ma.csv.gz',
 'data/open-dict-ipa/data1/nb.csv.gz',
 'data/open-dict-ipa/data1/nl.csv.gz',
 'data/open-dict-ipa/data1/or.csv.gz',
 'data/open-dict-ipa/data1/sv.csv.gz',
 'data/open-dict-ipa/data1/sw.csv.gz',
 'data/open-dict-ipa/data1/vi_C.csv.gz',
 'data/open-dict-ipa/data1/vi_N.csv.gz',
 'data/open-dict-ipa/data1/vi_S.csv.gz',
 'data/open-dict-ipa/data1/yue.csv.gz',
 'data/open-dict-ipa/data1/zh_hans.csv

In [392]:
## get source data from files
import pandas as pd
import gzip

#target_language_key = "en_US" # can be changed to get other languages
file = [ f for f in target_files if target_lang_key in f ][0]
print(f"processing: {file}")

if file.endswith(".gz"):
    with gzip.open(file, "rt") as f:
        raw_df = pd.read_csv(f, encoding = 'utf8', header = None, names = ['spell', 'sound'])
else:
    with open(file, "rt") as f:
        raw_df = pd.read_csv(f, encoding = 'utf8', header = None, names = ['spell', 'sound'])
try:
    sounds = raw_df['sound'].apply(lambda x: x.strip('/') )
    sounds = [ x.split("/,")[0] for x in sounds ] # picks up only the first of multiple entries
    raw_df['sound'] = sounds
except AttributeError:
    pass
#
raw_df.sample(10)

processing: data/wn3/en_A_only.csv


Unnamed: 0,spell,sound
10528,varying,
587,anecdotic,
6669,normative,
7161,perennial,
1640,bosky,
10416,untucked,
8269,scabrous,
3602,elfin,
10632,volant,
10764,workmanlike,


In [393]:
## source sampling
len(raw_df)
if source_sampling:
	print(f"source sampling applied")
	if len(raw_df) >= source_sampling_max_size:
		raw_df = raw_df.sample(source_sampling_max_size)
	else:
		raw_df = raw_df.sample(round(len(raw_df) * source_sampling_rate))
print(raw_df)

source sampling applied
              spell  sound
8261        saurian    NaN
10776         woven    NaN
10611        virile    NaN
7850       ratified    NaN
2604   contaminated    NaN
...             ...    ...
771    apocynaceous    NaN
7002        paneled    NaN
1385           best    NaN
6990     palmatifid    NaN
986         assumed    NaN

[5415 rows x 2 columns]


In [394]:
## generate 1-grams for spell and sound
## spell
raw_df['sp_1gram'] = raw_df['spell'].apply(lambda x: list(str(x)))
# add column of size
raw_df['sp_size'] = raw_df['sp_1gram'].apply(lambda x: len(x))
# add column of count of '-' inside
raw_df['hyphen'] = raw_df['sp_1gram'].apply(lambda x: list(x).count("-"))
# add column of count of '.' inside
raw_df['period'] = raw_df['sp_1gram'].apply(lambda x: list(x).count("."))
## sound
# takes the first entry, removes '/' around
try:
    raw_df['sn_1gram'] = raw_df['sound'].apply(lambda x: list(x) )
except TypeError:
    pass
# add column of size
try:
    raw_df['sn_size'] = raw_df['sn_1gram'].apply(lambda x: len(x))
except KeyError:
    pass
## check
raw_df

Unnamed: 0,spell,sound,sp_1gram,sp_size,hyphen,period
8261,saurian,,"[s, a, u, r, i, a, n]",7,0,0
10776,woven,,"[w, o, v, e, n]",5,0,0
10611,virile,,"[v, i, r, i, l, e]",6,0,0
7850,ratified,,"[r, a, t, i, f, i, e, d]",8,0,0
2604,contaminated,,"[c, o, n, t, a, m, i, n, a, t, e, d]",12,0,0
...,...,...,...,...,...,...
771,apocynaceous,,"[a, p, o, c, y, n, a, c, e, o, u, s]",12,0,0
7002,paneled,,"[p, a, n, e, l, e, d]",7,0,0
1385,best,,"[b, e, s, t]",4,0,0
6990,palmatifid,,"[p, a, l, m, a, t, i, f, i, d]",10,0,0


In [395]:
## filtering raw_data by size
print(f"term_type: {term_type}")
if "sp_" in term_type:
    df_filtered = raw_df[ (raw_df['sp_size'] <= max_doc_size) & (raw_df['sp_size'] >= min_doc_size) & (raw_df['hyphen'] == 0) & (raw_df['period'] == 0) ]
else:
    df_filtered = raw_df[ (raw_df['sn_size'] <= max_doc_size) & (raw_df['sn_size'] >= min_doc_size) ]
#
df_filtered

term_type: sp_skippy4gram


Unnamed: 0,spell,sound,sp_1gram,sp_size,hyphen,period
8261,saurian,,"[s, a, u, r, i, a, n]",7,0,0
10776,woven,,"[w, o, v, e, n]",5,0,0
10611,virile,,"[v, i, r, i, l, e]",6,0,0
7850,ratified,,"[r, a, t, i, f, i, e, d]",8,0,0
6333,mysophobic,,"[m, y, s, o, p, h, o, b, i, c]",10,0,0
...,...,...,...,...,...,...
1315,beatified,,"[b, e, a, t, i, f, i, e, d]",9,0,0
4701,holophytic,,"[h, o, l, o, p, h, y, t, i, c]",10,0,0
7002,paneled,,"[p, a, n, e, l, e, d]",7,0,0
6990,palmatifid,,"[p, a, l, m, a, t, i, f, i, d]",10,0,0


In [396]:
## define df after second sampling if any
len(df_filtered)
if second_sampling:
    df = df_filtered.sample(round(len(df_filtered) * second_sampling_rate))
else:
    df = df_filtered
len(df)

3838

In [397]:
## spell 2grams
import ngrams
reload_module = False
if reload_module:
    import importlib
    importlib.reload(ngrams)

if term_class == 'spell':
    sp_2grams = [ ngrams.list_gen_ngrams (x, n = 2, check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_2grams):
            g.extend(list(df['sp_1gram'])[i])
    ## add sp_2gram
    df['sp_2gram'] = sp_2grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sp_2gram'] = sp_2grams


In [398]:
## spell 3grams
import ngrams
if n_for_ngram > 2 and term_class == 'spell':
    sp_3grams = [ ngrams.list_gen_ngrams (x, n = 3, check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_3grams):
            g.extend(list(df['sp_2gram'])[i])
    ## add sp_2gram
    df['sp_3gram'] = sp_3grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sp_3gram'] = sp_3grams


In [399]:
## spell 4grams
import ngrams
if n_for_ngram > 3 and term_class == 'spell':
    sp_4grams = [ ngrams.list_gen_ngrams (x, n = 4, check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_4grams):
            g.extend(list(df['sp_3gram'])[i])
    ## add sp_2gram
    df['sp_4gram'] = sp_4grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sp_4gram'] = sp_4grams


In [400]:
## spell skippy2gram
import ngrams_skippy
reload_module = False
if reload_module:
    import importlib
    importlib.reload(ngrams_skippy)
#
if term_class == 'spell':
    sp_skippy2grams = [ ngrams_skippy.gen_skippy2grams(x, check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_skippy2grams):
            g.extend(list(df['sp_1gram'])[i])
    #
    df['sp_skippy2gram'] = sp_skippy2grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sp_skippy2gram'] = sp_skippy2grams


In [401]:
## spell skippy3gram
import ngrams_skippy
if n_for_ngram > 2 and term_class == 'spell':
    sp_skippy3grams = [ ngrams_skippy.gen_skippy3grams(x, check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_skippy3grams):
            g.extend(list(df['sp_skippy2gram'])[i])
    #
    df['sp_skippy3gram'] = sp_skippy3grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sp_skippy3gram'] = sp_skippy3grams


In [402]:
## spell skippy4gram
import ngrams_skippy
if n_for_ngram > 3 and term_class == 'spell':
    sp_skippy4grams = [ ngrams_skippy.gen_skippy4grams(x, check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_skippy4grams):
            g.extend(list(df['sp_skippy3gram'])[i])
    #
    df['sp_skippy4gram'] = sp_skippy4grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sp_skippy4gram'] = sp_skippy4grams


In [403]:
## sound 2grams
import ngrams
reload_module = False
if reload_module:
    import importlib
    importlib.reload(ngrams)
#
if term_class == 'sound':
    sn_2grams = [ ngrams.list_gen_ngrams (x, n = 2, check = False) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_2grams):
            g.extend(list(df['sn_1gram'])[i])
    ## add sn_2gram
    df['sn_2gram'] = sn_2grams

In [404]:
## sound 3grams
import ngrams
if n_for_ngram > 2 and term_class == 'sound':
    sn_3grams = [ ngrams.list_gen_ngrams (x, n = 3, check = False) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_3grams):
            g.extend(list(df['sn_2gram'])[i])
    ## add sn_3gram
    df['sn_3gram'] = sn_3grams

In [405]:
## sound 4grams
import ngrams
if n_for_ngram > 3 and term_class == 'sound':
    sn_4grams = [ ngrams.list_gen_ngrams (x, n = 4, check = False) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_3grams):
            g.extend(list(df['sn_2gram'])[i])
    ## add sn_4gram
    df['sn_4gram'] = sn_3grams

In [406]:
## sound skippy2gram
import ngrams_skippy
if term_class == 'sound':
    sn_skippy2grams = [ ngrams_skippy.gen_skippy2grams(x) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_skippy2grams):
            g.extend(list(df['sn_1gram'])[i])
    #
    df['sn_skippy2gram'] = sn_skippy2grams

In [407]:
## sound skippy3gram
import ngrams_skippy
if n_for_ngram > 2 and term_class == 'sound':
    sn_skippy3grams = [ ngrams_skippy.gen_skippy3grams(x, check = False) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_skippy3grams):
            g.extend(list(df['sn_skippy2gram'])[i])
    #
    df['sn_skippy3gram'] = sn_skippy3grams

In [408]:
## sound skippy4gram
import ngrams_skippy
if n_for_ngram > 3 and term_class == 'sound':
    sn_skippy4grams = [ ngrams_skippy.gen_skippy4grams(x, check = False) for x in df['sn_1gram'] ]
    #
    if ngram_is_inclusive:
        for i, g in enumerate(sn_skippy4grams):
            g.extend(list(df['sn_skippy3gram'])[i])
    #
    df['sn_skippy4gram'] = sn_skippy4grams

In [409]:
## check df
dropped_vars = [ 'sp_size', 'hyphen', 'period', 'sn_size' ]
if term_class == 'spell':
    extra = [ 'sn_1gram', 'sn_2gram', 'sn_3gram', 'sn_4gram',
             'sn_skippy2gram', 'sn_skippy3gram', 'sn_skippy4gram' ]
    dropped_vars.extend(extra)
    target_vars = [ x for x in df.columns if not x in dropped_vars ]
else:
    extra = [ 'sp_1gram', 'sp_2gram', 'sp_3gram', 'sp_4gram',
             'sp_skippy2gram', 'sp_skippy3gram', 'sp_skippy4gram' ]
    dropped_vars.extend(extra)
    target_vars = [ x for x in df.columns if not x in dropped_vars ]    
#
df[target_vars]

Unnamed: 0,spell,sound,sp_1gram,sp_2gram,sp_3gram,sp_4gram,sp_skippy2gram,sp_skippy3gram,sp_skippy4gram
8261,saurian,,"[s, a, u, r, i, a, n]","[sa, au, ur, ri, ia, an, s, a, u, r, i, a, n]","[sau, aur, uri, ria, ian, sa, au, ur, ri, ia, ...","[saur, auri, uria, rian, sau, aur, uri, ria, i...","[sa, s_u, s_r, s_i, s_a, s_n, au, a_r, a_i, a_...","[sau, sa_r, sa_i, sa_a, sa_n, s_ur, s_u_i, s_u...","[saur, sau_i, sau_a, sau_n, sa_ri, sa_r_a, sa_..."
10776,woven,,"[w, o, v, e, n]","[wo, ov, ve, en, w, o, v, e, n]","[wov, ove, ven, wo, ov, ve, en, w, o, v, e, n]","[wove, oven, wov, ove, ven, wo, ov, ve, en, w,...","[wo, w_v, w_e, w_n, ov, o_e, o_n, ve, v_n, en,...","[wov, wo_e, wo_n, w_ve, w_v_n, w_en, ove, ov_n...","[wove, wov_n, wo_en, w_v_en, oven, wov, wo_e, ..."
10611,virile,,"[v, i, r, i, l, e]","[vi, ir, ri, il, le, v, i, r, i, l, e]","[vir, iri, ril, ile, vi, ir, ri, il, le, v, i,...","[viri, iril, rile, vir, iri, ril, ile, vi, ir,...","[vi, v_r, v_i, v_l, v_e, ir, i_i, i_l, i_e, ri...","[vir, vi_i, vi_l, vi_e, v_ri, v_r_l, v_r_e, v_...","[viri, vir_l, vir_e, vi_il, vi_i_e, vi_le, v_r..."
7850,ratified,,"[r, a, t, i, f, i, e, d]","[ra, at, ti, if, fi, ie, ed, r, a, t, i, f, i,...","[rat, ati, tif, ifi, fie, ied, ra, at, ti, if,...","[rati, atif, tifi, ifie, fied, rat, ati, tif, ...","[ra, r_t, r_i, r_f, r_e, r_d, at, a_i, a_f, a_...","[rat, ra_i, ra_f, ra_e, ra_d, r_ti, r_t_f, r_t...","[rati, rat_f, rat_i, rat_e, rat_d, ra_if, ra_i..."
6333,mysophobic,,"[m, y, s, o, p, h, o, b, i, c]","[my, ys, so, op, ph, ho, ob, bi, ic, m, y, s, ...","[mys, yso, sop, oph, pho, hob, obi, bic, my, y...","[myso, ysop, soph, opho, phob, hobi, obic, mys...","[my, m_s, m_o, m_p, m_h, m_b, m_i, m_c, ys, y_...","[mys, my_o, my_p, my_h, my_b, my_i, my_c, m_so...","[myso, mys_p, mys_h, mys_o, mys_b, mys_i, mys_..."
...,...,...,...,...,...,...,...,...,...
1315,beatified,,"[b, e, a, t, i, f, i, e, d]","[be, ea, at, ti, if, fi, ie, ed, b, e, a, t, i...","[bea, eat, ati, tif, ifi, fie, ied, be, ea, at...","[beat, eati, atif, tifi, ifie, fied, bea, eat,...","[be, b_a, b_t, b_i, b_f, b_e, b_d, ea, e_t, e_...","[bea, be_t, be_i, be_f, be_e, be_d, b_at, b_a_...","[beat, bea_i, bea_f, bea_e, bea_d, be_ti, be_t..."
4701,holophytic,,"[h, o, l, o, p, h, y, t, i, c]","[ho, ol, lo, op, ph, hy, yt, ti, ic, h, o, l, ...","[hol, olo, lop, oph, phy, hyt, yti, tic, ho, o...","[holo, olop, loph, ophy, phyt, hyti, ytic, hol...","[ho, h_l, h_o, h_p, h_h, h_y, h_t, h_i, h_c, o...","[hol, ho_o, ho_p, ho_h, ho_y, ho_t, ho_i, ho_c...","[holo, hol_p, hol_h, hol_y, hol_t, hol_i, hol_..."
7002,paneled,,"[p, a, n, e, l, e, d]","[pa, an, ne, el, le, ed, p, a, n, e, l, e, d]","[pan, ane, nel, ele, led, pa, an, ne, el, le, ...","[pane, anel, nele, eled, pan, ane, nel, ele, l...","[pa, p_n, p_e, p_l, p_d, an, a_e, a_l, a_d, ne...","[pan, pa_e, pa_l, pa_d, p_ne, p_n_l, p_n_e, p_...","[pane, pan_l, pan_e, pan_d, pa_el, pa_e_e, pa_..."
6990,palmatifid,,"[p, a, l, m, a, t, i, f, i, d]","[pa, al, lm, ma, at, ti, if, fi, id, p, a, l, ...","[pal, alm, lma, mat, ati, tif, ifi, fid, pa, a...","[palm, alma, lmat, mati, atif, tifi, ifid, pal...","[pa, p_l, p_m, p_a, p_t, p_i, p_f, p_d, al, a_...","[pal, pa_m, pa_a, pa_t, pa_i, pa_f, pa_d, p_lm...","[palm, pal_a, pal_t, pal_i, pal_f, pal_d, pa_m..."


In [410]:
## select data type and define doc_dict
import random
if "sp_" in term_type:
    base_type = "spell"
else:
    base_type = "sound"
doc_dict = { i: x for i, x in enumerate(df[base_type]) }
## check
random.sample(doc_dict.items(), 10)

since Python 3.9 and will be removed in a subsequent version.
  random.sample(doc_dict.items(), 10)


[(1528, 'resistant'),
 (2152, 'unclean'),
 (3091, 'flatulent'),
 (3148, 'floating'),
 (1964, 'running'),
 (3831, 'rightful'),
 (938, 'combinable'),
 (3614, 'offish'),
 (2157, 'disclosed'),
 (3354, 'implicated')]

In [411]:
## select bots for analysis
enable_term_change = False # if you want to change term_type to save time and energy
if enable_term_change:
	term_type = 'sp_skippy4gram'
print(f"(changed) term_type: {term_type}")

## bot stands for 'bag-of-terms', a generalization of 'bag-of-words'
bots = [ x for x in df[term_type] if len(x) > min_bot_size ] # Crucially
import random
random.sample(bots, 3)

(changed) term_type: sp_skippy4gram


[['cros',
  'cro_s',
  'cro_e',
  'cro_d',
  'cr_ss',
  'cr_s_e',
  'cr_s_d',
  'cr_se',
  'cr_ed',
  'c_o_ss',
  'c_os_e',
  'c_os_d',
  'c_o_se',
  'c_o_s_d',
  'c_o_ed',
  'c_s_se',
  'c_ss_d',
  'c_s_ed',
  'ross',
  'ros_e',
  'ros_d',
  'ro_se',
  'ro_s_d',
  'ro_ed',
  'r_s_se',
  'r_ss_d',
  'r_s_ed',
  'osse',
  'oss_d',
  'os_ed',
  'o_s_ed',
  'ssed',
  'cro',
  'cr_s',
  'cr_e',
  'cr_d',
  'c_os',
  'c_o_s',
  'c_o_e',
  'c_o_d',
  'c_ss',
  'c_s_e',
  'c_s_d',
  'c_se',
  'c_ed',
  'ros',
  'ro_s',
  'ro_e',
  'ro_d',
  'r_ss',
  'r_s_e',
  'r_s_d',
  'r_se',
  'r_ed',
  'oss',
  'os_e',
  'os_d',
  'o_se',
  'o_s_d',
  'o_ed',
  'sse',
  'ss_d',
  's_ed',
  'sed',
  'cr',
  'c_o',
  'c_s',
  'c_e',
  'c_d',
  'ro',
  'r_s',
  'r_e',
  'r_d',
  'os',
  'o_s',
  'o_e',
  'o_d',
  'ss',
  's_e',
  's_d',
  'se',
  'ed',
  'c',
  'r',
  'o',
  's',
  's',
  'e',
  'd'],
 ['down',
  'dow_s',
  'dow_t',
  'dow_a',
  'dow_i',
  'dow_r',
  'do_ns',
  'do_n_t',
  'do_n_a',
  'do_

In [412]:
## generate dictionary
from gensim.corpora import Dictionary
diction = Dictionary(bots)
print(diction)

if apply_term_filtering:
    print(f"term filtering applied")
    diction.filter_extremes(no_below = term_minfreq, no_above = abuse_threshold)
else:
    print(f"term filtering not applied")
print(diction)

## generate DTM
corpus = [ diction.doc2bow(bot) for bot in bots if len(bot) > min_bot_size ] # Crucially

Dictionary(180745 unique tokens: ['a', 'a_a', 'a_an', 'a_i', 'a_i_an']...)
term filtering applied
Dictionary(47353 unique tokens: ['a_i_an', 'a_i_n', 'a_ia', 'a_r_a', 'a_r_an']...)


In [413]:
## HDP (n_topics = 90)
import gensim.models
import pyLDAvis.gensim
max_n_topics = 90
hdp = gensim.models.HdpModel(corpus, diction, random_state = 1, T = max_n_topics)
vis_data = pyLDAvis.gensim.prepare(hdp, corpus, diction)
pyLDAvis.display(vis_data)

In [414]:
## save LDAvis output as a html file
save_LDAvis = True
if save_LDAvis:
	output = f"results/LDAvis/{target_lang_dict[target_lang_key]}-HDP-max_ntop{max_n_topics}-{term_type}.html"
	pyLDAvis.save_html(vis_data, output)

In [415]:
## topic investigation
import numpy as np
import HDP_helper

documents_topics = np.zeros([hdp.m_T, len(corpus)])
for doc_id, c in enumerate(corpus):
    for topic_id, prob in hdp[c]:
        documents_topics[topic_id][doc_id] = prob

## investigate topics
n_docs_to_show  = 10
n_terms_to_show = 7
hdp.optimal_ordering()
for topic_id, probs in enumerate(documents_topics):
    print(f"==============")
    #topic_encoding = ", ".join(hdp.show_topic(topic_id))
    topic_t = hdp.print_topic(topic_id, topn = n_terms_to_show)
    print(f"topic_id {topic_id}: {HDP_helper.reformat_topic (topic_t, n_terms_to_show)}")
    print(f"nonzero count: ", len(probs.nonzero()[0]))
    for doc_id in probs.argsort()[::-1][:n_docs_to_show]:
        doc = doc_dict[doc_id]
        print(f"\t{probs[doc_id]:0.4f}: {doc}")

topic_id 0: 0.001 * e_t_l + 0.001 * t_le + 0.001 * a_t_l + 0.001 * en_l + 0.0 * n_i_l + 0.0 * e_b + 0.0 * t_bl
nonzero count:  262
	0.9966: terminable
	0.9966: invertible
	0.9964: penetrable
	0.9963: tangential
	0.9963: prudential
	0.9963: intangible
	0.9962: sentential
	0.9962: negotiable
	0.9962: revertible
	0.9961: unbeatable
topic_id 1: 0.0 * p_al + 0.0 * a_p + 0.0 * y_i + 0.0 * s_ic + 0.0 * a_o_i + 0.0 * c_ic + 0.0 * a_a_i
nonzero count:  266
	0.9965: candescent
	0.9965: chimerical
	0.9962: anachronic
	0.9961: decreasing
	0.9961: caulescent
	0.9960: cheliceral
	0.9960: aldermanic
	0.9959: pleonastic
	0.9959: incidental
	0.9959: depressing
topic_id 2: 0.0 * a_nt + 0.0 * an_e + 0.0 * n_le + 0.0 * a_e_t + 0.0 * co_e + 0.0 * z + 0.0 * u_i_d
nonzero count:  277
	0.9965: unsociable
	0.9964: untraveled
	0.9964: uncombined
	0.9963: combinable
	0.9962: parturient
	0.9961: lubricated
	0.9961: adulterate
	0.9961: unsilenced
	0.9961: ambivalent
	0.9961: surmounted
topic_id 3: 0.001 * le_s + 0

In [416]:
## HDP (n_topics = 45)
import gensim.models
import pyLDAvis.gensim
max_n_topics = 45
hdp = gensim.models.HdpModel(corpus, diction, random_state = 1, T = max_n_topics)
vis_data = pyLDAvis.gensim.prepare(hdp, corpus, diction)
pyLDAvis.display(vis_data)

In [417]:
## save LDAvis output as a html file
save_LDAvis = True
if save_LDAvis:
	output = f"results/LDAvis/{target_lang_dict[target_lang_key]}-HDP-max_ntop{max_n_topics}-{term_type}.html"
	pyLDAvis.save_html(vis_data, output)

In [418]:
## topic investigation
import numpy as np
import HDP_helper

documents_topics = np.zeros([hdp.m_T, len(corpus)])
for doc_id, c in enumerate(corpus):
    for topic_id, prob in hdp[c]:
        documents_topics[topic_id][doc_id] = prob

n_docs_to_show = 10
n_terms_to_show = 7
hdp.optimal_ordering()
for topic_id, probs in enumerate(documents_topics):
    print(f"==============")
    topic_t = hdp.print_topic(topic_id, topn = n_terms_to_show)
    print(f"topic_id {topic_id}: {HDP_helper.reformat_topic (topic_t, n_terms_to_show)}")
    print(f"nonzero count: {len(probs.nonzero()[0])}")
    for doc_id in probs.argsort()[::-1][:n_docs_to_show]:
        doc = doc_dict[doc_id]
        print(f"\t{probs[doc_id]:0.4f}: {doc}")

topic_id 0: 0.0 * le_s + 0.0 * l_ss + 0.0 * less + 0.0 * les + 0.0 * ea_e + 0.0 * a_e_e + 0.0 * e_es
nonzero count: 746
	0.9969: changeable
	0.9968: chargeable
	0.9968: relational
	0.9966: formulated
	0.9966: bimestrial
	0.9965: thunderous
	0.9965: histrionic
	0.9964: inarguable
	0.9964: decreasing
	0.9964: cautionary
topic_id 1: 0.0 * pr + 0.0 * es_e + 0.0 * un_i + 0.0 * n_e_e + 0.0 * e_v + 0.0 * pr_e + 0.0 * p_e_e
nonzero count: 327
	0.9965: unoriented
	0.9965: unreserved
	0.9965: undesigned
	0.9964: undeserved
	0.9963: intangible
	0.9962: sentential
	0.9962: parturient
	0.9962: plundering
	0.9961: propertied
	0.9961: connatural
topic_id 2: 0.0 * e_an + 0.0 * r_an + 0.0 * s_an + 0.0 * m_ic + 0.0 * a_i_n + 0.0 * so + 0.0 * i_in
nonzero count: 324
	0.9965: candescent
	0.9964: mechanical
	0.9964: penetrable
	0.9962: implacable
	0.9962: duplicable
	0.9961: catalectic
	0.9961: cataleptic
	0.9960: eviscerate
	0.9960: ammoniated
	0.9960: pleonastic
topic_id 3: 0.0 * c_le + 0.0 * e_t_l + 0.0

In [419]:
## HDP (n_topics = 15)
import gensim.models
import pyLDAvis.gensim

max_n_topics = 15
hdp = gensim.models.HdpModel(corpus, diction, random_state = 1, T = max_n_topics)
vis_data = pyLDAvis.gensim.prepare(hdp, corpus, diction)
pyLDAvis.display(vis_data)

In [420]:
## save LDAvis output as a html file
save_LDAvis = True
if save_LDAvis:
	output = f"results/LDAvis/{target_lang_dict[target_lang_key]}-HDP-max_ntop{max_n_topics}-{term_type}.html"
	pyLDAvis.save_html(vis_data, output)

In [421]:
## topic investigation
import numpy as np
import HDP_helper
reload_module = True
if reload_module:
    import importlib
    importlib.reload(HDP_helper)

documents_topics = np.zeros([hdp.m_T, len(corpus)])
for doc_id, c in enumerate(corpus):
    for topic_id, prob in hdp[c]:
        documents_topics[topic_id][doc_id] = prob

n_docs_to_show = 10
n_terms_to_show = 7
hdp.optimal_ordering()
for topic_id, probs in enumerate(documents_topics):
    print(f"==============")
    topic_t = hdp.print_topic(topic_id, topn = n_terms_to_show)
    print(f"topic_id {topic_id}: {HDP_helper.reformat_topic (topic_t, n_terms_to_show)}")
    print(f"nonzero count: ", len(probs.nonzero()[0]))
    for doc_id in probs.argsort()[::-1][:n_docs_to_show]:
        doc = doc_dict[doc_id]
        print(f"\t{probs[doc_id]:0.4f}: {doc}")

topic_id 0: 0.0 * a_a_i + 0.0 * ct + 0.0 * ati + 0.0 * e_e_t + 0.0 * l_ic + 0.0 * a_at + 0.0 * i_i_e
nonzero count:  1596
	0.9970: indelicate
	0.9970: implicated
	0.9969: separative
	0.9969: deflective
	0.9969: unimpaired
	0.9969: operculate
	0.9969: adjunctive
	0.9968: delineated
	0.9968: disjointed
	0.9968: regulative
topic_id 1: 0.0 * e_ss + 0.0 * le_s + 0.0 * e_s_e + 0.0 * les + 0.0 * less + 0.0 * e_e_ss + 0.0 * l_ss
nonzero count:  930
	0.9969: terminable
	0.9967: actionable
	0.9967: reportable
	0.9966: unilateral
	0.9966: intangible
	0.9966: measurable
	0.9966: impressive
	0.9964: consolable
	0.9964: consistent
	0.9964: comparable
topic_id 2: 0.0 * s_ic + 0.0 * p_t_c + 0.0 * p_o_i + 0.0 * h_ic + 0.0 * a_a_i + 0.0 * i_ti + 0.0 * a_o_i
nonzero count:  789
	0.9967: contracted
	0.9966: antimonial
	0.9966: antisocial
	0.9965: estrogenic
	0.9965: scholastic
	0.9963: tramontane
	0.9962: prosthetic
	0.9962: antiseptic
	0.9961: controlled
	0.9961: pentatonic
topic_id 3: 0.0 * r_us + 0.0 *