In [1]:
#%pip install -U pandas

In [2]:
#%pip install -U pyLDAvis

In [3]:
## imports
import os, sys
import pprint as pp

In [4]:
## 一つ上の階層のファイルを見るように設定
sys.path.append(os.path.join(os.path.dirname("__file__"), '..'))

In [5]:
## variables

## sampling
source_sampling      = True
source_sampling_rate = 0.2
## doc settings
max_doc_size = 10
min_doc_size =  5
## term settings
ngram_is_inclusive = True
gap_mark   = "…"
term_is_skippy  = True
term_classes    = [ 'spell', 'sound' ]
term_class      = term_classes[0]
n_for_ngram     = 4
print(f"term_is_skippy: {term_is_skippy}")
print(f"term_class: {term_class}")
print(f"n_for_ngram: {n_for_ngram}")
## define term_type
if term_class == 'spell':
    if term_is_skippy:
        term_type = f"sp_skippy{n_for_ngram}gram"
    else:
        term_type = f"sp_{n_for_ngram}gram"
else:
    if term_is_skippy:
        term_type = f"sn_skippy{n_for_ngram}gram"
    else:
        term_type = f"sn_{n_for_ngram}gram"
## check
print(f"term_type: {term_type}")

term_is_skippy: True
term_class: spell
n_for_ngram: 4
term_type: sp_skippy4gram


In [6]:
## LDA/HDP
apply_term_filtering = True
## The following parameters need to be relatively large to prevent "Row sum not equal 1" error
term_minfreq         = 3
abuse_threshold      = 0.03
min_bot_size         = 3

In [7]:
## set target files
import glob
data_dir = "data/open-dict-ipa/data1/"
target_files = glob.glob(f"{data_dir}/*")
target_files = [ file for file in target_files if ".csv" in file ]
pp.pprint(target_files)

['data/open-dict-ipa/data1/nb.csv',
 'data/open-dict-ipa/data1/yue.csv',
 'data/open-dict-ipa/data1/or.csv',
 'data/open-dict-ipa/data1/fr_FR.csv',
 'data/open-dict-ipa/data1/es_ES.csv',
 'data/open-dict-ipa/data1/jam.csv',
 'data/open-dict-ipa/data1/is.csv',
 'data/open-dict-ipa/data1/vi_S.csv',
 'data/open-dict-ipa/data1/fi.csv',
 'data/open-dict-ipa/data1/vi_C.csv',
 'data/open-dict-ipa/data1/de.csv',
 'data/open-dict-ipa/data1/en_US.csv',
 'data/open-dict-ipa/data1/es_MX.csv',
 'data/open-dict-ipa/data1/eo.csv',
 'data/open-dict-ipa/data1/ja.csv',
 'data/open-dict-ipa/data1/en_UK.csv',
 'data/open-dict-ipa/data1/sv.csv',
 'data/open-dict-ipa/data1/sw.csv',
 'data/open-dict-ipa/data1/fa.csv',
 'data/open-dict-ipa/data1/vi_N.csv',
 'data/open-dict-ipa/data1/ar.csv',
 'data/open-dict-ipa/data1/zh_hans.csv',
 'data/open-dict-ipa/data1/zh_hant.csv',
 'data/open-dict-ipa/data1/ma.csv',
 'data/open-dict-ipa/data1/nl.csv',
 'data/open-dict-ipa/data1/fr_QC.csv']


In [8]:
## get data from files
import pandas as pd

target_key = "en_US" # can be changed to get other languages
file = [ f for f in target_files if target_key in f ][0]
print(f"processing: {file}")
with open(file, "rt") as f:
    raw_df = pd.read_csv(f, encoding = 'utf8', header = None, names = ['spell', 'sound'])
    sounds = raw_df['sound'].apply(lambda x: x.strip('/') )
    sounds = [ x.split("/,")[0] for x in sounds ] # picks up only the first of multiple entries
    raw_df['sound'] = sounds
# 
raw_df.sample(10)

processing: data/open-dict-ipa/data1/en_US.csv


Unnamed: 0,spell,sound
96122,roebuck's,ˈɹoʊˌbəks
113750,tingen,ˈtɪŋən
104989,snaggers,ˈsnæɡɝz
27074,davos,ˈdɑvoʊs
42520,gabriel's,ˈɡeɪbɹiəɫz
61812,kraeutler,ˈkɹaʊtɫɝ
81477,orchids,ˈɔɹkədz
35486,engross,ɪnˈɡɹoʊs
113628,tillery,ˈtɪɫɝi
88443,pragma,ˈpɹæɡmə


In [9]:
## generate 1-grams for spell and sound
## spell
raw_df['sp_1gram'] = raw_df['spell'].apply(lambda x: list(str(x)))
# add column of size
raw_df['sp_size'] = raw_df['sp_1gram'].apply(lambda x: len(x))
# add column of count of '-' inside
raw_df['hyphen'] = raw_df['sp_1gram'].apply(lambda x: list(x).count("-"))
# add column of count of '.' inside
raw_df['period'] = raw_df['sp_1gram'].apply(lambda x: list(x).count("."))
## sound
# takes the first entry, removes '/' around
raw_df['sn_1gram'] = raw_df['sound'].apply(lambda x: list(x) )
# add column of size
raw_df['sn_size'] = raw_df['sn_1gram'].apply(lambda x: len(x))
## check
raw_df

Unnamed: 0,spell,sound,sp_1gram,sp_size,hyphen,period,sn_1gram,sn_size
0,'bout,ˈbaʊt,"[', b, o, u, t]",5,0,0,"[ˈ, b, a, ʊ, t]",5
1,'cause,kəz,"[', c, a, u, s, e]",6,0,0,"[k, ə, z]",3
2,'course,ˈkɔɹs,"[', c, o, u, r, s, e]",7,0,0,"[ˈ, k, ɔ, ɹ, s]",5
3,'cuse,ˈkjuz,"[', c, u, s, e]",5,0,0,"[ˈ, k, j, u, z]",5
4,'em,əm,"[', e, m]",3,0,0,"[ə, m]",2
...,...,...,...,...,...,...,...,...
125922,zysk,ˈzaɪsk,"[z, y, s, k]",4,0,0,"[ˈ, z, a, ɪ, s, k]",6
125923,zyskowski,zɪˈskɔfski,"[z, y, s, k, o, w, s, k, i]",9,0,0,"[z, ɪ, ˈ, s, k, ɔ, f, s, k, i]",10
125924,zyuganov,ˈzjuɡɑnɑv,"[z, y, u, g, a, n, o, v]",8,0,0,"[ˈ, z, j, u, ɡ, ɑ, n, ɑ, v]",9
125925,zyuganov's,ˈzjuɡɑnɑvz,"[z, y, u, g, a, n, o, v, ', s]",10,0,0,"[ˈ, z, j, u, ɡ, ɑ, n, ɑ, v, z]",10


In [10]:
## filtering raw_data by size
print(f"term_type: {term_type}")
if "sp_" in term_type:
    df_filtered = raw_df[ (raw_df['sp_size'] <= max_doc_size) & (raw_df['sp_size'] >= min_doc_size) & (raw_df['hyphen'] == 0) & (raw_df['period'] == 0) ]
else:
    df_filtered = raw_df[ (raw_df['sn_size'] <= max_doc_size) & (raw_df['sn_size'] >= min_doc_size) ]
#
df_filtered

term_type: sp_skippy4gram


Unnamed: 0,spell,sound,sp_1gram,sp_size,hyphen,period,sn_1gram,sn_size
0,'bout,ˈbaʊt,"[', b, o, u, t]",5,0,0,"[ˈ, b, a, ʊ, t]",5
1,'cause,kəz,"[', c, a, u, s, e]",6,0,0,"[k, ə, z]",3
2,'course,ˈkɔɹs,"[', c, o, u, r, s, e]",7,0,0,"[ˈ, k, ɔ, ɹ, s]",5
3,'cuse,ˈkjuz,"[', c, u, s, e]",5,0,0,"[ˈ, k, j, u, z]",5
5,'frisco,ˈfɹɪskoʊ,"[', f, r, i, s, c, o]",7,0,0,"[ˈ, f, ɹ, ɪ, s, k, o, ʊ]",8
...,...,...,...,...,...,...,...,...
125921,zynda,ˈzɪndə,"[z, y, n, d, a]",5,0,0,"[ˈ, z, ɪ, n, d, ə]",6
125923,zyskowski,zɪˈskɔfski,"[z, y, s, k, o, w, s, k, i]",9,0,0,"[z, ɪ, ˈ, s, k, ɔ, f, s, k, i]",10
125924,zyuganov,ˈzjuɡɑnɑv,"[z, y, u, g, a, n, o, v]",8,0,0,"[ˈ, z, j, u, ɡ, ɑ, n, ɑ, v]",9
125925,zyuganov's,ˈzjuɡɑnɑvz,"[z, y, u, g, a, n, o, v, ', s]",10,0,0,"[ˈ, z, j, u, ɡ, ɑ, n, ɑ, v, z]",10


In [11]:
## establish df after sampling if any
len(df_filtered)
if source_sampling:
    df = df_filtered.sample(round(len(df_filtered) * source_sampling_rate))
else:
    df = df_filtered
len(df)

20844

In [12]:
## spell 2grams
import ngrams
reload_module = False
if reload_module:
    import importlib
    importlib.reload(ngrams)
sp_2grams = [ ngrams.list_gen_ngrams (x, n = 2, check = False) for x in df['sp_1gram'] ]
if ngram_is_inclusive:
    for i, g in enumerate(sp_2grams):
        g.extend(list(df['sp_1gram'])[i])
## add sp_2gram
df['sp_2gram'] = sp_2grams

In [13]:
## spell 3grams
import ngrams
sp_3grams = [ ngrams.list_gen_ngrams (x, n = 3, check = False) for x in df['sp_1gram'] ]
if ngram_is_inclusive:
    for i, g in enumerate(sp_3grams):
        g.extend(list(df['sp_2gram'])[i])
## add sp_2gram
df['sp_3gram'] = sp_3grams

In [14]:
## spell 4grams
import ngrams
sp_4grams = [ ngrams.list_gen_ngrams (x, n = 4, check = False) for x in df['sp_1gram'] ]
if ngram_is_inclusive:
    for i, g in enumerate(sp_4grams):
        g.extend(list(df['sp_3gram'])[i])
## add sp_2gram
df['sp_4gram'] = sp_4grams

In [15]:
## spell skippy2gram
import ngrams_skippy
reload_module = False
if reload_module:
    import importlib
    importlib.reload(ngrams_skippy)
#
sp_skippy2grams = [ ngrams_skippy.gen_skippy2grams(x) for x in df['sp_1gram'] ]
if ngram_is_inclusive:
    for i, g in enumerate(sp_skippy2grams):
        g.extend(list(df['sp_1gram'])[i])
#
df['sp_skippy2gram'] = sp_skippy2grams

In [16]:
## spell skippy3gram
import ngrams_skippy
sp_skippy3grams = [ ngrams_skippy.gen_skippy3grams(x) for x in df['sp_1gram'] ]
if ngram_is_inclusive:
    for i, g in enumerate(sp_skippy3grams):
        g.extend(list(df['sp_skippy2gram'])[i])
#
df['sp_skippy3gram'] = sp_skippy3grams

In [17]:
## spell skippy4gram
import ngrams_skippy
sp_skippy4grams = [ ngrams_skippy.gen_skippy4grams(x) for x in df['sp_1gram'] ]
if ngram_is_inclusive:
    for i, g in enumerate(sp_skippy4grams):
        g.extend(list(df['sp_skippy3gram'])[i])
#
df['sp_skippy4gram'] = sp_skippy4grams

In [18]:
## sound 2grams
import ngrams
reload_module = False
if reload_module:
    import importlib
    importlib.reload(ngrams)
#
sn_2grams = [ ngrams.list_gen_ngrams (x, n = 2, check = False) for x in df['sn_1gram'] ]
if ngram_is_inclusive:
    for i, g in enumerate(sn_2grams):
        g.extend(list(df['sn_1gram'])[i])
## add sn_2gram
df['sn_2gram'] = sn_2grams

In [19]:
## sound 3grams
import ngrams
sn_3grams = [ ngrams.list_gen_ngrams (x, n = 3, check = False) for x in df['sn_1gram'] ]
if ngram_is_inclusive:
    for i, g in enumerate(sn_3grams):
        g.extend(list(df['sn_2gram'])[i])
## add sn_3gram
df['sn_3gram'] = sn_3grams

In [20]:
## sound 4grams
import ngrams
sn_4grams = [ ngrams.list_gen_ngrams (x, n = 4, check = False) for x in df['sn_1gram'] ]
if ngram_is_inclusive:
    for i, g in enumerate(sn_3grams):
        g.extend(list(df['sn_2gram'])[i])
## add sn_4gram
df['sn_4gram'] = sn_3grams

In [21]:
## sound skippy2gram
import ngrams_skippy
sn_skippy2grams = [ ngrams_skippy.gen_skippy2grams(x) for x in df['sn_1gram'] ]
if ngram_is_inclusive:
    for i, g in enumerate(sn_skippy2grams):
        g.extend(list(df['sn_1gram'])[i])
#
df['sn_skippy2gram'] = sn_skippy2grams

In [22]:
## sound skippy3gram
import ngrams_skippy
sn_skippy3grams = [ ngrams_skippy.gen_skippy3grams(x) for x in df['sn_1gram'] ]
if ngram_is_inclusive:
    for i, g in enumerate(sn_skippy3grams):
        g.extend(list(df['sn_skippy2gram'])[i])
#
df['sn_skippy3gram'] = sn_skippy3grams

In [23]:
## sound skippy4gram
import ngrams_skippy
sn_skippy4grams = [ ngrams_skippy.gen_skippy4grams(x) for x in df['sn_1gram'] ]
#
if ngram_is_inclusive:
    for i, g in enumerate(sn_skippy4grams):
        g.extend(list(df['sn_skippy3gram'])[i])
#
df['sn_skippy4gram'] = sn_skippy4grams

In [24]:
## check df
dropped_vars = [ 'sp_size', 'hyphen', 'period', 'sn_size' ]
if "sp_" in term_type:
    extra = [ 'sn_1gram', 'sn_2gram', 'sn_3gram', 'sn_skippy2gram', 'sn_skippy3gram', 'sn_skippy4gram' ]
    dropped_vars.extend(extra)
    target_vars = [ x for x in df.columns if not x in dropped_vars ]
else:
    extra = [ 'sp_1gram', 'sp_2gram', 'sp_3gram', 'sp_skippy2gram', 'sp_skippy3gram', 'sp_skippy4gram' ]
    dropped_vars.extend(extra)
    target_vars = [ x for x in df.columns if not x in dropped_vars ]    
#
df[target_vars]

Unnamed: 0,spell,sound,sp_1gram,sp_2gram,sp_3gram,sp_4gram,sp_skippy2gram,sp_skippy3gram,sp_skippy4gram,sn_4gram
93606,relearning,ɹiˈɫɛɹnɪŋ,"[r, e, l, e, a, r, n, i, n, g]","[re, el, le, ea, ar, rn, ni, in, ng, r, e, l, ...","[rel, ele, lea, ear, arn, rni, nin, ing, re, e...","[rele, elea, lear, earn, arni, rnin, ning, rel...","[re, r_l, r_e, r_a, r_r, r_n, r_i, r_g, el, e_...","[rel, re_e, re_a, re_r, re_n, re_i, re_g, r_le...","[rele, rel_a, rel_r, rel_n, rel_i, rel_g, re_e...","[ɹiˈ, iˈɫ, ˈɫɛ, ɫɛɹ, ɛɹn, ɹnɪ, nɪŋ, ɹi, iˈ, ˈɫ..."
123179,wingett,ˈwɪŋɡɪt,"[w, i, n, g, e, t, t]","[wi, in, ng, ge, et, tt, w, i, n, g, e, t, t]","[win, ing, nge, get, ett, wi, in, ng, ge, et, ...","[wing, inge, nget, gett, win, ing, nge, get, e...","[wi, w_n, w_g, w_e, w_t, in, i_g, i_e, i_t, ng...","[win, wi_g, wi_e, wi_t, w_ng, w_n_e, w_n_t, w_...","[wing, win_e, win_t, wi_ge, wi_g_t, wi_et, wi_...","[ˈwɪ, wɪŋ, ɪŋɡ, ŋɡɪ, ɡɪt, ˈw, wɪ, ɪŋ, ŋɡ, ɡɪ, ..."
85655,pettet,ˈpɛtɪt,"[p, e, t, t, e, t]","[pe, et, tt, te, et, p, e, t, t, e, t]","[pet, ett, tte, tet, pe, et, tt, te, et, p, e,...","[pett, ette, ttet, pet, ett, tte, tet, pe, et,...","[pe, p_t, p_e, et, e_t, e_e, tt, t_e, t_t, te,...","[pet, pe_t, pe_e, p_tt, p_t_e, p_t_t, p_te, p_...","[pett, pet_e, pet_t, pe_te, pe_t_t, pe_et, p_t...","[ˈpɛ, pɛt, ɛtɪ, tɪt, ˈp, pɛ, ɛt, tɪ, ɪt, ˈ, p,..."
52484,honoured,ˈɑnɝd,"[h, o, n, o, u, r, e, d]","[ho, on, no, ou, ur, re, ed, h, o, n, o, u, r,...","[hon, ono, nou, our, ure, red, ho, on, no, ou,...","[hono, onou, nour, oure, ured, hon, ono, nou, ...","[ho, h_n, h_o, h_u, h_r, h_e, h_d, on, o_o, o_...","[hon, ho_o, ho_u, ho_r, ho_e, ho_d, h_no, h_n_...","[hono, hon_u, hon_r, hon_e, hon_d, ho_ou, ho_o...","[ˈɑn, ɑnɝ, nɝd, ˈɑ, ɑn, nɝ, ɝd, ˈ, ɑ, n, ɝ, d,..."
17790,carvey,ˈkɑɹvi,"[c, a, r, v, e, y]","[ca, ar, rv, ve, ey, c, a, r, v, e, y]","[car, arv, rve, vey, ca, ar, rv, ve, ey, c, a,...","[carv, arve, rvey, car, arv, rve, vey, ca, ar,...","[ca, c_r, c_v, c_e, c_y, ar, a_v, a_e, a_y, rv...","[car, ca_v, ca_e, ca_y, c_rv, c_r_e, c_r_y, c_...","[carv, car_e, car_y, ca_ve, ca_v_y, ca_ey, c_r...","[ˈkɑ, kɑɹ, ɑɹv, ɹvi, ˈk, kɑ, ɑɹ, ɹv, vi, ˈ, k,..."
...,...,...,...,...,...,...,...,...,...,...
69110,maniac,ˈmeɪniˌæk,"[m, a, n, i, a, c]","[ma, an, ni, ia, ac, m, a, n, i, a, c]","[man, ani, nia, iac, ma, an, ni, ia, ac, m, a,...","[mani, ania, niac, man, ani, nia, iac, ma, an,...","[ma, m_n, m_i, m_a, m_c, an, a_i, a_a, a_c, ni...","[man, ma_i, ma_a, ma_c, m_ni, m_n_a, m_n_c, m_...","[mani, man_a, man_c, ma_ia, ma_i_c, ma_ac, m_n...","[ˈme, meɪ, eɪn, ɪni, niˌ, iˌæ, ˌæk, ˈm, me, eɪ..."
38549,feedings,ˈfidɪŋz,"[f, e, e, d, i, n, g, s]","[fe, ee, ed, di, in, ng, gs, f, e, e, d, i, n,...","[fee, eed, edi, din, ing, ngs, fe, ee, ed, di,...","[feed, eedi, edin, ding, ings, fee, eed, edi, ...","[fe, f_e, f_d, f_i, f_n, f_g, f_s, ee, e_d, e_...","[fee, fe_d, fe_i, fe_n, fe_g, fe_s, f_ed, f_e_...","[feed, fee_i, fee_n, fee_g, fee_s, fe_di, fe_d...","[ˈfi, fid, idɪ, dɪŋ, ɪŋz, ˈf, fi, id, dɪ, ɪŋ, ..."
118253,valade,vɑˈɫɑdeɪ,"[v, a, l, a, d, e]","[va, al, la, ad, de, v, a, l, a, d, e]","[val, ala, lad, ade, va, al, la, ad, de, v, a,...","[vala, alad, lade, val, ala, lad, ade, va, al,...","[va, v_l, v_a, v_d, v_e, al, a_a, a_d, a_e, la...","[val, va_a, va_d, va_e, v_la, v_l_d, v_l_e, v_...","[vala, val_d, val_e, va_ad, va_a_e, va_de, v_l...","[vɑˈ, ɑˈɫ, ˈɫɑ, ɫɑd, ɑde, deɪ, vɑ, ɑˈ, ˈɫ, ɫɑ,..."
44647,giovannini,dʒoʊvɑˈnini,"[g, i, o, v, a, n, n, i, n, i]","[gi, io, ov, va, an, nn, ni, in, ni, g, i, o, ...","[gio, iov, ova, van, ann, nni, nin, ini, gi, i...","[giov, iova, ovan, vann, anni, nnin, nini, gio...","[gi, g_o, g_v, g_a, g_n, g_i, io, i_v, i_a, i_...","[gio, gi_v, gi_a, gi_n, gi_i, g_ov, g_o_a, g_o...","[giov, gio_a, gio_n, gio_i, gi_va, gi_v_n, gi_...","[dʒo, ʒoʊ, oʊv, ʊvɑ, vɑˈ, ɑˈn, ˈni, nin, ini, ..."


In [25]:
## select data type and define doc_dict
import random
if "sp_" in term_type:
    base_type = "spell"
else:
    base_type = "sound"
doc_dict = { i: x for i, x in enumerate(df[base_type]) }
## check
random.sample(doc_dict.items(), 10)

since Python 3.9 and will be removed in a subsequent version.
  random.sample(doc_dict.items(), 10)


[(7008, 'misch'),
 (9105, 'paules'),
 (20706, 'krejci'),
 (6350, 'fashioned'),
 (2626, 'mother'),
 (1188, 'bilotta'),
 (14774, 'gunst'),
 (19679, 'polcyn'),
 (3995, 'biehl'),
 (16368, 'kostic')]

In [26]:
## select bots for analysis
import random

bots = [ x for x in df[term_type] if len(x) > 1 ] # Crucially
random.sample(bots, 3)

[['chis',
  'chi_o',
  'chi_m',
  'ch_so',
  'ch_s_m',
  'ch_om',
  'c_i_so',
  'c_is_m',
  'c_i_om',
  'c_s_om',
  'hiso',
  'his_m',
  'hi_om',
  'h_s_om',
  'isom',
  'chi',
  'ch_s',
  'ch_o',
  'ch_m',
  'c_is',
  'c_i_o',
  'c_i_m',
  'c_so',
  'c_s_m',
  'c_om',
  'his',
  'hi_o',
  'hi_m',
  'h_so',
  'h_s_m',
  'h_om',
  'iso',
  'is_m',
  'i_om',
  'som',
  'ch',
  'c_i',
  'c_s',
  'c_o',
  'c_m',
  'hi',
  'h_s',
  'h_o',
  'h_m',
  'is',
  'i_o',
  'i_m',
  'so',
  's_m',
  'om',
  'c',
  'h',
  'i',
  's',
  'o',
  'm'],
 ['negl',
  'neg_i',
  'neg_g',
  'neg_b',
  'neg_l',
  'neg_e',
  'ne_li',
  'ne_l_g',
  'ne_l_i',
  'ne_l_b',
  'ne_l_l',
  'ne_l_e',
  'ne_ig',
  'ne_i_i',
  'ne_i_b',
  'ne_i_l',
  'ne_i_e',
  'ne_gi',
  'ne_g_b',
  'ne_g_l',
  'ne_g_e',
  'ne_ib',
  'ne_bl',
  'ne_b_e',
  'ne_le',
  'n_g_li',
  'n_gl_g',
  'n_gl_i',
  'n_gl_b',
  'n_gl_l',
  'n_gl_e',
  'n_g_ig',
  'n_g_i_i',
  'n_g_i_b',
  'n_g_i_l',
  'n_g_i_e',
  'n_g_gi',
  'n_g_g_b',
  'n_g_g_l'

In [27]:
## generate dictionary
from gensim.corpora import Dictionary
diction = Dictionary(bots)
print(diction)

if apply_term_filtering:
    print(f"term filtering applied")
    diction.filter_extremes(no_below = term_minfreq, no_above = abuse_threshold)
else:
    print(f"term filtering not applied")
print(diction)

## generate DTM
corpus = [ diction.doc2bow(bot) for bot in bots if len(bot) > min_bot_size ] # Crucially

Dictionary(418268 unique tokens: ['a', 'a_g', 'a_i', 'a_i_g', 'a_i_ng']...)
term filtering applied
Dictionary(100000 unique tokens: ['a_i_g', 'a_i_ng', 'a_n_g', 'a_n_in', 'a_n_n']...)


In [28]:
## HDP (n_topics = 15)
import gensim.models
import pyLDAvis.gensim

max_n_topics = 15
hdp = gensim.models.HdpModel(corpus, diction, random_state = 1, T = max_n_topics)
vis_data = pyLDAvis.gensim.prepare(hdp, corpus, diction)
pyLDAvis.display(vis_data)

In [29]:
## topic investigation
import numpy as np
import HDP_helper
reload_module = True
if reload_module:
    import importlib
    importlib.reload(HDP_helper)

documents_topics = np.zeros([hdp.m_T, len(corpus)])
for doc_id, c in enumerate(corpus):
    for topic_id, prob in hdp[c]:
        documents_topics[topic_id][doc_id] = prob

n_docs_to_show = 10
n_terms_to_show = 7
hdp.optimal_ordering()
for topic_id, probs in enumerate(documents_topics):
    print(f"==============")
    topic_t = hdp.print_topic(topic_id, topn = n_terms_to_show)
    print(f"topic_id {topic_id}: {HDP_helper.reformat_topic (topic_t, n_terms_to_show)}")
    print(f"nonzero count: ", len(probs.nonzero()[0]))
    for doc_id in probs.argsort()[::-1][:n_docs_to_show]:
        doc = doc_dict[doc_id]
        print(f"\t{probs[doc_id]:0.4f}: {doc}")

topic_id 0: 0.001 * ve + 0.0 * e_m + 0.0 * re_e + 0.0 * v_n + 0.0 * s_h + 0.0 * r_in + 0.0 * v_e
nonzero count:  12008
	0.9977: charleston
	0.9977: casterline
	0.9977: ameliorate
	0.9977: chamberlin
	0.9976: indelicate
	0.9976: partisaned
	0.9976: terminated
	0.9976: coverstone
	0.9976: estimating
	0.9976: dispersing
topic_id 1: 0.001 * a_' + 0.001 * e_' + 0.001 * a_'s + 0.001 * r_' + 0.001 * e_'s + 0.001 * r_'s + 0.0 * o_'
nonzero count:  8634
	0.9974: seminaries
	0.9973: comedian's
	0.9973: coltrane's
	0.9973: stimulants
	0.9973: cordiant's
	0.9972: reasoner's
	0.9972: americans'
	0.9972: ordinaries
	0.9972: samaritans
	0.9972: ministries
topic_id 2: 0.0 * a_u + 0.0 * ur + 0.0 * e_u + 0.0 * ou + 0.0 * ho + 0.0 * ba + 0.0 * us
nonzero count:  3821
	0.9969: cornelious
	0.9969: blackstone
	0.9968: composites
	0.9967: submarines
	0.9967: alligators
	0.9967: burglaries
	0.9966: bankshares
	0.9966: racehorses
	0.9965: allocators
	0.9965: schoeneman
topic_id 3: 0.0 * s_g + 0.0 * c_d + 0.0 *

In [30]:
## HDP (n_topics = 45)
import gensim.models
import pyLDAvis.gensim
max_n_topics = 45
hdp = gensim.models.HdpModel(corpus, diction, random_state = 1, T = max_n_topics)
vis_data = pyLDAvis.gensim.prepare(hdp, corpus, diction)
pyLDAvis.display(vis_data)

In [31]:
## topic investigation
import numpy as np
import HDP_helper

documents_topics = np.zeros([hdp.m_T, len(corpus)])
for doc_id, c in enumerate(corpus):
    for topic_id, prob in hdp[c]:
        documents_topics[topic_id][doc_id] = prob

n_docs_to_show = 10
n_terms_to_show = 7
hdp.optimal_ordering()
for topic_id, probs in enumerate(documents_topics):
    print(f"==============")
    topic_t = hdp.print_topic(topic_id, topn = n_terms_to_show)
    print(f"topic_id {topic_id}: {HDP_helper.reformat_topic (topic_t, n_terms_to_show)}")
    print(f"nonzero count: ", len(probs.nonzero()[0]))
    for doc_id in probs.argsort()[::-1][:n_docs_to_show]:
        doc = doc_dict[doc_id]
        print(f"\t{probs[doc_id]:0.4f}: {doc}")

topic_id 0: 0.0 * a_m + 0.0 * a_' + 0.0 * k_n + 0.0 * t_d + 0.0 * d_o + 0.0 * e_u + 0.0 * ve
nonzero count:  17025
	0.9984: catherines
	0.9983: discretion
	0.9983: americans'
	0.9983: patricians
	0.9983: waterstone
	0.9983: calamine's
	0.9982: receptions
	0.9982: variations
	0.9982: gradations
	0.9982: hinterland
topic_id 1: 0.0 * co_e + 0.0 * ts + 0.0 * ss + 0.0 * in_e + 0.0 * d_l + 0.0 * ell + 0.0 * or_e
nonzero count:  2882
	0.9965: cornelious
	0.9965: composites
	0.9964: complicate
	0.9962: confidants
	0.9962: sacraments
	0.9961: indentures
	0.9961: condensate
	0.9960: demolishes
	0.9960: conditions
	0.9960: contribute
topic_id 2: 0.001 * or_e + 0.001 * ur + 0.001 * rt + 0.001 * f_r + 0.001 * e_' + 0.001 * or_s + 0.001 * mo
nonzero count:  3223
	0.9966: seminaries
	0.9965: enumerates
	0.9964: distrusted
	0.9964: consumer's
	0.9963: residents'
	0.9963: correlates
	0.9963: informants
	0.9962: enumerated
	0.9962: ordinaries
	0.9962: correlated
topic_id 3: 0.001 * i_k + 0.001 * r_in + 

In [32]:
## HDP (n_topics = 90)
import gensim.models
import pyLDAvis.gensim
max_n_topics = 90
hdp = gensim.models.HdpModel(corpus, diction, random_state = 1, T = max_n_topics)
vis_data = pyLDAvis.gensim.prepare(hdp, corpus, diction)
pyLDAvis.display(vis_data)

In [33]:
## topic investigation
import numpy as np
import HDP_helper

documents_topics = np.zeros([hdp.m_T, len(corpus)])
for doc_id, c in enumerate(corpus):
    for topic_id, prob in hdp[c]:
        documents_topics[topic_id][doc_id] = prob

## investigate topics
n_docs_to_show = 10
n_terms_to_show = 7
hdp.optimal_ordering()
for topic_id, probs in enumerate(documents_topics):
    print(f"==============")
    #topic_encoding = ", ".join(hdp.show_topic(topic_id))
    topic_t = hdp.print_topic(topic_id, topn = n_terms_to_show)
    print(f"topic_id {topic_id}: {HDP_helper.reformat_topic (topic_t, n_terms_to_show)}")
    print(f"nonzero count: ", len(probs.nonzero()[0]))
    for doc_id in probs.argsort()[::-1][:n_docs_to_show]:
        doc = doc_dict[doc_id]
        print(f"\t{probs[doc_id]:0.4f}: {doc}")

topic_id 0: 0.0 * e_m + 0.0 * t_d + 0.0 * n_er + 0.0 * e_u + 0.0 * a_m + 0.0 * n_d + 0.0 * f_r
nonzero count:  17434
	0.9985: predicated
	0.9984: palminteri
	0.9982: unwavering
	0.9980: lionberger
	0.9979: hinderman
	0.9979: halterman
	0.9978: bensinger
	0.9977: hermansen
	0.9977: bessinger
	0.9977: recreates
topic_id 1: 0.001 * e_' + 0.001 * r_' + 0.001 * c_' + 0.001 * a_' + 0.001 * e_'s + 0.001 * c_'s + 0.001 * r_'s
nonzero count:  2442
	0.9964: composites
	0.9962: comedian's
	0.9961: racehorses
	0.9960: musician's
	0.9960: reasoner's
	0.9957: secularist
	0.9956: deflectors
	0.9955: treasury's
	0.9955: christmas'
	0.9954: ballerinas
topic_id 2: 0.001 * ba + 0.001 * ga + 0.001 * a_' + 0.001 * a_'s + 0.001 * al_a + 0.001 * o_an + 0.0 * a_a_s
nonzero count:  1881
	0.9956: castagnola
	0.9954: colebreath
	0.9952: croissants
	0.9950: prestowitz
	0.9949: batholiths
	0.9949: sebastiani
	0.9947: razorbacks
	0.9947: relativism
	0.9945: tolerants
	0.9945: altzheimer
topic_id 3: 0.001 * o_k + 0.