HDP-base spell-sound analyzer
developed by Kow Kuroda (kow.kuroda@gmail.com)

History
2024/08/19 Cython を使った cy_gen_ngrams を利用可能にした
2024/09/02 n-gram 生成の generic function を導入し，コードを簡略化

In [51]:
#!pip install -U pyLDAvis
#!pip install -U pandas

In [52]:
## imports
import os, sys
import pprint as pp
import unicodedata

In [53]:
## 一つ上の階層のファイルを見るように設定
sys.path.append(os.path.join(os.path.dirname("__file__"), '..'))

In [54]:
## Cython module の生成 (必要に応じて)
#!python clean setup.py build_ext --inplace

In [55]:
## Cython を使うかどうか
use_Cython = False
if use_Cython:
    %load_ext Cython

In [56]:
## term settings
term_classes        = [ 'spell', 'sound' ]
term_class          = term_classes[0]
ngram_is_inclusive  = True
ngram_inclusiveness = 2
## doc settings
max_doc_size        = 11
min_doc_size        =  3
print(f"max_doc_size: {max_doc_size}")
print(f"min_doc_size: {min_doc_size}")
### boundary handling
add_boundary        = False
boundary_mark       = "#"
## term setting
seg_joint           = ""
gap_mark            = "…"
term_is_skippy      = True
n_for_ngram         = 4
max_gap_ratio       = 0.8
max_gap_size        = round(max_doc_size * max_gap_ratio)
print(f"term_class: {term_class}")
print(f"term_is_skippy: {term_is_skippy}")
print(f"max_gap_size: {max_gap_size}")
print(f"n_for_ngram: {n_for_ngram}")
### accent handling
suppress_accents    = True
accent_marks        = [ "ˈ", "ˌ" ] 
if term_class == 'sound':
    if suppress_accents:
        accent_status = "-unaccented"
    else:
        accent_stratus = "-accented"
else:
    accent_status = ""
print(f"accent_status: {accent_status}")
## define term_type
if term_class == 'spell':
    if term_is_skippy:
        term_type = f"sp_skippy{n_for_ngram}gram"
    else:
        term_type = f"sp_{n_for_ngram}gram"
else:
    if term_is_skippy:
        term_type = f"sn_skippy{n_for_ngram}gram"
    else:
        term_type = f"sn_{n_for_ngram}gram"
## check
print(f"term_type: {term_type}")

max_doc_size: 11
min_doc_size: 3
term_class: spell
term_is_skippy: True
max_gap_size: 9
n_for_ngram: 4
accent_status: 
term_type: sp_skippy4gram


In [57]:
## target language
## a key must be part of a file name 
target_lang_dict = {    'en_US' : 'English (US)',
                        'en_UK' : 'English (UK)',
                        'en_N_only' : 'English noun (WN)',
                        'en_V_only' : 'English verb (WN)',
                        'en_A_only' : 'English adj (WN)',
                        'en_R_only' : 'English adv (WN)',
                        'ar'    : 'Arabic',
                        'de'    : 'German',
                        'de_N_only' : 'German Nouns',
                        'de_non_N_only' : 'German Non-nouns',
                        'eo'    : 'Esperanto',
                        'es_ES' : 'Spanish (Spain)',
                        'es_MX' : 'Spanish (Mexico)',
                        'fi'    : 'Finnish',
                        'fr_FR' : 'French (France)',
                        'fr_QC' : 'French (Quebec)',
                        'is'    : 'Icelandic',
                        'ir'    : 'Irish',
                        'nl'    : 'Dutch',
                        'ro'    : 'Romanian',
                        'sw'    : 'Swahili' }
## proper language selection
target_lang_keys = [    'en_US', # 0
                        'en_UK', # 1
                        'en_N_only', # 2
                        'en_V_only', # 3
                        'en_A_only', # 4
                        'en_R_only', # 5
                        'ar', # 6
                        'de', # 7
                        'de_N_only', # 8
                        'de_non_N_only', # 9
                        'eo', 'es_ES', 'es_MX',
                        'fi', 'fr_FR', 'fr_QC',
                        'is', 'nl', 'ro', 'sw',
                        'ir' # This lacks sound
                    ]
## check
target_lang_key  = target_lang_keys[4]
print(f"target_lang_key: {target_lang_key}")
print(f"target lang: {target_lang_dict[target_lang_key]} [{target_lang_key}]")
## target_attr [effective only for Irish]
target_class = ""
#target_class = None # This causes an unrediable error
if target_lang_key == "ir":
    target_classes = [ 'adjectives', 'nouns', 'verbs' ]
    target_class = f"-{target_classes[3]}"
print(f"target_class: {target_class}")

target_lang_key: en_A_only
target lang: English adj (WN) [en_A_only]
target_class: 


In [58]:
## LDA/HDP
apply_term_filtering = True
## The following parameters need to be relatively large to prevent "Row sum not equal 1" error
term_minfreq       = 2
## The following value is crucial to prevent "Row sum not equal 1" error
min_bot_size       = min_doc_size # 3
abuse_threshold    = 0.05 # larger value selects shorter units, smaller value selects longer units
# number of terms listed for a given topic
n_docs_to_show     = 15
n_terms_to_show    = 20
n_terms_to_save    = 120
#
sanitize_DTM = True

In [59]:
## sampling
source_sampling          = True
source_sampling_rate     = 0.5
source_sampling_max_size = 5000
second_sampling          = False
second_sampling_rate     = 0.7

In [60]:
## set target files
import glob
data_dir1     = "data/open-dict-ipa/data1"
data_dir2     = "data/open-dict-ipa/data1a"
data_dir3     = "data/wn3"
data_dir4     = "data/irish"
target_files  = glob.glob(f"{data_dir1}/*")
target_files2 = glob.glob(f"{data_dir2}/*")
target_files.extend(target_files2)
target_files3 = glob.glob(f"{data_dir3}/*")
target_files.extend(target_files3)
target_files4 = glob.glob(f"{data_dir4}/*")
target_files.extend(target_files4)
#
target_files = sorted([ file for file in target_files if ".csv" in file ])
pp.pprint(target_files)

['data/irish/word-irish-adjectives-spell.csv',
 'data/irish/word-irish-noun-phrases-spell.csv',
 'data/irish/word-irish-nouns-spell.csv',
 'data/irish/word-irish-possessives-spell.csv',
 'data/irish/word-irish-prepositions-spell.csv',
 'data/irish/word-irish-verbs-spell.csv',
 'data/open-dict-ipa/data1/ar.csv.gz',
 'data/open-dict-ipa/data1/de.csv.gz',
 'data/open-dict-ipa/data1/en_UK.csv.gz',
 'data/open-dict-ipa/data1/en_US.csv.gz',
 'data/open-dict-ipa/data1/eo.csv.gz',
 'data/open-dict-ipa/data1/es_ES.csv.gz',
 'data/open-dict-ipa/data1/es_MX.csv.gz',
 'data/open-dict-ipa/data1/fa.csv.gz',
 'data/open-dict-ipa/data1/fi.csv.gz',
 'data/open-dict-ipa/data1/fr_FR.csv.gz',
 'data/open-dict-ipa/data1/fr_QC.csv.gz',
 'data/open-dict-ipa/data1/is.csv.gz',
 'data/open-dict-ipa/data1/ja.csv.gz',
 'data/open-dict-ipa/data1/jam.csv.gz',
 'data/open-dict-ipa/data1/ma.csv.gz',
 'data/open-dict-ipa/data1/nb.csv.gz',
 'data/open-dict-ipa/data1/nl.csv.gz',
 'data/open-dict-ipa/data1/or.csv.gz',
 '

In [61]:
## get source data from files
import pandas as pd
import gzip
#target_language_key = "en_US" # can be changed to get other languages
#if target_class != "" or target_class is not None:
if target_class != "":
    target_file = [ f for f in target_files if target_lang_key in f and target_class in f ][0]
else:
    target_file = [ f for f in target_files if target_lang_key in f ][0]
print(f"processing: {target_file}")
##
if target_lang_key == "ir":
    col_names = ['spell', 'POS']
else:
    col_names = ['spell', 'sound']
#
if target_file.endswith(".gz"):
    with gzip.open(target_file, "rt") as f:
        raw_df = pd.read_csv(f, encoding = 'utf8', header = None, names = col_names )
else:
    with open(target_file, "rt") as f:
        raw_df = pd.read_csv(f, encoding = 'utf8', header = None, names = col_names )
## normalize characters
raw_df['spell'] = raw_df['spell'].apply(lambda x: unicodedata.normalize('NFC', str(x)))
## modify sound
try:
    sounds = raw_df['sound'].apply(lambda x: x.strip('/') )
    sounds = [ x.split("/,")[0] for x in sounds ] # picks up only the first of multiple entries
    raw_df['sound'] = sounds
except (AttributeError, KeyError):
    pass
#
raw_df.sample(10)

processing: data/wn3/en_A_only.csv


Unnamed: 0,spell,sound
3016,isotopic,ˌaɪsəˈtɑpɪk
3953,pharmacological,ˌfɑɹməkəˈɫɑdʒɪkəɫ
2366,glistening,ˈɡɫɪsənɪŋ
2812,indistinct,ˌɪndɪˈstɪŋkt
2814,individual,ˌɪndəˈvɪdʒəwəɫ
101,admiring,ædˈmaɪɹɪŋ
5070,symptomatic,ˌsɪmptəˈmætɪk
2267,frank,ˈfɹæŋk
5418,undefeated,ˌəndɪˈfitɪd
1697,distributive,dɪˈstɹɪbjutɪv


In [62]:
## source sampling
len(raw_df)
if source_sampling:
	print(f"source sampling applied")
	if len(raw_df) >= source_sampling_max_size:
		raw_df = raw_df.sample(source_sampling_max_size)
	else:
		raw_df = raw_df.sample(round(len(raw_df) * source_sampling_rate))
## remove accent marking
if suppress_accents:
	try:
		raw_df['sound'] = raw_df['sound'].apply(lambda x: "".join([ y for y in list(x) if y not in accent_marks ]))
	except KeyError:
		pass
## add boudary marks
if add_boundary:
	raw_df['spell'] = raw_df['spell'].apply(lambda x: f"{boundary_mark}{x}{boundary_mark}")
	try:
		raw_df['sound'] = raw_df['sound'].apply(lambda x: f"{boundary_mark}{x}{boundary_mark}")
	except KeyError:
		pass
#
print(raw_df)

source sampling applied
             spell          sound
491     automotive      ɔtəmoʊtɪv
425      assertive         əsɝtɪv
5431  undetectable   əndɪtɛktəbəɫ
2117       fateful        feɪtfəɫ
3344    membranous      mɛmbɹənəs
...            ...            ...
2752  incalculable  ɪnkæɫkjəɫəbəɫ
2555        hollow          hɑɫoʊ
5884      wrongful         ɹɔŋfəɫ
2484         harsh           hɑɹʃ
2991      ionizing      aɪənaɪzɪŋ

[5000 rows x 2 columns]


In [63]:
## generate 1-grams for spell and sound
## spell
raw_df['sp_1gram'] = raw_df['spell'].apply(lambda x: list(str(x)))
# add column of size
raw_df['sp_size'] = raw_df['sp_1gram'].apply(lambda x: len(x))
# add column of count of '-' inside
raw_df['hyphen'] = raw_df['sp_1gram'].apply(lambda x: list(x).count("-"))
# add column of count of '.' inside
raw_df['period'] = raw_df['sp_1gram'].apply(lambda x: list(x).count("."))
## sound
# takes the first entry, removes '/' around
try:
    raw_df['sn_1gram'] = raw_df['sound'].apply(lambda x: list(x) )
except (TypeError, KeyError):
    pass
# add column of size
try:
    raw_df['sn_size'] = raw_df['sn_1gram'].apply(lambda x: len(x))
except KeyError:
    pass
## check
raw_df

Unnamed: 0,spell,sound,sp_1gram,sp_size,hyphen,period,sn_1gram,sn_size
491,automotive,ɔtəmoʊtɪv,"[a, u, t, o, m, o, t, i, v, e]",10,0,0,"[ɔ, t, ə, m, o, ʊ, t, ɪ, v]",9
425,assertive,əsɝtɪv,"[a, s, s, e, r, t, i, v, e]",9,0,0,"[ə, s, ɝ, t, ɪ, v]",6
5431,undetectable,əndɪtɛktəbəɫ,"[u, n, d, e, t, e, c, t, a, b, l, e]",12,0,0,"[ə, n, d, ɪ, t, ɛ, k, t, ə, b, ə, ɫ]",12
2117,fateful,feɪtfəɫ,"[f, a, t, e, f, u, l]",7,0,0,"[f, e, ɪ, t, f, ə, ɫ]",7
3344,membranous,mɛmbɹənəs,"[m, e, m, b, r, a, n, o, u, s]",10,0,0,"[m, ɛ, m, b, ɹ, ə, n, ə, s]",9
...,...,...,...,...,...,...,...,...
2752,incalculable,ɪnkæɫkjəɫəbəɫ,"[i, n, c, a, l, c, u, l, a, b, l, e]",12,0,0,"[ɪ, n, k, æ, ɫ, k, j, ə, ɫ, ə, b, ə, ɫ]",13
2555,hollow,hɑɫoʊ,"[h, o, l, l, o, w]",6,0,0,"[h, ɑ, ɫ, o, ʊ]",5
5884,wrongful,ɹɔŋfəɫ,"[w, r, o, n, g, f, u, l]",8,0,0,"[ɹ, ɔ, ŋ, f, ə, ɫ]",6
2484,harsh,hɑɹʃ,"[h, a, r, s, h]",5,0,0,"[h, ɑ, ɹ, ʃ]",4


In [64]:
## filtering raw_data by size
print(f"term_type: {term_type}")
if "sp_" in term_type:
    df_filtered = raw_df[ (raw_df['sp_size'] <= max_doc_size) & (raw_df['sp_size'] >= min_doc_size) & (raw_df['hyphen'] == 0) & (raw_df['period'] == 0) ]
else:
    df_filtered = raw_df[ (raw_df['sn_size'] <= max_doc_size) & (raw_df['sn_size'] >= min_doc_size) ]
#
df_filtered

term_type: sp_skippy4gram


Unnamed: 0,spell,sound,sp_1gram,sp_size,hyphen,period,sn_1gram,sn_size
491,automotive,ɔtəmoʊtɪv,"[a, u, t, o, m, o, t, i, v, e]",10,0,0,"[ɔ, t, ə, m, o, ʊ, t, ɪ, v]",9
425,assertive,əsɝtɪv,"[a, s, s, e, r, t, i, v, e]",9,0,0,"[ə, s, ɝ, t, ɪ, v]",6
2117,fateful,feɪtfəɫ,"[f, a, t, e, f, u, l]",7,0,0,"[f, e, ɪ, t, f, ə, ɫ]",7
3344,membranous,mɛmbɹənəs,"[m, e, m, b, r, a, n, o, u, s]",10,0,0,"[m, ɛ, m, b, ɹ, ə, n, ə, s]",9
1514,dependent,dɪpɛndənt,"[d, e, p, e, n, d, e, n, t]",9,0,0,"[d, ɪ, p, ɛ, n, d, ə, n, t]",9
...,...,...,...,...,...,...,...,...
2274,freehand,fɹihænd,"[f, r, e, e, h, a, n, d]",8,0,0,"[f, ɹ, i, h, æ, n, d]",7
2555,hollow,hɑɫoʊ,"[h, o, l, l, o, w]",6,0,0,"[h, ɑ, ɫ, o, ʊ]",5
5884,wrongful,ɹɔŋfəɫ,"[w, r, o, n, g, f, u, l]",8,0,0,"[ɹ, ɔ, ŋ, f, ə, ɫ]",6
2484,harsh,hɑɹʃ,"[h, a, r, s, h]",5,0,0,"[h, ɑ, ɹ, ʃ]",4


In [65]:
## define df after second sampling if any
len(df_filtered)
if second_sampling:
    df = df_filtered.sample(round(len(df_filtered) * second_sampling_rate))
else:
    df = df_filtered
len(df)

4464

In [66]:
## generic function for n-gram generation
def add_ngram_to_df(dfx, n_for_ngram: int, var_prefix: str = "", ngram_is_skippy: bool = False, ngram_is_inclusive: bool = ngram_is_inclusive, ngram_inclusiveness: int = ngram_inclusiveness, seg_joint: str = "", use_Cython: bool = False, check: bool = False):
    """
    generic function for adding n-gram column to df with a specified n for ngram
    """
    inclusion_size = (n_for_ngram - ngram_inclusiveness)
    print(f"inclusion_size: {inclusion_size}")
    assert inclusion_size >= 0
    source_var = f"{var_prefix}1gram"
    print(f"source_var: {source_var}")
    unigrams = df[source_var]
    if use_Cython:
        import cy_gen_ngrams
        if ngram_is_skippy:
            ngrams = [ [seg_joint.join(x) for x in cy_gen_ngrams.cy_gen_skippy_ngrams(x, n = n_for_ngram, check = False)] for x in unigrams ]
        else:
            ngrams = [ [seg_joint.join(x) for x in cy_gen_ngrams.cy_gen_ngrams(x, n = n_for_ngram, check = False)] for x in unigrams ]
    else:
        import gen_ngrams
        if ngram_is_skippy:
            ngrams = [ gen_ngrams.gen_skippy_ngrams(x, n = n_for_ngram, sep = "", check = False) for x in unigrams ]
        else:
            ngrams = [ gen_ngrams.gen_ngrams(x, n = n_for_ngram, sep = "", check = False) for x in unigrams ]
    ## 包括的 2gramの生成
    if ngram_is_inclusive:
        assert (n_for_ngram - 1) > 0
        if ngram_is_skippy and n_for_ngram > 2:
            supplement_var = f"{var_prefix}skippy{n_for_ngram - 1}gram"
        else:
            supplement_var = f"{var_prefix}{n_for_ngram - 1}gram"
        print(f"supplement_var: {supplement_var}")
        for i, g in enumerate(ngrams):
            included = [ x for x in list(dfx[supplement_var])[i] if len(x) >= inclusion_size ]
            if len(included) > 0:
                g.extend(included)
    ## 変数の追加
    if ngram_is_skippy:
        added_var = f"{var_prefix}skippy{n_for_ngram}gram"
    else:
        added_var = f"{var_prefix}{n_for_ngram}gram"
    print(f"added_var: {added_var}")
    dfx[added_var] = ngrams
    ## check result
    print(dfx[added_var])

# Spell

In [67]:
## spell 2grams
add_ngram_to_df(df, n_for_ngram = 2, var_prefix = "sp_", ngram_is_skippy = False, ngram_is_inclusive = True, ngram_inclusiveness = ngram_inclusiveness, check = True)

inclusion_size: 0
source_var: sp_1gram
supplement_var: sp_1gram
added_var: sp_2gram
491     [au, ut, to, om, mo, ot, ti, iv, ve, a, u, t, ...
425     [as, ss, se, er, rt, ti, iv, ve, a, s, s, e, r...
2117        [fa, at, te, ef, fu, ul, f, a, t, e, f, u, l]
3344    [me, em, mb, br, ra, an, no, ou, us, m, e, m, ...
1514    [de, ep, pe, en, nd, de, en, nt, d, e, p, e, n...
                              ...                        
2274    [fr, re, ee, eh, ha, an, nd, f, r, e, e, h, a,...
2555               [ho, ol, ll, lo, ow, h, o, l, l, o, w]
5884    [wr, ro, on, ng, gf, fu, ul, w, r, o, n, g, f,...
2484                      [ha, ar, rs, sh, h, a, r, s, h]
2991    [io, on, ni, iz, zi, in, ng, i, o, n, i, z, i,...
Name: sp_2gram, Length: 4464, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfx[added_var] = ngrams


In [68]:
## spell 3grams
add_ngram_to_df(df, n_for_ngram = 3, var_prefix = "sp_", ngram_is_skippy = False, ngram_is_inclusive = True, ngram_inclusiveness = ngram_inclusiveness, check = True)

inclusion_size: 1
source_var: sp_1gram
supplement_var: sp_2gram
added_var: sp_3gram
491     [aut, uto, tom, omo, mot, oti, tiv, ive, au, u...
425     [ass, sse, ser, ert, rti, tiv, ive, as, ss, se...
2117    [fat, ate, tef, efu, ful, fa, at, te, ef, fu, ...
3344    [mem, emb, mbr, bra, ran, ano, nou, ous, me, e...
1514    [dep, epe, pen, end, nde, den, ent, de, ep, pe...
                              ...                        
2274    [fre, ree, eeh, eha, han, and, fr, re, ee, eh,...
2555    [hol, oll, llo, low, ho, ol, ll, lo, ow, h, o,...
5884    [wro, ron, ong, ngf, gfu, ful, wr, ro, on, ng,...
2484       [har, ars, rsh, ha, ar, rs, sh, h, a, r, s, h]
2991    [ion, oni, niz, izi, zin, ing, io, on, ni, iz,...
Name: sp_3gram, Length: 4464, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfx[added_var] = ngrams


In [69]:
## spell 4grams
add_ngram_to_df(df, n_for_ngram = 4, var_prefix = "sp_", ngram_is_skippy = False, ngram_is_inclusive = True, ngram_inclusiveness = ngram_inclusiveness, check = True)

inclusion_size: 2
source_var: sp_1gram
supplement_var: sp_3gram
added_var: sp_4gram
491     [auto, utom, tomo, omot, moti, otiv, tive, aut...
425     [asse, sser, sert, erti, rtiv, tive, ass, sse,...
2117    [fate, atef, tefu, eful, fat, ate, tef, efu, f...
3344    [memb, embr, mbra, bran, rano, anou, nous, mem...
1514    [depe, epen, pend, ende, nden, dent, dep, epe,...
                              ...                        
2274    [free, reeh, eeha, ehan, hand, fre, ree, eeh, ...
2555    [holl, ollo, llow, hol, oll, llo, low, ho, ol,...
5884    [wron, rong, ongf, ngfu, gful, wro, ron, ong, ...
2484          [hars, arsh, har, ars, rsh, ha, ar, rs, sh]
2991    [ioni, oniz, nizi, izin, zing, ion, oni, niz, ...
Name: sp_4gram, Length: 4464, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfx[added_var] = ngrams


In [70]:
## spell 5grams
add_ngram_to_df(df, n_for_ngram = 5, var_prefix = "sp_", ngram_is_skippy = False, ngram_is_inclusive = True, ngram_inclusiveness = ngram_inclusiveness, check = True)

inclusion_size: 3
source_var: sp_1gram
supplement_var: sp_4gram
added_var: sp_5gram
491     [autom, utomo, tomot, omoti, motiv, otive, aut...
425     [asser, ssert, serti, ertiv, rtive, asse, sser...
2117    [fatef, atefu, teful, fate, atef, tefu, eful, ...
3344    [membr, embra, mbran, brano, ranou, anous, mem...
1514    [depen, epend, pende, enden, ndent, depe, epen...
                              ...                        
2274    [freeh, reeha, eehan, ehand, free, reeh, eeha,...
2555    [hollo, ollow, holl, ollo, llow, hol, oll, llo...
5884    [wrong, rongf, ongfu, ngful, wron, rong, ongf,...
2484                   [harsh, hars, arsh, har, ars, rsh]
2991    [ioniz, onizi, nizin, izing, ioni, oniz, nizi,...
Name: sp_5gram, Length: 4464, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfx[added_var] = ngrams


In [71]:
## spell skippy 2grams
add_ngram_to_df(df, n_for_ngram = 2, var_prefix = "sp_", ngram_is_skippy = True, ngram_is_inclusive = ngram_is_inclusive, ngram_inclusiveness = ngram_inclusiveness, check = True)

inclusion_size: 0
source_var: sp_1gram
supplement_var: sp_1gram
added_var: sp_skippy2gram
491     [au, a…t, a…o, a…m, a…o, a…t, a…i, a…v, a…e, u...
425     [as, a…s, a…e, a…r, a…t, a…i, a…v, a…e, ss, s…...
2117    [fa, f…t, f…e, f…f, f…u, f…l, at, a…e, a…f, a…...
3344    [me, m…m, m…b, m…r, m…a, m…n, m…o, m…u, m…s, e...
1514    [de, d…p, d…e, d…n, d…d, d…e, d…n, d…t, ep, e…...
                              ...                        
2274    [fr, f…e, f…e, f…h, f…a, f…n, f…d, re, r…e, r…...
2555    [ho, h…l, h…l, h…o, h…w, ol, o…l, o…o, o…w, ll...
5884    [wr, w…o, w…n, w…g, w…f, w…u, w…l, ro, r…n, r…...
2484    [ha, h…r, h…s, h…h, ar, a…s, a…h, rs, r…h, sh,...
2991    [io, i…n, i…i, i…z, i…i, i…n, i…g, on, o…i, o…...
Name: sp_skippy2gram, Length: 4464, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfx[added_var] = ngrams


In [72]:
## spell skippy 3grams
add_ngram_to_df(df, n_for_ngram = 3, var_prefix = "sp_", ngram_is_skippy = True, ngram_is_inclusive = ngram_is_inclusive, ngram_inclusiveness = ngram_inclusiveness, check = True)

inclusion_size: 1
source_var: sp_1gram
supplement_var: sp_skippy2gram
added_var: sp_skippy3gram
491     [aut, au…o, au…m, au…o, au…t, au…i, au…v, au…e...
425     [ass, as…e, as…r, as…t, as…i, as…v, as…e, a…se...
2117    [fat, fa…e, fa…f, fa…u, fa…l, f…te, f…t…f, f…t...
3344    [mem, me…b, me…r, me…a, me…n, me…o, me…u, me…s...
1514    [dep, de…e, de…n, de…d, de…e, de…n, de…t, d…pe...
                              ...                        
2274    [fre, fr…e, fr…h, fr…a, fr…n, fr…d, f…ee, f…e…...
2555    [hol, ho…l, ho…o, ho…w, h…ll, h…l…o, h…l…w, h…...
5884    [wro, wr…n, wr…g, wr…f, wr…u, wr…l, w…on, w…o…...
2484    [har, ha…s, ha…h, h…rs, h…r…h, h…sh, ars, ar…h...
2991    [ion, io…i, io…z, io…i, io…n, io…g, i…ni, i…n…...
Name: sp_skippy3gram, Length: 4464, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfx[added_var] = ngrams


In [73]:
## spell skippy 4grams
add_ngram_to_df(df, n_for_ngram = 4, var_prefix = "sp_", ngram_is_skippy = True, ngram_is_inclusive = ngram_is_inclusive, ngram_inclusiveness = ngram_inclusiveness, check = True)

inclusion_size: 2
source_var: sp_1gram
supplement_var: sp_skippy3gram
added_var: sp_skippy4gram
491     [auto, aut…m, aut…o, aut…t, aut…i, aut…v, aut…...
425     [asse, ass…r, ass…t, ass…i, ass…v, ass…e, as…e...
2117    [fate, fat…f, fat…u, fat…l, fa…ef, fa…e…u, fa…...
3344    [memb, mem…r, mem…a, mem…n, mem…o, mem…u, mem…...
1514    [depe, dep…n, dep…d, dep…e, dep…n, dep…t, de…e...
                              ...                        
2274    [free, fre…h, fre…a, fre…n, fre…d, fr…eh, fr…e...
2555    [holl, hol…o, hol…w, ho…lo, ho…l…w, ho…ow, h…l...
5884    [wron, wro…g, wro…f, wro…u, wro…l, wr…ng, wr…n...
2484    [hars, har…h, ha…sh, h…rsh, arsh, har, ha…s, h...
2991    [ioni, ion…z, ion…i, ion…n, ion…g, io…iz, io…i...
Name: sp_skippy4gram, Length: 4464, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfx[added_var] = ngrams


In [74]:
## spell skippy 5grams
add_ngram_to_df(df, n_for_ngram = 5, var_prefix = "sp_", ngram_is_skippy = True, ngram_is_inclusive = ngram_is_inclusive, ngram_inclusiveness = ngram_inclusiveness, check = True)

inclusion_size: 3
source_var: sp_1gram
supplement_var: sp_skippy4gram
added_var: sp_skippy5gram
491     [autom, auto…o, auto…t, auto…i, auto…v, auto…e...
425     [asser, asse…t, asse…i, asse…v, asse…e, ass…rt...
2117    [fatef, fate…u, fate…l, fat…fu, fat…f…l, fat…u...
3344    [membr, memb…a, memb…n, memb…o, memb…u, memb…s...
1514    [depen, depe…d, depe…e, depe…n, depe…t, dep…nd...
                              ...                        
2274    [freeh, free…a, free…n, free…d, fre…ha, fre…h…...
2555    [hollo, holl…w, hol…ow, ho…low, h…llow, ollow,...
5884    [wrong, wron…f, wron…u, wron…l, wro…gf, wro…g…...
2484    [harsh, hars, har…h, ha…sh, h…rsh, arsh, har, ...
2991    [ioniz, ioni…i, ioni…n, ioni…g, ion…zi, ion…z…...
Name: sp_skippy5gram, Length: 4464, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfx[added_var] = ngrams


In [75]:
## check result
target_vars = [ 'spell' ]
target_vars.extend([ f"sp_{i}gram" for i in range(1,6) ])
target_vars.extend([ f"sp_skippy{i}gram" for i in range(2,6) ])
df[target_vars]

Unnamed: 0,spell,sp_1gram,sp_2gram,sp_3gram,sp_4gram,sp_5gram,sp_skippy2gram,sp_skippy3gram,sp_skippy4gram,sp_skippy5gram
491,automotive,"[a, u, t, o, m, o, t, i, v, e]","[au, ut, to, om, mo, ot, ti, iv, ve, a, u, t, ...","[aut, uto, tom, omo, mot, oti, tiv, ive, au, u...","[auto, utom, tomo, omot, moti, otiv, tive, aut...","[autom, utomo, tomot, omoti, motiv, otive, aut...","[au, a…t, a…o, a…m, a…o, a…t, a…i, a…v, a…e, u...","[aut, au…o, au…m, au…o, au…t, au…i, au…v, au…e...","[auto, aut…m, aut…o, aut…t, aut…i, aut…v, aut…...","[autom, auto…o, auto…t, auto…i, auto…v, auto…e..."
425,assertive,"[a, s, s, e, r, t, i, v, e]","[as, ss, se, er, rt, ti, iv, ve, a, s, s, e, r...","[ass, sse, ser, ert, rti, tiv, ive, as, ss, se...","[asse, sser, sert, erti, rtiv, tive, ass, sse,...","[asser, ssert, serti, ertiv, rtive, asse, sser...","[as, a…s, a…e, a…r, a…t, a…i, a…v, a…e, ss, s…...","[ass, as…e, as…r, as…t, as…i, as…v, as…e, a…se...","[asse, ass…r, ass…t, ass…i, ass…v, ass…e, as…e...","[asser, asse…t, asse…i, asse…v, asse…e, ass…rt..."
2117,fateful,"[f, a, t, e, f, u, l]","[fa, at, te, ef, fu, ul, f, a, t, e, f, u, l]","[fat, ate, tef, efu, ful, fa, at, te, ef, fu, ...","[fate, atef, tefu, eful, fat, ate, tef, efu, f...","[fatef, atefu, teful, fate, atef, tefu, eful, ...","[fa, f…t, f…e, f…f, f…u, f…l, at, a…e, a…f, a…...","[fat, fa…e, fa…f, fa…u, fa…l, f…te, f…t…f, f…t...","[fate, fat…f, fat…u, fat…l, fa…ef, fa…e…u, fa…...","[fatef, fate…u, fate…l, fat…fu, fat…f…l, fat…u..."
3344,membranous,"[m, e, m, b, r, a, n, o, u, s]","[me, em, mb, br, ra, an, no, ou, us, m, e, m, ...","[mem, emb, mbr, bra, ran, ano, nou, ous, me, e...","[memb, embr, mbra, bran, rano, anou, nous, mem...","[membr, embra, mbran, brano, ranou, anous, mem...","[me, m…m, m…b, m…r, m…a, m…n, m…o, m…u, m…s, e...","[mem, me…b, me…r, me…a, me…n, me…o, me…u, me…s...","[memb, mem…r, mem…a, mem…n, mem…o, mem…u, mem…...","[membr, memb…a, memb…n, memb…o, memb…u, memb…s..."
1514,dependent,"[d, e, p, e, n, d, e, n, t]","[de, ep, pe, en, nd, de, en, nt, d, e, p, e, n...","[dep, epe, pen, end, nde, den, ent, de, ep, pe...","[depe, epen, pend, ende, nden, dent, dep, epe,...","[depen, epend, pende, enden, ndent, depe, epen...","[de, d…p, d…e, d…n, d…d, d…e, d…n, d…t, ep, e…...","[dep, de…e, de…n, de…d, de…e, de…n, de…t, d…pe...","[depe, dep…n, dep…d, dep…e, dep…n, dep…t, de…e...","[depen, depe…d, depe…e, depe…n, depe…t, dep…nd..."
...,...,...,...,...,...,...,...,...,...,...
2274,freehand,"[f, r, e, e, h, a, n, d]","[fr, re, ee, eh, ha, an, nd, f, r, e, e, h, a,...","[fre, ree, eeh, eha, han, and, fr, re, ee, eh,...","[free, reeh, eeha, ehan, hand, fre, ree, eeh, ...","[freeh, reeha, eehan, ehand, free, reeh, eeha,...","[fr, f…e, f…e, f…h, f…a, f…n, f…d, re, r…e, r…...","[fre, fr…e, fr…h, fr…a, fr…n, fr…d, f…ee, f…e…...","[free, fre…h, fre…a, fre…n, fre…d, fr…eh, fr…e...","[freeh, free…a, free…n, free…d, fre…ha, fre…h…..."
2555,hollow,"[h, o, l, l, o, w]","[ho, ol, ll, lo, ow, h, o, l, l, o, w]","[hol, oll, llo, low, ho, ol, ll, lo, ow, h, o,...","[holl, ollo, llow, hol, oll, llo, low, ho, ol,...","[hollo, ollow, holl, ollo, llow, hol, oll, llo...","[ho, h…l, h…l, h…o, h…w, ol, o…l, o…o, o…w, ll...","[hol, ho…l, ho…o, ho…w, h…ll, h…l…o, h…l…w, h…...","[holl, hol…o, hol…w, ho…lo, ho…l…w, ho…ow, h…l...","[hollo, holl…w, hol…ow, ho…low, h…llow, ollow,..."
5884,wrongful,"[w, r, o, n, g, f, u, l]","[wr, ro, on, ng, gf, fu, ul, w, r, o, n, g, f,...","[wro, ron, ong, ngf, gfu, ful, wr, ro, on, ng,...","[wron, rong, ongf, ngfu, gful, wro, ron, ong, ...","[wrong, rongf, ongfu, ngful, wron, rong, ongf,...","[wr, w…o, w…n, w…g, w…f, w…u, w…l, ro, r…n, r…...","[wro, wr…n, wr…g, wr…f, wr…u, wr…l, w…on, w…o…...","[wron, wro…g, wro…f, wro…u, wro…l, wr…ng, wr…n...","[wrong, wron…f, wron…u, wron…l, wro…gf, wro…g…..."
2484,harsh,"[h, a, r, s, h]","[ha, ar, rs, sh, h, a, r, s, h]","[har, ars, rsh, ha, ar, rs, sh, h, a, r, s, h]","[hars, arsh, har, ars, rsh, ha, ar, rs, sh]","[harsh, hars, arsh, har, ars, rsh]","[ha, h…r, h…s, h…h, ar, a…s, a…h, rs, r…h, sh,...","[har, ha…s, ha…h, h…rs, h…r…h, h…sh, ars, ar…h...","[hars, har…h, ha…sh, h…rsh, arsh, har, ha…s, h...","[harsh, hars, har…h, ha…sh, h…rsh, arsh, har, ..."


# Sound

In [76]:
## sound 2grams
add_ngram_to_df(df, n_for_ngram = 2, var_prefix = "sn_", ngram_is_skippy = False, ngram_is_inclusive = True, ngram_inclusiveness = ngram_inclusiveness, check = True)

inclusion_size: 0
source_var: sn_1gram
supplement_var: sn_1gram
added_var: sn_2gram
491     [ɔt, tə, əm, mo, oʊ, ʊt, tɪ, ɪv, ɔ, t, ə, m, o...
425                [əs, sɝ, ɝt, tɪ, ɪv, ə, s, ɝ, t, ɪ, v]
2117        [fe, eɪ, ɪt, tf, fə, əɫ, f, e, ɪ, t, f, ə, ɫ]
3344    [mɛ, ɛm, mb, bɹ, ɹə, ən, nə, əs, m, ɛ, m, b, ɹ...
1514    [dɪ, ɪp, pɛ, ɛn, nd, də, ən, nt, d, ɪ, p, ɛ, n...
                              ...                        
2274        [fɹ, ɹi, ih, hæ, æn, nd, f, ɹ, i, h, æ, n, d]
2555                      [hɑ, ɑɫ, ɫo, oʊ, h, ɑ, ɫ, o, ʊ]
5884               [ɹɔ, ɔŋ, ŋf, fə, əɫ, ɹ, ɔ, ŋ, f, ə, ɫ]
2484                             [hɑ, ɑɹ, ɹʃ, h, ɑ, ɹ, ʃ]
2991    [aɪ, ɪə, ən, na, aɪ, ɪz, zɪ, ɪŋ, a, ɪ, ə, n, a...
Name: sn_2gram, Length: 4464, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfx[added_var] = ngrams


In [77]:
## sound 3grams
add_ngram_to_df(df, n_for_ngram = 3, var_prefix = "sn_", ngram_is_skippy = False, ngram_is_inclusive = True, ngram_inclusiveness = ngram_inclusiveness, check = True)

inclusion_size: 1
source_var: sn_1gram
supplement_var: sn_2gram
added_var: sn_3gram
491     [ɔtə, təm, əmo, moʊ, oʊt, ʊtɪ, tɪv, ɔt, tə, əm...
425     [əsɝ, sɝt, ɝtɪ, tɪv, əs, sɝ, ɝt, tɪ, ɪv, ə, s,...
2117    [feɪ, eɪt, ɪtf, tfə, fəɫ, fe, eɪ, ɪt, tf, fə, ...
3344    [mɛm, ɛmb, mbɹ, bɹə, ɹən, ənə, nəs, mɛ, ɛm, mb...
1514    [dɪp, ɪpɛ, pɛn, ɛnd, ndə, dən, ənt, dɪ, ɪp, pɛ...
                              ...                        
2274    [fɹi, ɹih, ihæ, hæn, ænd, fɹ, ɹi, ih, hæ, æn, ...
2555       [hɑɫ, ɑɫo, ɫoʊ, hɑ, ɑɫ, ɫo, oʊ, h, ɑ, ɫ, o, ʊ]
5884    [ɹɔŋ, ɔŋf, ŋfə, fəɫ, ɹɔ, ɔŋ, ŋf, fə, əɫ, ɹ, ɔ,...
2484                   [hɑɹ, ɑɹʃ, hɑ, ɑɹ, ɹʃ, h, ɑ, ɹ, ʃ]
2991    [aɪə, ɪən, əna, naɪ, aɪz, ɪzɪ, zɪŋ, aɪ, ɪə, ən...
Name: sn_3gram, Length: 4464, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfx[added_var] = ngrams


In [78]:
## sound 4grams
add_ngram_to_df(df, n_for_ngram = 4, var_prefix = "sn_", ngram_is_skippy = False, ngram_is_inclusive = True, ngram_inclusiveness = ngram_inclusiveness, check = True)

inclusion_size: 2
source_var: sn_1gram
supplement_var: sn_3gram
added_var: sn_4gram
491     [ɔtəm, təmo, əmoʊ, moʊt, oʊtɪ, ʊtɪv, ɔtə, təm,...
425     [əsɝt, sɝtɪ, ɝtɪv, əsɝ, sɝt, ɝtɪ, tɪv, əs, sɝ,...
2117    [feɪt, eɪtf, ɪtfə, tfəɫ, feɪ, eɪt, ɪtf, tfə, f...
3344    [mɛmb, ɛmbɹ, mbɹə, bɹən, ɹənə, ənəs, mɛm, ɛmb,...
1514    [dɪpɛ, ɪpɛn, pɛnd, ɛndə, ndən, dənt, dɪp, ɪpɛ,...
                              ...                        
2274    [fɹih, ɹihæ, ihæn, hænd, fɹi, ɹih, ihæ, hæn, æ...
2555          [hɑɫo, ɑɫoʊ, hɑɫ, ɑɫo, ɫoʊ, hɑ, ɑɫ, ɫo, oʊ]
5884    [ɹɔŋf, ɔŋfə, ŋfəɫ, ɹɔŋ, ɔŋf, ŋfə, fəɫ, ɹɔ, ɔŋ,...
2484                         [hɑɹʃ, hɑɹ, ɑɹʃ, hɑ, ɑɹ, ɹʃ]
2991    [aɪən, ɪəna, ənaɪ, naɪz, aɪzɪ, ɪzɪŋ, aɪə, ɪən,...
Name: sn_4gram, Length: 4464, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfx[added_var] = ngrams


In [79]:
## sound 5grams
add_ngram_to_df(df, n_for_ngram = 4, var_prefix = "sn_", ngram_is_skippy = False, ngram_is_inclusive = True, ngram_inclusiveness = ngram_inclusiveness, check = True)

inclusion_size: 2
source_var: sn_1gram
supplement_var: sn_3gram
added_var: sn_4gram
491     [ɔtəm, təmo, əmoʊ, moʊt, oʊtɪ, ʊtɪv, ɔtə, təm,...
425     [əsɝt, sɝtɪ, ɝtɪv, əsɝ, sɝt, ɝtɪ, tɪv, əs, sɝ,...
2117    [feɪt, eɪtf, ɪtfə, tfəɫ, feɪ, eɪt, ɪtf, tfə, f...
3344    [mɛmb, ɛmbɹ, mbɹə, bɹən, ɹənə, ənəs, mɛm, ɛmb,...
1514    [dɪpɛ, ɪpɛn, pɛnd, ɛndə, ndən, dənt, dɪp, ɪpɛ,...
                              ...                        
2274    [fɹih, ɹihæ, ihæn, hænd, fɹi, ɹih, ihæ, hæn, æ...
2555          [hɑɫo, ɑɫoʊ, hɑɫ, ɑɫo, ɫoʊ, hɑ, ɑɫ, ɫo, oʊ]
5884    [ɹɔŋf, ɔŋfə, ŋfəɫ, ɹɔŋ, ɔŋf, ŋfə, fəɫ, ɹɔ, ɔŋ,...
2484                         [hɑɹʃ, hɑɹ, ɑɹʃ, hɑ, ɑɹ, ɹʃ]
2991    [aɪən, ɪəna, ənaɪ, naɪz, aɪzɪ, ɪzɪŋ, aɪə, ɪən,...
Name: sn_4gram, Length: 4464, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfx[added_var] = ngrams


In [80]:
## sound skippy 2grams
add_ngram_to_df(df, n_for_ngram = 2, var_prefix = "sn_", ngram_is_skippy = True, ngram_is_inclusive = True, ngram_inclusiveness = ngram_inclusiveness, check = True)

inclusion_size: 0
source_var: sn_1gram
supplement_var: sn_1gram
added_var: sn_skippy2gram
491     [ɔt, ɔ…ə, ɔ…m, ɔ…o, ɔ…ʊ, ɔ…t, ɔ…ɪ, ɔ…v, tə, t…...
425     [əs, ə…ɝ, ə…t, ə…ɪ, ə…v, sɝ, s…t, s…ɪ, s…v, ɝt...
2117    [fe, f…ɪ, f…t, f…f, f…ə, f…ɫ, eɪ, e…t, e…f, e…...
3344    [mɛ, m…m, m…b, m…ɹ, m…ə, m…n, m…ə, m…s, ɛm, ɛ…...
1514    [dɪ, d…p, d…ɛ, d…n, d…d, d…ə, d…n, d…t, ɪp, ɪ…...
                              ...                        
2274    [fɹ, f…i, f…h, f…æ, f…n, f…d, ɹi, ɹ…h, ɹ…æ, ɹ…...
2555    [hɑ, h…ɫ, h…o, h…ʊ, ɑɫ, ɑ…o, ɑ…ʊ, ɫo, ɫ…ʊ, oʊ,...
5884    [ɹɔ, ɹ…ŋ, ɹ…f, ɹ…ə, ɹ…ɫ, ɔŋ, ɔ…f, ɔ…ə, ɔ…ɫ, ŋf...
2484              [hɑ, h…ɹ, h…ʃ, ɑɹ, ɑ…ʃ, ɹʃ, h, ɑ, ɹ, ʃ]
2991    [aɪ, a…ə, a…n, a…a, a…ɪ, a…z, a…ɪ, a…ŋ, ɪə, ɪ…...
Name: sn_skippy2gram, Length: 4464, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfx[added_var] = ngrams


In [81]:
## sound skippy 3grams
add_ngram_to_df(df, n_for_ngram = 3, var_prefix = "sn_", ngram_is_skippy = True, ngram_is_inclusive = True, ngram_inclusiveness = ngram_inclusiveness, check = True)

inclusion_size: 1
source_var: sn_1gram
supplement_var: sn_skippy2gram
added_var: sn_skippy3gram
491     [ɔtə, ɔt…m, ɔt…o, ɔt…ʊ, ɔt…t, ɔt…ɪ, ɔt…v, ɔ…əm...
425     [əsɝ, əs…t, əs…ɪ, əs…v, ə…ɝt, ə…ɝ…ɪ, ə…ɝ…v, ə…...
2117    [feɪ, fe…t, fe…f, fe…ə, fe…ɫ, f…ɪt, f…ɪ…f, f…ɪ...
3344    [mɛm, mɛ…b, mɛ…ɹ, mɛ…ə, mɛ…n, mɛ…ə, mɛ…s, m…mb...
1514    [dɪp, dɪ…ɛ, dɪ…n, dɪ…d, dɪ…ə, dɪ…n, dɪ…t, d…pɛ...
                              ...                        
2274    [fɹi, fɹ…h, fɹ…æ, fɹ…n, fɹ…d, f…ih, f…i…æ, f…i...
2555    [hɑɫ, hɑ…o, hɑ…ʊ, h…ɫo, h…ɫ…ʊ, h…oʊ, ɑɫo, ɑɫ…ʊ...
5884    [ɹɔŋ, ɹɔ…f, ɹɔ…ə, ɹɔ…ɫ, ɹ…ŋf, ɹ…ŋ…ə, ɹ…ŋ…ɫ, ɹ…...
2484    [hɑɹ, hɑ…ʃ, h…ɹʃ, ɑɹʃ, hɑ, h…ɹ, h…ʃ, ɑɹ, ɑ…ʃ, ...
2991    [aɪə, aɪ…n, aɪ…a, aɪ…ɪ, aɪ…z, aɪ…ɪ, aɪ…ŋ, a…ən...
Name: sn_skippy3gram, Length: 4464, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfx[added_var] = ngrams


In [82]:
## sound skippy 4grams
add_ngram_to_df(df, n_for_ngram = 4, var_prefix = "sn_", ngram_is_skippy = True, ngram_is_inclusive = True, ngram_inclusiveness = ngram_inclusiveness, check = True)

inclusion_size: 2
source_var: sn_1gram
supplement_var: sn_skippy3gram
added_var: sn_skippy4gram
491     [ɔtəm, ɔtə…o, ɔtə…ʊ, ɔtə…t, ɔtə…ɪ, ɔtə…v, ɔt…m...
425     [əsɝt, əsɝ…ɪ, əsɝ…v, əs…tɪ, əs…t…v, əs…ɪv, ə…ɝ...
2117    [feɪt, feɪ…f, feɪ…ə, feɪ…ɫ, fe…tf, fe…t…ə, fe…...
3344    [mɛmb, mɛm…ɹ, mɛm…ə, mɛm…n, mɛm…ə, mɛm…s, mɛ…b...
1514    [dɪpɛ, dɪp…n, dɪp…d, dɪp…ə, dɪp…n, dɪp…t, dɪ…ɛ...
                              ...                        
2274    [fɹih, fɹi…æ, fɹi…n, fɹi…d, fɹ…hæ, fɹ…h…n, fɹ…...
2555    [hɑɫo, hɑɫ…ʊ, hɑ…oʊ, h…ɫoʊ, ɑɫoʊ, hɑɫ, hɑ…o, h...
5884    [ɹɔŋf, ɹɔŋ…ə, ɹɔŋ…ɫ, ɹɔ…fə, ɹɔ…f…ɫ, ɹɔ…əɫ, ɹ…ŋ...
2484    [hɑɹʃ, hɑɹ, hɑ…ʃ, h…ɹʃ, ɑɹʃ, hɑ, h…ɹ, h…ʃ, ɑɹ,...
2991    [aɪən, aɪə…a, aɪə…ɪ, aɪə…z, aɪə…ɪ, aɪə…ŋ, aɪ…n...
Name: sn_skippy4gram, Length: 4464, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfx[added_var] = ngrams


In [83]:
## sound skippy 5grams
add_ngram_to_df(df, n_for_ngram = 5, var_prefix = "sn_", ngram_is_skippy = True, ngram_is_inclusive = True, ngram_inclusiveness = ngram_inclusiveness, check = True)

inclusion_size: 3
source_var: sn_1gram
supplement_var: sn_skippy4gram
added_var: sn_skippy5gram
491     [ɔtəmo, ɔtəm…ʊ, ɔtəm…t, ɔtəm…ɪ, ɔtəm…v, ɔtə…oʊ...
425     [əsɝtɪ, əsɝt…v, əsɝ…ɪv, əs…tɪv, ə…ɝtɪv, sɝtɪv,...
2117    [feɪtf, feɪt…ə, feɪt…ɫ, feɪ…fə, feɪ…f…ɫ, feɪ…ə...
3344    [mɛmbɹ, mɛmb…ə, mɛmb…n, mɛmb…ə, mɛmb…s, mɛm…ɹə...
1514    [dɪpɛn, dɪpɛ…d, dɪpɛ…ə, dɪpɛ…n, dɪpɛ…t, dɪp…nd...
                              ...                        
2274    [fɹihæ, fɹih…n, fɹih…d, fɹi…æn, fɹi…æ…d, fɹi…n...
2555    [hɑɫoʊ, hɑɫo, hɑɫ…ʊ, hɑ…oʊ, h…ɫoʊ, ɑɫoʊ, hɑɫ, ...
5884    [ɹɔŋfə, ɹɔŋf…ɫ, ɹɔŋ…əɫ, ɹɔ…fəɫ, ɹ…ŋfəɫ, ɔŋfəɫ,...
2484          [hɑɹʃ, hɑɹ, hɑ…ʃ, h…ɹʃ, ɑɹʃ, h…ɹ, h…ʃ, ɑ…ʃ]
2991    [aɪəna, aɪən…ɪ, aɪən…z, aɪən…ɪ, aɪən…ŋ, aɪə…aɪ...
Name: sn_skippy5gram, Length: 4464, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfx[added_var] = ngrams


In [84]:
## check df
dropped_vars = [ 'sp_size', 'hyphen', 'period', 'sn_size' ]
if term_class == 'spell':
    extra = [ 'sn_1gram', 'sn_2gram', 'sn_3gram', 'sn_4gram',
             'sn_skippy2gram', 'sn_skippy3gram', 'sn_skippy4gram' ]
    dropped_vars.extend(extra)
    target_vars = [ x for x in df.columns if not x in dropped_vars ]
else:
    extra = [ 'sp_1gram', 'sp_2gram', 'sp_3gram', 'sp_4gram',
             'sp_skippy2gram', 'sp_skippy3gram', 'sp_skippy4gram' ]
    dropped_vars.extend(extra)
    target_vars = [ x for x in df.columns if not x in dropped_vars ]    
#
df[target_vars]

Unnamed: 0,spell,sound,sp_1gram,sp_2gram,sp_3gram,sp_4gram,sp_5gram,sp_skippy2gram,sp_skippy3gram,sp_skippy4gram,sp_skippy5gram,sn_skippy5gram
491,automotive,ɔtəmoʊtɪv,"[a, u, t, o, m, o, t, i, v, e]","[au, ut, to, om, mo, ot, ti, iv, ve, a, u, t, ...","[aut, uto, tom, omo, mot, oti, tiv, ive, au, u...","[auto, utom, tomo, omot, moti, otiv, tive, aut...","[autom, utomo, tomot, omoti, motiv, otive, aut...","[au, a…t, a…o, a…m, a…o, a…t, a…i, a…v, a…e, u...","[aut, au…o, au…m, au…o, au…t, au…i, au…v, au…e...","[auto, aut…m, aut…o, aut…t, aut…i, aut…v, aut…...","[autom, auto…o, auto…t, auto…i, auto…v, auto…e...","[ɔtəmo, ɔtəm…ʊ, ɔtəm…t, ɔtəm…ɪ, ɔtəm…v, ɔtə…oʊ..."
425,assertive,əsɝtɪv,"[a, s, s, e, r, t, i, v, e]","[as, ss, se, er, rt, ti, iv, ve, a, s, s, e, r...","[ass, sse, ser, ert, rti, tiv, ive, as, ss, se...","[asse, sser, sert, erti, rtiv, tive, ass, sse,...","[asser, ssert, serti, ertiv, rtive, asse, sser...","[as, a…s, a…e, a…r, a…t, a…i, a…v, a…e, ss, s…...","[ass, as…e, as…r, as…t, as…i, as…v, as…e, a…se...","[asse, ass…r, ass…t, ass…i, ass…v, ass…e, as…e...","[asser, asse…t, asse…i, asse…v, asse…e, ass…rt...","[əsɝtɪ, əsɝt…v, əsɝ…ɪv, əs…tɪv, ə…ɝtɪv, sɝtɪv,..."
2117,fateful,feɪtfəɫ,"[f, a, t, e, f, u, l]","[fa, at, te, ef, fu, ul, f, a, t, e, f, u, l]","[fat, ate, tef, efu, ful, fa, at, te, ef, fu, ...","[fate, atef, tefu, eful, fat, ate, tef, efu, f...","[fatef, atefu, teful, fate, atef, tefu, eful, ...","[fa, f…t, f…e, f…f, f…u, f…l, at, a…e, a…f, a…...","[fat, fa…e, fa…f, fa…u, fa…l, f…te, f…t…f, f…t...","[fate, fat…f, fat…u, fat…l, fa…ef, fa…e…u, fa…...","[fatef, fate…u, fate…l, fat…fu, fat…f…l, fat…u...","[feɪtf, feɪt…ə, feɪt…ɫ, feɪ…fə, feɪ…f…ɫ, feɪ…ə..."
3344,membranous,mɛmbɹənəs,"[m, e, m, b, r, a, n, o, u, s]","[me, em, mb, br, ra, an, no, ou, us, m, e, m, ...","[mem, emb, mbr, bra, ran, ano, nou, ous, me, e...","[memb, embr, mbra, bran, rano, anou, nous, mem...","[membr, embra, mbran, brano, ranou, anous, mem...","[me, m…m, m…b, m…r, m…a, m…n, m…o, m…u, m…s, e...","[mem, me…b, me…r, me…a, me…n, me…o, me…u, me…s...","[memb, mem…r, mem…a, mem…n, mem…o, mem…u, mem…...","[membr, memb…a, memb…n, memb…o, memb…u, memb…s...","[mɛmbɹ, mɛmb…ə, mɛmb…n, mɛmb…ə, mɛmb…s, mɛm…ɹə..."
1514,dependent,dɪpɛndənt,"[d, e, p, e, n, d, e, n, t]","[de, ep, pe, en, nd, de, en, nt, d, e, p, e, n...","[dep, epe, pen, end, nde, den, ent, de, ep, pe...","[depe, epen, pend, ende, nden, dent, dep, epe,...","[depen, epend, pende, enden, ndent, depe, epen...","[de, d…p, d…e, d…n, d…d, d…e, d…n, d…t, ep, e…...","[dep, de…e, de…n, de…d, de…e, de…n, de…t, d…pe...","[depe, dep…n, dep…d, dep…e, dep…n, dep…t, de…e...","[depen, depe…d, depe…e, depe…n, depe…t, dep…nd...","[dɪpɛn, dɪpɛ…d, dɪpɛ…ə, dɪpɛ…n, dɪpɛ…t, dɪp…nd..."
...,...,...,...,...,...,...,...,...,...,...,...,...
2274,freehand,fɹihænd,"[f, r, e, e, h, a, n, d]","[fr, re, ee, eh, ha, an, nd, f, r, e, e, h, a,...","[fre, ree, eeh, eha, han, and, fr, re, ee, eh,...","[free, reeh, eeha, ehan, hand, fre, ree, eeh, ...","[freeh, reeha, eehan, ehand, free, reeh, eeha,...","[fr, f…e, f…e, f…h, f…a, f…n, f…d, re, r…e, r…...","[fre, fr…e, fr…h, fr…a, fr…n, fr…d, f…ee, f…e…...","[free, fre…h, fre…a, fre…n, fre…d, fr…eh, fr…e...","[freeh, free…a, free…n, free…d, fre…ha, fre…h…...","[fɹihæ, fɹih…n, fɹih…d, fɹi…æn, fɹi…æ…d, fɹi…n..."
2555,hollow,hɑɫoʊ,"[h, o, l, l, o, w]","[ho, ol, ll, lo, ow, h, o, l, l, o, w]","[hol, oll, llo, low, ho, ol, ll, lo, ow, h, o,...","[holl, ollo, llow, hol, oll, llo, low, ho, ol,...","[hollo, ollow, holl, ollo, llow, hol, oll, llo...","[ho, h…l, h…l, h…o, h…w, ol, o…l, o…o, o…w, ll...","[hol, ho…l, ho…o, ho…w, h…ll, h…l…o, h…l…w, h…...","[holl, hol…o, hol…w, ho…lo, ho…l…w, ho…ow, h…l...","[hollo, holl…w, hol…ow, ho…low, h…llow, ollow,...","[hɑɫoʊ, hɑɫo, hɑɫ…ʊ, hɑ…oʊ, h…ɫoʊ, ɑɫoʊ, hɑɫ, ..."
5884,wrongful,ɹɔŋfəɫ,"[w, r, o, n, g, f, u, l]","[wr, ro, on, ng, gf, fu, ul, w, r, o, n, g, f,...","[wro, ron, ong, ngf, gfu, ful, wr, ro, on, ng,...","[wron, rong, ongf, ngfu, gful, wro, ron, ong, ...","[wrong, rongf, ongfu, ngful, wron, rong, ongf,...","[wr, w…o, w…n, w…g, w…f, w…u, w…l, ro, r…n, r…...","[wro, wr…n, wr…g, wr…f, wr…u, wr…l, w…on, w…o…...","[wron, wro…g, wro…f, wro…u, wro…l, wr…ng, wr…n...","[wrong, wron…f, wron…u, wron…l, wro…gf, wro…g…...","[ɹɔŋfə, ɹɔŋf…ɫ, ɹɔŋ…əɫ, ɹɔ…fəɫ, ɹ…ŋfəɫ, ɔŋfəɫ,..."
2484,harsh,hɑɹʃ,"[h, a, r, s, h]","[ha, ar, rs, sh, h, a, r, s, h]","[har, ars, rsh, ha, ar, rs, sh, h, a, r, s, h]","[hars, arsh, har, ars, rsh, ha, ar, rs, sh]","[harsh, hars, arsh, har, ars, rsh]","[ha, h…r, h…s, h…h, ar, a…s, a…h, rs, r…h, sh,...","[har, ha…s, ha…h, h…rs, h…r…h, h…sh, ars, ar…h...","[hars, har…h, ha…sh, h…rsh, arsh, har, ha…s, h...","[harsh, hars, har…h, ha…sh, h…rsh, arsh, har, ...","[hɑɹʃ, hɑɹ, hɑ…ʃ, h…ɹʃ, ɑɹʃ, h…ɹ, h…ʃ, ɑ…ʃ]"


# Analysis

In [85]:
## select data type and define doc_dict
import random
if "sp_" in term_type:
    base_type = "spell"
else:
    base_type = "sound"
doc_dict = { i: x for i, x in enumerate(df[base_type]) }
## check
random.sample(doc_dict.items(), 10)

since Python 3.9 and will be removed in a subsequent version.
  random.sample(doc_dict.items(), 10)


[(3887, 'addressed'),
 (4040, 'mortal'),
 (3878, 'tufted'),
 (2416, 'veterinary'),
 (341, 'flatulent'),
 (2276, 'humble'),
 (1768, 'ratified'),
 (418, 'solvent'),
 (177, 'unsettled'),
 (3557, 'piano')]

In [86]:
## select bots for analysis
enable_term_change = False # if you want to change term_type to save time and energy
if enable_term_change:
	term_type = 'sp_skippy4gram'
	print(f"(changed) term_type: {term_type}")

## bot stands for 'bag-of-terms', a generalization of 'bag-of-words'
bots = [ x for x in df[term_type] if len(x) > min_bot_size ] # Crucially
import random
random.sample(bots, 1)

[['spon',
  'spo…t',
  'spo…a',
  'spo…n',
  'spo…e',
  'spo…o',
  'spo…u',
  'spo…s',
  'sp…nt',
  'sp…n…a',
  'sp…n…n',
  'sp…n…e',
  'sp…n…o',
  'sp…n…u',
  'sp…n…s',
  'sp…ta',
  'sp…t…n',
  'sp…t…e',
  'sp…t…o',
  'sp…t…u',
  'sp…t…s',
  'sp…an',
  'sp…a…e',
  'sp…a…o',
  'sp…a…u',
  'sp…a…s',
  'sp…ne',
  'sp…n…o',
  'sp…n…u',
  'sp…n…s',
  'sp…eo',
  'sp…e…u',
  'sp…e…s',
  'sp…ou',
  'sp…o…s',
  'sp…us',
  's…ont',
  's…on…a',
  's…on…n',
  's…on…e',
  's…on…o',
  's…on…u',
  's…on…s',
  's…o…ta',
  's…o…t…n',
  's…o…t…e',
  's…o…t…o',
  's…o…t…u',
  's…o…t…s',
  's…o…an',
  's…o…a…e',
  's…o…a…o',
  's…o…a…u',
  's…o…a…s',
  's…o…ne',
  's…o…n…o',
  's…o…n…u',
  's…o…n…s',
  's…o…eo',
  's…o…e…u',
  's…o…e…s',
  's…o…ou',
  's…o…o…s',
  's…o…us',
  's…nta',
  's…nt…n',
  's…nt…e',
  's…nt…o',
  's…nt…u',
  's…nt…s',
  's…n…an',
  's…n…a…e',
  's…n…a…o',
  's…n…a…u',
  's…n…a…s',
  's…n…ne',
  's…n…n…o',
  's…n…n…u',
  's…n…n…s',
  's…n…eo',
  's…n…e…u',
  's…n…e…s',
  's…n…ou'

In [87]:
## generate dictionary
from gensim.corpora import Dictionary
diction = Dictionary(bots)
print(diction)

#apply_term_filtering = False
if apply_term_filtering:
    print(f"term filtering applied")
    diction.filter_extremes(no_below = term_minfreq, no_above = abuse_threshold)
else:
    print(f"term filtering not applied")
## check
print(diction)
## generate DTM
corpus = [ diction.doc2bow(bot) for bot in bots if len(bot) >= min_bot_size ] # Crucially

Dictionary<198041 unique tokens: ['au', 'aut', 'auto', 'aut…e', 'aut…i']...>
term filtering applied
Dictionary<96733 unique tokens: ['au', 'aut', 'auto', 'aut…e', 'aut…i']...>


In [88]:
## sanitize corpus
corpus = [ doc for doc in corpus if len(doc) > 0 ] # filter out empty rows 

In [89]:
## HDP (n_topics = 15)
import gensim.models
import pyLDAvis.gensim

max_n_topics = 15
hdp15 = gensim.models.HdpModel(corpus, diction, T = max_n_topics, random_state = 1)
vis_data15 = pyLDAvis.gensim.prepare(hdp15, corpus, diction)
pyLDAvis.display(vis_data15)

In [90]:
## save LDAvis output as a html file
save_LDAvis = True
if save_LDAvis:
	lang_dir_name = target_lang_dict[target_lang_key].split()[0]
	vis_output = f"results/LDAvis/{lang_dir_name}/{target_lang_dict[target_lang_key]}{target_class}-HDP-max_ntop{max_n_topics}-{term_type}{accent_status}.html"
	pyLDAvis.save_html(vis_data15, vis_output)

In [91]:
## topic investigation
import numpy as np
import HDP_helper
reload_module = False
if reload_module:
    import importlib
    importlib.reload(HDP_helper)

target_hdp = hdp15
documents_topics = np.zeros([target_hdp.m_T, len(corpus)])
for doc_id, c in enumerate(corpus):
    for topic_id, prob in target_hdp[c]:
        documents_topics[topic_id][doc_id] = prob

target_hdp.optimal_ordering()
for topic_id, probs in enumerate(documents_topics):
    print(f"==============")
    topic_t = target_hdp.print_topic(topic_id, topn = n_terms_to_show)
    print(f"topic_id {topic_id}: {HDP_helper.reformat_topic (topic_t, n_terms_to_show)}")
    print(f"nonzero count: ", len(probs.nonzero()[0]))
    for doc_id in probs.argsort()[::-1][:n_docs_to_show]:
        doc = doc_dict[doc_id]
        print(f"\t{probs[doc_id]:0.4f}: {doc}")

topic_id 0: 0.0 * a…y + 0.0 * i…i…e + 0.0 * e…g + 0.0 * l…r + 0.0 * i…te + 0.0 * m…r + 0.0 * e…in + 0.0 * n…i…e + 0.0 * i…a…e + 0.0 * l…g + 0.0 * t…y + 0.0 * t…g + 0.0 * i…nt + 0.0 * i…le + 0.0 * it + 0.0 * t…ed + 0.0 * e…i…g + 0.0 * g…e + 0.0 * v…e + 0.0 * l…ed
nonzero count:  1583
	0.9984: uninhabited
	0.9983: uninhibited
	0.9983: seventeenth
	0.9983: understated
	0.9983: intemperate
	0.9983: impertinent
	0.9983: respiratory
	0.9982: indifferent
	0.9982: sedimentary
	0.9982: alternating
	0.9982: sensational
	0.9982: sentimental
	0.9982: invalidated
	0.9982: stimulating
	0.9982: disinclined
topic_id 1: 0.001 * a…ic + 0.0 * tic + 0.0 * p…c + 0.0 * a…ti + 0.0 * e…ic + 0.0 * a…t…c + 0.0 * t…al + 0.0 * o…ic + 0.0 * e…ti + 0.0 * a…tic + 0.0 * s…ic + 0.0 * h…i + 0.0 * i…ti + 0.0 * r…ic + 0.0 * r…ti + 0.0 * l…c + 0.0 * t…ic + 0.0 * h…c + 0.0 * m…c + 0.0 * d…c
nonzero count:  1110
	0.9982: distressing
	0.9981: distinctive
	0.9981: ministerial
	0.9981: patriarchal
	0.9981: matriarchal
	0.9981:

In [92]:
## save topic structures
#hdp.get_topics() # =/= show_topics()
#hdp.print_topics()
hdp_topics = hdp15.show_topics(num_topics = max_n_topics,
                               num_words = n_terms_to_save, formatted = False)
hdp_dict = { tid: values for tid, values in hdp_topics }
## convert to Pandas dataframe
topics_df = pd.DataFrame.from_dict(hdp_dict)
hdp15_topics_out = f"results/terms-by-topics-raw/{lang_dir_name}/{target_lang_dict[target_lang_key]}{target_class}-topics{max_n_topics}-{term_type}{accent_status}.csv"
topics_df.to_csv(hdp15_topics_out, header = False, index = None)

In [93]:
## HDP (n_topics = 45)
import gensim.models
import pyLDAvis.gensim

max_n_topics = 45
hdp45 = gensim.models.HdpModel(corpus, diction, T = max_n_topics, random_state = 1)
vis_data45 = pyLDAvis.gensim.prepare(hdp45, corpus, diction)
pyLDAvis.display(vis_data45)

In [94]:
## save LDAvis output as a html file
lang_dir_name = target_lang_dict[target_lang_key].split()[0]
save_LDAvis = True
if save_LDAvis:
	vis_output = f"results/LDAvis/{lang_dir_name}/{target_lang_dict[target_lang_key]}{target_class}-HDP-max_ntop{max_n_topics}-{term_type}{accent_status}.html"
	pyLDAvis.save_html(vis_data45, vis_output)

In [95]:
## save topic structures
#hdp.get_topics() # =/= show_topics()
#hdp.print_topics()
hdp_topics = hdp45.show_topics(num_topics = max_n_topics,
                               num_words = n_terms_to_save, formatted = False)
hdp_dict = { tid: values for tid, values in hdp_topics }
## convert to Pandas dataframe
topics_df = pd.DataFrame.from_dict(hdp_dict)
#hdp45_topics_out = f"results/terms-by-topics-raw/hdp{max_n_topics}_topics_raw.csv"
hdp45_topics_out = f"results/terms-by-topics-raw/{lang_dir_name}/{target_lang_dict[target_lang_key]}{target_class}-topics{max_n_topics}-{term_type}{accent_status}.csv"
topics_df.to_csv(hdp45_topics_out, header = False, index = None)

In [96]:
## topic investigation
import numpy as np
import HDP_helper

target_hdp = hdp45
documents_topics = np.zeros([target_hdp.m_T, len(corpus)])
for doc_id, c in enumerate(corpus):
    for topic_id, prob in target_hdp[c]:
        documents_topics[topic_id][doc_id] = prob
#
target_hdp.optimal_ordering()
for topic_id, probs in enumerate(documents_topics):
    print(f"==============")
    topic_t = target_hdp.print_topic(topic_id, topn = n_terms_to_show)
    print(f"topic_id {topic_id}: {HDP_helper.reformat_topic (topic_t, n_terms_to_show)}")
    print(f"nonzero count: {len(probs.nonzero()[0])}")
    for doc_id in probs.argsort()[::-1][:n_docs_to_show]:
        doc = doc_dict[doc_id]
        print(f"\t{probs[doc_id]:0.4f}: {doc}")

topic_id 0: 0.001 * a…ic + 0.001 * r…al + 0.001 * p…c + 0.001 * t…al + 0.001 * e…ic + 0.0 * o…ic + 0.0 * e…al + 0.0 * tic + 0.0 * h…l + 0.0 * h…i + 0.0 * h…t + 0.0 * un…e + 0.0 * a…t…c + 0.0 * a…al + 0.0 * h…r + 0.0 * a…y + 0.0 * l…c + 0.0 * h…e + 0.0 * ia + 0.0 * a…ti
nonzero count: 1216
	0.9983: unhealthful
	0.9983: ministerial
	0.9982: interracial
	0.9982: patriarchal
	0.9982: secretarial
	0.9982: matriarchal
	0.9982: terrestrial
	0.9982: mechanistic
	0.9982: unrealistic
	0.9982: masochistic
	0.9981: unregulated
	0.9981: magisterial
	0.9981: subordinate
	0.9981: sacrificial
	0.9981: peripatetic
topic_id 1: 0.001 * e…a…e + 0.001 * p…n + 0.0 * na + 0.0 * a…a…e + 0.0 * ab…e + 0.0 * e…g + 0.0 * g…e + 0.0 * ble + 0.0 * h…e + 0.0 * e…le + 0.0 * n…i…e + 0.0 * i…i…e + 0.0 * e…in + 0.0 * n…le + 0.0 * i…a…e + 0.0 * ni + 0.0 * abl + 0.0 * able + 0.0 * m…d + 0.0 * o…al
nonzero count: 975
	0.9982: particulate
	0.9982: patrilineal
	0.9982: operational
	0.9982: impertinent
	0.9982: matrilineal
	0.

In [97]:
## HDP (n_topics = 90)
import gensim.models
import pyLDAvis.gensim

max_n_topics = 90
hdp90 = gensim.models.HdpModel(corpus, diction, T = max_n_topics,
                               random_state = 1)
vis_data90 = pyLDAvis.gensim.prepare(hdp90, corpus, diction)
pyLDAvis.display(vis_data90)

In [98]:
## save LDAvis output as a html file
save_LDAvis = True
if save_LDAvis:
	lang_dir_name = target_lang_dict[target_lang_key].split()[0]
	vis_output = f"results/LDAvis/{lang_dir_name}/{target_lang_dict[target_lang_key]}{target_class}-HDP-max_ntop{max_n_topics}-{term_type}{accent_status}.html"
	pyLDAvis.save_html(vis_data90, vis_output)

In [99]:
## save topic structures
hdp_topics = hdp90.show_topics(num_topics = max_n_topics,
                               num_words = n_terms_to_save, formatted = False)
hdp_dict = { tid: values for tid, values in hdp_topics }

## convert to Pandas dataframe
topics_df = pd.DataFrame.from_dict(hdp_dict)
hdp90_topics_out = f"results/terms-by-topics-raw/{lang_dir_name}/{target_lang_dict[target_lang_key]}{target_class}-topics{max_n_topics}-{term_type}{accent_status}.csv"
topics_df.to_csv(hdp90_topics_out, header = False, index = None)

In [100]:
## topic investigation
import numpy as np
import HDP_helper

target_hdp = hdp90
documents_topics = np.zeros([target_hdp.m_T, len(corpus)])
for doc_id, c in enumerate(corpus):
    for topic_id, prob in target_hdp[c]:
        documents_topics[topic_id][doc_id] = prob

## investigate topics
target_hdp.optimal_ordering()
for topic_id, probs in enumerate(documents_topics):
    print(f"==============")
    #topic_encoding = ", ".join(hdp.show_topic(topic_id))
    topic_t = target_hdp.print_topic(topic_id, topn = n_terms_to_show)
    print(f"topic_id {topic_id}: {HDP_helper.reformat_topic (topic_t, n_terms_to_show)}")
    print(f"nonzero count: ", len(probs.nonzero()[0]))
    for doc_id in probs.argsort()[::-1][:n_docs_to_show]:
        doc = doc_dict[doc_id]
        print(f"\t{probs[doc_id]:0.4f}: {doc}")

topic_id 0: 0.001 * ble + 0.001 * e…a…e + 0.001 * ab…e + 0.001 * abl + 0.001 * able + 0.001 * i…a…e + 0.001 * re…e + 0.001 * in…e + 0.001 * i…le + 0.001 * e…le + 0.001 * n…a…e + 0.001 * un…e + 0.001 * r…le + 0.001 * a…a…e + 0.001 * p…ed + 0.001 * r…a…e + 0.001 * e…b + 0.001 * i…b + 0.001 * i…b…e + 0.001 * e…te
nonzero count:  1048
	0.9983: intractable
	0.9983: respectable
	0.9983: retractable
	0.9983: uninsurable
	0.9983: untraceable
	0.9983: understated
	0.9983: intemperate
	0.9983: inseparable
	0.9983: appreciable
	0.9983: indivisible
	0.9983: inheritable
	0.9982: recoverable
	0.9982: irreparable
	0.9982: unsupported
	0.9982: indifferent
topic_id 1: 0.001 * p…n + 0.001 * e…al + 0.001 * e…a…e + 0.001 * n…al + 0.001 * e…g + 0.001 * b…n + 0.001 * b…d + 0.0 * r…i…e + 0.0 * l…ed + 0.0 * g…e + 0.0 * na + 0.0 * a…i…d + 0.0 * ze + 0.0 * a…i…ed + 0.0 * t…al + 0.0 * o…i…e + 0.0 * z…d + 0.0 * zed + 0.0 * r…a…e + 0.0 * iz
nonzero count:  501
	0.9981: patrilineal
	0.9981: conditioned
	0.9981: mat