In [79]:
#!pip install -U pyLDAvis
#!pip install -U pandas

In [80]:
## imports
import os, sys
import pprint as pp
import unicodedata

In [81]:
## 一つ上の階層のファイルを見るように設定
sys.path.append(os.path.join(os.path.dirname("__file__"), '..'))

In [82]:
## target language
## a key must be part of a file name 
target_lang_dict = {    'en_US' : 'English (US)',
                        'en_UK' : 'English (UK)',
                        'en_N_only' : 'English noun (WN)',
                        'en_V_only' : 'English verb (WN)',
                        'en_A_only' : 'English adj (WN)',
                        'en_R_only' : 'English adv (WN)',
                        'ar'    : 'Arabic',
                        'de'    : 'German',
                        'de_N_only' : 'German Nouns',
                        'de_non_N_only' : 'German Non-nouns',
                        'eo'    : 'Esperanto',
                        'es_ES' : 'Spanish (Spain)',
                        'es_MX' : 'Spanish (Mexico)',
                        'fi'    : 'Finnish',
                        'fr_FR' : 'French (France)',
                        'fr_QC' : 'French (Quebec)',
                        'is'    : 'Icelandic',
                        'ir'    : 'Irish',
                        'nl'    : 'Dutch',
                        'ro'    : 'Romanian',
                        'sw'    : 'Swahili' }
## proper language selection
target_lang_keys = [    'en_US', # 0
                        'en_UK', # 1
                        'en_N_only', # 2
                        'en_V_only', # 3
                        'en_A_only', # 4
                        'en_R_only', # 5
                        'ar', # 6
                        'de', # 7
                        'de_N_only', # 8
                        'de_non_N_only', # 9
                        'eo', 'es_ES', 'es_MX',
                        'fi', 'fr_FR', 'fr_QC',
                        'is', 'nl', 'ro', 'sw',
                        'ir' # This lacks sound
                    ]
## check
target_lang_key  = target_lang_keys[3]
print(f"target_lang_key: {target_lang_key}")
print(f"target lang: {target_lang_dict[target_lang_key]} [{target_lang_key}]")
## target_attr [effective only for Irish]
target_class = ""
#target_class = None # This causes an unrediable error
if target_lang_key == "ir":
    target_classes = [ 'adjectives', 'nouns', 'verbs' ]
    target_class = f"-{target_classes[3]}"
print(f"target_class: {target_class}")

target_lang_key: en_V_only
target lang: English verb (WN) [en_V_only]
target_class: 


In [83]:
## LDA/HDP
apply_term_filtering = True
## The following parameters need to be relatively large to prevent "Row sum not equal 1" error
term_minfreq         = 2
abuse_threshold      = 0.05 # larger value selects shorter units, smaller value selects longer units
min_bot_size         = 3
# number of terms listed for a given topic
n_terms_to_show = 60

In [84]:
## term settings
term_classes        = [ 'spell', 'sound' ]
term_class          = term_classes[1]
ngram_is_inclusive  = True
## doc settings
max_doc_size        = 11
min_doc_size        =  3
print(f"max_doc_size: {max_doc_size}")
print(f"min_doc_size: {min_doc_size}")
### boundary handling
add_boundary       = True
boundary_mark      = "#"
## term setting
gap_marker           = "…"
term_is_skippy     = True
n_for_ngram        = 4
max_gap_ratio      = 1.0
max_gap_size       = round(max_doc_size * max_gap_ratio)
print(f"term_class: {term_class}")
print(f"term_is_skippy: {term_is_skippy}")
print(f"max_gap_size: {max_gap_size}")
print(f"n_for_ngram: {n_for_ngram}")
### accent handling
suppress_accents   = True
accent_marks       = [ "ˈ", "ˌ" ] 
if term_class == 'sound':
    if suppress_accents:
        accent_status = "-unaccented"
    else:
        accent_stratus = "-accented"
else:
    accent_status = ""
print(f"accent_status: {accent_status}")
## define term_type
if term_class == 'spell':
    if term_is_skippy:
        term_type = f"sp_skippy{n_for_ngram}gram"
    else:
        term_type = f"sp_{n_for_ngram}gram"
else:
    if term_is_skippy:
        term_type = f"sn_skippy{n_for_ngram}gram"
    else:
        term_type = f"sn_{n_for_ngram}gram"
## check
print(f"term_type: {term_type}")

max_doc_size: 11
min_doc_size: 3
term_class: sound
term_is_skippy: True
max_gap_size: 11
n_for_ngram: 4
accent_status: -unaccented
term_type: sn_skippy4gram


In [85]:
## sampling
source_sampling          = True
source_sampling_rate     = 0.5
source_sampling_max_size = 5000
second_sampling          = False
second_sampling_rate     = 0.7

In [86]:
## set target files
import glob
data_dir1     = "data/open-dict-ipa/data1"
data_dir2     = "data/open-dict-ipa/data1a"
data_dir3     = "data/wn3"
data_dir4     = "data/irish"
target_files = glob.glob(f"{data_dir1}/*")
target_files2 = glob.glob(f"{data_dir2}/*")
target_files.extend(target_files2)
target_files3 = glob.glob(f"{data_dir3}/*")
target_files.extend(target_files3)
target_files4 = glob.glob(f"{data_dir4}/*")
target_files.extend(target_files4)
#
target_files = sorted([ file for file in target_files if ".csv" in file ])
pp.pprint(target_files)

['data/irish/word-irish-adjectives-spell.csv',
 'data/irish/word-irish-noun-phrases-spell.csv',
 'data/irish/word-irish-nouns-spell.csv',
 'data/irish/word-irish-possessives-spell.csv',
 'data/irish/word-irish-prepositions-spell.csv',
 'data/irish/word-irish-verbs-spell.csv',
 'data/open-dict-ipa/data1/ar.csv.gz',
 'data/open-dict-ipa/data1/de.csv.gz',
 'data/open-dict-ipa/data1/en_UK.csv.gz',
 'data/open-dict-ipa/data1/en_US.csv.gz',
 'data/open-dict-ipa/data1/eo.csv.gz',
 'data/open-dict-ipa/data1/es_ES.csv.gz',
 'data/open-dict-ipa/data1/es_MX.csv.gz',
 'data/open-dict-ipa/data1/fa.csv.gz',
 'data/open-dict-ipa/data1/fi.csv.gz',
 'data/open-dict-ipa/data1/fr_FR.csv.gz',
 'data/open-dict-ipa/data1/fr_QC.csv.gz',
 'data/open-dict-ipa/data1/is.csv.gz',
 'data/open-dict-ipa/data1/ja.csv.gz',
 'data/open-dict-ipa/data1/jam.csv.gz',
 'data/open-dict-ipa/data1/ma.csv.gz',
 'data/open-dict-ipa/data1/nb.csv.gz',
 'data/open-dict-ipa/data1/nl.csv.gz',
 'data/open-dict-ipa/data1/or.csv.gz',
 '

In [87]:
#print(target_files)

In [88]:
## get source data from files
import pandas as pd
import gzip
#target_language_key = "en_US" # can be changed to get other languages
#if target_class != "" or target_class is not None:
if target_class != "":
    target_file = [ f for f in target_files if target_lang_key in f and target_class in f ][0]
else:
    target_file = [ f for f in target_files if target_lang_key in f ][0]
print(f"processing: {target_file}")
##
if target_lang_key == "ir":
    col_names = ['spell', 'POS']
else:
    col_names = ['spell', 'sound']
#
if target_file.endswith(".gz"):
    with gzip.open(target_file, "rt") as f:
        raw_df = pd.read_csv(f, encoding = 'utf8', header = None, names = col_names )
else:
    with open(target_file, "rt") as f:
        raw_df = pd.read_csv(f, encoding = 'utf8', header = None, names = col_names )
## normalize characters
raw_df['spell'] = raw_df['spell'].apply(lambda x: unicodedata.normalize('NFC', str(x)))
## modify sound
try:
    sounds = raw_df['sound'].apply(lambda x: x.strip('/') )
    sounds = [ x.split("/,")[0] for x in sounds ] # picks up only the first of multiple entries
    raw_df['sound'] = sounds
except (AttributeError, KeyError):
    pass
#
raw_df.sample(10)

processing: data/wn3/en_V_only.csv


Unnamed: 0,spell,sound
4381,totter,ˈtɑtɝ
1751,forewarn,fɔɹˈwɔɹn
4555,vacation,veɪˈkeɪʃən
2305,jog,ˈdʒɑɡ
4170,suburbanize,səˈbɝbəˌnaɪz
1552,exclaim,ɪkˈskɫeɪm
861,confect,kənˈfɛkt
2889,pass,ˈpæs
826,compensate,ˈkɑmpənˌseɪt
723,clamber,ˈkɫæmbɝ


In [89]:
## source sampling
len(raw_df)
if source_sampling:
	print(f"source sampling applied")
	if len(raw_df) >= source_sampling_max_size:
		raw_df = raw_df.sample(source_sampling_max_size)
	else:
		raw_df = raw_df.sample(round(len(raw_df) * source_sampling_rate))
## remove accent marking
if suppress_accents:
	try:
		raw_df['sound'] = raw_df['sound'].apply(lambda x: "".join([ y for y in list(x) if y not in accent_marks ]))
	except KeyError:
		pass
## add boudary marks
if add_boundary:
	raw_df['spell'] = raw_df['spell'].apply(lambda x: f"{boundary_mark}{x}{boundary_mark}")
	try:
		raw_df['sound'] = raw_df['sound'].apply(lambda x: f"{boundary_mark}{x}{boundary_mark}")
	except KeyError:
		pass
#
print(raw_df)

source sampling applied
            spell         sound
3987  #speculate#  #spɛkjəɫeɪt#
1481   #engender#    #ɛndʒɛndɝ#
4011      #spook#        #spuk#
3839     #sketch#       #skɛtʃ#
2825  #oversleep#    #oʊvɝsɫip#
...           ...           ...
1369     #drench#      #dɹɛntʃ#
1885       #gnaw#          #nɔ#
3843       #skid#        #skɪd#
2636      #mound#       #maʊnd#
3938       #soap#        #soʊp#

[2374 rows x 2 columns]


In [90]:
## generate 1-grams for spell and sound
## spell
raw_df['sp_1gram'] = raw_df['spell'].apply(lambda x: list(str(x)))
# add column of size
raw_df['sp_size'] = raw_df['sp_1gram'].apply(lambda x: len(x))
# add column of count of '-' inside
raw_df['hyphen'] = raw_df['sp_1gram'].apply(lambda x: list(x).count("-"))
# add column of count of '.' inside
raw_df['period'] = raw_df['sp_1gram'].apply(lambda x: list(x).count("."))
## sound
# takes the first entry, removes '/' around
try:
    raw_df['sn_1gram'] = raw_df['sound'].apply(lambda x: list(x) )
except (TypeError, KeyError):
    pass
# add column of size
try:
    raw_df['sn_size'] = raw_df['sn_1gram'].apply(lambda x: len(x))
except KeyError:
    pass
## check
raw_df

Unnamed: 0,spell,sound,sp_1gram,sp_size,hyphen,period,sn_1gram,sn_size
3987,#speculate#,#spɛkjəɫeɪt#,"[#, s, p, e, c, u, l, a, t, e, #]",11,0,0,"[#, s, p, ɛ, k, j, ə, ɫ, e, ɪ, t, #]",12
1481,#engender#,#ɛndʒɛndɝ#,"[#, e, n, g, e, n, d, e, r, #]",10,0,0,"[#, ɛ, n, d, ʒ, ɛ, n, d, ɝ, #]",10
4011,#spook#,#spuk#,"[#, s, p, o, o, k, #]",7,0,0,"[#, s, p, u, k, #]",6
3839,#sketch#,#skɛtʃ#,"[#, s, k, e, t, c, h, #]",8,0,0,"[#, s, k, ɛ, t, ʃ, #]",7
2825,#oversleep#,#oʊvɝsɫip#,"[#, o, v, e, r, s, l, e, e, p, #]",11,0,0,"[#, o, ʊ, v, ɝ, s, ɫ, i, p, #]",10
...,...,...,...,...,...,...,...,...
1369,#drench#,#dɹɛntʃ#,"[#, d, r, e, n, c, h, #]",8,0,0,"[#, d, ɹ, ɛ, n, t, ʃ, #]",8
1885,#gnaw#,#nɔ#,"[#, g, n, a, w, #]",6,0,0,"[#, n, ɔ, #]",4
3843,#skid#,#skɪd#,"[#, s, k, i, d, #]",6,0,0,"[#, s, k, ɪ, d, #]",6
2636,#mound#,#maʊnd#,"[#, m, o, u, n, d, #]",7,0,0,"[#, m, a, ʊ, n, d, #]",7


In [91]:
## filtering raw_data by size
print(f"term_type: {term_type}")
if "sp_" in term_type:
    df_filtered = raw_df[ (raw_df['sp_size'] <= max_doc_size) & (raw_df['sp_size'] >= min_doc_size) & (raw_df['hyphen'] == 0) & (raw_df['period'] == 0) ]
else:
    df_filtered = raw_df[ (raw_df['sn_size'] <= max_doc_size) & (raw_df['sn_size'] >= min_doc_size) ]
#
df_filtered

term_type: sn_skippy4gram


Unnamed: 0,spell,sound,sp_1gram,sp_size,hyphen,period,sn_1gram,sn_size
1481,#engender#,#ɛndʒɛndɝ#,"[#, e, n, g, e, n, d, e, r, #]",10,0,0,"[#, ɛ, n, d, ʒ, ɛ, n, d, ɝ, #]",10
4011,#spook#,#spuk#,"[#, s, p, o, o, k, #]",7,0,0,"[#, s, p, u, k, #]",6
3839,#sketch#,#skɛtʃ#,"[#, s, k, e, t, c, h, #]",8,0,0,"[#, s, k, ɛ, t, ʃ, #]",7
2825,#oversleep#,#oʊvɝsɫip#,"[#, o, v, e, r, s, l, e, e, p, #]",11,0,0,"[#, o, ʊ, v, ɝ, s, ɫ, i, p, #]",10
1492,#enmesh#,#ɛnmɛʃ#,"[#, e, n, m, e, s, h, #]",8,0,0,"[#, ɛ, n, m, ɛ, ʃ, #]",7
...,...,...,...,...,...,...,...,...
1369,#drench#,#dɹɛntʃ#,"[#, d, r, e, n, c, h, #]",8,0,0,"[#, d, ɹ, ɛ, n, t, ʃ, #]",8
1885,#gnaw#,#nɔ#,"[#, g, n, a, w, #]",6,0,0,"[#, n, ɔ, #]",4
3843,#skid#,#skɪd#,"[#, s, k, i, d, #]",6,0,0,"[#, s, k, ɪ, d, #]",6
2636,#mound#,#maʊnd#,"[#, m, o, u, n, d, #]",7,0,0,"[#, m, a, ʊ, n, d, #]",7


In [92]:
## define df after second sampling if any
len(df_filtered)
if second_sampling:
    df = df_filtered.sample(round(len(df_filtered) * second_sampling_rate))
else:
    df = df_filtered
len(df)

2220

In [93]:
## spell 2grams
#import ngrams
import gen_ngrams
module_name = "gen_ngrams"
reload_module = False
if reload_module:
    import importlib
    importlib.reload(module_name)

if term_class == 'spell':
    #sp_2grams = [ ngrams.list_gen_ngrams (x, n = 2, check = False) for x in df['sp_1gram'] ]
    sp_2grams = [ gen_ngrams.gen_ngrams (x, n = 2, sep = "", check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_2grams):
            g.extend(list(df['sp_1gram'])[i])
    ## add sp_2gram
    df['sp_2gram'] = sp_2grams

In [94]:
## spell 3grams
#import ngrams
import gen_ngrams
if n_for_ngram > 2 and term_class == 'spell':
    #sp_3grams = [ ngrams.list_gen_ngrams (x, n = 3, check = False) for x in df['sp_1gram'] ]
    sp_3grams = [ gen_ngrams.gen_ngrams (x, n = 3, sep = "", check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_3grams):
            g.extend(list(df['sp_2gram'])[i])
    ## add sp_3gram
    df['sp_3gram'] = sp_3grams

In [95]:
## spell 4grams
#import ngrams
import gen_ngrams
if n_for_ngram > 3 and term_class == 'spell':
    #sp_4grams = [ ngrams.list_gen_ngrams (x, n = 4, check = False) for x in df['sp_1gram'] ]
    sp_4grams = [ gen_ngrams.gen_ngrams (x, n = 4, sep = "", check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_4grams):
            g.extend(list(df['sp_3gram'])[i])
    ## add sp_4gram
    df['sp_4gram'] = sp_4grams

In [96]:
## spell 5grams
#import ngrams
import gen_ngrams
if n_for_ngram > 4 and term_class == 'spell':
    sp_5grams = [ gen_ngrams.gen_ngrams (x, n = 5, sep = "", check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_5grams):
            g.extend(list(df['sp_4gram'])[i])
    ## add sp_5gram
    df['sp_5gram'] = sp_5grams

In [97]:
## spell skippy 2gram
#import ngrams_skippy
import gen_ngrams
reload_module = False
module_name = "gen_ngrams"
if reload_module:
    import importlib
    importlib.reload(module_name)
#
if term_class == 'spell':
    #sp_skippy2grams = [ ngrams_skippy.gen_skippy2grams(x, missing_mark = gap_mark, check = False) for x in df['sp_1gram'] ]
    ## The code above was replaced by the following more efficient one
    sp_skippy2grams = [ gen_ngrams.gen_skippy_ngrams(x, 2, sep = "", max_distance = max_gap_size, missing_mark = gap_marker, check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_skippy2grams):
            g.extend(list(df['sp_1gram'])[i])
    #
    df['sp_skippy2gram'] = sp_skippy2grams

In [98]:
## spell skippy 3gram
#import ngrams_skippy
import gen_ngrams
if n_for_ngram > 2 and term_class == 'spell':
    #sp_skippy3grams = [ ngrams_skippy.gen_skippy3grams(x, missing_mark = gap_mark, check = False) for x in df['sp_1gram'] ]
    ## The code above was replaced by the following more efficient one
    sp_skippy3grams = [ gen_ngrams.gen_skippy_ngrams(x, 3, sep = "", max_distance = max_gap_size, missing_mark = gap_marker, check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_skippy3grams):
            g.extend(list(df['sp_skippy2gram'])[i])
    #
    df['sp_skippy3gram'] = sp_skippy3grams

In [99]:
## spell skippy 4gram
#import ngrams_skippy
import gen_ngrams
if n_for_ngram > 3 and term_class == 'spell':
    #sp_skippy4grams = [ ngrams_skippy.gen_skippy4grams(x, missing_mark = gap_mark, check = False) for x in df['sp_1gram'] ]
    ## The code above was replaced by the following more efficient one
    sp_skippy4grams = [ gen_ngrams.gen_skippy_ngrams(x, 4, sep = "", max_distance = max_gap_size, missing_mark = gap_marker, check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_skippy4grams):
            g.extend(list(df['sp_skippy3gram'])[i])
    #
    df['sp_skippy4gram'] = sp_skippy4grams

In [100]:
## spell skippy 5gram
import gen_ngrams
if n_for_ngram > 4 and term_class == 'spell':
    sp_skippy5grams = [ gen_ngrams.gen_skippy_ngrams(x, 5, sep = "", max_distance = max_gap_size, missing_mark = gap_marker, check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_skippy5grams):
            g.extend(list(df['sp_skippy4gram'])[i])
    #
    df['sp_skippy5gram'] = sp_skippy5grams

In [101]:
## sound 2grams
#import ngrams
import gen_ngrams
module_name = "gen_ngrams"
reload_module = False
if reload_module:
    import importlib
    importlib.reload(module_name)
#
if term_class == 'sound':
    #sn_2grams = [ ngrams.list_gen_ngrams (x, n = 2, check = False) for x in df['sn_1gram'] ]
    sn_2grams = [ gen_ngrams.gen_ngrams (x, n = 2, sep ="", check = False) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_2grams):
            g.extend(list(df['sn_1gram'])[i])
    ## add sn_2gram
    df['sn_2gram'] = sn_2grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sn_2gram'] = sn_2grams


In [102]:
## sound 3grams
#import ngrams
import gen_ngrams
if n_for_ngram > 2 and term_class == 'sound':
    #sn_3grams = [ ngrams.list_gen_ngrams (x, n = 3, check = False) for x in df['sn_1gram'] ]
    sn_3grams = [ gen_ngrams.gen_ngrams (x, n = 3, sep = "", check = False) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_3grams):
            g.extend(list(df['sn_2gram'])[i])
    ## add sn_3gram
    df['sn_3gram'] = sn_3grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sn_3gram'] = sn_3grams


In [103]:
## sound 4grams
#import ngrams
import gen_ngrams
if n_for_ngram > 3 and term_class == 'sound':
    #sn_4grams = [ ngrams.list_gen_ngrams (x, n = 4, check = False) for x in df['sn_1gram'] ]
    sn_4grams = [ gen_ngrams.gen_ngrams (x, n = 4, sep = "", check = False) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_4grams):
            g.extend(list(df['sn_3gram'])[i])
    ## add sn_4gram
    df['sn_4gram'] = sn_4grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sn_4gram'] = sn_4grams


In [104]:
## sound 5grams
import gen_ngrams
if n_for_ngram > 4 and term_class == 'sound':
    sn_5grams = [ gen_ngrams.gen_ngrams (x, n = 5, sep = "", check = False) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_5grams):
            g.extend(list(df['sn_4gram'])[i])
    ## add sn_4gram
    df['sn_5gram'] = sn_5grams

In [105]:
## sound skippy 2gram
#import ngrams_skippy
import gen_ngrams
if term_class == 'sound':
    #sn_skippy2grams = [ ngrams_skippy.gen_skippy2grams(x, missing_mark = gap_mark, check = False) for x in df['sn_1gram'] ]
    ## The code above was replaced by the following more efficient one
    sn_skippy2grams = [ gen_ngrams.gen_skippy_ngrams(x, n = 2, sep = "", max_distance = max_gap_size, missing_mark = gap_marker, check = False) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_skippy2grams):
            g.extend(list(df['sn_1gram'])[i])
    #
    df['sn_skippy2gram'] = sn_skippy2grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sn_skippy2gram'] = sn_skippy2grams


In [106]:
## sound skippy 3gram
#import ngrams_skippy
import gen_ngrams
if n_for_ngram > 2 and term_class == 'sound':
    #sn_skippy3grams = [ ngrams_skippy.gen_skippy3grams(x, missing_mark = gap_mark, check = False) for x in df['sn_1gram'] ]
    ## The code above was replaced by the following more efficient one
    sn_skippy3grams = [ gen_ngrams.gen_skippy_ngrams(x, n = 3, sep = "", max_distance = max_gap_size, missing_mark = gap_marker, check = False) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_skippy3grams):
            g.extend(list(df['sn_skippy2gram'])[i])
    #
    df['sn_skippy3gram'] = sn_skippy3grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sn_skippy3gram'] = sn_skippy3grams


In [107]:
## sound skippy 4gram
#import ngrams_skippy
import gen_ngrams
if n_for_ngram > 3 and term_class == 'sound':
    #sn_skippy4grams = [ ngrams_skippy.gen_skippy4grams(x, missing_mark = gap_mark, check = False) for x in df['sn_1gram'] ]
    ## The code above was replaced by the following more efficient one
    sn_skippy4grams = [ gen_ngrams.gen_skippy_ngrams(x, n = 4, sep = "", max_distance = max_gap_size, missing_mark = gap_marker, check = False) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_skippy4grams):
            g.extend(list(df['sn_skippy3gram'])[i])
    #
    df['sn_skippy4gram'] = sn_skippy4grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sn_skippy4gram'] = sn_skippy4grams


In [108]:
## sound skippy 5gram
import gen_ngrams
if n_for_ngram > 4 and term_class == 'sound':
    sn_skippy5grams = [ gen_ngrams.gen_skippy_ngrams(x, n = 5, sep = "", max_distance = max_gap_size, missing_mark = gap_marker, check = False) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_skippy5grams):
            g.extend(list(df['sn_skippy4gram'])[i])
    #
    df['sn_skippy5gram'] = sn_skippy5grams

In [109]:
## check df
dropped_vars = [ 'sp_size', 'hyphen', 'period', 'sn_size' ]
if term_class == 'spell':
    extra = [ 'sn_1gram', 'sn_2gram', 'sn_3gram', 'sn_4gram',
             'sn_skippy2gram', 'sn_skippy3gram', 'sn_skippy4gram' ]
    dropped_vars.extend(extra)
    target_vars = [ x for x in df.columns if not x in dropped_vars ]
else:
    extra = [ 'sp_1gram', 'sp_2gram', 'sp_3gram', 'sp_4gram',
             'sp_skippy2gram', 'sp_skippy3gram', 'sp_skippy4gram' ]
    dropped_vars.extend(extra)
    target_vars = [ x for x in df.columns if not x in dropped_vars ]    
#
df[target_vars]

Unnamed: 0,spell,sound,sn_1gram,sn_2gram,sn_3gram,sn_4gram,sn_skippy2gram,sn_skippy3gram,sn_skippy4gram
1481,#engender#,#ɛndʒɛndɝ#,"[#, ɛ, n, d, ʒ, ɛ, n, d, ɝ, #]","[#ɛ, ɛn, nd, dʒ, ʒɛ, ɛn, nd, dɝ, ɝ#, #, ɛ, n, ...","[#ɛn, ɛnd, ndʒ, dʒɛ, ʒɛn, ɛnd, ndɝ, dɝ#, #ɛ, ɛ...","[#ɛnd, ɛndʒ, ndʒɛ, dʒɛn, ʒɛnd, ɛndɝ, ndɝ#, #ɛn...","[#ɛ, #…n, #…d, #…ʒ, #…ɛ, #…n, #…d, #…ɝ, #…#, ɛ...","[#ɛn, #ɛ…d, #ɛ…ʒ, #ɛ…ɛ, #ɛ…n, #ɛ…d, #ɛ…ɝ, #ɛ…#...","[#ɛnd, #ɛn…ʒ, #ɛn…ɛ, #ɛn…n, #ɛn…d, #ɛn…ɝ, #ɛn…..."
4011,#spook#,#spuk#,"[#, s, p, u, k, #]","[#s, sp, pu, uk, k#, #, s, p, u, k, #]","[#sp, spu, puk, uk#, #s, sp, pu, uk, k#, #, s,...","[#spu, spuk, puk#, #sp, spu, puk, uk#, #s, sp,...","[#s, #…p, #…u, #…k, #…#, sp, s…u, s…k, s…#, pu...","[#sp, #s…u, #s…k, #s…#, #…pu, #…p…k, #…p…#, #…...","[#spu, #sp…k, #sp…#, #s…uk, #s…u…#, #s…k#, #…p..."
3839,#sketch#,#skɛtʃ#,"[#, s, k, ɛ, t, ʃ, #]","[#s, sk, kɛ, ɛt, tʃ, ʃ#, #, s, k, ɛ, t, ʃ, #]","[#sk, skɛ, kɛt, ɛtʃ, tʃ#, #s, sk, kɛ, ɛt, tʃ, ...","[#skɛ, skɛt, kɛtʃ, ɛtʃ#, #sk, skɛ, kɛt, ɛtʃ, t...","[#s, #…k, #…ɛ, #…t, #…ʃ, #…#, sk, s…ɛ, s…t, s…...","[#sk, #s…ɛ, #s…t, #s…ʃ, #s…#, #…kɛ, #…k…t, #…k...","[#skɛ, #sk…t, #sk…ʃ, #sk…#, #s…ɛt, #s…ɛ…ʃ, #s…..."
2825,#oversleep#,#oʊvɝsɫip#,"[#, o, ʊ, v, ɝ, s, ɫ, i, p, #]","[#o, oʊ, ʊv, vɝ, ɝs, sɫ, ɫi, ip, p#, #, o, ʊ, ...","[#oʊ, oʊv, ʊvɝ, vɝs, ɝsɫ, sɫi, ɫip, ip#, #o, o...","[#oʊv, oʊvɝ, ʊvɝs, vɝsɫ, ɝsɫi, sɫip, ɫip#, #oʊ...","[#o, #…ʊ, #…v, #…ɝ, #…s, #…ɫ, #…i, #…p, #…#, o...","[#oʊ, #o…v, #o…ɝ, #o…s, #o…ɫ, #o…i, #o…p, #o…#...","[#oʊv, #oʊ…ɝ, #oʊ…s, #oʊ…ɫ, #oʊ…i, #oʊ…p, #oʊ…..."
1492,#enmesh#,#ɛnmɛʃ#,"[#, ɛ, n, m, ɛ, ʃ, #]","[#ɛ, ɛn, nm, mɛ, ɛʃ, ʃ#, #, ɛ, n, m, ɛ, ʃ, #]","[#ɛn, ɛnm, nmɛ, mɛʃ, ɛʃ#, #ɛ, ɛn, nm, mɛ, ɛʃ, ...","[#ɛnm, ɛnmɛ, nmɛʃ, mɛʃ#, #ɛn, ɛnm, nmɛ, mɛʃ, ɛ...","[#ɛ, #…n, #…m, #…ɛ, #…ʃ, #…#, ɛn, ɛ…m, ɛ…ɛ, ɛ…...","[#ɛn, #ɛ…m, #ɛ…ɛ, #ɛ…ʃ, #ɛ…#, #…nm, #…n…ɛ, #…n...","[#ɛnm, #ɛn…ɛ, #ɛn…ʃ, #ɛn…#, #ɛ…mɛ, #ɛ…m…ʃ, #ɛ…..."
...,...,...,...,...,...,...,...,...,...
1369,#drench#,#dɹɛntʃ#,"[#, d, ɹ, ɛ, n, t, ʃ, #]","[#d, dɹ, ɹɛ, ɛn, nt, tʃ, ʃ#, #, d, ɹ, ɛ, n, t,...","[#dɹ, dɹɛ, ɹɛn, ɛnt, ntʃ, tʃ#, #d, dɹ, ɹɛ, ɛn,...","[#dɹɛ, dɹɛn, ɹɛnt, ɛntʃ, ntʃ#, #dɹ, dɹɛ, ɹɛn, ...","[#d, #…ɹ, #…ɛ, #…n, #…t, #…ʃ, #…#, dɹ, d…ɛ, d…...","[#dɹ, #d…ɛ, #d…n, #d…t, #d…ʃ, #d…#, #…ɹɛ, #…ɹ…...","[#dɹɛ, #dɹ…n, #dɹ…t, #dɹ…ʃ, #dɹ…#, #d…ɛn, #d…ɛ..."
1885,#gnaw#,#nɔ#,"[#, n, ɔ, #]","[#n, nɔ, ɔ#, #, n, ɔ, #]","[#nɔ, nɔ#, #n, nɔ, ɔ#, #, n, ɔ, #]","[#nɔ#, #nɔ, nɔ#, #n, nɔ, ɔ#, #, n, ɔ, #]","[#n, #…ɔ, #…#, nɔ, n…#, ɔ#, nɔ, n…#, ɔ#, ɔ#, #...","[#nɔ, #n…#, #…ɔ#, nɔ#, nɔ#, #n, #…ɔ, #…#, nɔ, ...","[#nɔ#, #nɔ, #n…#, #…ɔ#, nɔ#, nɔ#, #n, #…ɔ, #…#..."
3843,#skid#,#skɪd#,"[#, s, k, ɪ, d, #]","[#s, sk, kɪ, ɪd, d#, #, s, k, ɪ, d, #]","[#sk, skɪ, kɪd, ɪd#, #s, sk, kɪ, ɪd, d#, #, s,...","[#skɪ, skɪd, kɪd#, #sk, skɪ, kɪd, ɪd#, #s, sk,...","[#s, #…k, #…ɪ, #…d, #…#, sk, s…ɪ, s…d, s…#, kɪ...","[#sk, #s…ɪ, #s…d, #s…#, #…kɪ, #…k…d, #…k…#, #…...","[#skɪ, #sk…d, #sk…#, #s…ɪd, #s…ɪ…#, #s…d#, #…k..."
2636,#mound#,#maʊnd#,"[#, m, a, ʊ, n, d, #]","[#m, ma, aʊ, ʊn, nd, d#, #, m, a, ʊ, n, d, #]","[#ma, maʊ, aʊn, ʊnd, nd#, #m, ma, aʊ, ʊn, nd, ...","[#maʊ, maʊn, aʊnd, ʊnd#, #ma, maʊ, aʊn, ʊnd, n...","[#m, #…a, #…ʊ, #…n, #…d, #…#, ma, m…ʊ, m…n, m…...","[#ma, #m…ʊ, #m…n, #m…d, #m…#, #…aʊ, #…a…n, #…a...","[#maʊ, #ma…n, #ma…d, #ma…#, #m…ʊn, #m…ʊ…d, #m…..."


In [110]:
## select data type and define doc_dict
import random
if "sp_" in term_type:
    base_type = "spell"
else:
    base_type = "sound"
doc_dict = { i: x for i, x in enumerate(df[base_type]) }
## check
random.sample(doc_dict.items(), 10)

since Python 3.9 and will be removed in a subsequent version.
  random.sample(doc_dict.items(), 10)


[(1305, '#hoʊm#'),
 (1972, '#mɪsəpɫaɪ#'),
 (1729, '#hæk#'),
 (1588, '#spɪɹət#'),
 (119, '#sɝaʊnd#'),
 (2065, '#ædvaɪz#'),
 (874, '#fɑɫ#'),
 (338, '#ɹɪkjuz#'),
 (2050, '#ɹaɪd#'),
 (781, '#oʊvɝdɹaɪv#')]

In [111]:
## select bots for analysis
enable_term_change = False # if you want to change term_type to save time and energy
if enable_term_change:
	term_type = 'sp_skippy4gram'
print(f"(changed) term_type: {term_type}")

## bot stands for 'bag-of-terms', a generalization of 'bag-of-words'
bots = [ x for x in df[term_type] if len(x) > min_bot_size ] # Crucially
import random
random.sample(bots, 1)

(changed) term_type: sn_skippy4gram


[['#ɪɫu',
  '#ɪɫ…d',
  '#ɪɫ…#',
  '#ɪ…ud',
  '#ɪ…u…#',
  '#ɪ…d#',
  '#…ɫud',
  '#…ɫu…#',
  '#…ɫ…d#',
  '#…ud#',
  'ɪɫud',
  'ɪɫu…#',
  'ɪɫ…d#',
  'ɪ…ud#',
  'ɫud#',
  'ɪɫud',
  'ɪɫu…#',
  'ɪɫ…d#',
  'ɪ…ud#',
  'ɫud#',
  'ɫud#',
  '#ɪɫ',
  '#ɪ…u',
  '#ɪ…d',
  '#ɪ…#',
  '#…ɫu',
  '#…ɫ…d',
  '#…ɫ…#',
  '#…ud',
  '#…u…#',
  '#…d#',
  'ɪɫu',
  'ɪɫ…d',
  'ɪɫ…#',
  'ɪ…ud',
  'ɪ…u…#',
  'ɪ…d#',
  'ɫud',
  'ɫu…#',
  'ɫ…d#',
  'ud#',
  'ɪɫu',
  'ɪɫ…d',
  'ɪɫ…#',
  'ɪ…ud',
  'ɪ…u…#',
  'ɪ…d#',
  'ɫud',
  'ɫu…#',
  'ɫ…d#',
  'ud#',
  'ɫud',
  'ɫu…#',
  'ɫ…d#',
  'ud#',
  'ud#',
  '#ɪ',
  '#…ɫ',
  '#…u',
  '#…d',
  '#…#',
  'ɪɫ',
  'ɪ…u',
  'ɪ…d',
  'ɪ…#',
  'ɫu',
  'ɫ…d',
  'ɫ…#',
  'ud',
  'u…#',
  'd#',
  'ɪɫ',
  'ɪ…u',
  'ɪ…d',
  'ɪ…#',
  'ɫu',
  'ɫ…d',
  'ɫ…#',
  'ud',
  'u…#',
  'd#',
  'ɫu',
  'ɫ…d',
  'ɫ…#',
  'ud',
  'u…#',
  'd#',
  'ud',
  'u…#',
  'd#',
  'd#',
  '#',
  'ɪ',
  'ɫ',
  'u',
  'd',
  '#']]

In [112]:
## generate dictionary
from gensim.corpora import Dictionary
diction = Dictionary(bots)
print(diction)
#apply_term_filtering = False
if apply_term_filtering:
    print(f"term filtering applied")
    diction.filter_extremes(no_below = term_minfreq, no_above = abuse_threshold)
else:
    print(f"term filtering not applied")
## check
print(diction)
## generate DTM
corpus = [ diction.doc2bow(bot) for bot in bots if len(bot) > min_bot_size ] # Crucially

Dictionary<111746 unique tokens: ['#', '#ɛ', '#ɛn', '#ɛnd', '#ɛn…#']...>
term filtering applied
Dictionary<41456 unique tokens: ['#ɛ', '#ɛn', '#ɛnd', '#ɛn…#', '#ɛn…d']...>


In [113]:
## HDP (n_topics = 15)
import gensim.models
import pyLDAvis.gensim
max_n_topics = 15
hdp15 = gensim.models.HdpModel(corpus, diction, random_state = 1, T = max_n_topics)
vis_data15 = pyLDAvis.gensim.prepare(hdp15, corpus, diction)
pyLDAvis.display(vis_data15)

In [114]:
## save LDAvis output as a html file
save_LDAvis = True
if save_LDAvis:
	lang_dir_name = target_lang_dict[target_lang_key].split()[0]
	vis_output = f"results/LDAvis/{lang_dir_name}/{target_lang_dict[target_lang_key]}{target_class}-HDP-max_ntop{max_n_topics}-{term_type}{accent_status}.html"
	pyLDAvis.save_html(vis_data15, vis_output)

In [116]:
## topic investigation
import numpy as np
import HDP_helper
reload_module = False
if reload_module:
    import importlib
    importlib.reload(HDP_helper)

documents_topics = np.zeros([hdp15.m_T, len(corpus)])
for doc_id, c in enumerate(corpus):
    for topic_id, prob in hdp15[c]:
        documents_topics[topic_id][doc_id] = prob

n_docs_to_show = 10
n_terms_to_show = 7
hdp15.optimal_ordering()
for topic_id, probs in enumerate(documents_topics):
    print(f"==============")
    topic_t = hdp15.print_topic(topic_id, topn = n_terms_to_show)
    print(f"topic_id {topic_id}: {HDP_helper.reformat_topic (topic_t, n_terms_to_show)}")
    print(f"nonzero count: ", len(probs.nonzero()[0]))
    for doc_id in probs.argsort()[::-1][:n_docs_to_show]:
        doc = doc_dict[doc_id]
        print(f"\t{probs[doc_id]:0.4f}: {doc}")

topic_id 0: 0.001 * ɪ# + 0.001 * aɪ# + 0.001 * f…ɪ + 0.001 * nt + 0.001 * eɪt + 0.001 * e…t# + 0.001 * eɪt#
nonzero count:  608
	0.9992: #kɑnsəmeɪt#
	0.9992: #ɹɛpɹɪzɛnt#
	0.9992: #ɛmbɫeɪzən#
	0.9992: #pɹidɛstən#
	0.9992: #oʊvɝdɹaɪv#
	0.9991: #pɹoʊkɫeɪm#
	0.9991: #fʊɫməneɪt#
	0.9991: #ɪntɝtwaɪn#
	0.9991: #fɪɫəbəstɝ#
	0.9991: #ɪntɹoʊvɝt#
topic_id 1: 0.001 * z# + 0.001 * n…t + 0.001 * n…ɪ + 0.001 * n…ɪ…# + 0.001 * t…t + 0.001 * ɪz + 0.001 * s…t#
nonzero count:  398
	0.9993: #kɑnstɹəkt#
	0.9993: #kənstɹɪkt#
	0.9992: #kɑnstətut#
	0.9992: #əsæsəneɪt#
	0.9992: #ɹidɪskəvɝ#
	0.9992: #pɹɑstətut#
	0.9992: #ɹiskɛdʒuɫ#
	0.9991: #dɪskənɛkt#
	0.9991: #dɪsɔɹiɛnt#
	0.9991: #əɡɫɑmɝeɪt#
topic_id 2: 0.001 * z# + 0.001 * ɪz# + 0.001 * ɪz + 0.001 * a…z# + 0.001 * aɪz# + 0.001 * a…z + 0.001 * aɪz
nonzero count:  253
	0.9992: #kɑɹbənaɪz#
	0.9992: #mɪɫətɝaɪz#
	0.9991: #pɑɫɪmɝaɪz#
	0.9991: #æksɛɫɝeɪt#
	0.9991: #hɪpnətaɪz#
	0.9990: #mæstɝbeɪt#
	0.9990: #stæɫɪnaɪz#
	0.9990: #kɛɹəkətʃɝ#
	0.9990: #ɫɪkwɪdeɪt#
	0.998

In [115]:
## save topic structures
#hdp.get_topics() # =/= show_topics()
#hdp.print_topics()
hdp_topics = hdp15.show_topics(num_topics = max_n_topics,
                               num_words = n_terms_to_show, formatted = False)
hdp_dict = { tid: values for tid, values in hdp_topics }
## convert to Pandas dataframe
topics_df = pd.DataFrame.from_dict(hdp_dict)
hdp15_topics_out = "results/terms-by-topics-raw/hdp15_topics_raw.csv"
topics_df.to_csv(hdp15_topics_out)

In [118]:
## reduce terms: under implementation
import term_handlers
reload_modules = False
if reload_modules:
    import importlib
    importlib.reload(term_handlers)

#
check = False
for topic in hdp_topics:
    if check:
        print(topic)
    topic_id, topic_matrix = topic[0], topic[1]
    terms =  [ x[0] for x in topic_matrix ]
    extended_terms = terms.copy()
    print(f"topic id: {topic_id}")
    print(terms)
    reduced_terms = term_handlers.reduce_by_superposition(terms, min_overlap = 2, check = check)
    #reduced_terms = term_handlers.reduce_by_superposition_under_gap(terms, min_overlap = 2, gap_marker = gap_marker, check = check)
    print(reduced_terms)
    if len(reduced_terms) > 0:
        for term in reduced_terms:
            if not term in extended_terms:
                extended_terms.extend(term)
        extended_terms = sorted(extended_terms, key = lambda x: len(x), reverse = True)
    print(extended_terms)


topic id: 0
['ɪ#', 'aɪ#', 'f…ɪ', 'nt', 'eɪt', 'e…t#', 'eɪt#', 'əf', 'ɔɹ', 'm#', 'ɫ…t', 'ɔɹ…#', 'ʃ#', 'nd', 'fa…#', 'əf…#', 'fa', 'k#', 'faɪ', 'ɪn…#', 'faɪ#', 'f…ɪ#', 'ə…a…#', 'ə…ɪ#', 'dʒ', 'ə…a', 'ə…aɪ', 'st…#', 'n…ɪ', 'ə…aɪ#', 'ɹ…n', 'tʃ', 'əf…ɪ', 'ən#', 'n…t', 'v#', 'ɪs', 'nt#', 'əfa…#', 'əfa', 'əs', 'əfaɪ', 'əf…ɪ#', 'm…ɪ', 'm…t', 'ʒ#', 'tʃ#', 'ɪ…n', 'ɹ…f', 's…ə', 'f#', 'tɝ', 's…t#', 'ɫ…t#', 's…ə…#', 'n…t#', 'p…ɪ', 'ɪ…ɪ…#', 't…ɪ…#', 'n…ɪ…#']
['f…ɪ#', 'nt#', 'eɪt#', 'əf…#', 'əf…ɪ', 'əfa…#', 'əfa', 'əfaɪ', 'əf…ɪ#', 'ɔɹ…#', 'ɫ…t#', 'faɪ', 'faɪ#', 'faɪ#', 'ə…aɪ', 'ə…aɪ#', 'ə…aɪ#', 'n…ɪ…#', 'tʃ#', 'əf…ɪ#', 'n…t#', 'əfaɪ', 's…ə…#']
['ə…a…#', 'ə…aɪ#', 'əfa…#', 'əf…ɪ#', 's…ə…#', 'ɪ…ɪ…#', 't…ɪ…#', 'n…ɪ…#', 'e…t#', 'eɪt#', 'ɔɹ…#', 'fa…#', 'əf…#', 'ɪn…#', 'faɪ#', 'f…ɪ#', 'ə…ɪ#', 'ə…aɪ', 'st…#', 'əf…ɪ', 'əfaɪ', 's…t#', 'ɫ…t#', 'n…t#', 'aɪ#', 'f…ɪ', 'eɪt', 'ɫ…t', 'faɪ', 'ə…a', 'n…ɪ', 'ɹ…n', 'ən#', 'n…t', 'nt#', 'əfa', 'm…ɪ', 'm…t', 'tʃ#', 'ɪ…n', 'ɹ…f', 's…ə', 'p…ɪ', 'ɪ#', 'nt', 'əf', 'ɔɹ', 'm#', 

In [120]:
## HDP (n_topics = 45)
import gensim.models
import pyLDAvis.gensim
max_n_topics = 45
hdp45 = gensim.models.HdpModel(corpus, diction, random_state = 1, T = max_n_topics)
vis_data45 = pyLDAvis.gensim.prepare(hdp45, corpus, diction)
pyLDAvis.display(vis_data45)

In [121]:
## save LDAvis output as a html file
save_LDAvis = True
if save_LDAvis:
	lang_dir_name = target_lang_dict[target_lang_key].split()[0]
	vis_output = f"results/LDAvis/{lang_dir_name}/{target_lang_dict[target_lang_key]}{target_class}-HDP-max_ntop{max_n_topics}-{term_type}{accent_status}.html"
	pyLDAvis.save_html(vis_data45, vis_output)

In [122]:
## save topic structures
#hdp.get_topics() # =/= show_topics()
#hdp.print_topics()
hdp_topics = hdp45.show_topics(num_topics = max_n_topics,
                               num_words = n_terms_to_show, formatted = False)
hdp_dict = { tid: values for tid, values in hdp_topics }
## convert to Pandas dataframe
topics_df = pd.DataFrame.from_dict(hdp_dict)
hdp45_topics_out = "results/terms-by-topics-raw/hdp45_topics_raw.csv"
topics_df.to_csv(hdp45_topics_out)

In [123]:
## topic investigation
import numpy as np
import HDP_helper
documents_topics = np.zeros([hdp45.m_T, len(corpus)])
for doc_id, c in enumerate(corpus):
    for topic_id, prob in hdp45[c]:
        documents_topics[topic_id][doc_id] = prob
#
n_docs_to_show = 10
n_terms_to_show = 7
hdp45.optimal_ordering()
for topic_id, probs in enumerate(documents_topics):
    print(f"==============")
    topic_t = hdp45.print_topic(topic_id, topn = n_terms_to_show)
    print(f"topic_id {topic_id}: {HDP_helper.reformat_topic (topic_t, n_terms_to_show)}")
    print(f"nonzero count: {len(probs.nonzero()[0])}")
    for doc_id in probs.argsort()[::-1][:n_docs_to_show]:
        doc = doc_dict[doc_id]
        print(f"\t{probs[doc_id]:0.4f}: {doc}")

topic_id 0: 0.003 * ɪ# + 0.002 * f…ɪ + 0.002 * aɪ# + 0.002 * fa…# + 0.002 * fa + 0.002 * faɪ + 0.002 * f…ɪ#
nonzero count: 256
	0.9993: #sɪmpɫəfaɪ#
	0.9993: #stɹætəfaɪ#
	0.9992: #əsæsəneɪt#
	0.9992: #kɫɔɹəneɪt#
	0.9992: #dʒəstəfaɪ#
	0.9992: #disɝtəfaɪ#
	0.9992: #kɑnsəmeɪt#
	0.9991: #sækɹəfaɪs#
	0.9991: #ɹɛɡjəɫeɪt#
	0.9991: #oʊvɝdɹaɪv#
topic_id 1: 0.001 * nd + 0.001 * t…t + 0.001 * nt + 0.001 * nd# + 0.001 * nt# + 0.001 * t…t# + 0.001 * kə
nonzero count: 208
	0.9992: #kɑnstɹəkt#
	0.9992: #kənstɹɪkt#
	0.9992: #kɑnstətut#
	0.9991: #ɹidɪskəvɝ#
	0.9991: #pɹɑstətut#
	0.9991: #ɹɛpɹɪzɛnt#
	0.9991: #dɪskənɛkt#
	0.9991: #kɑnskɹɪpt#
	0.9989: #mɪsdɪɹɛkt#
	0.9988: #dɪstɹəkt#
topic_id 2: 0.003 * z# + 0.003 * ɪz + 0.003 * ɪz# + 0.003 * aɪz# + 0.003 * a…z + 0.003 * a…z# + 0.003 * aɪz
nonzero count: 137
	0.9992: #sɪmbəɫaɪz#
	0.9992: #hɑɹmənaɪz#
	0.9992: #dɹæmətaɪz#
	0.9992: #bɹutəɫaɪz#
	0.9992: #sɪɹiəɫaɪz#
	0.9992: #ɪnɪʃəɫaɪz#
	0.9992: #kɑɹbənaɪz#
	0.9992: #pɹɪvətaɪz#
	0.9992: #fɔɹməɫaɪz#
	0.9991: #ɪmp

In [124]:
## HDP (n_topics = 90)
import gensim.models
import pyLDAvis.gensim
max_n_topics = 90
hdp90 = gensim.models.HdpModel(corpus, diction, random_state = 1, T = max_n_topics,
                             var_converge = 0.001 # Effective to prevent "Not all row sum equatl to 1" error?
                             )
vis_data90 = pyLDAvis.gensim.prepare(hdp90, corpus, diction)
pyLDAvis.display(vis_data90)

In [125]:
## save LDAvis output as a html file
save_LDAvis = True
if save_LDAvis:
	lang_dir_name = target_lang_dict[target_lang_key].split()[0]
	vis_output = f"results/LDAvis/{lang_dir_name}/{target_lang_dict[target_lang_key]}{target_class}-HDP-max_ntop{max_n_topics}-{term_type}{accent_status}.html"
	pyLDAvis.save_html(vis_data90, vis_output)

In [126]:
## save topic structures
hdp_topics = hdp90.show_topics(num_topics = max_n_topics,
                               num_words = n_terms_to_show, formatted = False)
hdp_dict = { tid: values for tid, values in hdp_topics }
## convert to Pandas dataframe
topics_df = pd.DataFrame.from_dict(hdp_dict)
hdp90_topics_out = "results/terms-by-topics-raw/hdp90_topics_raw.csv"
topics_df.to_csv(hdp90_topics_out)

In [128]:
## topic investigation
import numpy as np
import HDP_helper
documents_topics = np.zeros([hdp90.m_T, len(corpus)])
for doc_id, c in enumerate(corpus):
    for topic_id, prob in hdp90[c]:
        documents_topics[topic_id][doc_id] = prob
## investigate topics
n_docs_to_show  = 10
n_terms_to_show = 7
hdp90.optimal_ordering()
for topic_id, probs in enumerate(documents_topics):
    print(f"==============")
    #topic_encoding = ", ".join(hdp.show_topic(topic_id))
    topic_t = hdp90.print_topic(topic_id, topn = n_terms_to_show)
    print(f"topic_id {topic_id}: {HDP_helper.reformat_topic (topic_t, n_terms_to_show)}")
    print(f"nonzero count: ", len(probs.nonzero()[0]))
    for doc_id in probs.argsort()[::-1][:n_docs_to_show]:
        doc = doc_dict[doc_id]
        print(f"\t{probs[doc_id]:0.4f}: {doc}")

topic_id 0: 0.002 * eɪt + 0.002 * eɪt# + 0.002 * e…t# + 0.002 * ən# + 0.002 * ɝ…ɪ + 0.002 * ɝ…ɪ…# + 0.001 * t…ɪ…#
nonzero count:  140
	0.9993: #oʊvɝsteɪt#
	0.9992: #əndɝsteɪt#
	0.9992: #dɛvəsteɪt#
	0.9992: #ɹiɫoʊkeɪt#
	0.9991: #pɹidɛstən#
	0.9990: #əpɹiʃieɪt#
	0.9990: #ɪntɝtwaɪn#
	0.9990: #vəsɪfɝeɪt#
	0.9990: #veɪkeɪʃən#
	0.9989: #oʊvɝsteɪ#
topic_id 1: 0.002 * n…ɪ + 0.002 * eɪt# + 0.002 * eɪt + 0.002 * e…t# + 0.002 * n…ɪ…# + 0.002 * n…t + 0.002 * n…t#
nonzero count:  122
	0.9991: #kɑnsəmeɪt#
	0.9991: #ɹɛpɹɪzɛnt#
	0.9990: #ɪnkɑɹneɪt#
	0.9989: #haɪbɝneɪt#
	0.9988: #faɪnænsɪɹ#
	0.9987: #ɹipɹoʊtʃ#
	0.9986: #mækɹəmeɪ#
	0.9985: #hɛzəteɪt#
	0.9983: #ɪnfɫuəns#
	0.9979: #əpɹoʊtʃ#
topic_id 2: 0.003 * z# + 0.003 * ɪz + 0.003 * ɪz# + 0.002 * a…z + 0.002 * a…z# + 0.002 * aɪz# + 0.002 * aɪz
nonzero count:  149
	0.9992: #ɫoʊkəɫaɪz#
	0.9992: #sɪmbəɫaɪz#
	0.9992: #voʊkəɫaɪz#
	0.9992: #sɪɹiəɫaɪz#
	0.9992: #ɪnɪʃəɫaɪz#
	0.9992: #pɫʊɹəɫaɪz#
	0.9992: #ɪmpɹəvaɪz#
	0.9991: #soʊʃəɫaɪz#
	0.9991: #kɹioʊɫaɪz#
	0.