In [386]:
#!pip install -U pyLDAvis
#!pip install -U pandas

In [387]:
## imports
import os, sys
import pprint as pp
import unicodedata

In [388]:
## 一つ上の階層のファイルを見るように設定
sys.path.append(os.path.join(os.path.dirname("__file__"), '..'))

In [389]:
## target language
## a key must be part of a file name 
target_lang_dict = {    'en_US' : 'English (US)',
                        'en_UK' : 'English (UK)',
                        'en_N_only' : 'English noun (WN)',
                        'en_V_only' : 'English verb (WN)',
                        'en_A_only' : 'English adj (WN)',
                        'en_R_only' : 'English adv (WN)',
                        'ar'    : 'Arabic',
                        'de'    : 'German',
                        'de_N_only' : 'German Nouns',
                        'de_non_N_only' : 'German Non-nouns',
                        'eo'    : 'Esperanto',
                        'es_ES' : 'Spanish (Spain)',
                        'es_MX' : 'Spanish (Mexico)',
                        'fi'    : 'Finnish',
                        'fr_FR' : 'French (France)',
                        'fr_QC' : 'French (Quebec)',
                        'is'    : 'Icelandic',
                        'ir'    : 'Irish',
                        'nl'    : 'Dutch',
                        'ro'    : 'Romanian',
                        'sw'    : 'Swahili' }
## proper language selection
target_lang_keys = [    'en_US', # 0
                        'en_UK', # 1
                        'en_N_only', # 2
                        'en_V_only', # 3
                        'en_A_only', # 4
                        'en_R_only', # 5
                        'ar', # 6
                        'de', # 7
                        'de_N_only', # 8
                        'de_non_N_only', # 9
                        'eo', 'es_ES', 'es_MX',
                        'fi', 'fr_FR', 'fr_QC',
                        'is', 'nl', 'ro', 'sw',
                        'ir' # This lacks sound
                    ]
## check
target_lang_key  = target_lang_keys[3]
print(f"target_lang_key: {target_lang_key}")
print(f"target lang: {target_lang_dict[target_lang_key]} [{target_lang_key}]")
## target_attr [effective only for Irish]
target_class = ""
#target_class = None # This causes an unrediable error
if target_lang_key == "ir":
    target_classes = [ 'adjectives', 'nouns', 'verbs' ]
    target_class = f"-{target_classes[3]}"
print(f"target_class: {target_class}")

target_lang_key: en_V_only
target lang: English verb (WN) [en_V_only]
target_class: 


In [390]:
## term settings
term_classes        = [ 'spell', 'sound' ]
term_class          = term_classes[1]
ngram_is_inclusive  = True
## doc settings
max_doc_size        = 11
min_doc_size        =  3
print(f"max_doc_size: {max_doc_size}")
print(f"min_doc_size: {min_doc_size}")
### boundary handling
add_boundary       = True
boundary_mark      = "#"
## term setting
gap_mark           = "…"
term_is_skippy     = True
n_for_ngram        = 5
max_gap_ratio      = 0.8
max_gap_size       = round(max_doc_size * max_gap_ratio)
print(f"term_class: {term_class}")
print(f"term_is_skippy: {term_is_skippy}")
print(f"max_gap_size: {max_gap_size}")
print(f"n_for_ngram: {n_for_ngram}")
### accent handling
suppress_accents   = True
accent_marks       = [ "ˈ", "ˌ" ] 
if term_class == 'sound':
    if suppress_accents:
        accent_status = "-unaccented"
    else:
        accent_stratus = "-accented"
else:
    accent_status = ""
print(f"accent_status: {accent_status}")
## define term_type
if term_class == 'spell':
    if term_is_skippy:
        term_type = f"sp_skippy{n_for_ngram}gram"
    else:
        term_type = f"sp_{n_for_ngram}gram"
else:
    if term_is_skippy:
        term_type = f"sn_skippy{n_for_ngram}gram"
    else:
        term_type = f"sn_{n_for_ngram}gram"
## check
print(f"term_type: {term_type}")

max_doc_size: 11
min_doc_size: 3
term_class: sound
term_is_skippy: True
max_gap_size: 9
n_for_ngram: 5
accent_status: -unaccented
term_type: sn_skippy5gram


In [391]:
## LDA/HDP
apply_term_filtering = True
## The following parameters need to be relatively large to prevent "Row sum not equal 1" error
term_minfreq         = 2
abuse_threshold      = 0.05 # larger value selects shorter units, smaller value selects longer units
min_bot_size         = 3

In [392]:
## sampling
source_sampling          = True
source_sampling_rate     = 0.5
source_sampling_max_size = 5000
second_sampling          = False
second_sampling_rate     = 0.7

In [393]:
## set target files
import glob
data_dir1     = "data/open-dict-ipa/data1"
data_dir2     = "data/open-dict-ipa/data1a"
data_dir3     = "data/wn3"
data_dir4     = "data/irish"
target_files = glob.glob(f"{data_dir1}/*")
target_files2 = glob.glob(f"{data_dir2}/*")
target_files.extend(target_files2)
target_files3 = glob.glob(f"{data_dir3}/*")
target_files.extend(target_files3)
target_files4 = glob.glob(f"{data_dir4}/*")
target_files.extend(target_files4)
#
target_files = sorted([ file for file in target_files if ".csv" in file ])
pp.pprint(target_files)

['data/irish/word-irish-adjectives-spell.csv',
 'data/irish/word-irish-noun-phrases-spell.csv',
 'data/irish/word-irish-nouns-spell.csv',
 'data/irish/word-irish-possessives-spell.csv',
 'data/irish/word-irish-prepositions-spell.csv',
 'data/irish/word-irish-verbs-spell.csv',
 'data/open-dict-ipa/data1/ar.csv.gz',
 'data/open-dict-ipa/data1/de.csv.gz',
 'data/open-dict-ipa/data1/en_UK.csv.gz',
 'data/open-dict-ipa/data1/en_US.csv.gz',
 'data/open-dict-ipa/data1/eo.csv.gz',
 'data/open-dict-ipa/data1/es_ES.csv.gz',
 'data/open-dict-ipa/data1/es_MX.csv.gz',
 'data/open-dict-ipa/data1/fa.csv.gz',
 'data/open-dict-ipa/data1/fi.csv.gz',
 'data/open-dict-ipa/data1/fr_FR.csv.gz',
 'data/open-dict-ipa/data1/fr_QC.csv.gz',
 'data/open-dict-ipa/data1/is.csv.gz',
 'data/open-dict-ipa/data1/ja.csv.gz',
 'data/open-dict-ipa/data1/jam.csv.gz',
 'data/open-dict-ipa/data1/ma.csv.gz',
 'data/open-dict-ipa/data1/nb.csv.gz',
 'data/open-dict-ipa/data1/nl.csv.gz',
 'data/open-dict-ipa/data1/or.csv.gz',
 '

In [394]:
#print(target_files)

In [395]:
## get source data from files
import pandas as pd
import gzip
#target_language_key = "en_US" # can be changed to get other languages
#if target_class != "" or target_class is not None:
if target_class != "":
    target_file = [ f for f in target_files if target_lang_key in f and target_class in f ][0]
else:
    target_file = [ f for f in target_files if target_lang_key in f ][0]
print(f"processing: {target_file}")
##
if target_lang_key == "ir":
    col_names = ['spell', 'POS']
else:
    col_names = ['spell', 'sound']
#
if target_file.endswith(".gz"):
    with gzip.open(target_file, "rt") as f:
        raw_df = pd.read_csv(f, encoding = 'utf8', header = None, names = col_names )
else:
    with open(target_file, "rt") as f:
        raw_df = pd.read_csv(f, encoding = 'utf8', header = None, names = col_names )
## normalize characters
raw_df['spell'] = raw_df['spell'].apply(lambda x: unicodedata.normalize('NFC', str(x)))
## modify sound
try:
    sounds = raw_df['sound'].apply(lambda x: x.strip('/') )
    sounds = [ x.split("/,")[0] for x in sounds ] # picks up only the first of multiple entries
    raw_df['sound'] = sounds
except (AttributeError, KeyError):
    pass
#
raw_df.sample(10)

processing: data/wn3/en_V_only.csv


Unnamed: 0,spell,sound
1531,etch,ˈɛtʃ
3073,precook,ˈpɹiˈkʊk
3526,revert,ɹiˈvɝt
903,contemplate,ˈkɑntəmˌpɫeɪt
684,chew,ˈtʃu
764,clump,ˈkɫəmp
1989,harry,ˈhɛɹi
2652,mug,ˈməɡ
1859,ginger,ˈdʒɪndʒɝ
1263,discontent,dɪskənˈtɛnt


In [396]:
## source sampling
len(raw_df)
if source_sampling:
	print(f"source sampling applied")
	if len(raw_df) >= source_sampling_max_size:
		raw_df = raw_df.sample(source_sampling_max_size)
	else:
		raw_df = raw_df.sample(round(len(raw_df) * source_sampling_rate))
## remove accent marking
if suppress_accents:
	try:
		raw_df['sound'] = raw_df['sound'].apply(lambda x: "".join([ y for y in list(x) if y not in accent_marks ]))
	except KeyError:
		pass
## add boudary marks
if add_boundary:
	raw_df['spell'] = raw_df['spell'].apply(lambda x: f"{boundary_mark}{x}{boundary_mark}")
	try:
		raw_df['sound'] = raw_df['sound'].apply(lambda x: f"{boundary_mark}{x}{boundary_mark}")
	except KeyError:
		pass
#
print(raw_df)

source sampling applied
            spell         sound
3432   #renounce#     #ɹɪnaʊns#
376    #blockade#     #bɫɑkeɪd#
1304    #disrupt#     #dɪsɹəpt#
1102  #decontrol#  #dikəntɹoʊɫ#
511    #bulletin#     #bʊɫɪtən#
...           ...           ...
1675     #finish#       #fɪnɪʃ#
1606     #faggot#       #fæɡət#
1398       #dust#        #dəst#
4452       #tuck#         #tək#
3597   #ruminate#    #ɹumɪneɪt#

[2374 rows x 2 columns]


In [397]:
## generate 1-grams for spell and sound
## spell
raw_df['sp_1gram'] = raw_df['spell'].apply(lambda x: list(str(x)))
# add column of size
raw_df['sp_size'] = raw_df['sp_1gram'].apply(lambda x: len(x))
# add column of count of '-' inside
raw_df['hyphen'] = raw_df['sp_1gram'].apply(lambda x: list(x).count("-"))
# add column of count of '.' inside
raw_df['period'] = raw_df['sp_1gram'].apply(lambda x: list(x).count("."))
## sound
# takes the first entry, removes '/' around
try:
    raw_df['sn_1gram'] = raw_df['sound'].apply(lambda x: list(x) )
except (TypeError, KeyError):
    pass
# add column of size
try:
    raw_df['sn_size'] = raw_df['sn_1gram'].apply(lambda x: len(x))
except KeyError:
    pass
## check
raw_df

Unnamed: 0,spell,sound,sp_1gram,sp_size,hyphen,period,sn_1gram,sn_size
3432,#renounce#,#ɹɪnaʊns#,"[#, r, e, n, o, u, n, c, e, #]",10,0,0,"[#, ɹ, ɪ, n, a, ʊ, n, s, #]",9
376,#blockade#,#bɫɑkeɪd#,"[#, b, l, o, c, k, a, d, e, #]",10,0,0,"[#, b, ɫ, ɑ, k, e, ɪ, d, #]",9
1304,#disrupt#,#dɪsɹəpt#,"[#, d, i, s, r, u, p, t, #]",9,0,0,"[#, d, ɪ, s, ɹ, ə, p, t, #]",9
1102,#decontrol#,#dikəntɹoʊɫ#,"[#, d, e, c, o, n, t, r, o, l, #]",11,0,0,"[#, d, i, k, ə, n, t, ɹ, o, ʊ, ɫ, #]",12
511,#bulletin#,#bʊɫɪtən#,"[#, b, u, l, l, e, t, i, n, #]",10,0,0,"[#, b, ʊ, ɫ, ɪ, t, ə, n, #]",9
...,...,...,...,...,...,...,...,...
1675,#finish#,#fɪnɪʃ#,"[#, f, i, n, i, s, h, #]",8,0,0,"[#, f, ɪ, n, ɪ, ʃ, #]",7
1606,#faggot#,#fæɡət#,"[#, f, a, g, g, o, t, #]",8,0,0,"[#, f, æ, ɡ, ə, t, #]",7
1398,#dust#,#dəst#,"[#, d, u, s, t, #]",6,0,0,"[#, d, ə, s, t, #]",6
4452,#tuck#,#tək#,"[#, t, u, c, k, #]",6,0,0,"[#, t, ə, k, #]",5


In [398]:
## filtering raw_data by size
print(f"term_type: {term_type}")
if "sp_" in term_type:
    df_filtered = raw_df[ (raw_df['sp_size'] <= max_doc_size) & (raw_df['sp_size'] >= min_doc_size) & (raw_df['hyphen'] == 0) & (raw_df['period'] == 0) ]
else:
    df_filtered = raw_df[ (raw_df['sn_size'] <= max_doc_size) & (raw_df['sn_size'] >= min_doc_size) ]
#
df_filtered

term_type: sn_skippy5gram


Unnamed: 0,spell,sound,sp_1gram,sp_size,hyphen,period,sn_1gram,sn_size
3432,#renounce#,#ɹɪnaʊns#,"[#, r, e, n, o, u, n, c, e, #]",10,0,0,"[#, ɹ, ɪ, n, a, ʊ, n, s, #]",9
376,#blockade#,#bɫɑkeɪd#,"[#, b, l, o, c, k, a, d, e, #]",10,0,0,"[#, b, ɫ, ɑ, k, e, ɪ, d, #]",9
1304,#disrupt#,#dɪsɹəpt#,"[#, d, i, s, r, u, p, t, #]",9,0,0,"[#, d, ɪ, s, ɹ, ə, p, t, #]",9
511,#bulletin#,#bʊɫɪtən#,"[#, b, u, l, l, e, t, i, n, #]",10,0,0,"[#, b, ʊ, ɫ, ɪ, t, ə, n, #]",9
1844,#geminate#,#dʒɛməneɪt#,"[#, g, e, m, i, n, a, t, e, #]",10,0,0,"[#, d, ʒ, ɛ, m, ə, n, e, ɪ, t, #]",11
...,...,...,...,...,...,...,...,...
1675,#finish#,#fɪnɪʃ#,"[#, f, i, n, i, s, h, #]",8,0,0,"[#, f, ɪ, n, ɪ, ʃ, #]",7
1606,#faggot#,#fæɡət#,"[#, f, a, g, g, o, t, #]",8,0,0,"[#, f, æ, ɡ, ə, t, #]",7
1398,#dust#,#dəst#,"[#, d, u, s, t, #]",6,0,0,"[#, d, ə, s, t, #]",6
4452,#tuck#,#tək#,"[#, t, u, c, k, #]",6,0,0,"[#, t, ə, k, #]",5


In [399]:
## define df after second sampling if any
len(df_filtered)
if second_sampling:
    df = df_filtered.sample(round(len(df_filtered) * second_sampling_rate))
else:
    df = df_filtered
len(df)

2230

In [400]:
## spell 2grams
#import ngrams
import gen_ngrams
module_name = "gen_ngrams"
reload_module = False
if reload_module:
    import importlib
    importlib.reload(module_name)

if term_class == 'spell':
    #sp_2grams = [ ngrams.list_gen_ngrams (x, n = 2, check = False) for x in df['sp_1gram'] ]
    sp_2grams = [ gen_ngrams.gen_ngrams (x, n = 2, sep = "", check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_2grams):
            g.extend(list(df['sp_1gram'])[i])
    ## add sp_2gram
    df['sp_2gram'] = sp_2grams

In [401]:
## spell 3grams
#import ngrams
import gen_ngrams
if n_for_ngram > 2 and term_class == 'spell':
    #sp_3grams = [ ngrams.list_gen_ngrams (x, n = 3, check = False) for x in df['sp_1gram'] ]
    sp_3grams = [ gen_ngrams.gen_ngrams (x, n = 3, sep = "", check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_3grams):
            g.extend(list(df['sp_2gram'])[i])
    ## add sp_3gram
    df['sp_3gram'] = sp_3grams

In [402]:
## spell 4grams
#import ngrams
import gen_ngrams
if n_for_ngram > 3 and term_class == 'spell':
    #sp_4grams = [ ngrams.list_gen_ngrams (x, n = 4, check = False) for x in df['sp_1gram'] ]
    sp_4grams = [ gen_ngrams.gen_ngrams (x, n = 4, sep = "", check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_4grams):
            g.extend(list(df['sp_3gram'])[i])
    ## add sp_4gram
    df['sp_4gram'] = sp_4grams

In [403]:
## spell 5grams
#import ngrams
import gen_ngrams
if n_for_ngram > 4 and term_class == 'spell':
    sp_5grams = [ gen_ngrams.gen_ngrams (x, n = 5, sep = "", check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_5grams):
            g.extend(list(df['sp_4gram'])[i])
    ## add sp_5gram
    df['sp_5gram'] = sp_5grams

In [404]:
## spell skippy 2gram
#import ngrams_skippy
import gen_ngrams
reload_module = False
module_name = "gen_ngrams"
if reload_module:
    import importlib
    importlib.reload(module_name)
#
if term_class == 'spell':
    #sp_skippy2grams = [ ngrams_skippy.gen_skippy2grams(x, missing_mark = gap_mark, check = False) for x in df['sp_1gram'] ]
    ## The code above was replaced by the following more efficient one
    sp_skippy2grams = [ gen_ngrams.gen_skippy_ngrams(x, 2, sep = "", max_distance = max_gap_size, missing_mark = gap_mark, check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_skippy2grams):
            g.extend(list(df['sp_1gram'])[i])
    #
    df['sp_skippy2gram'] = sp_skippy2grams

In [405]:
## spell skippy 3gram
#import ngrams_skippy
import gen_ngrams
if n_for_ngram > 2 and term_class == 'spell':
    #sp_skippy3grams = [ ngrams_skippy.gen_skippy3grams(x, missing_mark = gap_mark, check = False) for x in df['sp_1gram'] ]
    ## The code above was replaced by the following more efficient one
    sp_skippy3grams = [ gen_ngrams.gen_skippy_ngrams(x, 3, sep = "", max_distance = max_gap_size, missing_mark = gap_mark, check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_skippy3grams):
            g.extend(list(df['sp_skippy2gram'])[i])
    #
    df['sp_skippy3gram'] = sp_skippy3grams

In [406]:
## spell skippy 4gram
#import ngrams_skippy
import gen_ngrams
if n_for_ngram > 3 and term_class == 'spell':
    #sp_skippy4grams = [ ngrams_skippy.gen_skippy4grams(x, missing_mark = gap_mark, check = False) for x in df['sp_1gram'] ]
    ## The code above was replaced by the following more efficient one
    sp_skippy4grams = [ gen_ngrams.gen_skippy_ngrams(x, 4, sep = "", max_distance = max_gap_size, missing_mark = gap_mark, check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_skippy4grams):
            g.extend(list(df['sp_skippy3gram'])[i])
    #
    df['sp_skippy4gram'] = sp_skippy4grams

In [407]:
## spell skippy 5gram
import gen_ngrams
if n_for_ngram > 4 and term_class == 'spell':
    sp_skippy5grams = [ gen_ngrams.gen_skippy_ngrams(x, 5, sep = "", max_distance = max_gap_size, missing_mark = gap_mark, check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_skippy5grams):
            g.extend(list(df['sp_skippy4gram'])[i])
    #
    df['sp_skippy5gram'] = sp_skippy5grams

In [408]:
## sound 2grams
#import ngrams
import gen_ngrams
module_name = "gen_ngrams"
reload_module = False
if reload_module:
    import importlib
    importlib.reload(module_name)
#
if term_class == 'sound':
    #sn_2grams = [ ngrams.list_gen_ngrams (x, n = 2, check = False) for x in df['sn_1gram'] ]
    sn_2grams = [ gen_ngrams.gen_ngrams (x, n = 2, sep ="", check = False) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_2grams):
            g.extend(list(df['sn_1gram'])[i])
    ## add sn_2gram
    df['sn_2gram'] = sn_2grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sn_2gram'] = sn_2grams


In [409]:
## sound 3grams
#import ngrams
import gen_ngrams
if n_for_ngram > 2 and term_class == 'sound':
    #sn_3grams = [ ngrams.list_gen_ngrams (x, n = 3, check = False) for x in df['sn_1gram'] ]
    sn_3grams = [ gen_ngrams.gen_ngrams (x, n = 3, sep = "", check = False) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_3grams):
            g.extend(list(df['sn_2gram'])[i])
    ## add sn_3gram
    df['sn_3gram'] = sn_3grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sn_3gram'] = sn_3grams


In [410]:
## sound 4grams
#import ngrams
import gen_ngrams
if n_for_ngram > 3 and term_class == 'sound':
    #sn_4grams = [ ngrams.list_gen_ngrams (x, n = 4, check = False) for x in df['sn_1gram'] ]
    sn_4grams = [ gen_ngrams.gen_ngrams (x, n = 4, sep = "", check = False) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_4grams):
            g.extend(list(df['sn_3gram'])[i])
    ## add sn_4gram
    df['sn_4gram'] = sn_4grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sn_4gram'] = sn_4grams


In [411]:
## sound 5grams
import gen_ngrams
if n_for_ngram > 4 and term_class == 'sound':
    sn_5grams = [ gen_ngrams.gen_ngrams (x, n = 5, sep = "", check = False) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_5grams):
            g.extend(list(df['sn_4gram'])[i])
    ## add sn_4gram
    df['sn_5gram'] = sn_5grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sn_5gram'] = sn_5grams


In [412]:
## sound skippy 2gram
#import ngrams_skippy
import gen_ngrams
if term_class == 'sound':
    #sn_skippy2grams = [ ngrams_skippy.gen_skippy2grams(x, missing_mark = gap_mark, check = False) for x in df['sn_1gram'] ]
    ## The code above was replaced by the following more efficient one
    sn_skippy2grams = [ gen_ngrams.gen_skippy_ngrams(x, n = 2, sep = "", max_distance = max_gap_size, missing_mark = gap_mark, check = False) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_skippy2grams):
            g.extend(list(df['sn_1gram'])[i])
    #
    df['sn_skippy2gram'] = sn_skippy2grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sn_skippy2gram'] = sn_skippy2grams


In [413]:
## sound skippy 3gram
#import ngrams_skippy
import gen_ngrams
if n_for_ngram > 2 and term_class == 'sound':
    #sn_skippy3grams = [ ngrams_skippy.gen_skippy3grams(x, missing_mark = gap_mark, check = False) for x in df['sn_1gram'] ]
    ## The code above was replaced by the following more efficient one
    sn_skippy3grams = [ gen_ngrams.gen_skippy_ngrams(x, n = 3, sep = "", max_distance = max_gap_size, missing_mark = gap_mark, check = False) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_skippy3grams):
            g.extend(list(df['sn_skippy2gram'])[i])
    #
    df['sn_skippy3gram'] = sn_skippy3grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sn_skippy3gram'] = sn_skippy3grams


In [414]:
## sound skippy 4gram
#import ngrams_skippy
import gen_ngrams
if n_for_ngram > 3 and term_class == 'sound':
    #sn_skippy4grams = [ ngrams_skippy.gen_skippy4grams(x, missing_mark = gap_mark, check = False) for x in df['sn_1gram'] ]
    ## The code above was replaced by the following more efficient one
    sn_skippy4grams = [ gen_ngrams.gen_skippy_ngrams(x, n = 4, sep = "", max_distance = max_gap_size, missing_mark = gap_mark, check = False) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_skippy4grams):
            g.extend(list(df['sn_skippy3gram'])[i])
    #
    df['sn_skippy4gram'] = sn_skippy4grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sn_skippy4gram'] = sn_skippy4grams


In [415]:
## sound skippy 5gram
import gen_ngrams
if n_for_ngram > 4 and term_class == 'sound':
    sn_skippy5grams = [ gen_ngrams.gen_skippy_ngrams(x, n = 5, sep = "", max_distance = max_gap_size, missing_mark = gap_mark, check = False) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_skippy5grams):
            g.extend(list(df['sn_skippy4gram'])[i])
    #
    df['sn_skippy5gram'] = sn_skippy5grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sn_skippy5gram'] = sn_skippy5grams


In [416]:
## check df
dropped_vars = [ 'sp_size', 'hyphen', 'period', 'sn_size' ]
if term_class == 'spell':
    extra = [ 'sn_1gram', 'sn_2gram', 'sn_3gram', 'sn_4gram',
             'sn_skippy2gram', 'sn_skippy3gram', 'sn_skippy4gram' ]
    dropped_vars.extend(extra)
    target_vars = [ x for x in df.columns if not x in dropped_vars ]
else:
    extra = [ 'sp_1gram', 'sp_2gram', 'sp_3gram', 'sp_4gram',
             'sp_skippy2gram', 'sp_skippy3gram', 'sp_skippy4gram' ]
    dropped_vars.extend(extra)
    target_vars = [ x for x in df.columns if not x in dropped_vars ]    
#
df[target_vars]

Unnamed: 0,spell,sound,sn_1gram,sn_2gram,sn_3gram,sn_4gram,sn_5gram,sn_skippy2gram,sn_skippy3gram,sn_skippy4gram,sn_skippy5gram
3432,#renounce#,#ɹɪnaʊns#,"[#, ɹ, ɪ, n, a, ʊ, n, s, #]","[#ɹ, ɹɪ, ɪn, na, aʊ, ʊn, ns, s#, #, ɹ, ɪ, n, a...","[#ɹɪ, ɹɪn, ɪna, naʊ, aʊn, ʊns, ns#, #ɹ, ɹɪ, ɪn...","[#ɹɪn, ɹɪna, ɪnaʊ, naʊn, aʊns, ʊns#, #ɹɪ, ɹɪn,...","[#ɹɪna, ɹɪnaʊ, ɪnaʊn, naʊns, aʊns#, #ɹɪn, ɹɪna...","[#ɹ, #…ɪ, #…n, #…a, #…ʊ, #…n, #…s, #…#, ɹɪ, ɹ…...","[#ɹɪ, #ɹ…n, #ɹ…a, #ɹ…ʊ, #ɹ…n, #ɹ…s, #ɹ…#, #…ɪn...","[#ɹɪn, #ɹɪ…a, #ɹɪ…ʊ, #ɹɪ…n, #ɹɪ…s, #ɹɪ…#, #ɹ…n...","[#ɹɪna, #ɹɪn…ʊ, #ɹɪn…n, #ɹɪn…s, #ɹɪn…#, #ɹɪ…aʊ..."
376,#blockade#,#bɫɑkeɪd#,"[#, b, ɫ, ɑ, k, e, ɪ, d, #]","[#b, bɫ, ɫɑ, ɑk, ke, eɪ, ɪd, d#, #, b, ɫ, ɑ, k...","[#bɫ, bɫɑ, ɫɑk, ɑke, keɪ, eɪd, ɪd#, #b, bɫ, ɫɑ...","[#bɫɑ, bɫɑk, ɫɑke, ɑkeɪ, keɪd, eɪd#, #bɫ, bɫɑ,...","[#bɫɑk, bɫɑke, ɫɑkeɪ, ɑkeɪd, keɪd#, #bɫɑ, bɫɑk...","[#b, #…ɫ, #…ɑ, #…k, #…e, #…ɪ, #…d, #…#, bɫ, b…...","[#bɫ, #b…ɑ, #b…k, #b…e, #b…ɪ, #b…d, #b…#, #…ɫɑ...","[#bɫɑ, #bɫ…k, #bɫ…e, #bɫ…ɪ, #bɫ…d, #bɫ…#, #b…ɑ...","[#bɫɑk, #bɫɑ…e, #bɫɑ…ɪ, #bɫɑ…d, #bɫɑ…#, #bɫ…ke..."
1304,#disrupt#,#dɪsɹəpt#,"[#, d, ɪ, s, ɹ, ə, p, t, #]","[#d, dɪ, ɪs, sɹ, ɹə, əp, pt, t#, #, d, ɪ, s, ɹ...","[#dɪ, dɪs, ɪsɹ, sɹə, ɹəp, əpt, pt#, #d, dɪ, ɪs...","[#dɪs, dɪsɹ, ɪsɹə, sɹəp, ɹəpt, əpt#, #dɪ, dɪs,...","[#dɪsɹ, dɪsɹə, ɪsɹəp, sɹəpt, ɹəpt#, #dɪs, dɪsɹ...","[#d, #…ɪ, #…s, #…ɹ, #…ə, #…p, #…t, #…#, dɪ, d…...","[#dɪ, #d…s, #d…ɹ, #d…ə, #d…p, #d…t, #d…#, #…ɪs...","[#dɪs, #dɪ…ɹ, #dɪ…ə, #dɪ…p, #dɪ…t, #dɪ…#, #d…s...","[#dɪsɹ, #dɪs…ə, #dɪs…p, #dɪs…t, #dɪs…#, #dɪ…ɹə..."
511,#bulletin#,#bʊɫɪtən#,"[#, b, ʊ, ɫ, ɪ, t, ə, n, #]","[#b, bʊ, ʊɫ, ɫɪ, ɪt, tə, ən, n#, #, b, ʊ, ɫ, ɪ...","[#bʊ, bʊɫ, ʊɫɪ, ɫɪt, ɪtə, tən, ən#, #b, bʊ, ʊɫ...","[#bʊɫ, bʊɫɪ, ʊɫɪt, ɫɪtə, ɪtən, tən#, #bʊ, bʊɫ,...","[#bʊɫɪ, bʊɫɪt, ʊɫɪtə, ɫɪtən, ɪtən#, #bʊɫ, bʊɫɪ...","[#b, #…ʊ, #…ɫ, #…ɪ, #…t, #…ə, #…n, #…#, bʊ, b…...","[#bʊ, #b…ɫ, #b…ɪ, #b…t, #b…ə, #b…n, #b…#, #…ʊɫ...","[#bʊɫ, #bʊ…ɪ, #bʊ…t, #bʊ…ə, #bʊ…n, #bʊ…#, #b…ɫ...","[#bʊɫɪ, #bʊɫ…t, #bʊɫ…ə, #bʊɫ…n, #bʊɫ…#, #bʊ…ɪt..."
1844,#geminate#,#dʒɛməneɪt#,"[#, d, ʒ, ɛ, m, ə, n, e, ɪ, t, #]","[#d, dʒ, ʒɛ, ɛm, mə, ən, ne, eɪ, ɪt, t#, #, d,...","[#dʒ, dʒɛ, ʒɛm, ɛmə, mən, əne, neɪ, eɪt, ɪt#, ...","[#dʒɛ, dʒɛm, ʒɛmə, ɛmən, məne, əneɪ, neɪt, eɪt...","[#dʒɛm, dʒɛmə, ʒɛmən, ɛməne, məneɪ, əneɪt, neɪ...","[#d, #…ʒ, #…ɛ, #…m, #…ə, #…n, #…e, #…ɪ, dʒ, d…...","[#dʒ, #d…ɛ, #d…m, #d…ə, #d…n, #d…e, #d…ɪ, #…ʒɛ...","[#dʒɛ, #dʒ…m, #dʒ…ə, #dʒ…n, #dʒ…e, #dʒ…ɪ, #d…ɛ...","[#dʒɛm, #dʒɛ…ə, #dʒɛ…n, #dʒɛ…e, #dʒɛ…ɪ, #dʒ…mə..."
...,...,...,...,...,...,...,...,...,...,...,...
1675,#finish#,#fɪnɪʃ#,"[#, f, ɪ, n, ɪ, ʃ, #]","[#f, fɪ, ɪn, nɪ, ɪʃ, ʃ#, #, f, ɪ, n, ɪ, ʃ, #]","[#fɪ, fɪn, ɪnɪ, nɪʃ, ɪʃ#, #f, fɪ, ɪn, nɪ, ɪʃ, ...","[#fɪn, fɪnɪ, ɪnɪʃ, nɪʃ#, #fɪ, fɪn, ɪnɪ, nɪʃ, ɪ...","[#fɪnɪ, fɪnɪʃ, ɪnɪʃ#, #fɪn, fɪnɪ, ɪnɪʃ, nɪʃ#, ...","[#f, #…ɪ, #…n, #…ɪ, #…ʃ, #…#, fɪ, f…n, f…ɪ, f…...","[#fɪ, #f…n, #f…ɪ, #f…ʃ, #f…#, #…ɪn, #…ɪ…ɪ, #…ɪ...","[#fɪn, #fɪ…ɪ, #fɪ…ʃ, #fɪ…#, #f…nɪ, #f…n…ʃ, #f…...","[#fɪnɪ, #fɪn…ʃ, #fɪn…#, #fɪ…ɪʃ, #fɪ…ɪ…#, #fɪ…ʃ..."
1606,#faggot#,#fæɡət#,"[#, f, æ, ɡ, ə, t, #]","[#f, fæ, æɡ, ɡə, ət, t#, #, f, æ, ɡ, ə, t, #]","[#fæ, fæɡ, æɡə, ɡət, ət#, #f, fæ, æɡ, ɡə, ət, ...","[#fæɡ, fæɡə, æɡət, ɡət#, #fæ, fæɡ, æɡə, ɡət, ə...","[#fæɡə, fæɡət, æɡət#, #fæɡ, fæɡə, æɡət, ɡət#, ...","[#f, #…æ, #…ɡ, #…ə, #…t, #…#, fæ, f…ɡ, f…ə, f…...","[#fæ, #f…ɡ, #f…ə, #f…t, #f…#, #…æɡ, #…æ…ə, #…æ...","[#fæɡ, #fæ…ə, #fæ…t, #fæ…#, #f…ɡə, #f…ɡ…t, #f…...","[#fæɡə, #fæɡ…t, #fæɡ…#, #fæ…ət, #fæ…ə…#, #fæ…t..."
1398,#dust#,#dəst#,"[#, d, ə, s, t, #]","[#d, də, əs, st, t#, #, d, ə, s, t, #]","[#də, dəs, əst, st#, #d, də, əs, st, t#, #, d,...","[#dəs, dəst, əst#, #də, dəs, əst, st#, #d, də,...","[#dəst, dəst#, #dəs, dəst, əst#, #də, dəs, əst...","[#d, #…ə, #…s, #…t, #…#, də, d…s, d…t, d…#, əs...","[#də, #d…s, #d…t, #d…#, #…əs, #…ə…t, #…ə…#, #…...","[#dəs, #də…t, #də…#, #d…st, #d…s…#, #d…t#, #…ə...","[#dəst, #dəs…#, #də…t#, #d…st#, #…əst#, dəst#,..."
4452,#tuck#,#tək#,"[#, t, ə, k, #]","[#t, tə, ək, k#, #, t, ə, k, #]","[#tə, tək, ək#, #t, tə, ək, k#, #, t, ə, k, #]","[#tək, tək#, #tə, tək, ək#, #t, tə, ək, k#, #,...","[#tək#, #tək, tək#, #tə, tək, ək#, #t, tə, ək,...","[#t, #…ə, #…k, #…#, tə, t…k, t…#, ək, ə…#, k#,...","[#tə, #t…k, #t…#, #…ək, #…ə…#, #…k#, tək, tə…#...","[#tək, #tə…#, #t…k#, #…ək#, tək#, tək#, #tə, #...","[#tək#, #tək, #tə…#, #t…k#, #…ək#, tək#, tək#,..."


In [417]:
## select data type and define doc_dict
import random
if "sp_" in term_type:
    base_type = "spell"
else:
    base_type = "sound"
doc_dict = { i: x for i, x in enumerate(df[base_type]) }
## check
random.sample(doc_dict.items(), 10)

since Python 3.9 and will be removed in a subsequent version.
  random.sample(doc_dict.items(), 10)


[(1152, '#sɝveɪɫ#'),
 (1930, '#ɑdɪʃən#'),
 (1382, '#mɑɹkət#'),
 (542, '#swɑɫoʊ#'),
 (1990, '#mæp#'),
 (1086, '#skɹupəɫ#'),
 (1757, '#səmən#'),
 (2068, '#kæp#'),
 (2089, '#əsɛs#'),
 (267, '#dɪsaɪd#')]

In [418]:
## select bots for analysis
enable_term_change = False # if you want to change term_type to save time and energy
if enable_term_change:
	term_type = 'sp_skippy4gram'
print(f"(changed) term_type: {term_type}")

## bot stands for 'bag-of-terms', a generalization of 'bag-of-words'
bots = [ x for x in df[term_type] if len(x) > min_bot_size ] # Crucially
import random
random.sample(bots, 3)

(changed) term_type: sn_skippy5gram


[['#pəɫs',
  '#pəɫ…#',
  '#pə…s#',
  '#p…ɫs#',
  '#…əɫs#',
  'pəɫs#',
  'pəɫs#',
  '#pəɫ',
  '#pə…s',
  '#pə…#',
  '#p…ɫs',
  '#p…ɫ…#',
  '#p…s#',
  '#…əɫs',
  '#…əɫ…#',
  '#…ə…s#',
  '#…ɫs#',
  'pəɫs',
  'pəɫ…#',
  'pə…s#',
  'p…ɫs#',
  'əɫs#',
  'pəɫs',
  'pəɫ…#',
  'pə…s#',
  'p…ɫs#',
  'əɫs#',
  'əɫs#',
  '#pə',
  '#p…ɫ',
  '#p…s',
  '#p…#',
  '#…əɫ',
  '#…ə…s',
  '#…ə…#',
  '#…ɫs',
  '#…ɫ…#',
  '#…s#',
  'pəɫ',
  'pə…s',
  'pə…#',
  'p…ɫs',
  'p…ɫ…#',
  'p…s#',
  'əɫs',
  'əɫ…#',
  'ə…s#',
  'ɫs#',
  'pəɫ',
  'pə…s',
  'pə…#',
  'p…ɫs',
  'p…ɫ…#',
  'p…s#',
  'əɫs',
  'əɫ…#',
  'ə…s#',
  'ɫs#',
  'əɫs',
  'əɫ…#',
  'ə…s#',
  'ɫs#',
  'ɫs#',
  '#p',
  '#…ə',
  '#…ɫ',
  '#…s',
  '#…#',
  'pə',
  'p…ɫ',
  'p…s',
  'p…#',
  'əɫ',
  'ə…s',
  'ə…#',
  'ɫs',
  'ɫ…#',
  's#',
  'pə',
  'p…ɫ',
  'p…s',
  'p…#',
  'əɫ',
  'ə…s',
  'ə…#',
  'ɫs',
  'ɫ…#',
  's#',
  'əɫ',
  'ə…s',
  'ə…#',
  'ɫs',
  'ɫ…#',
  's#',
  'ɫs',
  'ɫ…#',
  's#',
  's#',
  '#',
  'p',
  'ə',
  'ɫ',
  's',
  '#'],
 ['

In [419]:
## generate dictionary
from gensim.corpora import Dictionary
diction = Dictionary(bots)
print(diction)

#apply_term_filtering = False
if apply_term_filtering:
    print(f"term filtering applied")
    diction.filter_extremes(no_below = term_minfreq, no_above = abuse_threshold)
else:
    print(f"term filtering not applied")
## check
print(diction)
## generate DTM
corpus = [ diction.doc2bow(bot) for bot in bots if len(bot) > min_bot_size ] # Crucially

Dictionary<228169 unique tokens: ['#', '#ɹ', '#ɹɪ', '#ɹɪn', '#ɹɪna']...>
term filtering applied
Dictionary<56070 unique tokens: ['#ɹɪ', '#ɹɪn', '#ɹɪn…#', '#ɹɪ…#', '#ɹɪ…a']...>


In [420]:
## HDP (n_topics = 90)
import gensim.models
import pyLDAvis.gensim
max_n_topics = 90
hdp = gensim.models.HdpModel(corpus, diction, random_state = 1, T = max_n_topics)
vis_data = pyLDAvis.gensim.prepare(hdp, corpus, diction)
pyLDAvis.display(vis_data)

In [421]:
## save LDAvis output as a html file
save_LDAvis = True
if save_LDAvis:
	lang_dir_name = target_lang_dict[target_lang_key].split()[0]
	vis_output = f"results/LDAvis/{lang_dir_name}/{target_lang_dict[target_lang_key]}{target_class}-HDP-max_ntop{max_n_topics}-{term_type}{accent_status}.html"
	pyLDAvis.save_html(vis_data, vis_output)

In [422]:
## topic investigation
import numpy as np
import HDP_helper

documents_topics = np.zeros([hdp.m_T, len(corpus)])
for doc_id, c in enumerate(corpus):
    for topic_id, prob in hdp[c]:
        documents_topics[topic_id][doc_id] = prob

## investigate topics
n_docs_to_show  = 10
n_terms_to_show = 7
hdp.optimal_ordering()
for topic_id, probs in enumerate(documents_topics):
    print(f"==============")
    #topic_encoding = ", ".join(hdp.show_topic(topic_id))
    topic_t = hdp.print_topic(topic_id, topn = n_terms_to_show)
    print(f"topic_id {topic_id}: {HDP_helper.reformat_topic (topic_t, n_terms_to_show)}")
    print(f"nonzero count: ", len(probs.nonzero()[0]))
    for doc_id in probs.argsort()[::-1][:n_docs_to_show]:
        doc = doc_dict[doc_id]
        print(f"\t{probs[doc_id]:0.4f}: {doc}")

topic_id 0: 0.004 * z# + 0.004 * ɪz + 0.004 * ɪz# + 0.004 * a…z + 0.004 * aɪz + 0.003 * a…z# + 0.003 * aɪz#
nonzero count:  117
	0.9992: #vændəɫaɪz#
	0.9991: #ɹændəmaɪz#
	0.9991: #pætɹənaɪz#
	0.9991: #sɪmbəɫaɪz#
	0.9991: #dɹæmətaɪz#
	0.9991: #ɪnɪʃəɫaɪz#
	0.9990: #nɔɹməɫaɪz#
	0.9990: #aɪdəɫaɪz#
	0.9990: #ɪmpɹəvaɪz#
	0.9990: #ɔstɹəsaɪz#
topic_id 1: 0.002 * ɪ# + 0.002 * i…ɪ + 0.001 * i…ɪ…# + 0.001 * i…t + 0.001 * i…ɪt + 0.001 * ən# + 0.001 * dɪ
nonzero count:  120
	0.9990: #əbɹivieɪt#
	0.9989: #ɪmeɪʃieɪt#
	0.9989: #dɪsəpɔɪnt#
	0.9989: #əpɹiʃieɪt#
	0.9987: #ɹidɪpɫɔɪ#
	0.9986: #ɹidɪfaɪn#
	0.9986: #pɑɹtɪʃən#
	0.9986: #əfɪʃieɪt#
	0.9984: #dɪspɫeɪ#
	0.9984: #ɪkskɫud#
topic_id 2: 0.003 * kt + 0.003 * kt# + 0.002 * ɹ…k + 0.002 * ɹ…k…# + 0.002 * ɹ…kt + 0.002 * t…t + 0.002 * æk
nonzero count:  119
	0.9992: #kɑnstɹəkt#
	0.9991: #kənstɹɪkt#
	0.9989: #dɪstɹækt#
	0.9989: #əbstɹəkt#
	0.9988: #æbstɹækt#
	0.9988: #ɛkstɹækt#
	0.9987: #kɑntɹækt#
	0.9987: #tɹɪpɫɪkət#
	0.9987: #ɹistɹɪkt#
	0.9987: #səbtɹækt#


In [423]:
## HDP (n_topics = 45)
import gensim.models
import pyLDAvis.gensim
max_n_topics = 45
hdp = gensim.models.HdpModel(corpus, diction, random_state = 1, T = max_n_topics)
vis_data = pyLDAvis.gensim.prepare(hdp, corpus, diction)
pyLDAvis.display(vis_data)

In [424]:
## save LDAvis output as a html file
save_LDAvis = True
if save_LDAvis:
	lang_dir_name = target_lang_dict[target_lang_key].split()[0]
	vis_output = f"results/LDAvis/{lang_dir_name}/{target_lang_dict[target_lang_key]}{target_class}-HDP-max_ntop{max_n_topics}-{term_type}{accent_status}.html"
	pyLDAvis.save_html(vis_data, vis_output)

In [425]:
## topic investigation
import numpy as np
import HDP_helper

documents_topics = np.zeros([hdp.m_T, len(corpus)])
for doc_id, c in enumerate(corpus):
    for topic_id, prob in hdp[c]:
        documents_topics[topic_id][doc_id] = prob

n_docs_to_show = 10
n_terms_to_show = 7
hdp.optimal_ordering()
for topic_id, probs in enumerate(documents_topics):
    print(f"==============")
    topic_t = hdp.print_topic(topic_id, topn = n_terms_to_show)
    print(f"topic_id {topic_id}: {HDP_helper.reformat_topic (topic_t, n_terms_to_show)}")
    print(f"nonzero count: {len(probs.nonzero()[0])}")
    for doc_id in probs.argsort()[::-1][:n_docs_to_show]:
        doc = doc_dict[doc_id]
        print(f"\t{probs[doc_id]:0.4f}: {doc}")

topic_id 0: 0.001 * p…ɪ + 0.001 * ɝ…ɪ + 0.001 * p# + 0.001 * dʒ + 0.001 * ɪ# + 0.001 * ɪ…ɝ + 0.001 * ɫa
nonzero count: 187
	0.9989: #ɹɪvɝbɝeɪt#
	0.9989: #əbɫɪtɝeɪt#
	0.9989: #mɪsəpɫaɪ#
	0.9989: #əndɝsteɪt#
	0.9988: #əndɝɫaɪn#
	0.9988: #dʒɛsteɪt#
	0.9988: #ɪɫæbɝeɪt#
	0.9987: #ɪvæpɝeɪt#
	0.9987: #məɫtəpɫaɪ#
	0.9987: #ɹiəpɹeɪz#
topic_id 1: 0.002 * ɪ# + 0.001 * ə…a + 0.001 * aɪ# + 0.001 * ə…aɪ + 0.001 * fa + 0.001 * f…ɪ + 0.001 * faɪ
nonzero count: 121
	0.9991: #sɛɡɹəɡeɪt#
	0.9990: #səɫɪdəfaɪ#
	0.9990: #kæstəɡeɪt#
	0.9990: #ɹiɪnsteɪt#
	0.9990: #kɑnsəmeɪt#
	0.9989: #tɹænsɫeɪt#
	0.9989: #pɹɑstɹeɪt#
	0.9989: #əsɪdəfaɪ#
	0.9989: #æɡɹəɡeɪt#
	0.9988: #kɫɛɹəfaɪ#
topic_id 2: 0.002 * kt + 0.002 * kt# + 0.001 * kə + 0.001 * kə…# + 0.001 * nt + 0.001 * ɛ…t + 0.001 * ɹ…k
nonzero count: 242
	0.9992: #kɑnstɹəkt#
	0.9991: #kənstɹɪkt#
	0.9990: #dɪstɹækt#
	0.9989: #əbstɹəkt#
	0.9988: #æbstɹækt#
	0.9988: #ɛkstɹækt#
	0.9988: #kɑntɹækt#
	0.9988: #tɹɪpɫɪkət#
	0.9987: #ɹistɹɪkt#
	0.9987: #səbtɹækt#
topic_id 3: 

In [426]:
## HDP (n_topics = 15)
import gensim.models
import pyLDAvis.gensim
max_n_topics = 15
hdp = gensim.models.HdpModel(corpus, diction, random_state = 1, T = max_n_topics)
vis_data = pyLDAvis.gensim.prepare(hdp, corpus, diction)
pyLDAvis.display(vis_data)

In [427]:
## save LDAvis output as a html file
save_LDAvis = True
if save_LDAvis:
	lang_dir_name = target_lang_dict[target_lang_key].split()[0]
	vis_output = f"results/LDAvis/{lang_dir_name}/{target_lang_dict[target_lang_key]}{target_class}-HDP-max_ntop{max_n_topics}-{term_type}{accent_status}.html"
	pyLDAvis.save_html(vis_data, vis_output)

In [428]:
## topic investigation
import numpy as np
import HDP_helper
reload_module = True
if reload_module:
    import importlib
    importlib.reload(HDP_helper)

documents_topics = np.zeros([hdp.m_T, len(corpus)])
for doc_id, c in enumerate(corpus):
    for topic_id, prob in hdp[c]:
        documents_topics[topic_id][doc_id] = prob

n_docs_to_show = 10
n_terms_to_show = 7
hdp.optimal_ordering()
for topic_id, probs in enumerate(documents_topics):
    print(f"==============")
    topic_t = hdp.print_topic(topic_id, topn = n_terms_to_show)
    print(f"topic_id {topic_id}: {HDP_helper.reformat_topic (topic_t, n_terms_to_show)}")
    print(f"nonzero count: ", len(probs.nonzero()[0]))
    for doc_id in probs.argsort()[::-1][:n_docs_to_show]:
        doc = doc_dict[doc_id]
        print(f"\t{probs[doc_id]:0.4f}: {doc}")

topic_id 0: 0.001 * p# + 0.001 * t…t + 0.001 * dʒ + 0.001 * nt + 0.001 * t…t# + 0.001 * ʃ# + 0.001 * tʃ
nonzero count:  587
	0.9992: #sɛɡɹəɡeɪt#
	0.9991: #dɪskɫeɪm#
	0.9991: #dɪstɹækt#
	0.9991: #ɹɛdʒəmənt#
	0.9990: #kɑnskɹɪpt#
	0.9990: #pɹɑstɹeɪt#
	0.9990: #æɡɹəɡeɪt#
	0.9990: #ɪkskɫeɪm#
	0.9990: #sætəɫaɪt#
	0.9990: #mɪsdɪɹɛkt#
topic_id 1: 0.001 * z# + 0.001 * ɪz + 0.001 * ə…a + 0.001 * ə…aɪ + 0.001 * ɫ…ɪ…# + 0.001 * ɪz# + 0.001 * ɹə
nonzero count:  391
	0.9992: #sɪɹiəɫaɪz#
	0.9992: #sɪmbəɫaɪz#
	0.9991: #ɪnɪʃəɫaɪz#
	0.9991: #ɪmpɹəvaɪz#
	0.9990: #ɹiɪnsteɪt#
	0.9990: #fɛdɝəɫaɪz#
	0.9990: #ɪmeɪʃieɪt#
	0.9990: #ɪmpɫəmənt#
	0.9990: #ɫɪbɝəɫaɪz#
	0.9990: #tɹænsɫeɪt#
topic_id 2: 0.001 * m# + 0.001 * kt + 0.001 * ɪn# + 0.001 * kt# + 0.001 * kə + 0.001 * ʃ# + 0.001 * nt
nonzero count:  520
	0.9992: #kɑnstɹəkt#
	0.9992: #kənstɹɪkt#
	0.9990: #əbstɹəkt#
	0.9989: #əndɝɫaɪn#
	0.9989: #kɑntɹækt#
	0.9989: #tɹɪpɫɪkət#
	0.9988: #ɹistɹɪkt#
	0.9988: #kɛɹəkətʃɝ#
	0.9988: #stɹəktʃɝ#
	0.9987: #dɪfɫɛkt#
topic_i