In [638]:
#!pip install -U pyLDAvis
#!pip install -U pandas

In [639]:
## imports
import os, sys
import pprint as pp
import unicodedata

In [640]:
## 一つ上の階層のファイルを見るように設定
sys.path.append(os.path.join(os.path.dirname("__file__"), '..'))

In [641]:
## term settings
term_classes        = [ 'spell', 'sound' ]
term_class          = term_classes[1]
ngram_is_inclusive  = True
## doc settings
max_doc_size        = 11
min_doc_size        =  3
print(f"max_doc_size: {max_doc_size}")
print(f"min_doc_size: {min_doc_size}")
### boundary handling
add_boundary       = False
boundary_mark      = "#"
## term setting
gap_mark           = "…"
term_is_skippy     = True
n_for_ngram        = 5
max_gap_ratio      = 0.8
max_gap_size       = round(max_doc_size * max_gap_ratio)
print(f"term_class: {term_class}")
print(f"term_is_skippy: {term_is_skippy}")
print(f"max_gap_size: {max_gap_size}")
print(f"n_for_ngram: {n_for_ngram}")
### accent handling
suppress_accents   = True
accent_marks       = [ "ˈ", "ˌ" ] 
if term_class == 'sound':
    if suppress_accents:
        accent_status = "-unaccented"
    else:
        accent_stratus = "-accented"
else:
    accent_status = ""
print(f"accent_status: {accent_status}")
## define term_type
if term_class == 'spell':
    if term_is_skippy:
        term_type = f"sp_skippy{n_for_ngram}gram"
    else:
        term_type = f"sp_{n_for_ngram}gram"
else:
    if term_is_skippy:
        term_type = f"sn_skippy{n_for_ngram}gram"
    else:
        term_type = f"sn_{n_for_ngram}gram"
## check
print(f"term_type: {term_type}")

max_doc_size: 11
min_doc_size: 3
term_class: sound
term_is_skippy: True
max_gap_size: 9
n_for_ngram: 5
accent_status: -unaccented
term_type: sn_skippy5gram


In [642]:
## target language
## a key must be part of a file name 
target_lang_dict = {    'en_US' : 'English (US)',
                        'en_UK' : 'English (UK)',
                        'en_N_only' : 'English noun (WN)',
                        'en_V_only' : 'English verb (WN)',
                        'en_A_only' : 'English adj (WN)',
                        'en_R_only' : 'English adv (WN)',
                        'ar'    : 'Arabic',
                        'de'    : 'German',
                        'de_N_only' : 'German Nouns',
                        'de_non_N_only' : 'German Non-nouns',
                        'eo'    : 'Esperanto',
                        'es_ES' : 'Spanish (Spain)',
                        'es_MX' : 'Spanish (Mexico)',
                        'fi'    : 'Finnish',
                        'fr_FR' : 'French (France)',
                        'fr_QC' : 'French (Quebec)',
                        'is'    : 'Icelandic',
                        'ir'    : 'Irish',
                        'nl'    : 'Dutch',
                        'ro'    : 'Romanian',
                        'sw'    : 'Swahili' }
## proper language selection
target_lang_keys = [    'en_US', # 0
                        'en_UK', # 1
                        'en_N_only', # 2
                        'en_V_only', # 3
                        'en_A_only', # 4
                        'en_R_only', # 5
                        'ar', # 6
                        'de', # 7
                        'de_N_only', # 8
                        'de_non_N_only', # 9
                        'eo', 'es_ES', 'es_MX',
                        'fi', 'fr_FR', 'fr_QC',
                        'is', 'nl', 'ro', 'sw',
                        'ir' # This lacks sound
                    ]
## check
target_lang_key  = target_lang_keys[4]
print(f"target_lang_key: {target_lang_key}")
print(f"target lang: {target_lang_dict[target_lang_key]} [{target_lang_key}]")
## target_attr [effective only for Irish]
target_class = ""
#target_class = None # This causes an unrediable error
if target_lang_key == "ir":
    target_classes = [ 'adjectives', 'nouns', 'verbs' ]
    target_class = f"-{target_classes[3]}"
print(f"target_class: {target_class}")

target_lang_key: en_A_only
target lang: English adj (WN) [en_A_only]
target_class: 


In [643]:
## LDA/HDP
apply_term_filtering = True
## The following parameters need to be relatively large to prevent "Row sum not equal 1" error
term_minfreq       = 2
## The following value is crucial to prevent "Row sum not equal 1" error
abuse_threshold    = 0.04 # larger value selects shorter units, smaller value selects longer units
min_bot_size       = 3
# number of terms listed for a given topic
n_terms_to_show    = 50

In [644]:
## sampling
source_sampling          = True
source_sampling_rate     = 0.5
source_sampling_max_size = 5000
second_sampling          = False
second_sampling_rate     = 0.7

In [645]:
## set target files
import glob
data_dir1     = "data/open-dict-ipa/data1"
data_dir2     = "data/open-dict-ipa/data1a"
data_dir3     = "data/wn3"
data_dir4     = "data/irish"
target_files = glob.glob(f"{data_dir1}/*")
target_files2 = glob.glob(f"{data_dir2}/*")
target_files.extend(target_files2)
target_files3 = glob.glob(f"{data_dir3}/*")
target_files.extend(target_files3)
target_files4 = glob.glob(f"{data_dir4}/*")
target_files.extend(target_files4)
#
target_files = sorted([ file for file in target_files if ".csv" in file ])
pp.pprint(target_files)

['data/irish/word-irish-adjectives-spell.csv',
 'data/irish/word-irish-noun-phrases-spell.csv',
 'data/irish/word-irish-nouns-spell.csv',
 'data/irish/word-irish-possessives-spell.csv',
 'data/irish/word-irish-prepositions-spell.csv',
 'data/irish/word-irish-verbs-spell.csv',
 'data/open-dict-ipa/data1/ar.csv.gz',
 'data/open-dict-ipa/data1/de.csv.gz',
 'data/open-dict-ipa/data1/en_UK.csv.gz',
 'data/open-dict-ipa/data1/en_US.csv.gz',
 'data/open-dict-ipa/data1/eo.csv.gz',
 'data/open-dict-ipa/data1/es_ES.csv.gz',
 'data/open-dict-ipa/data1/es_MX.csv.gz',
 'data/open-dict-ipa/data1/fa.csv.gz',
 'data/open-dict-ipa/data1/fi.csv.gz',
 'data/open-dict-ipa/data1/fr_FR.csv.gz',
 'data/open-dict-ipa/data1/fr_QC.csv.gz',
 'data/open-dict-ipa/data1/is.csv.gz',
 'data/open-dict-ipa/data1/ja.csv.gz',
 'data/open-dict-ipa/data1/jam.csv.gz',
 'data/open-dict-ipa/data1/ma.csv.gz',
 'data/open-dict-ipa/data1/nb.csv.gz',
 'data/open-dict-ipa/data1/nl.csv.gz',
 'data/open-dict-ipa/data1/or.csv.gz',
 '

In [646]:
## get source data from files
import pandas as pd
import gzip
#target_language_key = "en_US" # can be changed to get other languages
#if target_class != "" or target_class is not None:
if target_class != "":
    target_file = [ f for f in target_files if target_lang_key in f and target_class in f ][0]
else:
    target_file = [ f for f in target_files if target_lang_key in f ][0]
print(f"processing: {target_file}")
##
if target_lang_key == "ir":
    col_names = ['spell', 'POS']
else:
    col_names = ['spell', 'sound']
#
if target_file.endswith(".gz"):
    with gzip.open(target_file, "rt") as f:
        raw_df = pd.read_csv(f, encoding = 'utf8', header = None, names = col_names )
else:
    with open(target_file, "rt") as f:
        raw_df = pd.read_csv(f, encoding = 'utf8', header = None, names = col_names )
## normalize characters
raw_df['spell'] = raw_df['spell'].apply(lambda x: unicodedata.normalize('NFC', str(x)))
## modify sound
try:
    sounds = raw_df['sound'].apply(lambda x: x.strip('/') )
    sounds = [ x.split("/,")[0] for x in sounds ] # picks up only the first of multiple entries
    raw_df['sound'] = sounds
except (AttributeError, KeyError):
    pass
#
raw_df.sample(10)

processing: data/wn3/en_A_only.csv


Unnamed: 0,spell,sound
3220,loving,ˈɫəvɪŋ
5174,thickening,ˈθɪkənɪŋ
3541,national,ˈnæʃənəɫ
1494,deliverable,dɪˈɫɪvɝəbəɫ
1719,dogged,ˈdɔɡd
726,bookish,ˈbʊkɪʃ
2982,invigorating,ˌɪnˈvɪɡɝˌeɪtɪŋ
4308,rare,ˈɹɛɹ
471,auditory,ˈɔdɪˌtɔɹi
1708,divine,dɪˈvaɪn


In [647]:
## source sampling
len(raw_df)
if source_sampling:
	print(f"source sampling applied")
	if len(raw_df) >= source_sampling_max_size:
		raw_df = raw_df.sample(source_sampling_max_size)
	else:
		raw_df = raw_df.sample(round(len(raw_df) * source_sampling_rate))
## remove accent marking
if suppress_accents:
	try:
		raw_df['sound'] = raw_df['sound'].apply(lambda x: "".join([ y for y in list(x) if y not in accent_marks ]))
	except KeyError:
		pass
## add boudary marks
if add_boundary:
	raw_df['spell'] = raw_df['spell'].apply(lambda x: f"{boundary_mark}{x}{boundary_mark}")
	try:
		raw_df['sound'] = raw_df['sound'].apply(lambda x: f"{boundary_mark}{x}{boundary_mark}")
	except KeyError:
		pass
#
print(raw_df)

source sampling applied
            spell         sound
4617       secret        sikɹət
5626       unsold       ənsoʊɫd
714        bodily        bɑdəɫi
399     arthritic      ɑɹθɹɪtɪk
529        baking        beɪkɪŋ
...           ...           ...
4668  seventeenth     sɛvəntinθ
1461    deductive      dɪdəktəv
2548   highflying     haɪfɫaɪɪŋ
2823    inductive      ɪndəktɪv
419   asphyxiated  æsfɪksieɪtɪd

[5000 rows x 2 columns]


In [648]:
## generate 1-grams for spell and sound
## spell
raw_df['sp_1gram'] = raw_df['spell'].apply(lambda x: list(str(x)))
# add column of size
raw_df['sp_size'] = raw_df['sp_1gram'].apply(lambda x: len(x))
# add column of count of '-' inside
raw_df['hyphen'] = raw_df['sp_1gram'].apply(lambda x: list(x).count("-"))
# add column of count of '.' inside
raw_df['period'] = raw_df['sp_1gram'].apply(lambda x: list(x).count("."))
## sound
# takes the first entry, removes '/' around
try:
    raw_df['sn_1gram'] = raw_df['sound'].apply(lambda x: list(x) )
except (TypeError, KeyError):
    pass
# add column of size
try:
    raw_df['sn_size'] = raw_df['sn_1gram'].apply(lambda x: len(x))
except KeyError:
    pass
## check
raw_df

Unnamed: 0,spell,sound,sp_1gram,sp_size,hyphen,period,sn_1gram,sn_size
4617,secret,sikɹət,"[s, e, c, r, e, t]",6,0,0,"[s, i, k, ɹ, ə, t]",6
5626,unsold,ənsoʊɫd,"[u, n, s, o, l, d]",6,0,0,"[ə, n, s, o, ʊ, ɫ, d]",7
714,bodily,bɑdəɫi,"[b, o, d, i, l, y]",6,0,0,"[b, ɑ, d, ə, ɫ, i]",6
399,arthritic,ɑɹθɹɪtɪk,"[a, r, t, h, r, i, t, i, c]",9,0,0,"[ɑ, ɹ, θ, ɹ, ɪ, t, ɪ, k]",8
529,baking,beɪkɪŋ,"[b, a, k, i, n, g]",6,0,0,"[b, e, ɪ, k, ɪ, ŋ]",6
...,...,...,...,...,...,...,...,...
4668,seventeenth,sɛvəntinθ,"[s, e, v, e, n, t, e, e, n, t, h]",11,0,0,"[s, ɛ, v, ə, n, t, i, n, θ]",9
1461,deductive,dɪdəktəv,"[d, e, d, u, c, t, i, v, e]",9,0,0,"[d, ɪ, d, ə, k, t, ə, v]",8
2548,highflying,haɪfɫaɪɪŋ,"[h, i, g, h, f, l, y, i, n, g]",10,0,0,"[h, a, ɪ, f, ɫ, a, ɪ, ɪ, ŋ]",9
2823,inductive,ɪndəktɪv,"[i, n, d, u, c, t, i, v, e]",9,0,0,"[ɪ, n, d, ə, k, t, ɪ, v]",8


In [649]:
## filtering raw_data by size
print(f"term_type: {term_type}")
if "sp_" in term_type:
    df_filtered = raw_df[ (raw_df['sp_size'] <= max_doc_size) & (raw_df['sp_size'] >= min_doc_size) & (raw_df['hyphen'] == 0) & (raw_df['period'] == 0) ]
else:
    df_filtered = raw_df[ (raw_df['sn_size'] <= max_doc_size) & (raw_df['sn_size'] >= min_doc_size) ]
#
df_filtered

term_type: sn_skippy5gram


Unnamed: 0,spell,sound,sp_1gram,sp_size,hyphen,period,sn_1gram,sn_size
4617,secret,sikɹət,"[s, e, c, r, e, t]",6,0,0,"[s, i, k, ɹ, ə, t]",6
5626,unsold,ənsoʊɫd,"[u, n, s, o, l, d]",6,0,0,"[ə, n, s, o, ʊ, ɫ, d]",7
714,bodily,bɑdəɫi,"[b, o, d, i, l, y]",6,0,0,"[b, ɑ, d, ə, ɫ, i]",6
399,arthritic,ɑɹθɹɪtɪk,"[a, r, t, h, r, i, t, i, c]",9,0,0,"[ɑ, ɹ, θ, ɹ, ɪ, t, ɪ, k]",8
529,baking,beɪkɪŋ,"[b, a, k, i, n, g]",6,0,0,"[b, e, ɪ, k, ɪ, ŋ]",6
...,...,...,...,...,...,...,...,...
5264,transitional,tɹænsɪʃənəɫ,"[t, r, a, n, s, i, t, i, o, n, a, l]",12,0,0,"[t, ɹ, æ, n, s, ɪ, ʃ, ə, n, ə, ɫ]",11
4668,seventeenth,sɛvəntinθ,"[s, e, v, e, n, t, e, e, n, t, h]",11,0,0,"[s, ɛ, v, ə, n, t, i, n, θ]",9
1461,deductive,dɪdəktəv,"[d, e, d, u, c, t, i, v, e]",9,0,0,"[d, ɪ, d, ə, k, t, ə, v]",8
2548,highflying,haɪfɫaɪɪŋ,"[h, i, g, h, f, l, y, i, n, g]",10,0,0,"[h, a, ɪ, f, ɫ, a, ɪ, ɪ, ŋ]",9


In [650]:
## define df after second sampling if any
len(df_filtered)
if second_sampling:
    df = df_filtered.sample(round(len(df_filtered) * second_sampling_rate))
else:
    df = df_filtered
len(df)

4563

In [651]:
## spell 2grams
#import ngrams
import gen_ngrams
module_name = "gen_ngrams"
reload_module = False
if reload_module:
    import importlib
    importlib.reload(module_name)

if term_class == 'spell':
    #sp_2grams = [ ngrams.list_gen_ngrams (x, n = 2, check = False) for x in df['sp_1gram'] ]
    sp_2grams = [ gen_ngrams.gen_ngrams (x, n = 2, sep = "", check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_2grams):
            g.extend(list(df['sp_1gram'])[i])
    ## add sp_2gram
    df['sp_2gram'] = sp_2grams

In [652]:
## spell 3grams
#import ngrams
import gen_ngrams
if n_for_ngram > 2 and term_class == 'spell':
    #sp_3grams = [ ngrams.list_gen_ngrams (x, n = 3, check = False) for x in df['sp_1gram'] ]
    sp_3grams = [ gen_ngrams.gen_ngrams (x, n = 3, sep = "", check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_3grams):
            g.extend(list(df['sp_2gram'])[i])
    ## add sp_3gram
    df['sp_3gram'] = sp_3grams

In [653]:
## spell 4grams
#import ngrams
import gen_ngrams
if n_for_ngram > 3 and term_class == 'spell':
    #sp_4grams = [ ngrams.list_gen_ngrams (x, n = 4, check = False) for x in df['sp_1gram'] ]
    sp_4grams = [ gen_ngrams.gen_ngrams (x, n = 4, sep = "", check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_4grams):
            g.extend(list(df['sp_3gram'])[i])
    ## add sp_4gram
    df['sp_4gram'] = sp_4grams

In [654]:
## spell 5grams
#import ngrams
import gen_ngrams
if n_for_ngram > 4 and term_class == 'spell':
    sp_5grams = [ gen_ngrams.gen_ngrams (x, n = 5, sep = "", check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_5grams):
            g.extend(list(df['sp_4gram'])[i])
    ## add sp_5gram
    df['sp_5gram'] = sp_5grams

In [655]:
## spell skippy 2gram
#import ngrams_skippy
import gen_ngrams
reload_module = False
module_name = "gen_ngrams"
if reload_module:
    import importlib
    importlib.reload(module_name)
#
if term_class == 'spell':
    #sp_skippy2grams = [ ngrams_skippy.gen_skippy2grams(x, missing_mark = gap_mark, check = False) for x in df['sp_1gram'] ]
    ## The code above was replaced by the following more efficient one
    sp_skippy2grams = [ gen_ngrams.gen_skippy_ngrams(x, 2, sep = "", max_distance = max_gap_size, missing_mark = gap_mark, check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_skippy2grams):
            g.extend(list(df['sp_1gram'])[i])
    #
    df['sp_skippy2gram'] = sp_skippy2grams

In [656]:
## spell skippy 3gram
#import ngrams_skippy
import gen_ngrams
if n_for_ngram > 2 and term_class == 'spell':
    #sp_skippy3grams = [ ngrams_skippy.gen_skippy3grams(x, missing_mark = gap_mark, check = False) for x in df['sp_1gram'] ]
    ## The code above was replaced by the following more efficient one
    sp_skippy3grams = [ gen_ngrams.gen_skippy_ngrams(x, 3, sep = "", max_distance = max_gap_size, missing_mark = gap_mark, check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_skippy3grams):
            g.extend(list(df['sp_skippy2gram'])[i])
    #
    df['sp_skippy3gram'] = sp_skippy3grams

In [657]:
## spell skippy 4gram
#import ngrams_skippy
import gen_ngrams
if n_for_ngram > 3 and term_class == 'spell':
    #sp_skippy4grams = [ ngrams_skippy.gen_skippy4grams(x, missing_mark = gap_mark, check = False) for x in df['sp_1gram'] ]
    ## The code above was replaced by the following more efficient one
    sp_skippy4grams = [ gen_ngrams.gen_skippy_ngrams(x, 4, sep = "", max_distance = max_gap_size, missing_mark = gap_mark, check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_skippy4grams):
            g.extend(list(df['sp_skippy3gram'])[i])
    #
    df['sp_skippy4gram'] = sp_skippy4grams

In [658]:
## spell skippy 5gram
import gen_ngrams
if n_for_ngram > 4 and term_class == 'spell':
    sp_skippy5grams = [ gen_ngrams.gen_skippy_ngrams(x, 5, sep = "", max_distance = max_gap_size, missing_mark = gap_mark, check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_skippy5grams):
            g.extend(list(df['sp_skippy4gram'])[i])
    #
    df['sp_skippy5gram'] = sp_skippy5grams

In [659]:
## sound 2grams
#import ngrams
import gen_ngrams
module_name = "gen_ngrams"
reload_module = False
if reload_module:
    import importlib
    importlib.reload(module_name)
#
if term_class == 'sound':
    #sn_2grams = [ ngrams.list_gen_ngrams (x, n = 2, check = False) for x in df['sn_1gram'] ]
    sn_2grams = [ gen_ngrams.gen_ngrams (x, n = 2, sep ="", check = False) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_2grams):
            g.extend(list(df['sn_1gram'])[i])
    ## add sn_2gram
    df['sn_2gram'] = sn_2grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sn_2gram'] = sn_2grams


In [660]:
## sound 3grams
#import ngrams
import gen_ngrams
if n_for_ngram > 2 and term_class == 'sound':
    #sn_3grams = [ ngrams.list_gen_ngrams (x, n = 3, check = False) for x in df['sn_1gram'] ]
    sn_3grams = [ gen_ngrams.gen_ngrams (x, n = 3, sep = "", check = False) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_3grams):
            g.extend(list(df['sn_2gram'])[i])
    ## add sn_3gram
    df['sn_3gram'] = sn_3grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sn_3gram'] = sn_3grams


In [661]:
## sound 4grams
#import ngrams
import gen_ngrams
if n_for_ngram > 3 and term_class == 'sound':
    #sn_4grams = [ ngrams.list_gen_ngrams (x, n = 4, check = False) for x in df['sn_1gram'] ]
    sn_4grams = [ gen_ngrams.gen_ngrams (x, n = 4, sep = "", check = False) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_4grams):
            g.extend(list(df['sn_3gram'])[i])
    ## add sn_4gram
    df['sn_4gram'] = sn_4grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sn_4gram'] = sn_4grams


In [662]:
## sound 5grams
import gen_ngrams
if n_for_ngram > 4 and term_class == 'sound':
    sn_5grams = [ gen_ngrams.gen_ngrams (x, n = 5, sep = "", check = False) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_5grams):
            g.extend(list(df['sn_4gram'])[i])
    ## add sn_4gram
    df['sn_5gram'] = sn_5grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sn_5gram'] = sn_5grams


In [663]:
## sound skippy 2gram
#import ngrams_skippy
import gen_ngrams
if term_class == 'sound':
    #sn_skippy2grams = [ ngrams_skippy.gen_skippy2grams(x, missing_mark = gap_mark, check = False) for x in df['sn_1gram'] ]
    ## The code above was replaced by the following more efficient one
    sn_skippy2grams = [ gen_ngrams.gen_skippy_ngrams(x, n = 2, sep = "", max_distance = max_gap_size, missing_mark = gap_mark, check = False) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_skippy2grams):
            g.extend(list(df['sn_1gram'])[i])
    #
    df['sn_skippy2gram'] = sn_skippy2grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sn_skippy2gram'] = sn_skippy2grams


In [664]:
## sound skippy 3gram
#import ngrams_skippy
import gen_ngrams
if n_for_ngram > 2 and term_class == 'sound':
    #sn_skippy3grams = [ ngrams_skippy.gen_skippy3grams(x, missing_mark = gap_mark, check = False) for x in df['sn_1gram'] ]
    ## The code above was replaced by the following more efficient one
    sn_skippy3grams = [ gen_ngrams.gen_skippy_ngrams(x, n = 3, sep = "", max_distance = max_gap_size, missing_mark = gap_mark, check = False) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_skippy3grams):
            g.extend(list(df['sn_skippy2gram'])[i])
    #
    df['sn_skippy3gram'] = sn_skippy3grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sn_skippy3gram'] = sn_skippy3grams


In [665]:
## sound skippy 4gram
#import ngrams_skippy
import gen_ngrams
if n_for_ngram > 3 and term_class == 'sound':
    #sn_skippy4grams = [ ngrams_skippy.gen_skippy4grams(x, missing_mark = gap_mark, check = False) for x in df['sn_1gram'] ]
    ## The code above was replaced by the following more efficient one
    sn_skippy4grams = [ gen_ngrams.gen_skippy_ngrams(x, n = 4, sep = "", max_distance = max_gap_size, missing_mark = gap_mark, check = False) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_skippy4grams):
            g.extend(list(df['sn_skippy3gram'])[i])
    #
    df['sn_skippy4gram'] = sn_skippy4grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sn_skippy4gram'] = sn_skippy4grams


In [666]:
## sound skippy 5gram
import gen_ngrams
if n_for_ngram > 4 and term_class == 'sound':
    sn_skippy5grams = [ gen_ngrams.gen_skippy_ngrams(x, n = 5, sep = "", max_distance = max_gap_size, missing_mark = gap_mark, check = False) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_skippy5grams):
            g.extend(list(df['sn_skippy4gram'])[i])
    #
    df['sn_skippy5gram'] = sn_skippy5grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sn_skippy5gram'] = sn_skippy5grams


In [667]:
## check df
dropped_vars = [ 'sp_size', 'hyphen', 'period', 'sn_size' ]
if term_class == 'spell':
    extra = [ 'sn_1gram', 'sn_2gram', 'sn_3gram', 'sn_4gram',
             'sn_skippy2gram', 'sn_skippy3gram', 'sn_skippy4gram' ]
    dropped_vars.extend(extra)
    target_vars = [ x for x in df.columns if not x in dropped_vars ]
else:
    extra = [ 'sp_1gram', 'sp_2gram', 'sp_3gram', 'sp_4gram',
             'sp_skippy2gram', 'sp_skippy3gram', 'sp_skippy4gram' ]
    dropped_vars.extend(extra)
    target_vars = [ x for x in df.columns if not x in dropped_vars ]    
#
df[target_vars]

Unnamed: 0,spell,sound,sn_1gram,sn_2gram,sn_3gram,sn_4gram,sn_5gram,sn_skippy2gram,sn_skippy3gram,sn_skippy4gram,sn_skippy5gram
4617,secret,sikɹət,"[s, i, k, ɹ, ə, t]","[si, ik, kɹ, ɹə, ət, s, i, k, ɹ, ə, t]","[sik, ikɹ, kɹə, ɹət, si, ik, kɹ, ɹə, ət, s, i,...","[sikɹ, ikɹə, kɹət, sik, ikɹ, kɹə, ɹət, si, ik,...","[sikɹə, ikɹət, sikɹ, ikɹə, kɹət, sik, ikɹ, kɹə...","[si, s…k, s…ɹ, s…ə, s…t, ik, i…ɹ, i…ə, i…t, kɹ...","[sik, si…ɹ, si…ə, si…t, s…kɹ, s…k…ə, s…k…t, s…...","[sikɹ, sik…ə, sik…t, si…ɹə, si…ɹ…t, si…ət, s…k...","[sikɹə, sikɹ…t, sik…ət, si…ɹət, s…kɹət, ikɹət,..."
5626,unsold,ənsoʊɫd,"[ə, n, s, o, ʊ, ɫ, d]","[ən, ns, so, oʊ, ʊɫ, ɫd, ə, n, s, o, ʊ, ɫ, d]","[əns, nso, soʊ, oʊɫ, ʊɫd, ən, ns, so, oʊ, ʊɫ, ...","[ənso, nsoʊ, soʊɫ, oʊɫd, əns, nso, soʊ, oʊɫ, ʊ...","[ənsoʊ, nsoʊɫ, soʊɫd, ənso, nsoʊ, soʊɫ, oʊɫd, ...","[ən, ə…s, ə…o, ə…ʊ, ə…ɫ, ə…d, ns, n…o, n…ʊ, n…...","[əns, ən…o, ən…ʊ, ən…ɫ, ən…d, ə…so, ə…s…ʊ, ə…s...","[ənso, əns…ʊ, əns…ɫ, əns…d, ən…oʊ, ən…o…ɫ, ən…...","[ənsoʊ, ənso…ɫ, ənso…d, əns…ʊɫ, əns…ʊ…d, əns…ɫ..."
714,bodily,bɑdəɫi,"[b, ɑ, d, ə, ɫ, i]","[bɑ, ɑd, də, əɫ, ɫi, b, ɑ, d, ə, ɫ, i]","[bɑd, ɑdə, dəɫ, əɫi, bɑ, ɑd, də, əɫ, ɫi, b, ɑ,...","[bɑdə, ɑdəɫ, dəɫi, bɑd, ɑdə, dəɫ, əɫi, bɑ, ɑd,...","[bɑdəɫ, ɑdəɫi, bɑdə, ɑdəɫ, dəɫi, bɑd, ɑdə, dəɫ...","[bɑ, b…d, b…ə, b…ɫ, b…i, ɑd, ɑ…ə, ɑ…ɫ, ɑ…i, də...","[bɑd, bɑ…ə, bɑ…ɫ, bɑ…i, b…də, b…d…ɫ, b…d…i, b…...","[bɑdə, bɑd…ɫ, bɑd…i, bɑ…əɫ, bɑ…ə…i, bɑ…ɫi, b…d...","[bɑdəɫ, bɑdə…i, bɑd…ɫi, bɑ…əɫi, b…dəɫi, ɑdəɫi,..."
399,arthritic,ɑɹθɹɪtɪk,"[ɑ, ɹ, θ, ɹ, ɪ, t, ɪ, k]","[ɑɹ, ɹθ, θɹ, ɹɪ, ɪt, tɪ, ɪk, ɑ, ɹ, θ, ɹ, ɪ, t,...","[ɑɹθ, ɹθɹ, θɹɪ, ɹɪt, ɪtɪ, tɪk, ɑɹ, ɹθ, θɹ, ɹɪ,...","[ɑɹθɹ, ɹθɹɪ, θɹɪt, ɹɪtɪ, ɪtɪk, ɑɹθ, ɹθɹ, θɹɪ, ...","[ɑɹθɹɪ, ɹθɹɪt, θɹɪtɪ, ɹɪtɪk, ɑɹθɹ, ɹθɹɪ, θɹɪt,...","[ɑɹ, ɑ…θ, ɑ…ɹ, ɑ…ɪ, ɑ…t, ɑ…ɪ, ɑ…k, ɹθ, ɹ…ɹ, ɹ…...","[ɑɹθ, ɑɹ…ɹ, ɑɹ…ɪ, ɑɹ…t, ɑɹ…ɪ, ɑɹ…k, ɑ…θɹ, ɑ…θ…...","[ɑɹθɹ, ɑɹθ…ɪ, ɑɹθ…t, ɑɹθ…ɪ, ɑɹθ…k, ɑɹ…ɹɪ, ɑɹ…ɹ...","[ɑɹθɹɪ, ɑɹθɹ…t, ɑɹθɹ…ɪ, ɑɹθɹ…k, ɑɹθ…ɪt, ɑɹθ…ɪ…..."
529,baking,beɪkɪŋ,"[b, e, ɪ, k, ɪ, ŋ]","[be, eɪ, ɪk, kɪ, ɪŋ, b, e, ɪ, k, ɪ, ŋ]","[beɪ, eɪk, ɪkɪ, kɪŋ, be, eɪ, ɪk, kɪ, ɪŋ, b, e,...","[beɪk, eɪkɪ, ɪkɪŋ, beɪ, eɪk, ɪkɪ, kɪŋ, be, eɪ,...","[beɪkɪ, eɪkɪŋ, beɪk, eɪkɪ, ɪkɪŋ, beɪ, eɪk, ɪkɪ...","[be, b…ɪ, b…k, b…ɪ, b…ŋ, eɪ, e…k, e…ɪ, e…ŋ, ɪk...","[beɪ, be…k, be…ɪ, be…ŋ, b…ɪk, b…ɪ…ɪ, b…ɪ…ŋ, b…...","[beɪk, beɪ…ɪ, beɪ…ŋ, be…kɪ, be…k…ŋ, be…ɪŋ, b…ɪ...","[beɪkɪ, beɪk…ŋ, beɪ…ɪŋ, be…kɪŋ, b…ɪkɪŋ, eɪkɪŋ,..."
...,...,...,...,...,...,...,...,...,...,...,...
5264,transitional,tɹænsɪʃənəɫ,"[t, ɹ, æ, n, s, ɪ, ʃ, ə, n, ə, ɫ]","[tɹ, ɹæ, æn, ns, sɪ, ɪʃ, ʃə, ən, nə, əɫ, t, ɹ,...","[tɹæ, ɹæn, æns, nsɪ, sɪʃ, ɪʃə, ʃən, ənə, nəɫ, ...","[tɹæn, ɹæns, ænsɪ, nsɪʃ, sɪʃə, ɪʃən, ʃənə, ənə...","[tɹæns, ɹænsɪ, ænsɪʃ, nsɪʃə, sɪʃən, ɪʃənə, ʃən...","[tɹ, t…æ, t…n, t…s, t…ɪ, t…ʃ, t…ə, t…n, ɹæ, ɹ…...","[tɹæ, tɹ…n, tɹ…s, tɹ…ɪ, tɹ…ʃ, tɹ…ə, tɹ…n, t…æn...","[tɹæn, tɹæ…s, tɹæ…ɪ, tɹæ…ʃ, tɹæ…ə, tɹæ…n, tɹ…n...","[tɹæns, tɹæn…ɪ, tɹæn…ʃ, tɹæn…ə, tɹæn…n, tɹæ…sɪ..."
4668,seventeenth,sɛvəntinθ,"[s, ɛ, v, ə, n, t, i, n, θ]","[sɛ, ɛv, və, ən, nt, ti, in, nθ, s, ɛ, v, ə, n...","[sɛv, ɛvə, vən, ənt, nti, tin, inθ, sɛ, ɛv, və...","[sɛvə, ɛvən, vənt, ənti, ntin, tinθ, sɛv, ɛvə,...","[sɛvən, ɛvənt, vənti, əntin, ntinθ, sɛvə, ɛvən...","[sɛ, s…v, s…ə, s…n, s…t, s…i, s…n, s…θ, ɛv, ɛ…...","[sɛv, sɛ…ə, sɛ…n, sɛ…t, sɛ…i, sɛ…n, sɛ…θ, s…və...","[sɛvə, sɛv…n, sɛv…t, sɛv…i, sɛv…n, sɛv…θ, sɛ…ə...","[sɛvən, sɛvə…t, sɛvə…i, sɛvə…n, sɛvə…θ, sɛv…nt..."
1461,deductive,dɪdəktəv,"[d, ɪ, d, ə, k, t, ə, v]","[dɪ, ɪd, də, ək, kt, tə, əv, d, ɪ, d, ə, k, t,...","[dɪd, ɪdə, dək, əkt, ktə, təv, dɪ, ɪd, də, ək,...","[dɪdə, ɪdək, dəkt, əktə, ktəv, dɪd, ɪdə, dək, ...","[dɪdək, ɪdəkt, dəktə, əktəv, dɪdə, ɪdək, dəkt,...","[dɪ, d…d, d…ə, d…k, d…t, d…ə, d…v, ɪd, ɪ…ə, ɪ…...","[dɪd, dɪ…ə, dɪ…k, dɪ…t, dɪ…ə, dɪ…v, d…də, d…d…...","[dɪdə, dɪd…k, dɪd…t, dɪd…ə, dɪd…v, dɪ…ək, dɪ…ə...","[dɪdək, dɪdə…t, dɪdə…ə, dɪdə…v, dɪd…kt, dɪd…k…..."
2548,highflying,haɪfɫaɪɪŋ,"[h, a, ɪ, f, ɫ, a, ɪ, ɪ, ŋ]","[ha, aɪ, ɪf, fɫ, ɫa, aɪ, ɪɪ, ɪŋ, h, a, ɪ, f, ɫ...","[haɪ, aɪf, ɪfɫ, fɫa, ɫaɪ, aɪɪ, ɪɪŋ, ha, aɪ, ɪf...","[haɪf, aɪfɫ, ɪfɫa, fɫaɪ, ɫaɪɪ, aɪɪŋ, haɪ, aɪf,...","[haɪfɫ, aɪfɫa, ɪfɫaɪ, fɫaɪɪ, ɫaɪɪŋ, haɪf, aɪfɫ...","[ha, h…ɪ, h…f, h…ɫ, h…a, h…ɪ, h…ɪ, h…ŋ, aɪ, a…...","[haɪ, ha…f, ha…ɫ, ha…a, ha…ɪ, ha…ɪ, ha…ŋ, h…ɪf...","[haɪf, haɪ…ɫ, haɪ…a, haɪ…ɪ, haɪ…ɪ, haɪ…ŋ, ha…f...","[haɪfɫ, haɪf…a, haɪf…ɪ, haɪf…ɪ, haɪf…ŋ, haɪ…ɫa..."


In [668]:
## select data type and define doc_dict
import random
if "sp_" in term_type:
    base_type = "spell"
else:
    base_type = "sound"
doc_dict = { i: x for i, x in enumerate(df[base_type]) }
## check
random.sample(doc_dict.items(), 10)

since Python 3.9 and will be removed in a subsequent version.
  random.sample(doc_dict.items(), 10)


[(2679, 'bɹəsk'),
 (3270, 'nɑnhjumən'),
 (2502, 'æɫuviəɫ'),
 (3133, 'kəmpɫit'),
 (4149, 'wɔntɪd'),
 (4082, 'ɪnəɡɹəɫ'),
 (2876, 'iziɡoʊɪŋ'),
 (1986, 'aɪdɛntəfaɪd'),
 (954, 'ɫɔst'),
 (4209, 'ənhɝid')]

In [669]:
## select bots for analysis
enable_term_change = False # if you want to change term_type to save time and energy
if enable_term_change:
	term_type = 'sp_skippy4gram'
print(f"(changed) term_type: {term_type}")

## bot stands for 'bag-of-terms', a generalization of 'bag-of-words'
bots = [ x for x in df[term_type] if len(x) > min_bot_size ] # Crucially
import random
random.sample(bots, 1)

(changed) term_type: sn_skippy5gram


[['hwɪsp',
  'hwɪs…ɝ',
  'hwɪs…d',
  'hwɪ…pɝ',
  'hwɪ…p…d',
  'hwɪ…ɝd',
  'hw…spɝ',
  'hw…sp…d',
  'hw…s…ɝd',
  'hw…pɝd',
  'h…ɪspɝ',
  'h…ɪsp…d',
  'h…ɪs…ɝd',
  'h…ɪ…pɝd',
  'h…spɝd',
  'wɪspɝ',
  'wɪsp…d',
  'wɪs…ɝd',
  'wɪ…pɝd',
  'w…spɝd',
  'ɪspɝd',
  'wɪspɝ',
  'wɪsp…d',
  'wɪs…ɝd',
  'wɪ…pɝd',
  'w…spɝd',
  'ɪspɝd',
  'ɪspɝd',
  'hwɪs',
  'hwɪ…p',
  'hwɪ…ɝ',
  'hwɪ…d',
  'hw…sp',
  'hw…s…ɝ',
  'hw…s…d',
  'hw…pɝ',
  'hw…p…d',
  'hw…ɝd',
  'h…ɪsp',
  'h…ɪs…ɝ',
  'h…ɪs…d',
  'h…ɪ…pɝ',
  'h…ɪ…p…d',
  'h…ɪ…ɝd',
  'h…spɝ',
  'h…sp…d',
  'h…s…ɝd',
  'h…pɝd',
  'wɪsp',
  'wɪs…ɝ',
  'wɪs…d',
  'wɪ…pɝ',
  'wɪ…p…d',
  'wɪ…ɝd',
  'w…spɝ',
  'w…sp…d',
  'w…s…ɝd',
  'w…pɝd',
  'ɪspɝ',
  'ɪsp…d',
  'ɪs…ɝd',
  'ɪ…pɝd',
  'spɝd',
  'wɪsp',
  'wɪs…ɝ',
  'wɪs…d',
  'wɪ…pɝ',
  'wɪ…p…d',
  'wɪ…ɝd',
  'w…spɝ',
  'w…sp…d',
  'w…s…ɝd',
  'w…pɝd',
  'ɪspɝ',
  'ɪsp…d',
  'ɪs…ɝd',
  'ɪ…pɝd',
  'spɝd',
  'ɪspɝ',
  'ɪsp…d',
  'ɪs…ɝd',
  'ɪ…pɝd',
  'spɝd',
  'spɝd',
  'hwɪ',
  'hw…s',
  'hw…p',
  'hw…ɝ',
  

In [670]:
## generate dictionary
from gensim.corpora import Dictionary
diction = Dictionary(bots)
print(diction)
#apply_term_filtering = False
if apply_term_filtering:
    print(f"term filtering applied")
    diction.filter_extremes(no_below = term_minfreq, no_above = abuse_threshold)
else:
    print(f"term filtering not applied")
## check
print(diction)
## generate DTM
corpus = [ diction.doc2bow(bot) for bot in bots if len(bot) > min_bot_size ] # Crucially

Dictionary<451130 unique tokens: ['i', 'ik', 'ikɹ', 'ikɹə', 'ikɹət']...>
term filtering applied
Dictionary<100000 unique tokens: ['ik', 'ikɹ', 'ik…t', 'ik…ə', 'ik…ət']...>


In [671]:
## HDP (n_topics = 15)
import gensim.models
import pyLDAvis.gensim
max_n_topics = 15
hdp15 = gensim.models.HdpModel(corpus, diction, T = max_n_topics, random_state = 1)
vis_data15 = pyLDAvis.gensim.prepare(hdp15, corpus, diction)
pyLDAvis.display(vis_data15)

In [672]:
## save LDAvis output as a html file
save_LDAvis = True
if save_LDAvis:
	lang_dir_name = target_lang_dict[target_lang_key].split()[0]
	vis_output = f"results/LDAvis/{lang_dir_name}/{target_lang_dict[target_lang_key]}{target_class}-HDP-max_ntop{max_n_topics}-{term_type}{accent_status}.html"
	pyLDAvis.save_html(vis_data15, vis_output)

In [673]:
## topic investigation
import numpy as np
import HDP_helper
reload_module = False
if reload_module:
    import importlib
    importlib.reload(HDP_helper)

documents_topics = np.zeros([hdp15.m_T, len(corpus)])
for doc_id, c in enumerate(corpus):
    for topic_id, prob in hdp15[c]:
        documents_topics[topic_id][doc_id] = prob

n_docs_to_show  = 10
n_terms_to_pick = 60
hdp15.optimal_ordering()
for topic_id, probs in enumerate(documents_topics):
    print(f"==============")
    topic_t = hdp15.print_topic(topic_id, topn = n_terms_to_pick)
    print(f"topic_id {topic_id}: {HDP_helper.reformat_topic (topic_t, n_terms_to_pick)}")
    print(f"nonzero count: ", len(probs.nonzero()[0]))
    for doc_id in probs.argsort()[::-1][:n_docs_to_show]:
        doc = doc_dict[doc_id]
        print(f"\t{probs[doc_id]:0.4f}: {doc}")

topic_id 0: 0.001 * tɪk + 0.001 * ɪz + 0.001 * zd + 0.001 * æ…k + 0.001 * æ…ɪk + 0.001 * a…z + 0.001 * aɪz + 0.001 * s…ɪk + 0.001 * ɪzd + 0.001 * aɪ…d + 0.001 * jə + 0.001 * tɪd + 0.001 * ɫɝ + 0.001 * ə…ɝ + 0.001 * j…ɫ + 0.001 * p…k + 0.001 * a…zd + 0.001 * aɪzd + 0.001 * əm + 0.001 * stɪ + 0.001 * ɛ…tɪ + 0.001 * st…k + 0.001 * æt + 0.001 * jəɫ + 0.001 * ɛk + 0.001 * ə…a + 0.001 * ə…aɪ + 0.0 * ɛ…k + 0.0 * əɫɝ + 0.0 * ə…ən + 0.0 * m…k + 0.0 * ʃə + 0.0 * æt…k + 0.0 * j…ɝ + 0.0 * p…d + 0.0 * əɫ…ɪ + 0.0 * k…k + 0.0 * nɪ + 0.0 * j…ɫɝ + 0.0 * stɪk + 0.0 * eɪ…ə + 0.0 * e…ə + 0.0 * ætɪ + 0.0 * p…s + 0.0 * ə…ɪ…d + 0.0 * ɪst + 0.0 * ɪɫ + 0.0 * ætɪk + 0.0 * ktɪ + 0.0 * ə…z + 0.0 * ɛ…ɪk + 0.0 * ɛk…ɪ + 0.0 * k…əɫ + 0.0 * m…tɪ + 0.0 * ə…t…k + 0.0 * pt + 0.0 * jə…ɝ + 0.0 * t…ɪk + 0.0 * ɛkt + 0.0 * ə…ə…t
nonzero count:  1027
	0.9994: ənsəspɛktɪd
	0.9994: kəmpɫeɪsənt
	0.9994: kəmpɫeɪsənt
	0.9993: ənsəspɛktɪŋ
	0.9993: kɹɪstəɫaɪzd
	0.9993: ənpɹitɛnʃəs
	0.9993: ənɔɹɡənaɪzd
	0.9993: ənəseɪɫəbəɫ
	0.9993: ən

In [674]:
## save topic structures
#hdp.get_topics() # =/= show_topics()
#hdp.print_topics()
hdp_topics = hdp15.show_topics(num_topics = max_n_topics,
                               num_words = n_terms_to_show, formatted = False)
hdp_dict = { tid: values for tid, values in hdp_topics }
## convert to Pandas dataframe
topics_df = pd.DataFrame.from_dict(hdp_dict)
#hdp15_topics_out = f"results/terms-by-topics-raw/hdp{max_n_topics}_topics_raw.csv"
hdp15_topics_out = f"results/terms-by-topics-raw/{lang_dir_name}/{target_lang_dict[target_lang_key]}{target_class}-topics{max_n_topics}-{term_type}{accent_status}.csv"
topics_df.to_csv(hdp15_topics_out, header = False, index = None)

In [675]:
## HDP (n_topics = 45)
import gensim.models
import pyLDAvis.gensim
max_n_topics = 45
hdp45 = gensim.models.HdpModel(corpus, diction, T = max_n_topics, random_state = 1)
vis_data45 = pyLDAvis.gensim.prepare(hdp45, corpus, diction)
pyLDAvis.display(vis_data45)

In [676]:
## save LDAvis output as a html file
lang_dir_name = target_lang_dict[target_lang_key].split()[0]
save_LDAvis = True
if save_LDAvis:
	vis_output = f"results/LDAvis/{lang_dir_name}/{target_lang_dict[target_lang_key]}{target_class}-HDP-max_ntop{max_n_topics}-{term_type}{accent_status}.html"
	pyLDAvis.save_html(vis_data45, vis_output)

In [677]:
## save topic structures
#hdp.get_topics() # =/= show_topics()
#hdp.print_topics()
hdp_topics = hdp45.show_topics(num_topics = max_n_topics,
                               num_words = n_terms_to_show, formatted = False)
hdp_dict = { tid: values for tid, values in hdp_topics }
## convert to Pandas dataframe
topics_df = pd.DataFrame.from_dict(hdp_dict)
#hdp45_topics_out = f"results/terms-by-topics-raw/hdp{max_n_topics}_topics_raw.csv"
hdp45_topics_out = f"results/terms-by-topics-raw/{lang_dir_name}/{target_lang_dict[target_lang_key]}{target_class}-topics{max_n_topics}-{term_type}{accent_status}.csv"
topics_df.to_csv(hdp45_topics_out, header = False, index = None)

In [678]:
## topic investigation
import numpy as np
import HDP_helper
documents_topics = np.zeros([hdp45.m_T, len(corpus)])
for doc_id, c in enumerate(corpus):
    for topic_id, prob in hdp45[c]:
        documents_topics[topic_id][doc_id] = prob
#
n_docs_to_show  = 10
n_terms_to_pick = 10
hdp45.optimal_ordering()
for topic_id, probs in enumerate(documents_topics):
    print(f"==============")
    topic_t = hdp45.print_topic(topic_id, topn = n_terms_to_pick)
    print(f"topic_id {topic_id}: {HDP_helper.reformat_topic (topic_t, n_terms_to_pick)}")
    print(f"nonzero count: {len(probs.nonzero()[0])}")
    for doc_id in probs.argsort()[::-1][:n_docs_to_show]:
        doc = doc_dict[doc_id]
        print(f"\t{probs[doc_id]:0.4f}: {doc}")

topic_id 0: 0.001 * t…b + 0.001 * tə…ə + 0.001 * t…bə + 0.001 * təb + 0.001 * təbə + 0.001 * ɝ…ɫ + 0.001 * t…b…ɫ + 0.001 * tə…ɫ + 0.001 * tə…əɫ + 0.001 * ɝə
nonzero count: 588
	0.9992: ɪnvəɫnɝəbəɫ
	0.9991: ɪndɑmətəbəɫ
	0.9991: ɪnhɛɹətəbəɫ
	0.9991: ənpɹɪntəbəɫ
	0.9991: pɹivɛntəbəɫ
	0.9991: ɹispɛktəbəɫ
	0.9991: pɹəzɛntəbəɫ
	0.9991: ɪnkɑmpɝəbəɫ
	0.9990: vɛɹəfaɪəbəɫ
	0.9990: kəmpɛnsəbəɫ
topic_id 1: 0.001 * v…ə + 0.001 * və + 0.001 * e…ə + 0.001 * eɪ…ə + 0.001 * ɪɫ + 0.001 * v…ɫ + 0.001 * ə…ə…t + 0.001 * e…ɫ + 0.001 * t…v + 0.001 * ə…ən
nonzero count: 473
	0.9992: dɪdʒɛnɝətɪv
	0.9992: ənəseɪɫəbəɫ
	0.9992: ənbɹeɪkəbəɫ
	0.9992: ənəveɪɫəbəɫ
	0.9991: kæɫəbɹeɪtəd
	0.9990: mɑnəveɪɫənt
	0.9990: ɪnkəndɛsənt
	0.9990: nɔɹðɝnmoʊst
	0.9990: ənseɪɫəbəɫ
	0.9990: kɑnvəɫɛsənt
topic_id 2: 0.001 * t…v + 0.001 * tɪv + 0.001 * tɪd + 0.001 * ɫɝ + 0.001 * f…d + 0.001 * aɪd + 0.001 * f…ɪd + 0.001 * jə + 0.001 * fa + 0.001 * ə…ɝ
nonzero count: 416
	0.9992: ɪnvɑɫəntɛɹi
	0.9992: ənəkɹɛdɪtɪd
	0.9992: pɹɛzədɛnʃəɫ
	0.9

In [679]:
## HDP (n_topics = 90)
import gensim.models
import pyLDAvis.gensim
max_n_topics = 90
hdp90 = gensim.models.HdpModel(corpus, diction, T = max_n_topics,
                               random_state = 1,
                               #var_converge = 0.001
                               )
vis_data90 = pyLDAvis.gensim.prepare(hdp90, corpus, diction)
pyLDAvis.display(vis_data90)

In [680]:
## save LDAvis output as a html file
save_LDAvis = True
if save_LDAvis:
	lang_dir_name = target_lang_dict[target_lang_key].split()[0]
	vis_output = f"results/LDAvis/{lang_dir_name}/{target_lang_dict[target_lang_key]}{target_class}-HDP-max_ntop{max_n_topics}-{term_type}{accent_status}.html"
	pyLDAvis.save_html(vis_data90, vis_output)

In [681]:
## save topic structures
hdp_topics = hdp90.show_topics(num_topics = max_n_topics,
                               num_words = n_terms_to_show, formatted = False)
hdp_dict = { tid: values for tid, values in hdp_topics }
## convert to Pandas dataframe
topics_df = pd.DataFrame.from_dict(hdp_dict)
#hdp90_topics_out = f"results/terms-by-topics-raw/hdp{max_n_topics}_topics_raw.csv"
hdp90_topics_out = f"results/terms-by-topics-raw/{lang_dir_name}/{target_lang_dict[target_lang_key]}{target_class}-topics{max_n_topics}-{term_type}{accent_status}.csv"
topics_df.to_csv(hdp90_topics_out, header = False, index = None)

In [682]:
## topic investigation
import numpy as np
import HDP_helper
documents_topics = np.zeros([hdp90.m_T, len(corpus)])
for doc_id, c in enumerate(corpus):
    for topic_id, prob in hdp90[c]:
        documents_topics[topic_id][doc_id] = prob
## investigate topics
n_docs_to_show  = 10
n_terms_to_pick = 10
hdp90.optimal_ordering()
for topic_id, probs in enumerate(documents_topics):
    print(f"==============")
    #topic_encoding = ", ".join(hdp.show_topic(topic_id))
    topic_t = hdp90.print_topic(topic_id, topn = n_terms_to_pick)
    print(f"topic_id {topic_id}: {HDP_helper.reformat_topic (topic_t, n_terms_to_pick)}")
    print(f"nonzero count: ", len(probs.nonzero()[0]))
    for doc_id in probs.argsort()[::-1][:n_docs_to_show]:
        doc = doc_dict[doc_id]
        print(f"\t{probs[doc_id]:0.4f}: {doc}")

topic_id 0: 0.002 * ɪz + 0.002 * zd + 0.002 * aɪ…d + 0.002 * a…z + 0.002 * aɪz + 0.002 * ɪzd + 0.001 * aɪzd + 0.001 * a…zd + 0.001 * iə + 0.001 * ə…a
nonzero count:  545
	0.9993: kɹɪstəɫaɪzd
	0.9992: ɹiəɫaɪzəbəɫ
	0.9992: ænəɫaɪzəbəɫ
	0.9991: pɝsənəɫaɪzd
	0.9991: ɹɛtɹoʊæktɪv
	0.9991: kəmɝʃəɫaɪzd
	0.9990: ɪmpɹæktəkəɫ
	0.9990: kɹɪstəɫaɪn
	0.9989: steɪbəɫaɪzd
	0.9989: mætɹɪɫɪniəɫ
topic_id 1: 0.003 * ɛ…i + 0.003 * ɛɹ + 0.002 * n…i + 0.002 * ɛɹi + 0.001 * ʃə + 0.001 * t…i + 0.001 * ə…ɛɹ + 0.001 * ndɪ + 0.001 * æ…d + 0.001 * ə…ɛ…i
nonzero count:  575
	0.9992: ənkəntɹoʊɫd
	0.9991: sɛntənɛɹiən
	0.9990: bɛnəfɪʃənt
	0.9990: ənɪntɛndɪd
	0.9989: æɫfənumɛɹɪk
	0.9989: æntipɝsənɛɫ
	0.9989: pɹəskɹɪpʃən
	0.9989: pɹɪkɔʃənɛɹi
	0.9989: ɹɛkəmɛndɪd
	0.9988: dɑkjəmɛnɝi
topic_id 2: 0.002 * t…b + 0.002 * t…bə + 0.002 * tə…ə + 0.002 * təb + 0.002 * ɝə + 0.002 * təbə + 0.002 * ɝ…ɫ + 0.002 * t…b…ɫ + 0.002 * t…bəɫ + 0.002 * təbəɫ
nonzero count:  314
	0.9992: ɪnvəɫnɝəbəɫ
	0.9991: ɪndɑmətəbəɫ
	0.9991: ɪnhɛɹətəbəɫ
	0.