In [1]:
#!pip install -U pandas

In [2]:
#!pip install -U pyLDAvis

In [3]:
## imports
import os, sys
import pprint as pp

In [4]:
## 一つ上の階層のファイルを見るように設定
sys.path.append(os.path.join(os.path.dirname("__file__"), '..'))

In [5]:
## target language
## a key must be part of a file name 
target_lang_dict = {    'ar'    : 'Arabic',
                        'de'    : 'German',
                        'de_N_only' : 'German Nouns',
                        'de_non_N_only' : 'German Non-nouns',
                        'en_US' : 'English (US)',
                        'en_UK' : 'English (UK)',
                        'en_N_only' : 'English noun (WN)',
                        'en_V_only' : 'English verb (WN)',
                        'en_A_only' : 'English adj (WN)',
                        'en_R_only' : 'English adv (WN)',
                        'eo'    : 'Esperanto',
                        'es_ES' : 'Spanish (Spain)',
                        'es_MX' : 'Spanish (Mexico)',
                        'fi'    : 'Finnish',
                        'fr_FR' : 'French (France)',
                        'fr_QC' : 'French (Quebec)',
                        'is'    : 'Icelandic',
                        'nl'    : 'Dutch',
                        'ro'    : 'Romanian',
                        'sw'    : 'Swahili' }
target_lang_keys = [    'ar', 'de', 'de_N_only', 'de_non_N_only',
                        'en_US', 'en_UK', 'en_N_only', 'en_V_only', 'en_A_only', 'en_R_only',
                        'eo', 'es_ES', 'es_MX',
                        'fi', 'fr_FR', 'fr_QC',
                        'is', 'nl', 'ro', 'sw' ]
#
target_lang_key  = target_lang_keys[6]
print(f"target lang: {target_lang_dict[target_lang_key]} ({target_lang_key})")

target lang: English noun (WN) (en_N_only)


In [40]:
## term settings
term_classes       = [ 'spell', 'sound' ]
term_class         = term_classes[1]
ngram_is_inclusive = True
gap_mark           = "…"
term_is_skippy     = True
n_for_ngram        = 4
print(f"term_class: {term_class}")
print(f"term_is_skippy: {term_is_skippy}")
print(f"n_for_ngram: {n_for_ngram}")
## define term_type
if term_class == 'spell':
    if term_is_skippy:
        term_type = f"sp_skippy{n_for_ngram}gram"
    else:
        term_type = f"sp_{n_for_ngram}gram"
else:
    if term_is_skippy:
        term_type = f"sn_skippy{n_for_ngram}gram"
    else:
        term_type = f"sn_{n_for_ngram}gram"
## check
print(f"term_type: {term_type}")
## doc settings
max_doc_size       = 10
min_doc_size       =  5
### boundary handling
add_boundary       = True
boundary_mark      = "#"
### accent handling
suppress_accents   = True
accent_marks       = [ "ˈ", "ˌ" ] 
if term_class == 'sound':
    if suppress_accents:
        accent_status = "unaccented-"
    else:
        accent_stratus = "accented-"
else:
    accent_status = ""
print(f"accent: {accent_status}")

term_class: sound
term_is_skippy: True
n_for_ngram: 4
term_type: sn_skippy4gram
accent: -unaccented-


In [7]:
## LDA/HDP
apply_term_filtering = True
## The following parameters need to be relatively large to prevent "Row sum not equal 1" error
term_minfreq         = 3
abuse_threshold      = 0.03
min_bot_size         = 3

In [8]:
## sampling
source_sampling          = True
source_sampling_rate     = 0.5
source_sampling_max_size = 30000
second_sampling          = False
second_sampling_rate     = 0.7

In [9]:
## set target files
import glob
data_dir1     = "data/open-dict-ipa/data1/"
data_dir2     = "data/open-dict-ipa/data1a/"
data_dir3     = "data/wn3/"
target_files = glob.glob(f"{data_dir1}/*")
target_files2 = glob.glob(f"{data_dir2}/*")
target_files.extend(target_files2)
target_files3 = glob.glob(f"{data_dir3}/*")
target_files.extend(target_files3)
target_files = sorted([ file for file in target_files if ".csv" in file ])
pp.pprint(target_files)

['data/open-dict-ipa/data1/ar.csv.gz',
 'data/open-dict-ipa/data1/de.csv.gz',
 'data/open-dict-ipa/data1/en_UK.csv.gz',
 'data/open-dict-ipa/data1/en_US.csv.gz',
 'data/open-dict-ipa/data1/eo.csv.gz',
 'data/open-dict-ipa/data1/es_ES.csv.gz',
 'data/open-dict-ipa/data1/es_MX.csv.gz',
 'data/open-dict-ipa/data1/fa.csv.gz',
 'data/open-dict-ipa/data1/fi.csv.gz',
 'data/open-dict-ipa/data1/fr_FR.csv.gz',
 'data/open-dict-ipa/data1/fr_QC.csv.gz',
 'data/open-dict-ipa/data1/is.csv.gz',
 'data/open-dict-ipa/data1/ja.csv.gz',
 'data/open-dict-ipa/data1/jam.csv.gz',
 'data/open-dict-ipa/data1/ma.csv.gz',
 'data/open-dict-ipa/data1/nb.csv.gz',
 'data/open-dict-ipa/data1/nl.csv.gz',
 'data/open-dict-ipa/data1/or.csv.gz',
 'data/open-dict-ipa/data1/sv.csv.gz',
 'data/open-dict-ipa/data1/sw.csv.gz',
 'data/open-dict-ipa/data1/vi_C.csv.gz',
 'data/open-dict-ipa/data1/vi_N.csv.gz',
 'data/open-dict-ipa/data1/vi_S.csv.gz',
 'data/open-dict-ipa/data1/yue.csv.gz',
 'data/open-dict-ipa/data1/zh_hans.csv

In [10]:
## get source data from files
import pandas as pd
import gzip

#target_language_key = "en_US" # can be changed to get other languages
file = [ f for f in target_files if target_lang_key in f ][0]
print(f"processing: {file}")

if file.endswith(".gz"):
    with gzip.open(file, "rt") as f:
        raw_df = pd.read_csv(f, encoding = 'utf8', header = None, names = ['spell', 'sound'])
else:
    with open(file, "rt") as f:
        raw_df = pd.read_csv(f, encoding = 'utf8', header = None, names = ['spell', 'sound'])
try:
    sounds = raw_df['sound'].apply(lambda x: x.strip('/') )
    sounds = [ x.split("/,")[0] for x in sounds ] # picks up only the first of multiple entries
    raw_df['sound'] = sounds
except AttributeError:
    pass
#
raw_df.sample(10)

processing: data/wn3/en_N_only.csv


Unnamed: 0,spell,sound
9776,phrasing,ˈfɹeɪzɪŋ
13696,trip,ˈtɹɪp
12828,struggle,ˈstɹəɡəɫ
7179,jolt,ˈdʒoʊɫt
1846,bus,ˈbəs
3248,credence,ˈkɹidəns
3588,deduction,dɪˈdəkʃən
7310,kingfisher,ˈkɪŋˌfɪʃɝ
10307,prize,ˈpɹaɪz
1734,bronco,ˈbɹɑŋkoʊ


In [11]:
## source sampling
len(raw_df)
if source_sampling:
	print(f"source sampling applied")
	if len(raw_df) >= source_sampling_max_size:
		raw_df = raw_df.sample(source_sampling_max_size)
	else:
		raw_df = raw_df.sample(round(len(raw_df) * source_sampling_rate))
## remove accent marking
if suppress_accents:
	raw_df['sound'] = raw_df['sound'].apply(lambda x: "".join([ y for y in list(x) if y not in accent_marks ]))
## add boudary marks
if add_boundary:
	raw_df['spell'] = raw_df['spell'].apply(lambda x: f"{boundary_mark}{x}{boundary_mark}")
	raw_df['sound'] = raw_df['sound'].apply(lambda x: f"{boundary_mark}{x}{boundary_mark}")
#
print(raw_df)

source sampling applied
                spell         sound
5255         #flurry#        #fɫɝi#
7705        #linseed#      #ɫɪnsid#
4986       #felicity#    #fɪɫɪsəti#
11861  #sharpshooter#    #ʃɑɹpʃutɝ#
12908      #suckling#      #səkɫɪŋ#
...               ...           ...
9791         #picker#        #pɪkɝ#
8988           #nous#         #nus#
13308   #thalidomide#  #θəɫɪdəmaɪd#
9282        #overlap#     #oʊvɝɫæp#
7674         #limber#       #ɫɪmbɝ#

[7308 rows x 2 columns]


In [12]:
## generate 1-grams for spell and sound
## spell
raw_df['sp_1gram'] = raw_df['spell'].apply(lambda x: list(str(x)))
# add column of size
raw_df['sp_size'] = raw_df['sp_1gram'].apply(lambda x: len(x))
# add column of count of '-' inside
raw_df['hyphen'] = raw_df['sp_1gram'].apply(lambda x: list(x).count("-"))
# add column of count of '.' inside
raw_df['period'] = raw_df['sp_1gram'].apply(lambda x: list(x).count("."))
## sound
# takes the first entry, removes '/' around
try:
    raw_df['sn_1gram'] = raw_df['sound'].apply(lambda x: list(x) )
except TypeError:
    pass
# add column of size
try:
    raw_df['sn_size'] = raw_df['sn_1gram'].apply(lambda x: len(x))
except KeyError:
    pass
## check
raw_df

Unnamed: 0,spell,sound,sp_1gram,sp_size,hyphen,period,sn_1gram,sn_size
5255,#flurry#,#fɫɝi#,"[#, f, l, u, r, r, y, #]",8,0,0,"[#, f, ɫ, ɝ, i, #]",6
7705,#linseed#,#ɫɪnsid#,"[#, l, i, n, s, e, e, d, #]",9,0,0,"[#, ɫ, ɪ, n, s, i, d, #]",8
4986,#felicity#,#fɪɫɪsəti#,"[#, f, e, l, i, c, i, t, y, #]",10,0,0,"[#, f, ɪ, ɫ, ɪ, s, ə, t, i, #]",10
11861,#sharpshooter#,#ʃɑɹpʃutɝ#,"[#, s, h, a, r, p, s, h, o, o, t, e, r, #]",14,0,0,"[#, ʃ, ɑ, ɹ, p, ʃ, u, t, ɝ, #]",10
12908,#suckling#,#səkɫɪŋ#,"[#, s, u, c, k, l, i, n, g, #]",10,0,0,"[#, s, ə, k, ɫ, ɪ, ŋ, #]",8
...,...,...,...,...,...,...,...,...
9791,#picker#,#pɪkɝ#,"[#, p, i, c, k, e, r, #]",8,0,0,"[#, p, ɪ, k, ɝ, #]",6
8988,#nous#,#nus#,"[#, n, o, u, s, #]",6,0,0,"[#, n, u, s, #]",5
13308,#thalidomide#,#θəɫɪdəmaɪd#,"[#, t, h, a, l, i, d, o, m, i, d, e, #]",13,0,0,"[#, θ, ə, ɫ, ɪ, d, ə, m, a, ɪ, d, #]",12
9282,#overlap#,#oʊvɝɫæp#,"[#, o, v, e, r, l, a, p, #]",9,0,0,"[#, o, ʊ, v, ɝ, ɫ, æ, p, #]",9


In [13]:
## filtering raw_data by size
print(f"term_type: {term_type}")
if "sp_" in term_type:
    df_filtered = raw_df[ (raw_df['sp_size'] <= max_doc_size) & (raw_df['sp_size'] >= min_doc_size) & (raw_df['hyphen'] == 0) & (raw_df['period'] == 0) ]
else:
    df_filtered = raw_df[ (raw_df['sn_size'] <= max_doc_size) & (raw_df['sn_size'] >= min_doc_size) ]
#
df_filtered

term_type: sn_skippy4gram


Unnamed: 0,spell,sound,sp_1gram,sp_size,hyphen,period,sn_1gram,sn_size
5255,#flurry#,#fɫɝi#,"[#, f, l, u, r, r, y, #]",8,0,0,"[#, f, ɫ, ɝ, i, #]",6
7705,#linseed#,#ɫɪnsid#,"[#, l, i, n, s, e, e, d, #]",9,0,0,"[#, ɫ, ɪ, n, s, i, d, #]",8
4986,#felicity#,#fɪɫɪsəti#,"[#, f, e, l, i, c, i, t, y, #]",10,0,0,"[#, f, ɪ, ɫ, ɪ, s, ə, t, i, #]",10
11861,#sharpshooter#,#ʃɑɹpʃutɝ#,"[#, s, h, a, r, p, s, h, o, o, t, e, r, #]",14,0,0,"[#, ʃ, ɑ, ɹ, p, ʃ, u, t, ɝ, #]",10
12908,#suckling#,#səkɫɪŋ#,"[#, s, u, c, k, l, i, n, g, #]",10,0,0,"[#, s, ə, k, ɫ, ɪ, ŋ, #]",8
...,...,...,...,...,...,...,...,...
7597,#leukemia#,#ɫukimiə#,"[#, l, e, u, k, e, m, i, a, #]",10,0,0,"[#, ɫ, u, k, i, m, i, ə, #]",9
9791,#picker#,#pɪkɝ#,"[#, p, i, c, k, e, r, #]",8,0,0,"[#, p, ɪ, k, ɝ, #]",6
8988,#nous#,#nus#,"[#, n, o, u, s, #]",6,0,0,"[#, n, u, s, #]",5
9282,#overlap#,#oʊvɝɫæp#,"[#, o, v, e, r, l, a, p, #]",9,0,0,"[#, o, ʊ, v, ɝ, ɫ, æ, p, #]",9


In [14]:
## define df after second sampling if any
len(df_filtered)
if second_sampling:
    df = df_filtered.sample(round(len(df_filtered) * second_sampling_rate))
else:
    df = df_filtered
len(df)

5446

In [15]:
## spell 2grams
import ngrams
reload_module = False
if reload_module:
    import importlib
    importlib.reload(ngrams)

if term_class == 'spell':
    sp_2grams = [ ngrams.list_gen_ngrams (x, n = 2, check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_2grams):
            g.extend(list(df['sp_1gram'])[i])
    ## add sp_2gram
    df['sp_2gram'] = sp_2grams

In [16]:
## spell 3grams
import ngrams
if n_for_ngram > 2 and term_class == 'spell':
    sp_3grams = [ ngrams.list_gen_ngrams (x, n = 3, check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_3grams):
            g.extend(list(df['sp_2gram'])[i])
    ## add sp_2gram
    df['sp_3gram'] = sp_3grams

In [17]:
## spell 4grams
import ngrams
if n_for_ngram > 3 and term_class == 'spell':
    sp_4grams = [ ngrams.list_gen_ngrams (x, n = 4, check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_4grams):
            g.extend(list(df['sp_3gram'])[i])
    ## add sp_2gram
    df['sp_4gram'] = sp_4grams

In [18]:
## spell skippy2gram
import ngrams_skippy
reload_module = False
if reload_module:
    import importlib
    importlib.reload(ngrams_skippy)
#
if term_class == 'spell':
    sp_skippy2grams = [ ngrams_skippy.gen_skippy2grams(x, check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_skippy2grams):
            g.extend(list(df['sp_1gram'])[i])
    #
    df['sp_skippy2gram'] = sp_skippy2grams

In [19]:
## spell skippy3gram
import ngrams_skippy
if n_for_ngram > 2 and term_class == 'spell':
    sp_skippy3grams = [ ngrams_skippy.gen_skippy3grams(x, check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_skippy3grams):
            g.extend(list(df['sp_skippy2gram'])[i])
    #
    df['sp_skippy3gram'] = sp_skippy3grams

In [20]:
## spell skippy4gram
import ngrams_skippy
if n_for_ngram > 3 and term_class == 'spell':
    sp_skippy4grams = [ ngrams_skippy.gen_skippy4grams(x, check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_skippy4grams):
            g.extend(list(df['sp_skippy3gram'])[i])
    #
    df['sp_skippy4gram'] = sp_skippy4grams

In [21]:
## sound 2grams
import ngrams
reload_module = False
if reload_module:
    import importlib
    importlib.reload(ngrams)
#
if term_class == 'sound':
    sn_2grams = [ ngrams.list_gen_ngrams (x, n = 2, check = False) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_2grams):
            g.extend(list(df['sn_1gram'])[i])
    ## add sn_2gram
    df['sn_2gram'] = sn_2grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sn_2gram'] = sn_2grams


In [22]:
## sound 3grams
import ngrams
if n_for_ngram > 2 and term_class == 'sound':
    sn_3grams = [ ngrams.list_gen_ngrams (x, n = 3, check = False) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_3grams):
            g.extend(list(df['sn_2gram'])[i])
    ## add sn_3gram
    df['sn_3gram'] = sn_3grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sn_3gram'] = sn_3grams


In [23]:
## sound 4grams
import ngrams
if n_for_ngram > 3 and term_class == 'sound':
    sn_4grams = [ ngrams.list_gen_ngrams (x, n = 4, check = False) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_3grams):
            g.extend(list(df['sn_2gram'])[i])
    ## add sn_4gram
    df['sn_4gram'] = sn_3grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sn_4gram'] = sn_3grams


In [24]:
## sound skippy2gram
import ngrams_skippy
if term_class == 'sound':
    sn_skippy2grams = [ ngrams_skippy.gen_skippy2grams(x) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_skippy2grams):
            g.extend(list(df['sn_1gram'])[i])
    #
    df['sn_skippy2gram'] = sn_skippy2grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sn_skippy2gram'] = sn_skippy2grams


In [25]:
## sound skippy3gram
import ngrams_skippy
if n_for_ngram > 2 and term_class == 'sound':
    sn_skippy3grams = [ ngrams_skippy.gen_skippy3grams(x, check = False) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_skippy3grams):
            g.extend(list(df['sn_skippy2gram'])[i])
    #
    df['sn_skippy3gram'] = sn_skippy3grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sn_skippy3gram'] = sn_skippy3grams


In [26]:
## sound skippy4gram
import ngrams_skippy
if n_for_ngram > 3 and term_class == 'sound':
    sn_skippy4grams = [ ngrams_skippy.gen_skippy4grams(x, check = False) for x in df['sn_1gram'] ]
    #
    if ngram_is_inclusive:
        for i, g in enumerate(sn_skippy4grams):
            g.extend(list(df['sn_skippy3gram'])[i])
    #
    df['sn_skippy4gram'] = sn_skippy4grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sn_skippy4gram'] = sn_skippy4grams


In [27]:
## check df
dropped_vars = [ 'sp_size', 'hyphen', 'period', 'sn_size' ]
if term_class == 'spell':
    extra = [ 'sn_1gram', 'sn_2gram', 'sn_3gram', 'sn_4gram',
             'sn_skippy2gram', 'sn_skippy3gram', 'sn_skippy4gram' ]
    dropped_vars.extend(extra)
    target_vars = [ x for x in df.columns if not x in dropped_vars ]
else:
    extra = [ 'sp_1gram', 'sp_2gram', 'sp_3gram', 'sp_4gram',
             'sp_skippy2gram', 'sp_skippy3gram', 'sp_skippy4gram' ]
    dropped_vars.extend(extra)
    target_vars = [ x for x in df.columns if not x in dropped_vars ]    
#
df[target_vars]

Unnamed: 0,spell,sound,sn_1gram,sn_2gram,sn_3gram,sn_4gram,sn_skippy2gram,sn_skippy3gram,sn_skippy4gram
5255,#flurry#,#fɫɝi#,"[#, f, ɫ, ɝ, i, #]","[#f, fɫ, ɫɝ, ɝi, i#, #, f, ɫ, ɝ, i, #]","[#fɫ, fɫɝ, ɫɝi, ɝi#, #f, fɫ, ɫɝ, ɝi, i#, #, f,...","[#fɫ, fɫɝ, ɫɝi, ɝi#, #f, fɫ, ɫɝ, ɝi, i#, #, f,...","[#f, #_ɫ, #_ɝ, #_i, #_#, fɫ, f_ɝ, f_i, f_#, ɫɝ...","[#fɫ, #f_ɝ, #f_i, #f_#, #_ɫɝ, #_ɫ_i, #_ɫ_#, #_...","[#fɫɝ, #fɫ_i, #fɫ_#, #f_ɝi, #f_ɝ_#, #f_i#, #_ɫ..."
7705,#linseed#,#ɫɪnsid#,"[#, ɫ, ɪ, n, s, i, d, #]","[#ɫ, ɫɪ, ɪn, ns, si, id, d#, #, ɫ, ɪ, n, s, i,...","[#ɫɪ, ɫɪn, ɪns, nsi, sid, id#, #ɫ, ɫɪ, ɪn, ns,...","[#ɫɪ, ɫɪn, ɪns, nsi, sid, id#, #ɫ, ɫɪ, ɪn, ns,...","[#ɫ, #_ɪ, #_n, #_s, #_i, #_d, #_#, ɫɪ, ɫ_n, ɫ_...","[#ɫɪ, #ɫ_n, #ɫ_s, #ɫ_i, #ɫ_d, #ɫ_#, #_ɪn, #_ɪ_...","[#ɫɪn, #ɫɪ_s, #ɫɪ_i, #ɫɪ_d, #ɫɪ_#, #ɫ_ns, #ɫ_n..."
4986,#felicity#,#fɪɫɪsəti#,"[#, f, ɪ, ɫ, ɪ, s, ə, t, i, #]","[#f, fɪ, ɪɫ, ɫɪ, ɪs, sə, ət, ti, i#, #, f, ɪ, ...","[#fɪ, fɪɫ, ɪɫɪ, ɫɪs, ɪsə, sət, əti, ti#, #f, f...","[#fɪ, fɪɫ, ɪɫɪ, ɫɪs, ɪsə, sət, əti, ti#, #f, f...","[#f, #_ɪ, #_ɫ, #_s, #_ə, #_t, #_i, #_#, fɪ, f_...","[#fɪ, #f_ɫ, #f_ɪ, #f_s, #f_ə, #f_t, #f_i, #f_#...","[#fɪɫ, #fɪ_ɪ, #fɪ_s, #fɪ_ə, #fɪ_t, #fɪ_i, #fɪ_..."
11861,#sharpshooter#,#ʃɑɹpʃutɝ#,"[#, ʃ, ɑ, ɹ, p, ʃ, u, t, ɝ, #]","[#ʃ, ʃɑ, ɑɹ, ɹp, pʃ, ʃu, ut, tɝ, ɝ#, #, ʃ, ɑ, ...","[#ʃɑ, ʃɑɹ, ɑɹp, ɹpʃ, pʃu, ʃut, utɝ, tɝ#, #ʃ, ʃ...","[#ʃɑ, ʃɑɹ, ɑɹp, ɹpʃ, pʃu, ʃut, utɝ, tɝ#, #ʃ, ʃ...","[#ʃ, #_ɑ, #_ɹ, #_p, #_ʃ, #_u, #_t, #_ɝ, #_#, ʃ...","[#ʃɑ, #ʃ_ɹ, #ʃ_p, #ʃ_ʃ, #ʃ_u, #ʃ_t, #ʃ_ɝ, #ʃ_#...","[#ʃɑɹ, #ʃɑ_p, #ʃɑ_ʃ, #ʃɑ_u, #ʃɑ_t, #ʃɑ_ɝ, #ʃɑ_..."
12908,#suckling#,#səkɫɪŋ#,"[#, s, ə, k, ɫ, ɪ, ŋ, #]","[#s, sə, ək, kɫ, ɫɪ, ɪŋ, ŋ#, #, s, ə, k, ɫ, ɪ,...","[#sə, sək, əkɫ, kɫɪ, ɫɪŋ, ɪŋ#, #s, sə, ək, kɫ,...","[#sə, sək, əkɫ, kɫɪ, ɫɪŋ, ɪŋ#, #s, sə, ək, kɫ,...","[#s, #_ə, #_k, #_ɫ, #_ɪ, #_ŋ, #_#, sə, s_k, s_...","[#sə, #s_k, #s_ɫ, #s_ɪ, #s_ŋ, #s_#, #_ək, #_ə_...","[#sək, #sə_ɫ, #sə_ɪ, #sə_ŋ, #sə_#, #s_kɫ, #s_k..."
...,...,...,...,...,...,...,...,...,...
7597,#leukemia#,#ɫukimiə#,"[#, ɫ, u, k, i, m, i, ə, #]","[#ɫ, ɫu, uk, ki, im, mi, iə, ə#, #, ɫ, u, k, i...","[#ɫu, ɫuk, uki, kim, imi, miə, iə#, #ɫ, ɫu, uk...","[#ɫu, ɫuk, uki, kim, imi, miə, iə#, #ɫ, ɫu, uk...","[#ɫ, #_u, #_k, #_i, #_m, #_ə, #_#, ɫu, ɫ_k, ɫ_...","[#ɫu, #ɫ_k, #ɫ_i, #ɫ_m, #ɫ_ə, #ɫ_#, #_uk, #_u_...","[#ɫuk, #ɫu_i, #ɫu_m, #ɫu_ə, #ɫu_#, #ɫ_ki, #ɫ_k..."
9791,#picker#,#pɪkɝ#,"[#, p, ɪ, k, ɝ, #]","[#p, pɪ, ɪk, kɝ, ɝ#, #, p, ɪ, k, ɝ, #]","[#pɪ, pɪk, ɪkɝ, kɝ#, #p, pɪ, ɪk, kɝ, ɝ#, #, p,...","[#pɪ, pɪk, ɪkɝ, kɝ#, #p, pɪ, ɪk, kɝ, ɝ#, #, p,...","[#p, #_ɪ, #_k, #_ɝ, #_#, pɪ, p_k, p_ɝ, p_#, ɪk...","[#pɪ, #p_k, #p_ɝ, #p_#, #_ɪk, #_ɪ_ɝ, #_ɪ_#, #_...","[#pɪk, #pɪ_ɝ, #pɪ_#, #p_kɝ, #p_k_#, #p_ɝ#, #_ɪ..."
8988,#nous#,#nus#,"[#, n, u, s, #]","[#n, nu, us, s#, #, n, u, s, #]","[#nu, nus, us#, #n, nu, us, s#, #, n, u, s, #,...","[#nu, nus, us#, #n, nu, us, s#, #, n, u, s, #,...","[#n, #_u, #_s, #_#, nu, n_s, n_#, us, u_#, s#,...","[#nu, #n_s, #n_#, #_us, #_u_#, #_s#, nus, nu_#...","[#nus, #nu_#, #n_s#, #_u_s#, nus#, #nu, #n_s, ..."
9282,#overlap#,#oʊvɝɫæp#,"[#, o, ʊ, v, ɝ, ɫ, æ, p, #]","[#o, oʊ, ʊv, vɝ, ɝɫ, ɫæ, æp, p#, #, o, ʊ, v, ɝ...","[#oʊ, oʊv, ʊvɝ, vɝɫ, ɝɫæ, ɫæp, æp#, #o, oʊ, ʊv...","[#oʊ, oʊv, ʊvɝ, vɝɫ, ɝɫæ, ɫæp, æp#, #o, oʊ, ʊv...","[#o, #_ʊ, #_v, #_ɝ, #_ɫ, #_æ, #_p, #_#, oʊ, o_...","[#oʊ, #o_v, #o_ɝ, #o_ɫ, #o_æ, #o_p, #o_#, #_ʊv...","[#oʊv, #oʊ_ɝ, #oʊ_ɫ, #oʊ_æ, #oʊ_p, #oʊ_#, #o_v..."


In [28]:
## select data type and define doc_dict
import random
if "sp_" in term_type:
    base_type = "spell"
else:
    base_type = "sound"
doc_dict = { i: x for i, x in enumerate(df[base_type]) }
## check
random.sample(doc_dict.items(), 10)

since Python 3.9 and will be removed in a subsequent version.
  random.sample(doc_dict.items(), 10)


[(1403, '#dʒoʊɫt#'),
 (3715, '#fɹuɪʃən#'),
 (1451, '#bɫəstɝ#'),
 (3381, '#dɪnaɪəɫ#'),
 (3127, '#ɫændɪŋ#'),
 (2651, '#ɫɑkɝ#'),
 (859, '#pəsɪvɪti#'),
 (4902, '#ænəm#'),
 (1760, '#sɪnɪk#'),
 (610, '#twɛni#')]

In [29]:
## select bots for analysis
enable_term_change = False # if you want to change term_type to save time and energy
if enable_term_change:
	term_type = 'sp_skippy4gram'
print(f"(changed) term_type: {term_type}")

## bot stands for 'bag-of-terms', a generalization of 'bag-of-words'
bots = [ x for x in df[term_type] if len(x) > min_bot_size ] # Crucially
import random
random.sample(bots, 3)

(changed) term_type: sn_skippy4gram


[['#mɔɫ',
  '#mɔ_ɝ',
  '#mɔ_#',
  '#m_ɫɝ',
  '#m_ɫ_#',
  '#m_ɝ#',
  '#_ɔ_ɫɝ',
  '#_ɔɫ_#',
  '#_ɔ_ɝ#',
  '#_ɫ_ɝ#',
  'mɔɫɝ',
  'mɔɫ_#',
  'mɔ_ɝ#',
  'm_ɫ_ɝ#',
  'ɔɫɝ#',
  '#mɔ',
  '#m_ɫ',
  '#m_ɝ',
  '#m_#',
  '#_ɔɫ',
  '#_ɔ_ɝ',
  '#_ɔ_#',
  '#_ɫɝ',
  '#_ɫ_#',
  '#_ɝ#',
  'mɔɫ',
  'mɔ_ɝ',
  'mɔ_#',
  'm_ɫɝ',
  'm_ɫ_#',
  'm_ɝ#',
  'ɔɫɝ',
  'ɔɫ_#',
  'ɔ_ɝ#',
  'ɫɝ#',
  '#m',
  '#_ɔ',
  '#_ɫ',
  '#_ɝ',
  '#_#',
  'mɔ',
  'm_ɫ',
  'm_ɝ',
  'm_#',
  'ɔɫ',
  'ɔ_ɝ',
  'ɔ_#',
  'ɫɝ',
  'ɫ_#',
  'ɝ#',
  '#',
  'm',
  'ɔ',
  'ɫ',
  'ɝ',
  '#'],
 ['#tɹɪ',
  '#tɹ_p',
  '#tɹ_#',
  '#t_ɪp',
  '#t_ɪ_#',
  '#t_p#',
  '#_ɹ_ɪp',
  '#_ɹɪ_#',
  '#_ɹ_p#',
  '#_ɪ_p#',
  'tɹɪp',
  'tɹɪ_#',
  'tɹ_p#',
  't_ɪ_p#',
  'ɹɪp#',
  '#tɹ',
  '#t_ɪ',
  '#t_p',
  '#t_#',
  '#_ɹɪ',
  '#_ɹ_p',
  '#_ɹ_#',
  '#_ɪp',
  '#_ɪ_#',
  '#_p#',
  'tɹɪ',
  'tɹ_p',
  'tɹ_#',
  't_ɪp',
  't_ɪ_#',
  't_p#',
  'ɹɪp',
  'ɹɪ_#',
  'ɹ_p#',
  'ɪp#',
  '#t',
  '#_ɹ',
  '#_ɪ',
  '#_p',
  '#_#',
  'tɹ',
  't_ɪ',
  't_p',
  't_#',
  'ɹɪ',
  'ɹ

In [30]:
## generate dictionary
from gensim.corpora import Dictionary
diction = Dictionary(bots)
print(diction)

if apply_term_filtering:
    print(f"term filtering applied")
    diction.filter_extremes(no_below = term_minfreq, no_above = abuse_threshold)
else:
    print(f"term filtering not applied")
print(diction)

## generate DTM
corpus = [ diction.doc2bow(bot) for bot in bots if len(bot) > min_bot_size ] # Crucially

Dictionary(222016 unique tokens: ['#', '#_#', '#_i', '#_i#', '#_ɝ']...)
term filtering applied
Dictionary(56785 unique tokens: ['#_ɝ_i#', '#_ɝi', '#_ɫ_i', '#_ɫ_i#', '#_ɫ_ɝi']...)


In [31]:
## HDP (n_topics = 90)
import gensim.models
import pyLDAvis.gensim
max_n_topics = 90
hdp = gensim.models.HdpModel(corpus, diction, random_state = 1, T = max_n_topics)
vis_data = pyLDAvis.gensim.prepare(hdp, corpus, diction)
pyLDAvis.display(vis_data)

In [32]:
## save LDAvis output as a html file
save_LDAvis = True
if save_LDAvis:
	output = f"results/LDAvis/{target_lang_dict[target_lang_key]}-HDP-max_ntop{max_n_topics}-{term_type}.html"
	pyLDAvis.save_html(vis_data, output)

In [33]:
## topic investigation
import numpy as np
import HDP_helper

documents_topics = np.zeros([hdp.m_T, len(corpus)])
for doc_id, c in enumerate(corpus):
    for topic_id, prob in hdp[c]:
        documents_topics[topic_id][doc_id] = prob

## investigate topics
n_docs_to_show  = 10
n_terms_to_show = 7
hdp.optimal_ordering()
for topic_id, probs in enumerate(documents_topics):
    print(f"==============")
    #topic_encoding = ", ".join(hdp.show_topic(topic_id))
    topic_t = hdp.print_topic(topic_id, topn = n_terms_to_show)
    print(f"topic_id {topic_id}: {HDP_helper.reformat_topic (topic_t, n_terms_to_show)}")
    print(f"nonzero count: ", len(probs.nonzero()[0]))
    for doc_id in probs.argsort()[::-1][:n_docs_to_show]:
        doc = doc_dict[doc_id]
        print(f"\t{probs[doc_id]:0.4f}: {doc}")

topic_id 0: 0.001 * aʊ + 0.001 * aʊ_# + 0.001 * m_t_# + 0.0 * s_d + 0.0 * #m_t + 0.0 * #_aʊ + 0.0 * ɑ_t
nonzero count:  652
	0.9965: #ətɹɑsəti#
	0.9965: #fɫaʊtɪst#
	0.9965: #fəsɪɫɪti#
	0.9964: #fɹistoʊn#
	0.9964: #æmpɫətud#
	0.9964: #məskitoʊ#
	0.9963: #pɹɑfətɪɹ#
	0.9963: #mɪsɹidɪŋ#
	0.9963: #mɑɹɡɝitə#
	0.9962: #fɹəktoʊs#
topic_id 1: 0.001 * əs# + 0.001 * n_s# + 0.001 * #_ɫ_ə + 0.001 * #_ɫ_ə_# + 0.001 * #_n_s + 0.001 * s_n_# + 0.001 * ɪ_n_#
nonzero count:  408
	0.9966: #koʊɫdnəs#
	0.9966: #səɫɛɹəti#
	0.9965: #kɫɛmənsi#
	0.9965: #boʊɫdnəs#
	0.9965: #əɫɑɹməst#
	0.9965: #sənɪɫəti#
	0.9964: #kɫeɪmənt#
	0.9964: #ɪkstɝnəɫ#
	0.9964: #ædvɝsɪti#
	0.9963: #kɑnfɝəns#
topic_id 2: 0.001 * #p_ə + 0.001 * #p_ə_# + 0.001 * #_ɪ_ɫ# + 0.001 * #p_t + 0.001 * #p_n + 0.0 * p_ɫ + 0.0 * p_t_#
nonzero count:  518
	0.9967: #pɛdəntɹi#
	0.9965: #pɫəndɝɪŋ#
	0.9964: #pɹɛfɝəns#
	0.9964: #bæɹəkeɪd#
	0.9963: #pɛzəntɹi#
	0.9963: #æɫəkeɪtɝ#
	0.9963: #sɪɫəkeɪt#
	0.9961: #ænəteɪtɝ#
	0.9961: #kəɫɛktɪv#
	0.9961: #pɫeɪtɫət#


In [34]:
## HDP (n_topics = 45)
import gensim.models
import pyLDAvis.gensim
max_n_topics = 45
hdp = gensim.models.HdpModel(corpus, diction, random_state = 1, T = max_n_topics)
vis_data = pyLDAvis.gensim.prepare(hdp, corpus, diction)
pyLDAvis.display(vis_data)

In [35]:
## save LDAvis output as a html file
save_LDAvis = True
if save_LDAvis:
	output = f"results/LDAvis/{target_lang_dict[target_lang_key]}-HDP-max_ntop{max_n_topics}-{accent_status}{term_type}.html"
	pyLDAvis.save_html(vis_data, output)

In [36]:
## topic investigation
import numpy as np
import HDP_helper

documents_topics = np.zeros([hdp.m_T, len(corpus)])
for doc_id, c in enumerate(corpus):
    for topic_id, prob in hdp[c]:
        documents_topics[topic_id][doc_id] = prob

n_docs_to_show = 10
n_terms_to_show = 7
hdp.optimal_ordering()
for topic_id, probs in enumerate(documents_topics):
    print(f"==============")
    topic_t = hdp.print_topic(topic_id, topn = n_terms_to_show)
    print(f"topic_id {topic_id}: {HDP_helper.reformat_topic (topic_t, n_terms_to_show)}")
    print(f"nonzero count: {len(probs.nonzero()[0])}")
    for doc_id in probs.argsort()[::-1][:n_docs_to_show]:
        doc = doc_dict[doc_id]
        print(f"\t{probs[doc_id]:0.4f}: {doc}")

topic_id 0: 0.0 * #d_ə + 0.0 * #d_ə_# + 0.0 * d_s + 0.0 * #_ɹ_ɝ + 0.0 * dʒ_# + 0.0 * #_ɑɹ_# + 0.0 * ʃə
nonzero count: 1837
	0.9973: #dɪtɛkʃən#
	0.9972: #dɪtɛnʃən#
	0.9971: #dɪmɛnʃən#
	0.9971: #daɪnəsti#
	0.9971: #kəmpɹɛsɝ#
	0.9971: #səpɹɛʃən#
	0.9970: #dɛnɪstɹi#
	0.9970: #dɛntɪʃən#
	0.9970: #pɑɹtəkəɫ#
	0.9970: #dɪpɹɛʃən#
topic_id 1: 0.0 * ɹ_k + 0.0 * #b_ɪ + 0.0 * n_ɪ + 0.0 * ɪ_n# + 0.0 * #b_ɪ_# + 0.0 * #_ɫ_ɪ_# + 0.0 * #_ɪs_#
nonzero count: 1008
	0.9966: #bɛɹətoʊn#
	0.9966: #ɪkstɝnəɫ#
	0.9965: #ɪtɝeɪʃən#
	0.9965: #mɔɹəɫɪst#
	0.9964: #æɫəkeɪtɝ#
	0.9964: #fɹæŋkɫɪn#
	0.9964: #səɫɪsətɝ#
	0.9963: #bɹeɪkmən#
	0.9963: #ænəkdoʊt#
	0.9963: #kwɪkənɪŋ#
topic_id 2: 0.001 * ɛ_ɝ + 0.001 * #_ɛ_ɝ + 0.001 * ɛ_ɝ# + 0.001 * #s_ɝ + 0.001 * #s_ɝ# + 0.0 * #_ɛ_ɝ# + 0.0 * #_n_ɝ#
nonzero count: 673
	0.9965: #kɫeɪmənt#
	0.9963: #fɛmənɪst#
	0.9963: #kɪɫəmitɝ#
	0.9963: #sɛdəmənt#
	0.9963: #mɪɫəmitɝ#
	0.9963: #fɹæɡmənt#
	0.9962: #ɪɫɪtɝəsi#
	0.9962: #eɪɫitɪst#
	0.9962: #ɪndɛntʃɝ#
	0.9962: #mæsəkɪst#
topic_id 3: 0.0 

In [37]:
## HDP (n_topics = 15)
import gensim.models
import pyLDAvis.gensim

max_n_topics = 15
hdp = gensim.models.HdpModel(corpus, diction, random_state = 1, T = max_n_topics)
vis_data = pyLDAvis.gensim.prepare(hdp, corpus, diction)
pyLDAvis.display(vis_data)

In [38]:
## save LDAvis output as a html file
save_LDAvis = True
if save_LDAvis:
	output = f"results/LDAvis/{target_lang_dict[target_lang_key]}-HDP-max_ntop{max_n_topics}-{term_type}.html"
	pyLDAvis.save_html(vis_data, output)

In [39]:
## topic investigation
import numpy as np
import HDP_helper
reload_module = True
if reload_module:
    import importlib
    importlib.reload(HDP_helper)

documents_topics = np.zeros([hdp.m_T, len(corpus)])
for doc_id, c in enumerate(corpus):
    for topic_id, prob in hdp[c]:
        documents_topics[topic_id][doc_id] = prob

n_docs_to_show = 10
n_terms_to_show = 7
hdp.optimal_ordering()
for topic_id, probs in enumerate(documents_topics):
    print(f"==============")
    topic_t = hdp.print_topic(topic_id, topn = n_terms_to_show)
    print(f"topic_id {topic_id}: {HDP_helper.reformat_topic (topic_t, n_terms_to_show)}")
    print(f"nonzero count: ", len(probs.nonzero()[0]))
    for doc_id in probs.argsort()[::-1][:n_docs_to_show]:
        doc = doc_dict[doc_id]
        print(f"\t{probs[doc_id]:0.4f}: {doc}")

topic_id 0: 0.0 * f_ɪ + 0.0 * tʃ_# + 0.0 * n_ɪ + 0.0 * #_tʃ + 0.0 * dɪ_# + 0.0 * f_ɪ_# + 0.0 * dɪ
nonzero count:  2529
	0.9972: #skɹɪptʃɝ#
	0.9972: #ɫaɪtənɪŋ#
	0.9971: #mɑnətoʊn#
	0.9970: #dɛntɪʃən#
	0.9970: #faʊndɫɪŋ#
	0.9970: #bɛɹətoʊn#
	0.9969: #dɪtɝmənt#
	0.9969: #waɪdənɪŋ#
	0.9969: #pɹəsidɪŋ#
	0.9969: #dɹɛntʃɪŋ#
topic_id 1: 0.0 * #p_ə + 0.0 * dʒ_# + 0.0 * #_dʒ + 0.0 * d_i + 0.0 * #_ɹ_i# + 0.0 * #_ɹ_ɝ + 0.0 * #_ɛ_i
nonzero count:  1602
	0.9969: #pɛdəntɹi#
	0.9968: #səɫɛɹəti#
	0.9968: #kæpəɫɛɹi#
	0.9968: #dɛnɪstɹi#
	0.9968: #ɹiɫɪdʒən#
	0.9967: #kɑɹpəntɝ#
	0.9967: #tɛndənsi#
	0.9967: #ɹiɫɪdʒəs#
	0.9966: #ətɹɑsəti#
	0.9966: #pɹaɪvəsi#
topic_id 2: 0.0 * #_n_s + 0.0 * s_s + 0.0 * tɝ# + 0.0 * #s_ɝ + 0.0 * əs# + 0.0 * #s_t + 0.0 * #_n_ɝ#
nonzero count:  1733
	0.9970: #hoʊɫinəs#
	0.9969: #koʊɫdnəs#
	0.9968: #əɫɑɹməst#
	0.9968: #əpɹɛntəs#
	0.9967: #pɹɛfɝəns#
	0.9966: #bæɹəkeɪd#
	0.9966: #stɑɹdəst#
	0.9966: #ɛmptinəs#
	0.9966: #fɛmənɪst#
	0.9966: #ɪnspɛktɝ#
topic_id 3: 0.0 * ɫæ + 0.0 * æn_# 