In [269]:
#!pip install -U pandas

In [270]:
#!pip install -U pyLDAvis

In [271]:
## imports
import os, sys
import pprint as pp

In [272]:
## 一つ上の階層のファイルを見るように設定
sys.path.append(os.path.join(os.path.dirname("__file__"), '..'))

In [273]:
## target language
## a key must be part of a file name 
target_lang_dict = {    'ar'    : 'Arabic',
                        'de'    : 'German',
                        'de_N_only' : 'German Nouns',
                        'de_non_N_only' : 'German Non-nouns',
                        'en_US' : 'English (US)',
                        'en_UK' : 'English (UK)',
                        'en_N_only' : 'English noun (WN)',
                        'en_V_only' : 'English verb (WN)',
                        'en_A_only' : 'English adj (WN)',
                        'en_R_only' : 'English adv (WN)',
                        'eo'    : 'Esperanto',
                        'es_ES' : 'Spanish (Spain)',
                        'es_MX' : 'Spanish (Mexico)',
                        'fi'    : 'Finnish',
                        'fr_FR' : 'French (France)',
                        'fr_QC' : 'French (Quebec)',
                        'is'    : 'Icelandic',
                        'nl'    : 'Dutch',
                        'ro'    : 'Romanian',
                        'sw'    : 'Swahili' }
target_lang_keys = [    'ar', 'de', 'de_N_only', 'de_non_N_only',
                        'en_US', 'en_UK', 'en_N_only', 'en_V_only', 'en_A_only', 'en_R_only',
                        'eo', 'es_ES', 'es_MX',
                        'fi', 'fr_FR', 'fr_QC',
                        'is', 'nl', 'ro', 'sw' ]
#
target_lang_key  = target_lang_keys[9]
print(f"target lang: {target_lang_dict[target_lang_key]} ({target_lang_key})")

target lang: English adv (WN) (en_R_only)


In [274]:
## term settings
term_classes       = [ 'spell', 'sound' ]
term_class         = term_classes[0]
ngram_is_inclusive = True

## doc settings
max_doc_size       = 10
min_doc_size       =  5
print(f"max_doc_size: {max_doc_size}")
print(f"min_doc_size: {min_doc_size}")

### boundary handling
add_boundary       = True
boundary_mark      = "#"
### accent handling
suppress_accents   = True
accent_marks       = [ "ˈ", "ˌ" ] 
if term_class == 'sound':
    if suppress_accents:
        accent_status = "unaccented-"
    else:
        accent_stratus = "accented-"
else:
    accent_status = ""
print(f"accent: {accent_status}")

## term setting
gap_mark           = "…"
term_is_skippy     = True
n_for_ngram        = 5
max_gap_ratio      = 0.8
max_gap_size       = round(max_doc_size* max_gap_ratio)
print(f"term_class: {term_class}")
print(f"term_is_skippy: {term_is_skippy}")
print(f"max_gap_size: {max_gap_size}")
print(f"n_for_ngram: {n_for_ngram}")

## define term_type
if term_class == 'spell':
    if term_is_skippy:
        term_type = f"sp_skippy{n_for_ngram}gram"
    else:
        term_type = f"sp_{n_for_ngram}gram"
else:
    if term_is_skippy:
        term_type = f"sn_skippy{n_for_ngram}gram"
    else:
        term_type = f"sn_{n_for_ngram}gram"
## check
print(f"term_type: {term_type}")

max_doc_size: 10
min_doc_size: 5
accent: 
term_class: spell
term_is_skippy: True
max_gap_size: 8
n_for_ngram: 5
term_type: sp_skippy5gram


In [275]:
## LDA/HDP
apply_term_filtering = True
## The following parameters need to be relatively large to prevent "Row sum not equal 1" error
term_minfreq         = 3
abuse_threshold      = 0.03
min_bot_size         = 3

In [276]:
## sampling
source_sampling          = True
source_sampling_rate     = 0.5
source_sampling_max_size = 10000
second_sampling          = False
second_sampling_rate     = 0.7

In [277]:
## set target files
import glob
data_dir1     = "data/open-dict-ipa/data1/"
data_dir2     = "data/open-dict-ipa/data1a/"
data_dir3     = "data/wn3/"
target_files = glob.glob(f"{data_dir1}/*")
target_files2 = glob.glob(f"{data_dir2}/*")
target_files.extend(target_files2)
target_files3 = glob.glob(f"{data_dir3}/*")
target_files.extend(target_files3)
target_files = sorted([ file for file in target_files if ".csv" in file ])
pp.pprint(target_files)

['data/open-dict-ipa/data1/ar.csv.gz',
 'data/open-dict-ipa/data1/de.csv.gz',
 'data/open-dict-ipa/data1/en_UK.csv.gz',
 'data/open-dict-ipa/data1/en_US.csv.gz',
 'data/open-dict-ipa/data1/eo.csv.gz',
 'data/open-dict-ipa/data1/es_ES.csv.gz',
 'data/open-dict-ipa/data1/es_MX.csv.gz',
 'data/open-dict-ipa/data1/fa.csv.gz',
 'data/open-dict-ipa/data1/fi.csv.gz',
 'data/open-dict-ipa/data1/fr_FR.csv.gz',
 'data/open-dict-ipa/data1/fr_QC.csv.gz',
 'data/open-dict-ipa/data1/is.csv.gz',
 'data/open-dict-ipa/data1/ja.csv.gz',
 'data/open-dict-ipa/data1/jam.csv.gz',
 'data/open-dict-ipa/data1/ma.csv.gz',
 'data/open-dict-ipa/data1/nb.csv.gz',
 'data/open-dict-ipa/data1/nl.csv.gz',
 'data/open-dict-ipa/data1/or.csv.gz',
 'data/open-dict-ipa/data1/sv.csv.gz',
 'data/open-dict-ipa/data1/sw.csv.gz',
 'data/open-dict-ipa/data1/vi_C.csv.gz',
 'data/open-dict-ipa/data1/vi_N.csv.gz',
 'data/open-dict-ipa/data1/vi_S.csv.gz',
 'data/open-dict-ipa/data1/yue.csv.gz',
 'data/open-dict-ipa/data1/zh_hans.csv

In [278]:
## get source data from files
import pandas as pd
import gzip

#target_language_key = "en_US" # can be changed to get other languages
file = [ f for f in target_files if target_lang_key in f ][0]
print(f"processing: {file}")

if file.endswith(".gz"):
    with gzip.open(file, "rt") as f:
        raw_df = pd.read_csv(f, encoding = 'utf8', header = None, names = ['spell', 'sound'])
else:
    with open(file, "rt") as f:
        raw_df = pd.read_csv(f, encoding = 'utf8', header = None, names = ['spell', 'sound'])
try:
    sounds = raw_df['sound'].apply(lambda x: x.strip('/') )
    sounds = [ x.split("/,")[0] for x in sounds ] # picks up only the first of multiple entries
    raw_df['sound'] = sounds
except AttributeError:
    pass
#
raw_df.sample(10)

processing: data/wn3/en_R_only.csv


Unnamed: 0,spell,sound
280,deathly,ˈdɛθɫi
1393,yesterday,ˈjɛstɝˌdeɪ
292,demandingly,dɪˈmændɪŋɫi
329,domestically,dəˈmɛstɪkɫi
76,angelically,ænˈdʒɛɫɪkɫi
92,appreciatively,əˈpɹiʃiˌeɪtɪvɫi
1355,vocally,ˈvoʊkəɫi
451,flush,ˈfɫəʃ
827,nowadays,ˈnaʊəˌdeɪz
468,fourthly,ˈfɔɹθɫi


In [279]:
## source sampling
len(raw_df)
if source_sampling:
	print(f"source sampling applied")
	if len(raw_df) >= source_sampling_max_size:
		raw_df = raw_df.sample(source_sampling_max_size)
	else:
		raw_df = raw_df.sample(round(len(raw_df) * source_sampling_rate))
## remove accent marking
if suppress_accents:
	raw_df['sound'] = raw_df['sound'].apply(lambda x: "".join([ y for y in list(x) if y not in accent_marks ]))
## add boudary marks
if add_boundary:
	raw_df['spell'] = raw_df['spell'].apply(lambda x: f"{boundary_mark}{x}{boundary_mark}")
	raw_df['sound'] = raw_df['sound'].apply(lambda x: f"{boundary_mark}{x}{boundary_mark}")
#
print(raw_df)

source sampling applied
              spell          sound
784          #most#         #moʊs#
1355      #vocally#      #voʊkəɫi#
342   #drastically#    #dɹæstɪkɫi#
239   #contentedly#  #kəntɛntədɫi#
142         #below#        #biɫoʊ#
...             ...            ...
786          #much#         #mətʃ#
49        #alertly#       #əɫɝtɫi#
797     #naturally#     #nætʃɝəɫi#
1034      #roaring#        #ɹɔɹɪŋ#
1043     #ruggedly#      #ɹəɡədɫi#

[698 rows x 2 columns]


In [280]:
## generate 1-grams for spell and sound
## spell
raw_df['sp_1gram'] = raw_df['spell'].apply(lambda x: list(str(x)))
# add column of size
raw_df['sp_size'] = raw_df['sp_1gram'].apply(lambda x: len(x))
# add column of count of '-' inside
raw_df['hyphen'] = raw_df['sp_1gram'].apply(lambda x: list(x).count("-"))
# add column of count of '.' inside
raw_df['period'] = raw_df['sp_1gram'].apply(lambda x: list(x).count("."))
## sound
# takes the first entry, removes '/' around
try:
    raw_df['sn_1gram'] = raw_df['sound'].apply(lambda x: list(x) )
except TypeError:
    pass
# add column of size
try:
    raw_df['sn_size'] = raw_df['sn_1gram'].apply(lambda x: len(x))
except KeyError:
    pass
## check
raw_df

Unnamed: 0,spell,sound,sp_1gram,sp_size,hyphen,period,sn_1gram,sn_size
784,#most#,#moʊs#,"[#, m, o, s, t, #]",6,0,0,"[#, m, o, ʊ, s, #]",6
1355,#vocally#,#voʊkəɫi#,"[#, v, o, c, a, l, l, y, #]",9,0,0,"[#, v, o, ʊ, k, ə, ɫ, i, #]",9
342,#drastically#,#dɹæstɪkɫi#,"[#, d, r, a, s, t, i, c, a, l, l, y, #]",13,0,0,"[#, d, ɹ, æ, s, t, ɪ, k, ɫ, i, #]",11
239,#contentedly#,#kəntɛntədɫi#,"[#, c, o, n, t, e, n, t, e, d, l, y, #]",13,0,0,"[#, k, ə, n, t, ɛ, n, t, ə, d, ɫ, i, #]",13
142,#below#,#biɫoʊ#,"[#, b, e, l, o, w, #]",7,0,0,"[#, b, i, ɫ, o, ʊ, #]",7
...,...,...,...,...,...,...,...,...
786,#much#,#mətʃ#,"[#, m, u, c, h, #]",6,0,0,"[#, m, ə, t, ʃ, #]",6
49,#alertly#,#əɫɝtɫi#,"[#, a, l, e, r, t, l, y, #]",9,0,0,"[#, ə, ɫ, ɝ, t, ɫ, i, #]",8
797,#naturally#,#nætʃɝəɫi#,"[#, n, a, t, u, r, a, l, l, y, #]",11,0,0,"[#, n, æ, t, ʃ, ɝ, ə, ɫ, i, #]",10
1034,#roaring#,#ɹɔɹɪŋ#,"[#, r, o, a, r, i, n, g, #]",9,0,0,"[#, ɹ, ɔ, ɹ, ɪ, ŋ, #]",7


In [281]:
## filtering raw_data by size
print(f"term_type: {term_type}")
if "sp_" in term_type:
    df_filtered = raw_df[ (raw_df['sp_size'] <= max_doc_size) & (raw_df['sp_size'] >= min_doc_size) & (raw_df['hyphen'] == 0) & (raw_df['period'] == 0) ]
else:
    df_filtered = raw_df[ (raw_df['sn_size'] <= max_doc_size) & (raw_df['sn_size'] >= min_doc_size) ]
#
df_filtered

term_type: sp_skippy5gram


Unnamed: 0,spell,sound,sp_1gram,sp_size,hyphen,period,sn_1gram,sn_size
784,#most#,#moʊs#,"[#, m, o, s, t, #]",6,0,0,"[#, m, o, ʊ, s, #]",6
1355,#vocally#,#voʊkəɫi#,"[#, v, o, c, a, l, l, y, #]",9,0,0,"[#, v, o, ʊ, k, ə, ɫ, i, #]",9
142,#below#,#biɫoʊ#,"[#, b, e, l, o, w, #]",7,0,0,"[#, b, i, ɫ, o, ʊ, #]",7
1038,#roughly#,#ɹəfɫi#,"[#, r, o, u, g, h, l, y, #]",9,0,0,"[#, ɹ, ə, f, ɫ, i, #]",7
849,#only#,#oʊnɫi#,"[#, o, n, l, y, #]",6,0,0,"[#, o, ʊ, n, ɫ, i, #]",7
...,...,...,...,...,...,...,...,...
818,#normally#,#nɔɹməɫi#,"[#, n, o, r, m, a, l, l, y, #]",10,0,0,"[#, n, ɔ, ɹ, m, ə, ɫ, i, #]",9
786,#much#,#mətʃ#,"[#, m, u, c, h, #]",6,0,0,"[#, m, ə, t, ʃ, #]",6
49,#alertly#,#əɫɝtɫi#,"[#, a, l, e, r, t, l, y, #]",9,0,0,"[#, ə, ɫ, ɝ, t, ɫ, i, #]",8
1034,#roaring#,#ɹɔɹɪŋ#,"[#, r, o, a, r, i, n, g, #]",9,0,0,"[#, ɹ, ɔ, ɹ, ɪ, ŋ, #]",7


In [282]:
## define df after second sampling if any
len(df_filtered)
if second_sampling:
    df = df_filtered.sample(round(len(df_filtered) * second_sampling_rate))
else:
    df = df_filtered
len(df)

294

In [283]:
## spell 2grams
#import ngrams
import gen_ngrams
module_name = "gen_ngrams"
reload_module = False
if reload_module:
    import importlib
    importlib.reload(module_name)

if term_class == 'spell':
    #sp_2grams = [ ngrams.list_gen_ngrams (x, n = 2, check = False) for x in df['sp_1gram'] ]
    sp_2grams = [ gen_ngrams.gen_ngrams (x, n = 2, sep = "", check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_2grams):
            g.extend(list(df['sp_1gram'])[i])
    ## add sp_2gram
    df['sp_2gram'] = sp_2grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sp_2gram'] = sp_2grams


In [284]:
## spell 3grams
#import ngrams
import gen_ngrams
if n_for_ngram > 2 and term_class == 'spell':
    #sp_3grams = [ ngrams.list_gen_ngrams (x, n = 3, check = False) for x in df['sp_1gram'] ]
    sp_3grams = [ gen_ngrams.gen_ngrams (x, n = 3, sep = "", check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_3grams):
            g.extend(list(df['sp_2gram'])[i])
    ## add sp_2gram
    df['sp_3gram'] = sp_3grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sp_3gram'] = sp_3grams


In [285]:
## spell 4grams
#import ngrams
import gen_ngrams
if n_for_ngram > 3 and term_class == 'spell':
    #sp_4grams = [ ngrams.list_gen_ngrams (x, n = 4, check = False) for x in df['sp_1gram'] ]
    sp_4grams = [ gen_ngrams.gen_ngrams (x, n = 4, sep = "", check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_4grams):
            g.extend(list(df['sp_3gram'])[i])
    ## add sp_2gram
    df['sp_4gram'] = sp_4grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sp_4gram'] = sp_4grams


In [286]:
## spell skippy 2gram
#import ngrams_skippy
import gen_ngrams
reload_module = False
module_name = "gen_ngrams"
if reload_module:
    import importlib
    importlib.reload(module_name)
#
if term_class == 'spell':
    #sp_skippy2grams = [ ngrams_skippy.gen_skippy2grams(x, missing_mark = gap_mark, check = False) for x in df['sp_1gram'] ]
    ## The code above was replaced by the following more efficient one
    sp_skippy2grams = [ gen_ngrams.gen_skippy_ngrams(x, 2, sep = "", max_distance = max_gap_size, missing_mark = gap_mark, check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_skippy2grams):
            g.extend(list(df['sp_1gram'])[i])
    #
    df['sp_skippy2gram'] = sp_skippy2grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sp_skippy2gram'] = sp_skippy2grams


In [287]:
## spell skippy 3gram
#import ngrams_skippy
import gen_ngrams
if n_for_ngram > 2 and term_class == 'spell':
    #sp_skippy3grams = [ ngrams_skippy.gen_skippy3grams(x, missing_mark = gap_mark, check = False) for x in df['sp_1gram'] ]
    ## The code above was replaced by the following more efficient one
    sp_skippy3grams = [ gen_ngrams.gen_skippy_ngrams(x, 3, sep = "", max_distance = max_gap_size, missing_mark = gap_mark, check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_skippy3grams):
            g.extend(list(df['sp_skippy2gram'])[i])
    #
    df['sp_skippy3gram'] = sp_skippy3grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sp_skippy3gram'] = sp_skippy3grams


In [288]:
## spell skippy 4gram
#import ngrams_skippy
import gen_ngrams
if n_for_ngram > 3 and term_class == 'spell':
    #sp_skippy4grams = [ ngrams_skippy.gen_skippy4grams(x, missing_mark = gap_mark, check = False) for x in df['sp_1gram'] ]
    ## The code above was replaced by the following more efficient one
    sp_skippy4grams = [ gen_ngrams.gen_skippy_ngrams(x, 4, sep = "", max_distance = max_gap_size, missing_mark = gap_mark, check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_skippy4grams):
            g.extend(list(df['sp_skippy3gram'])[i])
    #
    df['sp_skippy4gram'] = sp_skippy4grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sp_skippy4gram'] = sp_skippy4grams


In [289]:
## spell skippy 5gram
import gen_ngrams
if n_for_ngram > 4 and term_class == 'spell':
    sp_skippy5grams = [ gen_ngrams.gen_skippy_ngrams(x, 5, sep = "", max_distance = max_gap_size, missing_mark = gap_mark, check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_skippy5grams):
            g.extend(list(df['sp_skippy4gram'])[i])
    #
    df['sp_skippy5gram'] = sp_skippy5grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sp_skippy5gram'] = sp_skippy5grams


In [290]:
## sound 2grams
#import ngrams
import gen_ngrams
module_name = "gen_ngrams"
reload_module = False
if reload_module:
    import importlib
    importlib.reload(module_name)
#
if term_class == 'sound':
    #sn_2grams = [ ngrams.list_gen_ngrams (x, n = 2, check = False) for x in df['sn_1gram'] ]
    sn_2grams = [ gen_ngrams.gen_ngrams (x, n = 2, sep ="", check = False) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_2grams):
            g.extend(list(df['sn_1gram'])[i])
    ## add sn_2gram
    df['sn_2gram'] = sn_2grams

In [291]:
## sound 3grams
#import ngrams
import gen_ngrams
if n_for_ngram > 2 and term_class == 'sound':
    #sn_3grams = [ ngrams.list_gen_ngrams (x, n = 3, check = False) for x in df['sn_1gram'] ]
    sn_3grams = [ gen_ngrams.gen_ngrams (x, n = 3, sep = "", check = False) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_3grams):
            g.extend(list(df['sn_2gram'])[i])
    ## add sn_3gram
    df['sn_3gram'] = sn_3grams

In [292]:
## sound 4grams
#import ngrams
import gen_ngrams
if n_for_ngram > 3 and term_class == 'sound':
    #sn_4grams = [ ngrams.list_gen_ngrams (x, n = 4, check = False) for x in df['sn_1gram'] ]
    sn_4grams = [ gen_ngrams.gen_ngrams (x, n = 4, sep = "", check = False) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_4grams):
            g.extend(list(df['sn_3gram'])[i])
    ## add sn_4gram
    df['sn_4gram'] = sn_4grams

In [293]:
## sound 5grams
import gen_ngrams
if n_for_ngram > 4 and term_class == 'sound':
    sn_5grams = [ gen_ngrams.gen_ngrams (x, n = 5, sep = "", check = False) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_5grams):
            g.extend(list(df['sn_4gram'])[i])
    ## add sn_4gram
    df['sn_5gram'] = sn_5grams

In [294]:
## sound skippy 2gram
#import ngrams_skippy
import gen_ngrams
if term_class == 'sound':
    #sn_skippy2grams = [ ngrams_skippy.gen_skippy2grams(x, missing_mark = gap_mark, check = False) for x in df['sn_1gram'] ]
    ## The code above was replaced by the following more efficient one
    sn_skippy2grams = [ gen_ngrams.gen_skippy_ngrams(x, n = 2, sep = "", max_distance = max_gap_size, missing_mark = gap_mark, check = False) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_skippy2grams):
            g.extend(list(df['sn_1gram'])[i])
    #
    df['sn_skippy2gram'] = sn_skippy2grams

In [295]:
## sound skippy 3gram
#import ngrams_skippy
import gen_ngrams
if n_for_ngram > 2 and term_class == 'sound':
    #sn_skippy3grams = [ ngrams_skippy.gen_skippy3grams(x, missing_mark = gap_mark, check = False) for x in df['sn_1gram'] ]
    ## The code above was replaced by the following more efficient one
    sn_skippy3grams = [ gen_ngrams.gen_skippy_ngrams(x, n = 3, sep = "", max_distance = max_gap_size, missing_mark = gap_mark, check = False) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_skippy3grams):
            g.extend(list(df['sn_skippy2gram'])[i])
    #
    df['sn_skippy3gram'] = sn_skippy3grams

In [296]:
## sound skippy 4gram
#import ngrams_skippy
import gen_ngrams
if n_for_ngram > 3 and term_class == 'sound':
    #sn_skippy4grams = [ ngrams_skippy.gen_skippy4grams(x, missing_mark = gap_mark, check = False) for x in df['sn_1gram'] ]
    ## The code above was replaced by the following more efficient one
    sn_skippy4grams = [ gen_ngrams.gen_skippy_ngrams(x, n = 4, sep = "", max_distance = max_gap_size, missing_mark = gap_mark, check = False) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_skippy4grams):
            g.extend(list(df['sn_skippy3gram'])[i])
    #
    df['sn_skippy4gram'] = sn_skippy4grams

In [297]:
## sound skippy 5gram
import gen_ngrams
if n_for_ngram > 4 and term_class == 'sound':
    sn_skippy5grams = [ gen_ngrams.gen_skippy_ngrams(x, n = 5, sep = "", max_distance = max_gap_size, missing_mark = gap_mark, check = False) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_skippy5grams):
            g.extend(list(df['sn_skippy4gram'])[i])
    #
    df['sn_skippy5gram'] = sn_skippy5grams

In [298]:
## check df
dropped_vars = [ 'sp_size', 'hyphen', 'period', 'sn_size' ]
if term_class == 'spell':
    extra = [ 'sn_1gram', 'sn_2gram', 'sn_3gram', 'sn_4gram',
             'sn_skippy2gram', 'sn_skippy3gram', 'sn_skippy4gram' ]
    dropped_vars.extend(extra)
    target_vars = [ x for x in df.columns if not x in dropped_vars ]
else:
    extra = [ 'sp_1gram', 'sp_2gram', 'sp_3gram', 'sp_4gram',
             'sp_skippy2gram', 'sp_skippy3gram', 'sp_skippy4gram' ]
    dropped_vars.extend(extra)
    target_vars = [ x for x in df.columns if not x in dropped_vars ]    
#
df[target_vars]

Unnamed: 0,spell,sound,sp_1gram,sp_2gram,sp_3gram,sp_4gram,sp_skippy2gram,sp_skippy3gram,sp_skippy4gram,sp_skippy5gram
784,#most#,#moʊs#,"[#, m, o, s, t, #]","[#m, mo, os, st, t#, #, m, o, s, t, #]","[#mo, mos, ost, st#, #m, mo, os, st, t#, #, m,...","[#mos, most, ost#, #mo, mos, ost, st#, #m, mo,...","[#m, #…o, #…s, #…t, #…#, mo, m…s, m…t, m…#, os...","[#mo, #m…s, #m…t, #m…#, #…os, #…o…t, #…o…#, #…...","[#mos, #mo…t, #mo…#, #m…st, #m…s…#, #m…t#, #…o...","[#most, #mos…#, #mo…t#, #m…st#, #…ost#, most#,..."
1355,#vocally#,#voʊkəɫi#,"[#, v, o, c, a, l, l, y, #]","[#v, vo, oc, ca, al, ll, ly, y#, #, v, o, c, a...","[#vo, voc, oca, cal, all, lly, ly#, #v, vo, oc...","[#voc, voca, ocal, call, ally, lly#, #vo, voc,...","[#v, #…o, #…c, #…a, #…l, #…l, #…y, vo, v…c, v…...","[#vo, #v…c, #v…a, #v…l, #v…l, #v…y, #…oc, #…o…...","[#voc, #vo…a, #vo…l, #vo…l, #vo…y, #v…ca, #v…c...","[#voca, #voc…l, #voc…l, #voc…y, #vo…al, #vo…a…..."
142,#below#,#biɫoʊ#,"[#, b, e, l, o, w, #]","[#b, be, el, lo, ow, w#, #, b, e, l, o, w, #]","[#be, bel, elo, low, ow#, #b, be, el, lo, ow, ...","[#bel, belo, elow, low#, #be, bel, elo, low, o...","[#b, #…e, #…l, #…o, #…w, #…#, be, b…l, b…o, b…...","[#be, #b…l, #b…o, #b…w, #b…#, #…el, #…e…o, #…e...","[#bel, #be…o, #be…w, #be…#, #b…lo, #b…l…w, #b…...","[#belo, #bel…w, #bel…#, #be…ow, #be…o…#, #be…w..."
1038,#roughly#,#ɹəfɫi#,"[#, r, o, u, g, h, l, y, #]","[#r, ro, ou, ug, gh, hl, ly, y#, #, r, o, u, g...","[#ro, rou, oug, ugh, ghl, hly, ly#, #r, ro, ou...","[#rou, roug, ough, ughl, ghly, hly#, #ro, rou,...","[#r, #…o, #…u, #…g, #…h, #…l, #…y, ro, r…u, r…...","[#ro, #r…u, #r…g, #r…h, #r…l, #r…y, #…ou, #…o…...","[#rou, #ro…g, #ro…h, #ro…l, #ro…y, #r…ug, #r…u...","[#roug, #rou…h, #rou…l, #rou…y, #ro…gh, #ro…g…..."
849,#only#,#oʊnɫi#,"[#, o, n, l, y, #]","[#o, on, nl, ly, y#, #, o, n, l, y, #]","[#on, onl, nly, ly#, #o, on, nl, ly, y#, #, o,...","[#onl, only, nly#, #on, onl, nly, ly#, #o, on,...","[#o, #…n, #…l, #…y, #…#, on, o…l, o…y, o…#, nl...","[#on, #o…l, #o…y, #o…#, #…nl, #…n…y, #…n…#, #…...","[#onl, #on…y, #on…#, #o…ly, #o…l…#, #o…y#, #…n...","[#only, #onl…#, #on…y#, #o…ly#, #…nly#, only#,..."
...,...,...,...,...,...,...,...,...,...,...
818,#normally#,#nɔɹməɫi#,"[#, n, o, r, m, a, l, l, y, #]","[#n, no, or, rm, ma, al, ll, ly, y#, #, n, o, ...","[#no, nor, orm, rma, mal, all, lly, ly#, #n, n...","[#nor, norm, orma, rmal, mall, ally, lly#, #no...","[#n, #…o, #…r, #…m, #…a, #…l, #…l, no, n…r, n…...","[#no, #n…r, #n…m, #n…a, #n…l, #n…l, #…or, #…o…...","[#nor, #no…m, #no…a, #no…l, #no…l, #n…rm, #n…r...","[#norm, #nor…a, #nor…l, #nor…l, #no…ma, #no…m…..."
786,#much#,#mətʃ#,"[#, m, u, c, h, #]","[#m, mu, uc, ch, h#, #, m, u, c, h, #]","[#mu, muc, uch, ch#, #m, mu, uc, ch, h#, #, m,...","[#muc, much, uch#, #mu, muc, uch, ch#, #m, mu,...","[#m, #…u, #…c, #…h, #…#, mu, m…c, m…h, m…#, uc...","[#mu, #m…c, #m…h, #m…#, #…uc, #…u…h, #…u…#, #…...","[#muc, #mu…h, #mu…#, #m…ch, #m…c…#, #m…h#, #…u...","[#much, #muc…#, #mu…h#, #m…ch#, #…uch#, much#,..."
49,#alertly#,#əɫɝtɫi#,"[#, a, l, e, r, t, l, y, #]","[#a, al, le, er, rt, tl, ly, y#, #, a, l, e, r...","[#al, ale, ler, ert, rtl, tly, ly#, #a, al, le...","[#ale, aler, lert, ertl, rtly, tly#, #al, ale,...","[#a, #…l, #…e, #…r, #…t, #…l, #…y, al, a…e, a…...","[#al, #a…e, #a…r, #a…t, #a…l, #a…y, #…le, #…l…...","[#ale, #al…r, #al…t, #al…l, #al…y, #a…er, #a…e...","[#aler, #ale…t, #ale…l, #ale…y, #al…rt, #al…r…..."
1034,#roaring#,#ɹɔɹɪŋ#,"[#, r, o, a, r, i, n, g, #]","[#r, ro, oa, ar, ri, in, ng, g#, #, r, o, a, r...","[#ro, roa, oar, ari, rin, ing, ng#, #r, ro, oa...","[#roa, roar, oari, arin, ring, ing#, #ro, roa,...","[#r, #…o, #…a, #…r, #…i, #…n, #…g, ro, r…a, r…...","[#ro, #r…a, #r…r, #r…i, #r…n, #r…g, #…oa, #…o…...","[#roa, #ro…r, #ro…i, #ro…n, #ro…g, #r…ar, #r…a...","[#roar, #roa…i, #roa…n, #roa…g, #ro…ri, #ro…r…..."


In [299]:
## select data type and define doc_dict
import random
if "sp_" in term_type:
    base_type = "spell"
else:
    base_type = "sound"
doc_dict = { i: x for i, x in enumerate(df[base_type]) }
## check
random.sample(doc_dict.items(), 10)

since Python 3.9 and will be removed in a subsequent version.
  random.sample(doc_dict.items(), 10)


[(215, '#pat#'),
 (204, '#clear#'),
 (32, '#directly#'),
 (40, '#naively#'),
 (174, '#greenly#'),
 (183, '#slower#'),
 (70, '#here#'),
 (237, '#item#'),
 (50, '#unfairly#'),
 (78, '#visibly#')]

In [300]:
## select bots for analysis
enable_term_change = False # if you want to change term_type to save time and energy
if enable_term_change:
	term_type = 'sp_skippy4gram'
print(f"(changed) term_type: {term_type}")

## bot stands for 'bag-of-terms', a generalization of 'bag-of-words'
bots = [ x for x in df[term_type] if len(x) > min_bot_size ] # Crucially
import random
random.sample(bots, 3)

(changed) term_type: sp_skippy5gram


[['#stea',
  '#ste…d',
  '#ste…i',
  '#ste…l',
  '#st…ad',
  '#st…a…i',
  '#st…a…l',
  '#st…di',
  '#st…d…l',
  '#st…il',
  '#s…ead',
  '#s…ea…i',
  '#s…ea…l',
  '#s…e…di',
  '#s…e…d…l',
  '#s…e…il',
  '#s…adi',
  '#s…ad…l',
  '#s…a…il',
  '#s…dil',
  '#…tead',
  '#…tea…i',
  '#…tea…l',
  '#…te…di',
  '#…te…d…l',
  '#…te…il',
  '#…t…adi',
  '#…t…ad…l',
  '#…t…a…il',
  '#…t…dil',
  '#…eadi',
  '#…ead…l',
  '#…ea…il',
  '#…e…dil',
  '#…adil',
  'stead',
  'stea…i',
  'stea…l',
  'ste…di',
  'ste…d…l',
  'ste…il',
  'st…adi',
  'st…ad…l',
  'st…a…il',
  'st…dil',
  's…eadi',
  's…ead…l',
  's…ea…il',
  's…e…dil',
  's…adil',
  'teadi',
  'tead…l',
  'tea…il',
  'te…dil',
  't…adil',
  'eadil',
  'stead',
  'stea…i',
  'stea…l',
  'stea…y',
  'ste…di',
  'ste…d…l',
  'ste…d…y',
  'ste…il',
  'ste…i…y',
  'ste…ly',
  'st…adi',
  'st…ad…l',
  'st…ad…y',
  'st…a…il',
  'st…a…i…y',
  'st…a…ly',
  'st…dil',
  'st…di…y',
  'st…d…ly',
  'st…ily',
  's…eadi',
  's…ead…l',
  's…ead…y',
  's…ea…il',

In [301]:
## generate dictionary
from gensim.corpora import Dictionary
diction = Dictionary(bots)
print(diction)

if apply_term_filtering:
    print(f"term filtering applied")
    diction.filter_extremes(no_below = term_minfreq, no_above = abuse_threshold)
else:
    print(f"term filtering not applied")
print(diction)

## generate DTM
corpus = [ diction.doc2bow(bot) for bot in bots if len(bot) > min_bot_size ] # Crucially

Dictionary<41459 unique tokens: ['#', '#m', '#mo', '#mos', '#most']...>
term filtering applied
Dictionary<3364 unique tokens: ['#m', '#mo', '#m…#', '#m…t', '#…os']...>


In [302]:
## HDP (n_topics = 90)
import gensim.models
import pyLDAvis.gensim
max_n_topics = 90
hdp = gensim.models.HdpModel(corpus, diction, random_state = 1, T = max_n_topics)
vis_data = pyLDAvis.gensim.prepare(hdp, corpus, diction)
pyLDAvis.display(vis_data)

In [303]:
## save LDAvis output as a html file
save_LDAvis = True
if save_LDAvis:
	vis_output = f"results/LDAvis/{target_lang_dict[target_lang_key]}-HDP-max_ntop{max_n_topics}-{accent_status}{term_type}.html"
	pyLDAvis.save_html(vis_data, vis_output)

In [304]:
## topic investigation
import numpy as np
import HDP_helper

documents_topics = np.zeros([hdp.m_T, len(corpus)])
for doc_id, c in enumerate(corpus):
    for topic_id, prob in hdp[c]:
        documents_topics[topic_id][doc_id] = prob

## investigate topics
n_docs_to_show  = 10
n_terms_to_show = 7
hdp.optimal_ordering()
for topic_id, probs in enumerate(documents_topics):
    print(f"==============")
    #topic_encoding = ", ".join(hdp.show_topic(topic_id))
    topic_t = hdp.print_topic(topic_id, topn = n_terms_to_show)
    print(f"topic_id {topic_id}: {HDP_helper.reformat_topic (topic_t, n_terms_to_show)}")
    print(f"nonzero count: ", len(probs.nonzero()[0]))
    for doc_id in probs.argsort()[::-1][:n_docs_to_show]:
        doc = doc_dict[doc_id]
        print(f"\t{probs[doc_id]:0.4f}: {doc}")

topic_id 0: 0.003 * nu…l + 0.003 * bly + 0.003 * ib + 0.002 * ibly + 0.002 * ib…y# + 0.002 * bly# + 0.002 * ibl
nonzero count:  14
	0.9966: #manually#
	0.9962: #annually#
	0.9925: #flexibly#
	0.9209: #glibly#
	0.6786: #audibly#
	0.6281: #tangibly#
	0.6247: #visibly#
	0.5848: #sexually#
	0.3926: #actually#
	0.3893: #equally#
topic_id 1: 0.003 * ml…# + 0.003 * kly + 0.002 * kl + 0.002 * k…y + 0.002 * kly# + 0.002 * kl…# + 0.002 * k…y#
nonzero count:  23
	0.9939: #starkly#
	0.9932: #downhill#
	0.9928: #forward#
	0.9928: #outward#
	0.9907: #darkly#
	0.9846: #weakly#
	0.9711: #west#
	0.9404: #ill#
	0.4638: #seaward#
	0.4525: #thickly#
topic_id 2: 0.003 * er…y + 0.003 * rl + 0.002 * er…y# + 0.002 * a…e…ly + 0.002 * rly + 0.002 * rl…# + 0.002 * rt…y#
nonzero count:  26
	0.9953: #easterly#
	0.9942: #sternly#
	0.9940: #greenly#
	0.7313: #vitally#
	0.6739: #evenly#
	0.6598: #civilly#
	0.5715: #directly#
	0.5589: #alertly#
	0.4430: #greatly#
	0.3883: #dolce#
topic_id 3: 0.003 * r…all + 0.003 * o…

In [305]:
## HDP (n_topics = 45)
import gensim.models
import pyLDAvis.gensim
max_n_topics = 45
hdp = gensim.models.HdpModel(corpus, diction, random_state = 1, T = max_n_topics)
vis_data = pyLDAvis.gensim.prepare(hdp, corpus, diction)
pyLDAvis.display(vis_data)

In [306]:
## save LDAvis output as a html file
save_LDAvis = True
if save_LDAvis:
	output = f"results/LDAvis/{target_lang_dict[target_lang_key]}-HDP-max_ntop{max_n_topics}-{accent_status}{term_type}.html"
	pyLDAvis.save_html(vis_data, vis_output)

In [307]:
## topic investigation
import numpy as np
import HDP_helper

documents_topics = np.zeros([hdp.m_T, len(corpus)])
for doc_id, c in enumerate(corpus):
    for topic_id, prob in hdp[c]:
        documents_topics[topic_id][doc_id] = prob

n_docs_to_show = 10
n_terms_to_show = 7
hdp.optimal_ordering()
for topic_id, probs in enumerate(documents_topics):
    print(f"==============")
    topic_t = hdp.print_topic(topic_id, topn = n_terms_to_show)
    print(f"topic_id {topic_id}: {HDP_helper.reformat_topic (topic_t, n_terms_to_show)}")
    print(f"nonzero count: {len(probs.nonzero()[0])}")
    for doc_id in probs.argsort()[::-1][:n_docs_to_show]:
        doc = doc_dict[doc_id]
        print(f"\t{probs[doc_id]:0.4f}: {doc}")

topic_id 0: 0.003 * r…all + 0.003 * o…m…l + 0.002 * or…l + 0.002 * #…iv + 0.002 * i…a…l…# + 0.002 * re…l…# + 0.002 * u…ge
nonzero count: 28
	0.9956: #formally#
	0.9955: #normally#
	0.9917: #ineptly#
	0.9405: #ill#
	0.8772: #royally#
	0.7095: #promptly#
	0.6867: #orally#
	0.6668: #neatly#
	0.6208: #anon#
	0.6095: #dorsally#
topic_id 1: 0.002 * u…ll…# + 0.002 * #…re…y + 0.002 * #alo + 0.002 * ma…l + 0.002 * lo…y# + 0.002 * u…l…y# + 0.002 * p…o
nonzero count: 26
	0.9966: #manually#
	0.9962: #annually#
	0.9916: #grimly#
	0.8056: #across#
	0.7851: #briefly#
	0.6381: #less#
	0.5715: #dimly#
	0.5288: #mutually#
	0.5070: #actually#
	0.4724: #deftly#
topic_id 2: 0.002 * o…est + 0.002 * sho…# + 0.002 * bly + 0.002 * t…i…l + 0.002 * #lo…# + 0.002 * f…tl + 0.002 * d…r…ly
nonzero count: 24
	0.9949: #terribly#
	0.7584: #visibly#
	0.6694: #audibly#
	0.6058: #last#
	0.6041: #tangibly#
	0.5347: #weakly#
	0.4362: #randomly#
	0.4122: #thickly#
	0.3517: #thereby#
	0.3386: #more#
topic_id 3: 0.002 * n…i…# 

In [308]:
## HDP (n_topics = 15)
import gensim.models
import pyLDAvis.gensim
max_n_topics = 15
hdp = gensim.models.HdpModel(corpus, diction, random_state = 1, T = max_n_topics)
vis_data = pyLDAvis.gensim.prepare(hdp, corpus, diction)
pyLDAvis.display(vis_data)

In [309]:
## save LDAvis output as a html file
save_LDAvis = True
if save_LDAvis:
	vis_output = f"results/LDAvis/{target_lang_dict[target_lang_key]}-HDP-max_ntop{max_n_topics}-{accent_status}{term_type}.html"
	pyLDAvis.save_html(vis_data, vis_output)

In [310]:
## topic investigation
import numpy as np
import HDP_helper
reload_module = True
if reload_module:
    import importlib
    importlib.reload(HDP_helper)

documents_topics = np.zeros([hdp.m_T, len(corpus)])
for doc_id, c in enumerate(corpus):
    for topic_id, prob in hdp[c]:
        documents_topics[topic_id][doc_id] = prob

n_docs_to_show = 10
n_terms_to_show = 7
hdp.optimal_ordering()
for topic_id, probs in enumerate(documents_topics):
    print(f"==============")
    topic_t = hdp.print_topic(topic_id, topn = n_terms_to_show)
    print(f"topic_id {topic_id}: {HDP_helper.reformat_topic (topic_t, n_terms_to_show)}")
    print(f"nonzero count: ", len(probs.nonzero()[0]))
    for doc_id in probs.argsort()[::-1][:n_docs_to_show]:
        doc = doc_dict[doc_id]
        print(f"\t{probs[doc_id]:0.4f}: {doc}")

topic_id 0: 0.003 * r…all + 0.002 * #…iv + 0.002 * i…a…l…# + 0.002 * re…l…# + 0.002 * u…ge + 0.002 * iv…l…# + 0.002 * o…m…l
nonzero count:  70
	0.9958: #formally#
	0.9958: #normally#
	0.9439: #ill#
	0.8137: #royally#
	0.7147: #beyond#
	0.6271: #orally#
	0.6178: #ineptly#
	0.5586: #unduly#
	0.5511: #leeward#
	0.5455: #roundly#
topic_id 1: 0.002 * r…lly# + 0.002 * #…er…l + 0.002 * #i…e + 0.002 * ke…# + 0.002 * #f…a…l + 0.002 * #…ug + 0.002 * c…il
nonzero count:  85
	0.9952: #terribly#
	0.9921: #grimly#
	0.9060: #ruggedly#
	0.8755: #calmly#
	0.7165: #aft#
	0.6474: #adrift#
	0.6396: #dimly#
	0.5092: #askance#
	0.4927: #warily#
	0.4593: #doggedly#
topic_id 2: 0.002 * he…e + 0.002 * b…a…l + 0.002 * #…k…# + 0.002 * ho…e + 0.002 * #st + 0.002 * t…t…# + 0.002 * a…te…#
nonzero count:  82
	0.7898: #easterly#
	0.7659: #much#
	0.6831: #longer#
	0.6141: #each#
	0.5476: #next#
	0.4836: #very#
	0.4820: #fairly#
	0.4762: #largely#
	0.4540: #south#
	0.4526: #afresh#
topic_id 3: 0.003 * #a…e…y + 0.002 * 