In [668]:
#!pip install -U pyLDAvis
#!pip install -U pandas

In [669]:
## imports
import os, sys
import pprint as pp
import unicodedata

In [670]:
## 一つ上の階層のファイルを見るように設定
sys.path.append(os.path.join(os.path.dirname("__file__"), '..'))

In [671]:
## target language
## a key must be part of a file name 
target_lang_dict = {    'en_US' : 'English (US)',
                        'en_UK' : 'English (UK)',
                        'en_N_only' : 'English noun (WN)',
                        'en_V_only' : 'English verb (WN)',
                        'en_A_only' : 'English adj (WN)',
                        'en_R_only' : 'English adv (WN)',
                        'ar'    : 'Arabic',
                        'de'    : 'German',
                        'de_N_only' : 'German Nouns',
                        'de_non_N_only' : 'German Non-nouns',
                        'eo'    : 'Esperanto',
                        'es_ES' : 'Spanish (Spain)',
                        'es_MX' : 'Spanish (Mexico)',
                        'fi'    : 'Finnish',
                        'fr_FR' : 'French (France)',
                        'fr_QC' : 'French (Quebec)',
                        'is'    : 'Icelandic',
                        'ir'    : 'Irish',
                        'nl'    : 'Dutch',
                        'ro'    : 'Romanian',
                        'sw'    : 'Swahili' }
## proper language selection
target_lang_keys = [    'en_US', 'en_UK',
                        'en_N_only', # 2
                        'en_V_only', 'en_A_only', 'en_R_only',
                        'ar', # 6
                        'de', # 7
                        'de_N_only', 'de_non_N_only',
                        'eo', 'es_ES', 'es_MX',
                        'fi', 'fr_FR', 'fr_QC',
                        'is',
                        'ir', # This lacks sound
                        'nl', 'ro', 'sw' ]
#
target_lang_key  = target_lang_keys[8]
print(f"target lang: {target_lang_dict[target_lang_key]} ({target_lang_key})")

## target_attr [effective only for Irish]
target_class = ""
if target_lang_key == "ir":
    target_classes = [ 'adjectives', 'nouns', 'verbs' ]
    target_class = f"-{target_classes[3]}"
print(f"target_class: {target_class}")

target lang: German Nouns (de_N_only)
target_class: 


In [672]:
## term settings
term_classes        = [ 'spell', 'sound' ]
term_class          = term_classes[0]
ngram_is_inclusive  = True

## doc settings
max_doc_size        = 10
min_doc_size        =  3
print(f"max_doc_size: {max_doc_size}")
print(f"min_doc_size: {min_doc_size}")

### boundary handling
add_boundary       = True
boundary_mark      = "#"
### accent handling
suppress_accents   = True
accent_marks       = [ "ˈ", "ˌ" ] 
if term_class == 'sound':
    if suppress_accents:
        accent_status = "-unaccented"
    else:
        accent_stratus = "-accented"
else:
    accent_status = ""
print(f"accent_status: {accent_status}")

## term setting
gap_mark           = "…"
term_is_skippy     = True
n_for_ngram        = 5
max_gap_ratio      = 0.8
max_gap_size       = round(max_doc_size * max_gap_ratio)
print(f"term_class: {term_class}")
print(f"term_is_skippy: {term_is_skippy}")
print(f"max_gap_size: {max_gap_size}")
print(f"n_for_ngram: {n_for_ngram}")

## define term_type
if term_class == 'spell':
    if term_is_skippy:
        term_type = f"sp_skippy{n_for_ngram}gram"
    else:
        term_type = f"sp_{n_for_ngram}gram"
else:
    if term_is_skippy:
        term_type = f"sn_skippy{n_for_ngram}gram"
    else:
        term_type = f"sn_{n_for_ngram}gram"
## check
print(f"term_type: {term_type}")

max_doc_size: 10
min_doc_size: 3
accent_status: 
term_class: spell
term_is_skippy: True
max_gap_size: 8
n_for_ngram: 5
term_type: sp_skippy5gram


In [673]:
## LDA/HDP
apply_term_filtering = True
## The following parameters need to be relatively large to prevent "Row sum not equal 1" error
term_minfreq         = 2
abuse_threshold      = 0.05 # larger value selects shorter units, smaller value selects longer units
min_bot_size         = 3

In [674]:
## sampling
source_sampling          = True
source_sampling_rate     = 0.5
source_sampling_max_size = 10000
second_sampling          = False
second_sampling_rate     = 0.7

In [675]:
## set target files
import glob
data_dir1     = "data/open-dict-ipa/data1"
data_dir2     = "data/open-dict-ipa/data1a"
data_dir3     = "data/wn3"
data_dir4     = "data/irish"
target_files = glob.glob(f"{data_dir1}/*")
target_files2 = glob.glob(f"{data_dir2}/*")
target_files.extend(target_files2)
target_files3 = glob.glob(f"{data_dir3}/*")
target_files.extend(target_files3)
target_files4 = glob.glob(f"{data_dir4}/*")
target_files.extend(target_files4)
#
target_files = sorted([ file for file in target_files if ".csv" in file ])
pp.pprint(target_files)

['data/irish/word-irish-adjectives-spell.csv',
 'data/irish/word-irish-noun-phrases-spell.csv',
 'data/irish/word-irish-nouns-spell.csv',
 'data/irish/word-irish-possessives-spell.csv',
 'data/irish/word-irish-prepositions-spell.csv',
 'data/irish/word-irish-verbs-spell.csv',
 'data/open-dict-ipa/data1/ar.csv.gz',
 'data/open-dict-ipa/data1/de.csv.gz',
 'data/open-dict-ipa/data1/en_UK.csv.gz',
 'data/open-dict-ipa/data1/en_US.csv.gz',
 'data/open-dict-ipa/data1/eo.csv.gz',
 'data/open-dict-ipa/data1/es_ES.csv.gz',
 'data/open-dict-ipa/data1/es_MX.csv.gz',
 'data/open-dict-ipa/data1/fa.csv.gz',
 'data/open-dict-ipa/data1/fi.csv.gz',
 'data/open-dict-ipa/data1/fr_FR.csv.gz',
 'data/open-dict-ipa/data1/fr_QC.csv.gz',
 'data/open-dict-ipa/data1/is.csv.gz',
 'data/open-dict-ipa/data1/ja.csv.gz',
 'data/open-dict-ipa/data1/jam.csv.gz',
 'data/open-dict-ipa/data1/ma.csv.gz',
 'data/open-dict-ipa/data1/nb.csv.gz',
 'data/open-dict-ipa/data1/nl.csv.gz',
 'data/open-dict-ipa/data1/or.csv.gz',
 '

In [676]:
## get source data from files
import pandas as pd
import gzip

#target_language_key = "en_US" # can be changed to get other languages
if target_class != "":
    file = [ f for f in target_files if target_lang_key in f and target_class in f ][0]
else:
    file = [ f for f in target_files if target_lang_key in f ][0]
print(f"processing: {file}")

##
if target_lang_key == "ir":
    col_names = ['spell', 'POS']
else:
    col_names = ['spell', 'sound']

if file.endswith(".gz"):
    with gzip.open(file, "rt") as f:
        raw_df = pd.read_csv(f, encoding = 'utf8', header = None, names = col_names )
else:
    with open(file, "rt") as f:
        raw_df = pd.read_csv(f, encoding = 'utf8', header = None, names = col_names )

## normalize
raw_df['spell'] = raw_df['spell'].apply(lambda x: unicodedata.normalize('NFC', str(x)))

## modify sound
try:
    sounds = raw_df['sound'].apply(lambda x: x.strip('/') )
    sounds = [ x.split("/,")[0] for x in sounds ] # picks up only the first of multiple entries
    raw_df['sound'] = sounds
except (AttributeError, KeyError):
    pass
#
raw_df.sample(10)

processing: data/open-dict-ipa/data1a/de_N_only.csv.gz


Unnamed: 0,spell,sound
80026,Unionen,ˈʔʊnĭoːnən
48499,Lieferlisten,liːfɛɾˈlɪstən
44398,Kontaktschalen,kɔntaktʃalən
36122,Hiebe,ˈhiːbə
36672,Höchstsätzen,ˈhœçstzɛtsən
71879,Spüllappen,ˈʃpʏllappən
27455,Fruchtbarkeit,fɾʊxtbaɾˈkəiːt
39306,Johannisbeeren,ˈjoːannɪsbəɐ̯ən
50037,Markensammlern,ˈmaɾkɛnzammlɛɾn
1687,Affenschande,ˈʔaffɛnʃandə


In [677]:
## source sampling
len(raw_df)
if source_sampling:
	print(f"source sampling applied")
	if len(raw_df) >= source_sampling_max_size:
		raw_df = raw_df.sample(source_sampling_max_size)
	else:
		raw_df = raw_df.sample(round(len(raw_df) * source_sampling_rate))
## remove accent marking
if suppress_accents:
	try:
		raw_df['sound'] = raw_df['sound'].apply(lambda x: "".join([ y for y in list(x) if y not in accent_marks ]))
	except KeyError:
		pass
## add boudary marks
if add_boundary:
	raw_df['spell'] = raw_df['spell'].apply(lambda x: f"{boundary_mark}{x}{boundary_mark}")
	try:
		raw_df['sound'] = raw_df['sound'].apply(lambda x: f"{boundary_mark}{x}{boundary_mark}")
	except KeyError:
		pass
#
print(raw_df)

source sampling applied
                           spell                      sound
21274               #Erektionen#             #eʁɛktsjoːnən#
58569                #Plauderei#                #plaodɐ̯ae#
60730                 #Püppchen#                  #pʏppçən#
74623        #Stundenberechnung#          #ʃtʊndɛnbeɾɛçnʊŋ#
15043           #Dachgeschossen#              #daçgɛʃɔszən#
...                          ...                        ...
3311                 #Anführers#                #ʔanfyɾɛɾs#
33139           #Grundbesitzers#           #gɾʊntbezɪtsɛɾs#
81918  #Verkehrsteilnehmerinnen#  #fɛːɐ̯keɾstaelnemeɾɪnnən#
76326            #Telefonkabeln#           #teːlefɔŋkɑbɛln#
43878               #Konditorei#               #kɔnditoɾae#

[10000 rows x 2 columns]


In [678]:
## generate 1-grams for spell and sound
## spell
raw_df['sp_1gram'] = raw_df['spell'].apply(lambda x: list(str(x)))
# add column of size
raw_df['sp_size'] = raw_df['sp_1gram'].apply(lambda x: len(x))
# add column of count of '-' inside
raw_df['hyphen'] = raw_df['sp_1gram'].apply(lambda x: list(x).count("-"))
# add column of count of '.' inside
raw_df['period'] = raw_df['sp_1gram'].apply(lambda x: list(x).count("."))
## sound
# takes the first entry, removes '/' around
try:
    raw_df['sn_1gram'] = raw_df['sound'].apply(lambda x: list(x) )
except (TypeError, KeyError):
    pass
# add column of size
try:
    raw_df['sn_size'] = raw_df['sn_1gram'].apply(lambda x: len(x))
except KeyError:
    pass
## check
raw_df

Unnamed: 0,spell,sound,sp_1gram,sp_size,hyphen,period,sn_1gram,sn_size
21274,#Erektionen#,#eʁɛktsjoːnən#,"[#, E, r, e, k, t, i, o, n, e, n, #]",12,0,0,"[#, e, ʁ, ɛ, k, t, s, j, o, ː, n, ə, n, #]",14
58569,#Plauderei#,#plaodɐ̯ae#,"[#, P, l, a, u, d, e, r, e, i, #]",11,0,0,"[#, p, l, a, o, d, ɐ, ̯, a, e, #]",11
60730,#Püppchen#,#pʏppçən#,"[#, P, ü, p, p, c, h, e, n, #]",10,0,0,"[#, p, ʏ, p, p, ç, ə, n, #]",9
74623,#Stundenberechnung#,#ʃtʊndɛnbeɾɛçnʊŋ#,"[#, S, t, u, n, d, e, n, b, e, r, e, c, h, n, ...",19,0,0,"[#, ʃ, t, ʊ, n, d, ɛ, n, b, e, ɾ, ɛ, ç, n, ʊ, ...",17
15043,#Dachgeschossen#,#daçgɛʃɔszən#,"[#, D, a, c, h, g, e, s, c, h, o, s, s, e, n, #]",16,0,0,"[#, d, a, ç, g, ɛ, ʃ, ɔ, s, z, ə, n, #]",13
...,...,...,...,...,...,...,...,...
3311,#Anführers#,#ʔanfyɾɛɾs#,"[#, A, n, f, ü, h, r, e, r, s, #]",11,0,0,"[#, ʔ, a, n, f, y, ɾ, ɛ, ɾ, s, #]",11
33139,#Grundbesitzers#,#gɾʊntbezɪtsɛɾs#,"[#, G, r, u, n, d, b, e, s, i, t, z, e, r, s, #]",16,0,0,"[#, g, ɾ, ʊ, n, t, b, e, z, ɪ, t, s, ɛ, ɾ, s, #]",16
81918,#Verkehrsteilnehmerinnen#,#fɛːɐ̯keɾstaelnemeɾɪnnən#,"[#, V, e, r, k, e, h, r, s, t, e, i, l, n, e, ...",25,0,0,"[#, f, ɛ, ː, ɐ, ̯, k, e, ɾ, s, t, a, e, l, n, ...",25
76326,#Telefonkabeln#,#teːlefɔŋkɑbɛln#,"[#, T, e, l, e, f, o, n, k, a, b, e, l, n, #]",15,0,0,"[#, t, e, ː, l, e, f, ɔ, ŋ, k, ɑ, b, ɛ, l, n, #]",16


In [679]:
## filtering raw_data by size
print(f"term_type: {term_type}")
if "sp_" in term_type:
    df_filtered = raw_df[ (raw_df['sp_size'] <= max_doc_size) & (raw_df['sp_size'] >= min_doc_size) & (raw_df['hyphen'] == 0) & (raw_df['period'] == 0) ]
else:
    df_filtered = raw_df[ (raw_df['sn_size'] <= max_doc_size) & (raw_df['sn_size'] >= min_doc_size) ]
#
df_filtered

term_type: sp_skippy5gram


Unnamed: 0,spell,sound,sp_1gram,sp_size,hyphen,period,sn_1gram,sn_size
60730,#Püppchen#,#pʏppçən#,"[#, P, ü, p, p, c, h, e, n, #]",10,0,0,"[#, p, ʏ, p, p, ç, ə, n, #]",9
66958,#Schrein#,#ʃɾəɪn#,"[#, S, c, h, r, e, i, n, #]",9,0,0,"[#, ʃ, ɾ, ə, ɪ, n, #]",7
55336,#Nute#,#nuːte#,"[#, N, u, t, e, #]",6,0,0,"[#, n, u, ː, t, e, #]",7
68186,#Schwur#,#ʃvuːɐ#,"[#, S, c, h, w, u, r, #]",8,0,0,"[#, ʃ, v, u, ː, ɐ, #]",7
72586,#Stände#,#ʃtɛndə#,"[#, S, t, ä, n, d, e, #]",8,0,0,"[#, ʃ, t, ɛ, n, d, ə, #]",8
...,...,...,...,...,...,...,...,...
60941,#Radlern#,#ɾadlɛɾn#,"[#, R, a, d, l, e, r, n, #]",9,0,0,"[#, ɾ, a, d, l, ɛ, ɾ, n, #]",9
14136,#Burg#,#bʊʁk#,"[#, B, u, r, g, #]",6,0,0,"[#, b, ʊ, ʁ, k, #]",6
58188,#Phosphat#,#fɔspɑt#,"[#, P, h, o, s, p, h, a, t, #]",10,0,0,"[#, f, ɔ, s, p, ɑ, t, #]",8
38409,#Insekt#,#ɪnzɛkt#,"[#, I, n, s, e, k, t, #]",8,0,0,"[#, ɪ, n, z, ɛ, k, t, #]",8


In [680]:
## define df after second sampling if any
len(df_filtered)
if second_sampling:
    df = df_filtered.sample(round(len(df_filtered) * second_sampling_rate))
else:
    df = df_filtered
len(df)

1679

In [681]:
## spell 2grams
#import ngrams
import gen_ngrams
module_name = "gen_ngrams"
reload_module = False
if reload_module:
    import importlib
    importlib.reload(module_name)

if term_class == 'spell':
    #sp_2grams = [ ngrams.list_gen_ngrams (x, n = 2, check = False) for x in df['sp_1gram'] ]
    sp_2grams = [ gen_ngrams.gen_ngrams (x, n = 2, sep = "", check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_2grams):
            g.extend(list(df['sp_1gram'])[i])
    ## add sp_2gram
    df['sp_2gram'] = sp_2grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sp_2gram'] = sp_2grams


In [682]:
## spell 3grams
#import ngrams
import gen_ngrams
if n_for_ngram > 2 and term_class == 'spell':
    #sp_3grams = [ ngrams.list_gen_ngrams (x, n = 3, check = False) for x in df['sp_1gram'] ]
    sp_3grams = [ gen_ngrams.gen_ngrams (x, n = 3, sep = "", check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_3grams):
            g.extend(list(df['sp_2gram'])[i])
    ## add sp_2gram
    df['sp_3gram'] = sp_3grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sp_3gram'] = sp_3grams


In [683]:
## spell 4grams
#import ngrams
import gen_ngrams
if n_for_ngram > 3 and term_class == 'spell':
    #sp_4grams = [ ngrams.list_gen_ngrams (x, n = 4, check = False) for x in df['sp_1gram'] ]
    sp_4grams = [ gen_ngrams.gen_ngrams (x, n = 4, sep = "", check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_4grams):
            g.extend(list(df['sp_3gram'])[i])
    ## add sp_2gram
    df['sp_4gram'] = sp_4grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sp_4gram'] = sp_4grams


In [684]:
## spell skippy 2gram
#import ngrams_skippy
import gen_ngrams
reload_module = False
module_name = "gen_ngrams"
if reload_module:
    import importlib
    importlib.reload(module_name)
#
if term_class == 'spell':
    #sp_skippy2grams = [ ngrams_skippy.gen_skippy2grams(x, missing_mark = gap_mark, check = False) for x in df['sp_1gram'] ]
    ## The code above was replaced by the following more efficient one
    sp_skippy2grams = [ gen_ngrams.gen_skippy_ngrams(x, 2, sep = "", max_distance = max_gap_size, missing_mark = gap_mark, check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_skippy2grams):
            g.extend(list(df['sp_1gram'])[i])
    #
    df['sp_skippy2gram'] = sp_skippy2grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sp_skippy2gram'] = sp_skippy2grams


In [685]:
## spell skippy 3gram
#import ngrams_skippy
import gen_ngrams
if n_for_ngram > 2 and term_class == 'spell':
    #sp_skippy3grams = [ ngrams_skippy.gen_skippy3grams(x, missing_mark = gap_mark, check = False) for x in df['sp_1gram'] ]
    ## The code above was replaced by the following more efficient one
    sp_skippy3grams = [ gen_ngrams.gen_skippy_ngrams(x, 3, sep = "", max_distance = max_gap_size, missing_mark = gap_mark, check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_skippy3grams):
            g.extend(list(df['sp_skippy2gram'])[i])
    #
    df['sp_skippy3gram'] = sp_skippy3grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sp_skippy3gram'] = sp_skippy3grams


In [686]:
## spell skippy 4gram
#import ngrams_skippy
import gen_ngrams
if n_for_ngram > 3 and term_class == 'spell':
    #sp_skippy4grams = [ ngrams_skippy.gen_skippy4grams(x, missing_mark = gap_mark, check = False) for x in df['sp_1gram'] ]
    ## The code above was replaced by the following more efficient one
    sp_skippy4grams = [ gen_ngrams.gen_skippy_ngrams(x, 4, sep = "", max_distance = max_gap_size, missing_mark = gap_mark, check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_skippy4grams):
            g.extend(list(df['sp_skippy3gram'])[i])
    #
    df['sp_skippy4gram'] = sp_skippy4grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sp_skippy4gram'] = sp_skippy4grams


In [687]:
## spell skippy 5gram
import gen_ngrams
if n_for_ngram > 4 and term_class == 'spell':
    sp_skippy5grams = [ gen_ngrams.gen_skippy_ngrams(x, 5, sep = "", max_distance = max_gap_size, missing_mark = gap_mark, check = False) for x in df['sp_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sp_skippy5grams):
            g.extend(list(df['sp_skippy4gram'])[i])
    #
    df['sp_skippy5gram'] = sp_skippy5grams

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sp_skippy5gram'] = sp_skippy5grams


In [688]:
## sound 2grams
#import ngrams
import gen_ngrams
module_name = "gen_ngrams"
reload_module = False
if reload_module:
    import importlib
    importlib.reload(module_name)
#
if term_class == 'sound':
    #sn_2grams = [ ngrams.list_gen_ngrams (x, n = 2, check = False) for x in df['sn_1gram'] ]
    sn_2grams = [ gen_ngrams.gen_ngrams (x, n = 2, sep ="", check = False) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_2grams):
            g.extend(list(df['sn_1gram'])[i])
    ## add sn_2gram
    df['sn_2gram'] = sn_2grams

In [689]:
## sound 3grams
#import ngrams
import gen_ngrams
if n_for_ngram > 2 and term_class == 'sound':
    #sn_3grams = [ ngrams.list_gen_ngrams (x, n = 3, check = False) for x in df['sn_1gram'] ]
    sn_3grams = [ gen_ngrams.gen_ngrams (x, n = 3, sep = "", check = False) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_3grams):
            g.extend(list(df['sn_2gram'])[i])
    ## add sn_3gram
    df['sn_3gram'] = sn_3grams

In [690]:
## sound 4grams
#import ngrams
import gen_ngrams
if n_for_ngram > 3 and term_class == 'sound':
    #sn_4grams = [ ngrams.list_gen_ngrams (x, n = 4, check = False) for x in df['sn_1gram'] ]
    sn_4grams = [ gen_ngrams.gen_ngrams (x, n = 4, sep = "", check = False) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_4grams):
            g.extend(list(df['sn_3gram'])[i])
    ## add sn_4gram
    df['sn_4gram'] = sn_4grams

In [691]:
## sound 5grams
import gen_ngrams
if n_for_ngram > 4 and term_class == 'sound':
    sn_5grams = [ gen_ngrams.gen_ngrams (x, n = 5, sep = "", check = False) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_5grams):
            g.extend(list(df['sn_4gram'])[i])
    ## add sn_4gram
    df['sn_5gram'] = sn_5grams

In [692]:
## sound skippy 2gram
#import ngrams_skippy
import gen_ngrams
if term_class == 'sound':
    #sn_skippy2grams = [ ngrams_skippy.gen_skippy2grams(x, missing_mark = gap_mark, check = False) for x in df['sn_1gram'] ]
    ## The code above was replaced by the following more efficient one
    sn_skippy2grams = [ gen_ngrams.gen_skippy_ngrams(x, n = 2, sep = "", max_distance = max_gap_size, missing_mark = gap_mark, check = False) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_skippy2grams):
            g.extend(list(df['sn_1gram'])[i])
    #
    df['sn_skippy2gram'] = sn_skippy2grams

In [693]:
## sound skippy 3gram
#import ngrams_skippy
import gen_ngrams
if n_for_ngram > 2 and term_class == 'sound':
    #sn_skippy3grams = [ ngrams_skippy.gen_skippy3grams(x, missing_mark = gap_mark, check = False) for x in df['sn_1gram'] ]
    ## The code above was replaced by the following more efficient one
    sn_skippy3grams = [ gen_ngrams.gen_skippy_ngrams(x, n = 3, sep = "", max_distance = max_gap_size, missing_mark = gap_mark, check = False) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_skippy3grams):
            g.extend(list(df['sn_skippy2gram'])[i])
    #
    df['sn_skippy3gram'] = sn_skippy3grams

In [694]:
## sound skippy 4gram
#import ngrams_skippy
import gen_ngrams
if n_for_ngram > 3 and term_class == 'sound':
    #sn_skippy4grams = [ ngrams_skippy.gen_skippy4grams(x, missing_mark = gap_mark, check = False) for x in df['sn_1gram'] ]
    ## The code above was replaced by the following more efficient one
    sn_skippy4grams = [ gen_ngrams.gen_skippy_ngrams(x, n = 4, sep = "", max_distance = max_gap_size, missing_mark = gap_mark, check = False) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_skippy4grams):
            g.extend(list(df['sn_skippy3gram'])[i])
    #
    df['sn_skippy4gram'] = sn_skippy4grams

In [695]:
## sound skippy 5gram
import gen_ngrams
if n_for_ngram > 4 and term_class == 'sound':
    sn_skippy5grams = [ gen_ngrams.gen_skippy_ngrams(x, n = 5, sep = "", max_distance = max_gap_size, missing_mark = gap_mark, check = False) for x in df['sn_1gram'] ]
    if ngram_is_inclusive:
        for i, g in enumerate(sn_skippy5grams):
            g.extend(list(df['sn_skippy4gram'])[i])
    #
    df['sn_skippy5gram'] = sn_skippy5grams

In [696]:
## check df
dropped_vars = [ 'sp_size', 'hyphen', 'period', 'sn_size' ]
if term_class == 'spell':
    extra = [ 'sn_1gram', 'sn_2gram', 'sn_3gram', 'sn_4gram',
             'sn_skippy2gram', 'sn_skippy3gram', 'sn_skippy4gram' ]
    dropped_vars.extend(extra)
    target_vars = [ x for x in df.columns if not x in dropped_vars ]
else:
    extra = [ 'sp_1gram', 'sp_2gram', 'sp_3gram', 'sp_4gram',
             'sp_skippy2gram', 'sp_skippy3gram', 'sp_skippy4gram' ]
    dropped_vars.extend(extra)
    target_vars = [ x for x in df.columns if not x in dropped_vars ]    
#
df[target_vars]

Unnamed: 0,spell,sound,sp_1gram,sp_2gram,sp_3gram,sp_4gram,sp_skippy2gram,sp_skippy3gram,sp_skippy4gram,sp_skippy5gram
60730,#Püppchen#,#pʏppçən#,"[#, P, ü, p, p, c, h, e, n, #]","[#P, Pü, üp, pp, pc, ch, he, en, n#, #, P, ü, ...","[#Pü, Püp, üpp, ppc, pch, che, hen, en#, #P, P...","[#Püp, Püpp, üppc, ppch, pche, chen, hen#, #Pü...","[#P, #…ü, #…p, #…p, #…c, #…h, #…e, Pü, P…p, P…...","[#Pü, #P…p, #P…p, #P…c, #P…h, #P…e, #…üp, #…ü…...","[#Püp, #Pü…p, #Pü…c, #Pü…h, #Pü…e, #P…pp, #P…p...","[#Püpp, #Püp…c, #Püp…h, #Püp…e, #Pü…pc, #Pü…p…..."
66958,#Schrein#,#ʃɾəɪn#,"[#, S, c, h, r, e, i, n, #]","[#S, Sc, ch, hr, re, ei, in, n#, #, S, c, h, r...","[#Sc, Sch, chr, hre, rei, ein, in#, #S, Sc, ch...","[#Sch, Schr, chre, hrei, rein, ein#, #Sc, Sch,...","[#S, #…c, #…h, #…r, #…e, #…i, #…n, Sc, S…h, S…...","[#Sc, #S…h, #S…r, #S…e, #S…i, #S…n, #…ch, #…c…...","[#Sch, #Sc…r, #Sc…e, #Sc…i, #Sc…n, #S…hr, #S…h...","[#Schr, #Sch…e, #Sch…i, #Sch…n, #Sc…re, #Sc…r…..."
55336,#Nute#,#nuːte#,"[#, N, u, t, e, #]","[#N, Nu, ut, te, e#, #, N, u, t, e, #]","[#Nu, Nut, ute, te#, #N, Nu, ut, te, e#, #, N,...","[#Nut, Nute, ute#, #Nu, Nut, ute, te#, #N, Nu,...","[#N, #…u, #…t, #…e, #…#, Nu, N…t, N…e, N…#, ut...","[#Nu, #N…t, #N…e, #N…#, #…ut, #…u…e, #…u…#, #…...","[#Nut, #Nu…e, #Nu…#, #N…te, #N…t…#, #N…e#, #…u...","[#Nute, #Nut…#, #Nu…e#, #N…te#, #…ute#, Nute#,..."
68186,#Schwur#,#ʃvuːɐ#,"[#, S, c, h, w, u, r, #]","[#S, Sc, ch, hw, wu, ur, r#, #, S, c, h, w, u,...","[#Sc, Sch, chw, hwu, wur, ur#, #S, Sc, ch, hw,...","[#Sch, Schw, chwu, hwur, wur#, #Sc, Sch, chw, ...","[#S, #…c, #…h, #…w, #…u, #…r, #…#, Sc, S…h, S…...","[#Sc, #S…h, #S…w, #S…u, #S…r, #S…#, #…ch, #…c…...","[#Sch, #Sc…w, #Sc…u, #Sc…r, #Sc…#, #S…hw, #S…h...","[#Schw, #Sch…u, #Sch…r, #Sch…#, #Sc…wu, #Sc…w…..."
72586,#Stände#,#ʃtɛndə#,"[#, S, t, ä, n, d, e, #]","[#S, St, tä, än, nd, de, e#, #, S, t, ä, n, d,...","[#St, Stä, tän, änd, nde, de#, #S, St, tä, än,...","[#Stä, Stän, tänd, ände, nde#, #St, Stä, tän, ...","[#S, #…t, #…ä, #…n, #…d, #…e, #…#, St, S…ä, S…...","[#St, #S…ä, #S…n, #S…d, #S…e, #S…#, #…tä, #…t…...","[#Stä, #St…n, #St…d, #St…e, #St…#, #S…än, #S…ä...","[#Stän, #Stä…d, #Stä…e, #Stä…#, #St…nd, #St…n…..."
...,...,...,...,...,...,...,...,...,...,...
60941,#Radlern#,#ɾadlɛɾn#,"[#, R, a, d, l, e, r, n, #]","[#R, Ra, ad, dl, le, er, rn, n#, #, R, a, d, l...","[#Ra, Rad, adl, dle, ler, ern, rn#, #R, Ra, ad...","[#Rad, Radl, adle, dler, lern, ern#, #Ra, Rad,...","[#R, #…a, #…d, #…l, #…e, #…r, #…n, Ra, R…d, R…...","[#Ra, #R…d, #R…l, #R…e, #R…r, #R…n, #…ad, #…a…...","[#Rad, #Ra…l, #Ra…e, #Ra…r, #Ra…n, #R…dl, #R…d...","[#Radl, #Rad…e, #Rad…r, #Rad…n, #Ra…le, #Ra…l…..."
14136,#Burg#,#bʊʁk#,"[#, B, u, r, g, #]","[#B, Bu, ur, rg, g#, #, B, u, r, g, #]","[#Bu, Bur, urg, rg#, #B, Bu, ur, rg, g#, #, B,...","[#Bur, Burg, urg#, #Bu, Bur, urg, rg#, #B, Bu,...","[#B, #…u, #…r, #…g, #…#, Bu, B…r, B…g, B…#, ur...","[#Bu, #B…r, #B…g, #B…#, #…ur, #…u…g, #…u…#, #…...","[#Bur, #Bu…g, #Bu…#, #B…rg, #B…r…#, #B…g#, #…u...","[#Burg, #Bur…#, #Bu…g#, #B…rg#, #…urg#, Burg#,..."
58188,#Phosphat#,#fɔspɑt#,"[#, P, h, o, s, p, h, a, t, #]","[#P, Ph, ho, os, sp, ph, ha, at, t#, #, P, h, ...","[#Ph, Pho, hos, osp, sph, pha, hat, at#, #P, P...","[#Pho, Phos, hosp, osph, spha, phat, hat#, #Ph...","[#P, #…h, #…o, #…s, #…p, #…h, #…a, Ph, P…o, P…...","[#Ph, #P…o, #P…s, #P…p, #P…h, #P…a, #…ho, #…h…...","[#Pho, #Ph…s, #Ph…p, #Ph…h, #Ph…a, #P…os, #P…o...","[#Phos, #Pho…p, #Pho…h, #Pho…a, #Ph…sp, #Ph…s…..."
38409,#Insekt#,#ɪnzɛkt#,"[#, I, n, s, e, k, t, #]","[#I, In, ns, se, ek, kt, t#, #, I, n, s, e, k,...","[#In, Ins, nse, sek, ekt, kt#, #I, In, ns, se,...","[#Ins, Inse, nsek, sekt, ekt#, #In, Ins, nse, ...","[#I, #…n, #…s, #…e, #…k, #…t, #…#, In, I…s, I…...","[#In, #I…s, #I…e, #I…k, #I…t, #I…#, #…ns, #…n…...","[#Ins, #In…e, #In…k, #In…t, #In…#, #I…se, #I…s...","[#Inse, #Ins…k, #Ins…t, #Ins…#, #In…ek, #In…e…..."


In [697]:
## select data type and define doc_dict
import random
if "sp_" in term_type:
    base_type = "spell"
else:
    base_type = "sound"
doc_dict = { i: x for i, x in enumerate(df[base_type]) }
## check
random.sample(doc_dict.items(), 10)

since Python 3.9 and will be removed in a subsequent version.
  random.sample(doc_dict.items(), 10)


[(1447, '#Mischung#'),
 (803, '#Jammers#'),
 (110, '#Verdruss#'),
 (1044, '#Magiers#'),
 (1367, '#Gulasch#'),
 (1625, '#Werbegag#'),
 (957, '#Pfennige#'),
 (1541, '#Sohne#'),
 (128, '#Tausch#'),
 (1612, '#Mühlheim#')]

In [698]:
## select bots for analysis
enable_term_change = False # if you want to change term_type to save time and energy
if enable_term_change:
	term_type = 'sp_skippy4gram'
print(f"(changed) term_type: {term_type}")

## bot stands for 'bag-of-terms', a generalization of 'bag-of-words'
bots = [ x for x in df[term_type] if len(x) > min_bot_size ] # Crucially
import random
random.sample(bots, 3)

(changed) term_type: sp_skippy5gram


[['#Utop',
  '#Uto…i',
  '#Uto…e',
  '#Uto…#',
  '#Ut…pi',
  '#Ut…p…e',
  '#Ut…p…#',
  '#Ut…ie',
  '#Ut…i…#',
  '#Ut…e#',
  '#U…opi',
  '#U…op…e',
  '#U…op…#',
  '#U…o…ie',
  '#U…o…i…#',
  '#U…o…e#',
  '#U…pie',
  '#U…pi…#',
  '#U…p…e#',
  '#U…ie#',
  '#…topi',
  '#…top…e',
  '#…top…#',
  '#…to…ie',
  '#…to…i…#',
  '#…to…e#',
  '#…t…pie',
  '#…t…pi…#',
  '#…t…p…e#',
  '#…t…ie#',
  '#…opie',
  '#…opi…#',
  '#…op…e#',
  '#…o…ie#',
  '#…pie#',
  'Utopi',
  'Utop…e',
  'Utop…#',
  'Uto…ie',
  'Uto…i…#',
  'Uto…e#',
  'Ut…pie',
  'Ut…pi…#',
  'Ut…p…e#',
  'Ut…ie#',
  'U…opie',
  'U…opi…#',
  'U…op…e#',
  'U…o…ie#',
  'U…pie#',
  'topie',
  'topi…#',
  'top…e#',
  'to…ie#',
  't…pie#',
  'opie#',
  'Utopi',
  'Utop…e',
  'Utop…#',
  'Uto…ie',
  'Uto…i…#',
  'Uto…e#',
  'Ut…pie',
  'Ut…pi…#',
  'Ut…p…e#',
  'Ut…ie#',
  'U…opie',
  'U…opi…#',
  'U…op…e#',
  'U…o…ie#',
  'U…pie#',
  'topie',
  'topi…#',
  'top…e#',
  'to…ie#',
  't…pie#',
  'opie#',
  'topie',
  'topi…#',
  'top…e#',
  'to…ie#'

In [699]:
## generate dictionary
from gensim.corpora import Dictionary
diction = Dictionary(bots)
print(diction)

#apply_term_filtering = False
if apply_term_filtering:
    print(f"term filtering applied")
    diction.filter_extremes(no_below = term_minfreq, no_above = abuse_threshold)
else:
    print(f"term filtering not applied")
## check
print(diction)
## generate DTM
corpus = [ diction.doc2bow(bot) for bot in bots if len(bot) > min_bot_size ] # Crucially

Dictionary<233200 unique tokens: ['#', '#P', '#Pü', '#Püp', '#Püpp']...>
term filtering applied
Dictionary<53883 unique tokens: ['#Pü', '#P…c', '#P…ch', '#P…c…e', '#P…e']...>


In [700]:
## HDP (n_topics = 90)
import gensim.models
import pyLDAvis.gensim
max_n_topics = 90
hdp = gensim.models.HdpModel(corpus, diction, random_state = 1, T = max_n_topics)
vis_data = pyLDAvis.gensim.prepare(hdp, corpus, diction)
pyLDAvis.display(vis_data)

In [701]:
## save LDAvis output as a html file
save_LDAvis = True
if save_LDAvis:
	lang_dir_name = target_lang_dict[target_lang_key].split()[0]
	vis_output = f"results/LDAvis/{lang_dir_name}/{target_lang_dict[target_lang_key]}{target_class}-HDP-max_ntop{max_n_topics}-{term_type}{accent_status}.html"
	pyLDAvis.save_html(vis_data, vis_output)

In [702]:
## topic investigation
import numpy as np
import HDP_helper

documents_topics = np.zeros([hdp.m_T, len(corpus)])
for doc_id, c in enumerate(corpus):
    for topic_id, prob in hdp[c]:
        documents_topics[topic_id][doc_id] = prob

## investigate topics
n_docs_to_show  = 10
n_terms_to_show = 7
hdp.optimal_ordering()
for topic_id, probs in enumerate(documents_topics):
    print(f"==============")
    #topic_encoding = ", ".join(hdp.show_topic(topic_id))
    topic_t = hdp.print_topic(topic_id, topn = n_terms_to_show)
    print(f"topic_id {topic_id}: {HDP_helper.reformat_topic (topic_t, n_terms_to_show)}")
    print(f"nonzero count: ", len(probs.nonzero()[0]))
    for doc_id in probs.argsort()[::-1][:n_docs_to_show]:
        doc = doc_dict[doc_id]
        print(f"\t{probs[doc_id]:0.4f}: {doc}")

topic_id 0: 0.001 * rs + 0.001 * on + 0.001 * rs# + 0.001 * he + 0.001 * o…i + 0.001 * to + 0.001 * ik
nonzero count:  94
	0.9982: #Waldland#
	0.9981: #Traktors#
	0.9981: #Sprunges#
	0.9980: #Insektes#
	0.9980: #Goldrand#
	0.9979: #Grasland#
	0.9979: #Komikern#
	0.9979: #Speichel#
	0.9978: #Lektors#
	0.9978: #Vektors#
topic_id 1: 0.002 * ie + 0.001 * he + 0.001 * ie…# + 0.001 * s…r + 0.001 * ss + 0.001 * ht + 0.001 * s…e…#
nonzero count:  69
	0.9985: #Richtern#
	0.9982: #Trichter#
	0.9981: #Richter#
	0.9980: #Feigheit#
	0.9980: #Wählerin#
	0.9980: #Dienstag#
	0.9980: #Eckchens#
	0.9979: #Späherin#
	0.9979: #Aufstieg#
	0.9979: #Schlicks#
topic_id 2: 0.002 * rn + 0.002 * n…r + 0.002 * ne + 0.002 * rn# + 0.002 * ern + 0.002 * l…r + 0.002 * ern#
nonzero count:  70
	0.9985: #Trödlern#
	0.9984: #Wandlern#
	0.9982: #Händlern#
	0.9981: #Trödler#
	0.9979: #Gewinnes#
	0.9979: #Bannern#
	0.9979: #Gegners#
	0.9977: #Niethose#
	0.9977: #Windeier#
	0.9976: #Reglers#
topic_id 3: 0.002 * r…en + 0.002 

In [703]:
## HDP (n_topics = 45)
import gensim.models
import pyLDAvis.gensim
max_n_topics = 45
hdp = gensim.models.HdpModel(corpus, diction, random_state = 1, T = max_n_topics)
vis_data = pyLDAvis.gensim.prepare(hdp, corpus, diction)
pyLDAvis.display(vis_data)

In [704]:
## save LDAvis output as a html file
save_LDAvis = True
if save_LDAvis:
	lang_dir_name = target_lang_dict[target_lang_key].split()[0]
	vis_output = f"results/LDAvis/{lang_dir_name}/{target_lang_dict[target_lang_key]}{target_class}-HDP-max_ntop{max_n_topics}-{accent_status}{term_type}.html"
	pyLDAvis.save_html(vis_data, vis_output)

In [705]:
## topic investigation
import numpy as np
import HDP_helper

documents_topics = np.zeros([hdp.m_T, len(corpus)])
for doc_id, c in enumerate(corpus):
    for topic_id, prob in hdp[c]:
        documents_topics[topic_id][doc_id] = prob

n_docs_to_show = 10
n_terms_to_show = 7
hdp.optimal_ordering()
for topic_id, probs in enumerate(documents_topics):
    print(f"==============")
    topic_t = hdp.print_topic(topic_id, topn = n_terms_to_show)
    print(f"topic_id {topic_id}: {HDP_helper.reformat_topic (topic_t, n_terms_to_show)}")
    print(f"nonzero count: {len(probs.nonzero()[0])}")
    for doc_id in probs.argsort()[::-1][:n_docs_to_show]:
        doc = doc_dict[doc_id]
        print(f"\t{probs[doc_id]:0.4f}: {doc}")

topic_id 0: 0.001 * ne + 0.001 * rs + 0.001 * ke + 0.001 * k…n + 0.001 * he + 0.001 * rs# + 0.001 * et
nonzero count: 144
	0.9986: #Spinette#
	0.9985: #Spinetts#
	0.9985: #Strähnen#
	0.9984: #Vulkanen#
	0.9982: #Stilette#
	0.9982: #Waldland#
	0.9982: #Strähne#
	0.9982: #Herzchen#
	0.9982: #Vulkane#
	0.9982: #Traktors#
topic_id 1: 0.002 * el…# + 0.001 * ls + 0.001 * ls# + 0.001 * t…l + 0.001 * ln + 0.001 * t…s + 0.001 * ln#
nonzero count: 146
	0.9985: #Gestelle#
	0.9984: #Gestells#
	0.9984: #Muscheln#
	0.9983: #Stacheln#
	0.9983: #Büscheln#
	0.9982: #Westwall#
	0.9982: #Bläschen#
	0.9981: #Westfale#
	0.9981: #Restwert#
	0.9980: #Gitters#
topic_id 2: 0.001 * l# + 0.001 * g…n + 0.001 * ge…# + 0.001 * el# + 0.001 * he + 0.001 * gen + 0.001 * a#
nonzero count: 120
	0.9984: #Jahrgang#
	0.9983: #Lehrgang#
	0.9981: #Spielers#
	0.9981: #Geflügel#
	0.9980: #Vorwände#
	0.9980: #Hirschen#
	0.9980: #Saustall#
	0.9979: #Unrechte#
	0.9979: #Speichel#
	0.9979: #Seminars#
topic_id 3: 0.001 * rs + 0.001

In [706]:
## HDP (n_topics = 15)
import gensim.models
import pyLDAvis.gensim
max_n_topics = 15
hdp = gensim.models.HdpModel(corpus, diction, random_state = 1, T = max_n_topics)
vis_data = pyLDAvis.gensim.prepare(hdp, corpus, diction)
pyLDAvis.display(vis_data)

In [707]:
## save LDAvis output as a html file
save_LDAvis = True
if save_LDAvis:
	lang_dir_name = target_lang_dict[target_lang_key].split()[0]
	vis_output = f"results/LDAvis/{lang_dir_name}/{target_lang_dict[target_lang_key]}{target_class}-HDP-max_ntop{max_n_topics}-{accent_status}{term_type}.html"
	pyLDAvis.save_html(vis_data, vis_output)

In [708]:
## topic investigation
import numpy as np
import HDP_helper
reload_module = True
if reload_module:
    import importlib
    importlib.reload(HDP_helper)

documents_topics = np.zeros([hdp.m_T, len(corpus)])
for doc_id, c in enumerate(corpus):
    for topic_id, prob in hdp[c]:
        documents_topics[topic_id][doc_id] = prob

n_docs_to_show = 10
n_terms_to_show = 7
hdp.optimal_ordering()
for topic_id, probs in enumerate(documents_topics):
    print(f"==============")
    topic_t = hdp.print_topic(topic_id, topn = n_terms_to_show)
    print(f"topic_id {topic_id}: {HDP_helper.reformat_topic (topic_t, n_terms_to_show)}")
    print(f"nonzero count: ", len(probs.nonzero()[0]))
    for doc_id in probs.argsort()[::-1][:n_docs_to_show]:
        doc = doc_dict[doc_id]
        print(f"\t{probs[doc_id]:0.4f}: {doc}")

topic_id 0: 0.001 * el…# + 0.001 * ls + 0.001 * ls# + 0.001 * t…l + 0.001 * ne + 0.001 * ie + 0.001 * be
nonzero count:  317
	0.9986: #Gestelle#
	0.9985: #Algerien#
	0.9985: #Gestells#
	0.9985: #Algerier#
	0.9985: #Besitzer#
	0.9985: #Muscheln#
	0.9984: #Kapiteln#
	0.9984: #Büscheln#
	0.9983: #Westwall#
	0.9983: #Besätzen#
topic_id 1: 0.001 * rs + 0.001 * s…s + 0.0 * rs# + 0.0 * au + 0.0 * ss + 0.0 * s…r + 0.0 * l#
nonzero count:  261
	0.9987: #Brechers#
	0.9987: #Brechern#
	0.9985: #Gasofens#
	0.9985: #Zuhörern#
	0.9985: #Saxofone#
	0.9983: #Saxofon#
	0.9983: #Zuhörer#
	0.9983: #Gasofen#
	0.9983: #Masseuse#
	0.9982: #Masseurs#
topic_id 2: 0.001 * k…n + 0.0 * rs + 0.0 * au + 0.0 * o…t + 0.0 * ie + 0.0 * o…r + 0.0 * re
nonzero count:  238
	0.9986: #Klosters#
	0.9985: #Kleinode#
	0.9985: #Torwarts#
	0.9984: #Stickern#
	0.9983: #Treffens#
	0.9983: #Torwart#
	0.9982: #Kleinod#
	0.9982: #Kloster#
	0.9982: #Entwurfs#
	0.9982: #Tastatur#
topic_id 3: 0.001 * he + 0.001 * l# + 0.0 * he…# + 0.0 