In [2]:
import sys, os, re
import pandas as pd
import ast

In [3]:
## settings
verbose           = False
## term
term_classes      = [ 'spell', 'sound' ]
term_class        = term_classes[0]
n_for_ngram       = 4
ngram_is_skippy   = True
if ngram_is_skippy:
    term_type     = f"skippy{n_for_ngram}gram"
else:
    term_type     = f"{n_for_ngram}gram"
if term_class == 'sound':
    term_type = "sn_" + term_type
else:
    term_type = "sp_" + term_type
print(f"term_type: {term_type}")

## gap mark
gap_mark          = "…"
print(f"gap_mark: {gap_mark}")

## accent handling
suppress_accents   = True
if term_class == 'sound':
    if suppress_accents:
        accent_status = "-unaccented"
    else:
        accent_stratus = "-accented"
else:
    accent_status = ""
print(f"accent_status: {accent_status}")

term_type: sp_skippy4gram
gap_mark: …
accent_status: 


In [4]:
## target language
## a key must be part of a file name 
target_lang_dict = {    'en_US' : 'English (US)',
                        'en_UK' : 'English (UK)',
                        'en_N_only' : 'English noun (WN)',
                        'en_V_only' : 'English verb (WN)',
                        'en_A_only' : 'English adj (WN)',
                        'en_R_only' : 'English adv (WN)',
                        'ar'    : 'Arabic',
                        'de'    : 'German',
                        'de_N_only' : 'German Nouns',
                        'de_non_N_only' : 'German Non-nouns',
                        'eo'    : 'Esperanto',
                        'es_ES' : 'Spanish (Spain)',
                        'es_MX' : 'Spanish (Mexico)',
                        'fi'    : 'Finnish',
                        'fr_FR' : 'French (France)',
                        'fr_QC' : 'French (Quebec)',
                        'is'    : 'Icelandic',
                        'ir'    : 'Irish',
                        'nl'    : 'Dutch',
                        'ro'    : 'Romanian',
                        'sw'    : 'Swahili' }
## proper language selection
target_lang_keys = [    'en_US', # 0
                        'en_UK', # 1
                        'en_N_only', # 2
                        'en_V_only', # 3
                        'en_A_only', # 4
                        'en_R_only', # 5
                        'ar', # 6
                        'de', # 7
                        'de_N_only', # 8
                        'de_non_N_only', # 9
                        'eo', 'es_ES', 'es_MX',
                        'fi', 'fr_FR', 'fr_QC',
                        'is', 'nl', 'ro', 'sw',
                        'ir' # This lacks sound
                    ]
## check
target_lang_key  = target_lang_keys[4]
print(f"target_lang_key: {target_lang_key}")
print(f"target lang: {target_lang_dict[target_lang_key]} [{target_lang_key}]")
## target_attr [effective only for Irish]
target_class = ""
#target_class = None # This causes an unrediable error
if target_lang_key == "ir":
    target_classes = [ 'adjectives', 'nouns', 'verbs' ]
    target_class = f"-{target_classes[3]}"
print(f"target_class: {target_class}")
lang_dir_name = target_lang_dict[target_lang_key].split()[0]
print(f"lang_dir_name: {lang_dir_name}")


target_lang_key: en_A_only
target lang: English adj (WN) [en_A_only]
target_class: 
lang_dir_name: English


In [5]:
## read data from a file
import pandas as pd
import ast
##
max_n_topics = 45
##
use_sample = False
if use_sample:
    data_file = f"results/terms-by-topics-raw/samples/hdp{max_n_topics}_topics_raw.csv"
else:
    data_file = f"results/terms-by-topics-raw/{lang_dir_name}/{target_lang_dict[target_lang_key]}{target_class}-topics{max_n_topics}-{term_type}{accent_status}.csv"
print(data_file)

## read a file and evaluate its content
raw_df = pd.read_csv(data_file, header = None)
for col in raw_df:
    raw_df[col] = raw_df[col].apply(ast.literal_eval) # evaluation is crucial
##
if verbose:
    raw_df

results/terms-by-topics-raw/English/English adj (WN)-topics45-sp_skippy4gram.csv


In [6]:
## supplement boudaries: this modification improves alignment quality
supplement_boundaries = True
boundary_mark = "#"
if supplement_boundaries:
    for col in raw_df:
        enc = [ (f"{gap_mark}{term}", score) if term[-1] == boundary_mark else (f"{gap_mark}{term}{gap_mark}", score) for term, score in raw_df[col] ]
        raw_df[col] = enc
##
if verbose:
    raw_df

In [7]:
## run the following to install string2string package if neccessary
#!pip3 install -U string2string

In [8]:
## run the following if string2string needs it
#!pip install -U ipywidgets

In [26]:
## integrate redundant terms by superposition
topics_df = raw_df
##
import term_handler
reload_module = True
if reload_module:
    import importlib
    importlib.reload(term_handler)
## Good paramterization is crucial for convergence. Here are two samples of good one for
## global and another for local alignments. NB: different values would generate unexitable loop.
## Acceptable setting for global alignment
global_align_params = {
            'gap_weight'      : -2, # This may not be smaller
            'match_weight'    :  3,
            'mismatch_weight' : -2,
            'theta'           :  0 # If it takes too long to complete, try out a larger value.
            }
## Acceptable setting for local alignment: Local alignment is more robust for variation
local_align_params = {
            'gap_weight'      : -2.5,
            'match_weight'    :  1,
            'mismatch_weight' : -2.5,
            'theta'           :  0 # If it takes too long to complete, try out a larger value.
            }

## settings for execution
align_globally = False
if align_globally:
    method = "globally"
else:
    method = "locally"
expansive      = True
greedy         = True # This attribute is largely incompatible with global alignment
detailed       = False
check          = False
## main
print(f"method: {method}")
print(f"greedy: {greedy}")
print(f"expansive: {expansive}")
print(f"detailed: {detailed}")
print(f"check: {check}")    
for tid in topics_df:
    topic_data = topics_df[tid]
    if check:
        print(topic_data)
    #
    terms_as_dict = { }
    for data in topic_data:
        term, score = data[0], data[1]
        try:
            val = float(score)
            terms_as_dict[str(term)] = val
        except ValueError:
            print(f"#{score}")
    if check:
        print(terms_as_dict)
    ## resurively update data
    T = list(terms_as_dict.keys())
    if align_globally:
        O = term_handler.derive_terms_by_superposition(T, align_globally = True, expansive = expansive, greedy = greedy, check = check, **global_align_params)
    else:
        O = term_handler.derive_terms_by_superposition(T, align_globally = False, expansive = expansive, greedy = greedy, check = check, **local_align_params)
    ## show result
    if detailed:
        T = sorted(T, key = lambda x: (len(x) - x.count(gap_mark) - x.count("[") - x.count("]")), reverse = True)
        print(f"topic {tid:03} has {len(T):03} raw terms: {T}")
    #
    O = [ x for x in O if not x in T ] # Filter out terms already in originals 
    O = sorted(O, key = lambda x: (len(x) - x.count(gap_mark) - x.count("[") - x.count("]")), reverse = True)
    print(f"topic {tid:03} found {len(O):03} {method} superposed terms: {O}")
##
print(f"# Topic-wise term integration for HDP {max_n_topics} topics via {target_lang_dict[target_lang_key]} {target_class}{term_type}{accent_status}") 

method: locally
greedy: True
expansive: True
detailed: False
check: False
topic ID 000 found 142 locally superposed terms: ['…r…[i,t]…e…', 'ive…', 't…ve…', '…tiv', 'tiv', 'cal…', '…ica', '…a…iv', '…r…ti', '…e…ti', 'tiv…', '…c…iv', '…r…iv', '…at…v', '…cti', '…e…t…v', '…ate', '…ati', 'ti…e…', '…e…iv', '[i,t]', 've…', 'iv', '…iv', '…t…v', '…ve…', 't…v', '…ti', 'ti', '…ct', 'ct…', 'ca', 'al…', '…ic', 'ic', '…a…v', 'iv…', '…iv…', 'ic…', '…ic…', '…e…v', '…r…v', '…ec', '…si', '…ti…', 'ti…', '…r…t', 'r…t', '…e…t', '…c…i', 'te…', '…te…', 'te', '…te', '…ri', '…ri…', 'i…l…', '…e…t…', 't…e…', '…t…e…', '…at', '…at…', 'at…', '…r…i', '…i…e…', 'i…e…', '…i…i', 't…v…', '…to', '…c…v', 'ct', '…ra', 'ed…', 'at', '…s…i', 'i…a', '…i…a', 'i…a…', 'ia', 'ia…', '…er…', 'er…', '…p…i', '…t…i', '…e…i', '…e…i…', 'r…l…', '…al…', 'tv', '…tv', '…rt', 'rt', '…et', '…ci', 'ie…', '…ia', '…ei', '…r…t…', 'r…t…', 'rt…', '…s…v', 'e…', '…i', '…t', 't…', '…v', '…t…', 't', '…c', 'l…', 'a…', '…v…', 'v…', '…a', '…a…', 'v', 'c…', '