In [490]:
import sys, os, re
import pandas as pd

In [491]:
## install String2String if necessary
#!pip install string2string

In [492]:
## Run the following if string2string needs it
#!pip install -U ipywidgets

In [493]:
## You can test if string2string works with the following:
#from string2string.alignment import NeedlemanWunsch

In [494]:
## settings
verbose           = False

## term
term_classes      = [ 'spell', 'sound' ]
term_class        = term_classes[1]
n_for_ngram       = 5
ngram_is_skippy   = True
if ngram_is_skippy:
    term_type     = f"skippy{n_for_ngram}gram"
else:
    term_type     = f"{n_for_ngram}gram"
if term_class == 'sound':
    term_type = "sn_" + term_type
else:
    term_type = "sp_" + term_type
print(f"term_type: {term_type}")

## gap mark
gap_mark          = "…"
print(f"gap_mark: {gap_mark}")

## accent handling
suppress_accents   = True
if term_class == 'sound':
    if suppress_accents:
        accent_status = "-unaccented"
    else:
        accent_stratus = "-accented"
else:
    accent_status = ""
print(f"accent_status: {accent_status}")

term_type: sn_skippy5gram
gap_mark: …
accent_status: -unaccented


In [495]:
## target language
## a key must be part of a file name 
target_lang_dict = {    'en_US' : 'English (US)',
                        'en_UK' : 'English (UK)',
                        'en_N_only' : 'English noun (WN)',
                        'en_V_only' : 'English verb (WN)',
                        'en_A_only' : 'English adj (WN)',
                        'en_R_only' : 'English adv (WN)',
                        'ar'    : 'Arabic',
                        'de'    : 'German',
                        'de_N_only' : 'German Nouns',
                        'de_non_N_only' : 'German Non-nouns',
                        'eo'    : 'Esperanto',
                        'es_ES' : 'Spanish (Spain)',
                        'es_MX' : 'Spanish (Mexico)',
                        'fi'    : 'Finnish',
                        'fr_FR' : 'French (France)',
                        'fr_QC' : 'French (Quebec)',
                        'is'    : 'Icelandic',
                        'ir'    : 'Irish',
                        'nl'    : 'Dutch',
                        'ro'    : 'Romanian',
                        'sw'    : 'Swahili' }
## proper language selection
target_lang_keys = [    'en_US', # 0
                        'en_UK', # 1
                        'en_N_only', # 2
                        'en_V_only', # 3
                        'en_A_only', # 4
                        'en_R_only', # 5
                        'ar', # 6
                        'de', # 7
                        'de_N_only', # 8
                        'de_non_N_only', # 9
                        'eo', 'es_ES', 'es_MX',
                        'fi', 'fr_FR', 'fr_QC',
                        'is', 'nl', 'ro', 'sw',
                        'ir' # This lacks sound
                    ]
## check
target_lang_key  = target_lang_keys[4]
print(f"target_lang_key: {target_lang_key}")
print(f"target lang: {target_lang_dict[target_lang_key]} [{target_lang_key}]")
## target_attr [effective only for Irish]
target_class = ""
#target_class = None # This causes an unrediable error
if target_lang_key == "ir":
    target_classes = [ 'adjectives', 'nouns', 'verbs' ]
    target_class = f"-{target_classes[3]}"
print(f"target_class: {target_class}")
lang_dir_name = target_lang_dict[target_lang_key].split()[0]
print(f"lang_dir_name: {lang_dir_name}")


target_lang_key: en_A_only
target lang: English adj (WN) [en_A_only]
target_class: 
lang_dir_name: English


In [496]:
## read data from a file
import pandas as pd
import ast
##
max_n_topics = 90
##
supplement_boundaries = True
boundary_mark = "#"
##
use_sample = False
if use_sample:
    data_file = f"results/terms-by-topics-raw/samples/hdp{max_n_topics}_topics_raw.csv"
else:
    data_file = f"results/terms-by-topics-raw/{lang_dir_name}/{target_lang_dict[target_lang_key]}{target_class}-topics{max_n_topics}-{term_type}{accent_status}.csv"
print(data_file)

## read a file and evaluate its content
raw_df = pd.read_csv(data_file, header = None)
for col in raw_df:
    ## The following evaluation is crucial
    raw_df[col] = raw_df[col].apply(ast.literal_eval)
##
if verbose:
    raw_df

results/terms-by-topics-raw/English/English adj (WN)-topics90-sn_skippy5gram-unaccented.csv


In [497]:
## supplement boudaries
if supplement_boundaries:
    for col in raw_df:
        enc = [ (f"{gap_mark}{term}", score) if term[-1] == boundary_mark else (f"{gap_mark}{term}{gap_mark}", score) for term, score in raw_df[col] ]
        raw_df[col] = enc
##
if verbose:
    raw_df

In [498]:
## define sequence aligners
import numpy as np
#from Bio.Align import PairwiseAligner as aligner
## BioPython based
use_BioPy = False
if use_BioPy:
    def biopy_show_alignments(s: str, t: str, check = False):
        from Bio import pairwise2
        #result = pairwise2.align.globalxx(s, t)
        #print(pairwise2.format_alignment(*result[0]))
        for a in pairwise2.align.globalxx(str(s), str(t)):
            print(pairwise2.format_alignment(*a))
## string2string based
def NW_superpose(s: str, t: str, gap_char = "-", with_score_matrix = False, check = False, **params):
    "perform global alignment of sequence pairs using Needleman-Wunsch method and return their superposition"
    outcome = [ ]
    from string2string.alignment import NeedlemanWunsch
    nw = NeedlemanWunsch(#gap_weight = -0, mismatch_weight = -2, match_weight = 2,
        gap_weight = params['gap_weight'],
        match_weight = params['match_weight'],
        mismatch_weight = params['mismatch_weight'],
        gap_char = gap_char)
    #
    s_aligned, t_aligned, score_matrix = nw.get_alignment(s, t, return_score_matrix = True)
    #
    size = min(len(s), len(t))
    score = np.sum(score_matrix)/size
    if score >= params['theta']:
        if check:
            nw.print_alignment(s_aligned, t_aligned)
            print(f"score:{score}")
        if check and with_score_matrix:
            print(score_matrix)
        superposed = str_superpose(s_aligned, t_aligned, check = False)
        if check:
            print(f"superposed: {superposed}")
        outcome = superposed
    else:
        outcome = None
    return outcome
#
def SW_superpose(s: str, t: str, gap_char = "-", with_score_matrix = False, check = False, **params):
    "perform local alignment of sequence pairs using Needleman-Wunsch method and return their superpostion"
    outcome = [ ]
    from string2string.alignment import SmithWaterman
    sw = SmithWaterman(#gap_weight = -0, match_weight = 2, mismatch_weight = -2,
        gap_weight = params['gap_weight'],
        match_weight = params['match_weight'],
        mismatch_weight = params['mismatch_weight'],
        gap_char = gap_char)
    #
    s_aligned, t_aligned, score_matrix = sw.get_alignment(s, t, return_score_matrix = True)
    #
    size = min(len(s), len(t))
    score = np.sum(score_matrix)/size
    if score >= params['theta']:
        if check:
            sw.print_alignment(s_aligned, t_aligned)
            print(f"score:{score}")
        if check and with_score_matrix:
                print(score_matrix)
        superposed = str_superpose(s_aligned, t_aligned, check = False)
        if check:
            print(f"superposed: {superposed}")
        outcome = superposed
    else:
        outcome = None
    return outcome

## superpose
def str_superpose(s_raw: str, t_raw: str, sep = "|", joint = "",
                  gap_mark = gap_mark, reduce = True, greedy = False,
                  check = False):
    s_split = [ x for x in re.split(f"\s*{sep}\s*", s_raw) if len(x) > 0 and x != sep ]
    t_split = [ x for x in re.split(f"\s*{sep}\s*", t_raw) if len(x) > 0 and x != sep ]
    if check:
        print(f"s_split: {s_split}")
        print(f"t_split: {t_split}")
    assert len(s_split) == len(t_split)
    S = [ ]
    for pair in zip(s_split, t_split):
        x = pair[0]; y = pair[1]
        if x == y:
            S.append(x)
        elif x == gap_mark:
            S.append(y)
        elif y == gap_mark:
            S.append(x)
        else:
            ## handles mismatch case
            if greedy:
                S.append(f"[{x},{y}]")
            else:
                return None
    result = joint.join(S)
    if reduce:
        result = re.sub(f"{gap_mark}+", f"{gap_mark}", result)
    return result

In [499]:
## integrate redundant terms by superposition
topics_df = raw_df
## Good paramterization is crucial for good outcome. Here is a sample of good parameterization.
#params_dict = { 'gap_weight'      : -2,
#                'match_weight'    :  3,
#                'mismatch_weight' : -2,
#                'theta'           :  0 }
## Note that different valuation would result in an endless loop.
params_dict = { 'gap_weight'      : -2, # This may not be small
                'match_weight'    :  3,
                'mismatch_weight' : -2,
                'theta'           :  0 # If it takes to long, try a larger value.
                }

## Evaluation
align_globally = True
inclusive      = False
expansive      = True
check = False
for tid in topics_df:
    topic_d = topics_df[tid]
    if check:
        print(topic_d)
    #
    terms_as_dict = { }
    for d in topic_d:
        term, score = d[0], d[1]
        try:
            val = float(score)
            terms_as_dict[str(term)] = val
        except ValueError:
            print(f"#{score}")
    if check:
        print(terms_as_dict)
    ## resurively update data
    terms = list(terms_as_dict.keys())
    remains = terms
    O = [ ]
    while len(remains) > 0:
        seen = [ ]
        for one in remains:
            seen.append(one)
            unseen = [ x for x in remains if not x in seen ]
            for another in unseen:
                if another != one:
                    if align_globally:
                        superposed = NW_superpose(one, another,
                                                      gap_char = gap_mark, with_score_matrix = False,
                                                      greedy = True, inclusive = True,
                                                      check = check, **params_dict)
                    else:
                        superposed = SW_superpose(one, another,
                                                      gap_char = gap_mark, with_score_matrix = False,
                                                      greedy = True, inclusive = True,
                                                      check = check, **params_dict)
                    ## evaluate result
                    if not superposed == None:
                        if not superposed in O:
                            O.append(superposed)
                    if inclusive:
                        if not superposed in remains:
                            remains.append(superposed)
                            if expansive:
                                E = []
                                for loc in [ m.start() for m in re.finditer(gap_mark, superposed)]:
                                    if 0 < loc and loc < len(superposed):
                                        e = superposed[:loc] + superposed[loc+1:]
                                        E.append(e)
                                for e in E:
                                    if e not in O:
                                        O.append(e)
                            #unseen.append(superposed)
            ## take out the processed one
            remains.remove(one)
    ##
    O = sorted(O, key = lambda x: (len(x) - x.count(gap_mark)), reverse = True)
    print(f"topic {tid:02}: {O}")
##
print(f"# topic-wise term integration for HDP {max_n_topics} topics via {target_lang_dict[target_lang_key]} {target_class}{term_type}{accent_status}") 

topic 00: ['…ə…aɪzd…', '…ə…aɪ…zd…', '…aɪzd…', '…ə…aɪz…', '…ə…aɪ…d…', '…ə…a…zd…', '…ɪ…bə…ɫ…', '…ɪ…bəb…', '…ɪ…bəs…', '…ə…ɪzd…', '…ɪ…iəs…', '…ɪ…iə…ɫ…', '…ɪ…iəb…', '…ɪ…ə…əɫ…', '…ɪzd…', '…iəɫ…', '…ə…aɪ…', '…ɪʃə…', '…ə…a…z…', '…ɪ…bə…', '…ɪ…əb…', '…ɫaɪ…', '…ɫ…ɪz…', '…ə…a…d…', '…ə…ɪz…', '…ə…zd…', '…a…zd…']
topic 01: ['…ə…ɛɹi…', '…n…ɹi…', '…ʃən…', '…t…ɹi…', '…ɛ…ən…', '…ænd…', '…n…ɪŋ…', '…ɛɹi…', '…nɛɹ…']
topic 02: ['…təbəɫ…', '…təb…əɫ…', '…ɛ…əbəɫ…', '…n…əbəɫ…', '…ɛ…əb…əɫ…', '…t…bəɫ…', '…təbə…', '…tə…əɫ…', '…təb…ɫ…', '…n…əbə…', '…n…bəb…', '…n…bəɫ…', '…ɛ…əbə…', '…ɛ…bəb…', '…ɛ…bə…ɫ…', '…ɛ…bəɫ…', '…p…əbə…', '…ə…əbə…', '…n…əb…ɫ…', '…ɛ…ə…əɫ…', '…ɛ…əb…ɫ…', '…t…bə…', '…t…b…ɫ…', '…ɝ…əɫ…', '…n…bə…', '…ə…əb…', '…ə…bə…', '…ɛ…bə…', '…s…bə…', '…n…b…ɫ…', '…ɛ…b…ɫ…', '…n…əb…', '…ɛ…əb…', '…p…əb…', '…ɹ…əb…']
topic 03: ['…ɛktɪ…', '…jəɫɝ…', '…ɪkəɫ…', '…ɛ…tɪv…', '…kjəɫ…', '…t…ɪk…ɫ…', '…k…əɫɝ…', '…jəɫ…', '…jə…ɝ…', '…j…ɫɝ…', '…ɛk…ɪ…', '…kjə…', '…kj…ɫ…', '…ɛ…ɪv…', '…k…ɪv…', '…k…ɫɝ…', '…ɛkt…']
topic 04: ['…ʃən…əɫ…', '…ʃə