In [46]:
import sys, os, re
import pandas as pd

In [47]:
## install String2String if necessary
#!pip install string2string

In [48]:
## Run the following if string2string needs it
#!pip install -U ipywidgets

In [49]:
#import string2string
from string2string.alignment import NeedlemanWunsch

In [50]:
#!pip install pyalign # fails to install on 3.10, 3.11, 3.9

In [51]:
## Run the following if BioPython is missing
#!pip install biopython

In [52]:
## Bio is the abbreviation of BioPython
#from Bio import pairwise2 # obsolete code
#from Bio.Align import PairwiseAligner as aligner

In [123]:
## aligners
import numpy as np
#from Bio.Align import PairwiseAligner as aligner
## BioPython based
use_BioPy = False
if use_BioPy:
    def biopy_show_alignments(s: str, t: str, check = False):
        from Bio import pairwise2
        #result = pairwise2.align.globalxx(s, t)
        #print(pairwise2.format_alignment(*result[0]))
        for a in pairwise2.align.globalxx(str(s), str(t)):
            print(pairwise2.format_alignment(*a))
## string2string based
def nw_show_alignments(s: str, t: str, theta: float = 0, gap_char = "-", with_score = False, *args, **params):
    size = min(len(s), len(t))
    from string2string.alignment import NeedlemanWunsch
    nw = NeedlemanWunsch(
        #gap_weight = -0, mismatch_weight = -2, match_weight = 2,
        gap_weight = params[gap_weight],
        match_weight = params[match_weight],
        mismatch_weight = params[mismatch_weight],
        gap_char = gap_char)
    s_aligned, t_aligned, score_matrix = nw.get_alignment(s, t,
                                                          return_score_matrix = True)
    score = np.sum(score_matrix)/size
    if score >= theta:
        print(f"score:{score}")
        print(f'The alignment between "{s}" and "{t}":')
        nw.print_alignment(s_aligned, t_aligned)
        if with_score:
            print(score_matrix)
#
def sw_show_alignments(s: str, t: str, theta: float = 0, gap_char = "-", with_score = False):
    size = min(len(s), len(t))
    from string2string.alignment import SmithWaterman
    sw = SmithWaterman(gap_weight = -0, match_weight = 2, mismatch_weight = -2, gap_char = gap_char)
    s_aligned, t_aligned, score_matrix = sw.get_alignment(s, t,
                                                          return_score_matrix = True)
    score = np.sum(score_matrix)/size
    if score >= theta:
        print(f"score:{score}")
        print(f'The alignment between "{s}" and "{t}":')
        sw.print_alignment(s_aligned, t_aligned)
        if with_score:
            print(score_matrix)

In [54]:
## target language
## a key must be part of a file name 
target_lang_dict = {    'en_US' : 'English (US)',
                        'en_UK' : 'English (UK)',
                        'en_N_only' : 'English noun (WN)',
                        'en_V_only' : 'English verb (WN)',
                        'en_A_only' : 'English adj (WN)',
                        'en_R_only' : 'English adv (WN)',
                        'ar'    : 'Arabic',
                        'de'    : 'German',
                        'de_N_only' : 'German Nouns',
                        'de_non_N_only' : 'German Non-nouns',
                        'eo'    : 'Esperanto',
                        'es_ES' : 'Spanish (Spain)',
                        'es_MX' : 'Spanish (Mexico)',
                        'fi'    : 'Finnish',
                        'fr_FR' : 'French (France)',
                        'fr_QC' : 'French (Quebec)',
                        'is'    : 'Icelandic',
                        'ir'    : 'Irish',
                        'nl'    : 'Dutch',
                        'ro'    : 'Romanian',
                        'sw'    : 'Swahili' }
## proper language selection
target_lang_keys = [    'en_US', # 0
                        'en_UK', # 1
                        'en_N_only', # 2
                        'en_V_only', # 3
                        'en_A_only', # 4
                        'en_R_only', # 5
                        'ar', # 6
                        'de', # 7
                        'de_N_only', # 8
                        'de_non_N_only', # 9
                        'eo', 'es_ES', 'es_MX',
                        'fi', 'fr_FR', 'fr_QC',
                        'is', 'nl', 'ro', 'sw',
                        'ir' # This lacks sound
                    ]
## check
target_lang_key  = target_lang_keys[3]
print(f"target_lang_key: {target_lang_key}")
print(f"target lang: {target_lang_dict[target_lang_key]} [{target_lang_key}]")
## target_attr [effective only for Irish]
target_class = ""
#target_class = None # This causes an unrediable error
if target_lang_key == "ir":
    target_classes = [ 'adjectives', 'nouns', 'verbs' ]
    target_class = f"-{target_classes[3]}"
print(f"target_class: {target_class}")
lang_dir_name = target_lang_dict[target_lang_key].split()[0]
print(f"lang_dir_name: {lang_dir_name}")
##
term_class = "spell"
suppress_accents   = True
if term_class == 'sound':
    if suppress_accents:
        accent_status = "-unaccented"
    else:
        accent_stratus = "-accented"
else:
    accent_status = ""
print(f"accent_status: {accent_status}")
term_type = "skippy4gram"
term_class = 'sound'
if term_class == 'sound':
    term_type = "sn_" + term_type
else:
    term_type = "sp_" + term_type
print(f"term_type: {term_type}")
gap_mark = "…"
print(f"gap_mark: {gap_mark}")

target_lang_key: en_V_only
target lang: English verb (WN) [en_V_only]
target_class: 
lang_dir_name: English
accent_status: 
term_type: sn_skippy4gram
gap_mark: …


In [105]:
## read data from a file
import pandas as pd
import io
import ast
#
max_n_topics = 90
use_sample = True
if use_sample:
    data_file = f"results/terms-by-topics-raw/hdp{max_n_topics}_topics_raw.csv"
else:
    data_file = f"results/terms-by-topics-raw/{lang_dir_name}/{target_lang_dict[target_lang_key]}{target_class}-topics-max_ntop{max_n_topics}-{term_type}{accent_status}.csv"
print(data_file)
## read a file and evaluate its content
raw_df = pd.read_csv(data_file, header = None)
for col in raw_df:
    ## The following evaluation is crucial
    raw_df[col] = raw_df[col].apply(ast.literal_eval)
try:
    raw_df.sample(10)
except ValueError:
    raw_df
raw_df

results/terms-by-topics-raw/hdp90_topics_raw.csv


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,80,81,82,83,84,85,86,87,88,89
0,"(ɪz, 0.002392253534277575)","(z#, 0.004399957590749747)","(k…t#, 0.0018744175155801237)","(dʒ, 0.0018168859341307646)","(t…t, 0.0017032269071149838)","(ʒ#, 0.0014548912586663507)","(ɪd, 0.0015534304134885647)","(b#, 0.001693633356001427)","(z#, 0.0016760296028086635)","(ɪ#, 0.002047121115092113)",...,"(ʃ#, 0.0008392727553341726)","(p#, 0.0009395097279351407)","(m#, 0.0011506341875226272)","(ut, 0.0008072412950932711)","(p#, 0.001312256327329552)","(p#, 0.0008219307374459255)","(ɔk, 0.0005675191561492987)","(j…#, 0.0004951690028406118)","(ɛɫ, 0.0005807744730302993)","(p#, 0.00038158789660015145)"
1,"(z#, 0.002292763267122344)","(ɪz#, 0.0038123330954223894)","(ɝ…ɪ, 0.0018591781139291777)","(ʒ…#, 0.001516955475548606)","(t…t#, 0.001603893920998497)","(dʒ#, 0.0012663912537816831)","(ɪ…eɪ…#, 0.0011036236802056288)","(ʃ#, 0.0016084082306303136)","(ɪz#, 0.00137066588272702)","(aɪ#, 0.0017148140033927796)",...,"(ɪ#, 0.0006340200000208684)","(z…#, 0.0006080570260981964)","(im#, 0.0009768955161800848)","(ut#, 0.0007776164746133306)","(ɫə, 0.00042875941897470825)","(i…ɪ, 0.0006229777405971992)","(ɔk#, 0.0005448505171170135)","(jɝ, 0.0004305486589315036)","(p…ɫ#, 0.0005411688577107329)","(ɹd, 0.0003174392627479755)"
2,"(ə…a, 0.002175632400358344)","(ɪz, 0.003809012449005823)","(ɝ…t#, 0.0017615849384848045)","(dʒ…#, 0.0012192036827678634)","(ə…e…#, 0.0013983526349812517)","(dʒ, 0.0012596943858507047)","(ɪ…e…#, 0.001099426062082817)","(tʃ, 0.0014548132730324694)","(ɡ#, 0.0013678392997966756)","(f…ɪ, 0.0017086120898848994)",...,"(n…ʃ, 0.0004990072944468415)","(ɹe, 0.000579407742968596)","(im, 0.000966972986188745)","(i#, 0.0007384916413445101)","(ɫə…#, 0.0004168808485719172)","(i…eɪ, 0.0005444542518449789)","(m#, 0.0005299715751496309)","(ɡj, 0.0004188220240764474)","(p…ɫ, 0.0005287072024033467)","(ɹd#, 0.0002953599420800135)"
3,"(ə…a…#, 0.0021500972762892663)","(a…z#, 0.0034840668884730135)","(ɝ…ɪ…#, 0.001736029239641814)","(p…t, 0.00117890165439807)","(ə…eɪ, 0.001377330216149228)","(v#, 0.0011479035013920247)","(ɪ…eɪ, 0.001098073434160184)","(tʃ#, 0.001268074269541247)","(ɪz, 0.0013637702420184875)","(fa, 0.00148045217754547)",...,"(ɹe, 0.0004762809731206372)","(ɹeɪ, 0.0005417263149177658)","(ɹ…m#, 0.0007377540407582104)","(ɫ…t#, 0.0006768749538978902)","(nt, 0.00036937287134647634)","(ɪk#, 0.0005248074449570152)","(h…k#, 0.0004780429999949618)","(ɡɝ, 0.0004170254046092849)","(ɛɫ#, 0.0005228082327957904)","(#h…æ…#, 0.00024818084563018716)"
4,"(ə…aɪ, 0.0021494282912593024)","(a…z, 0.003472206034844537)","(ɝ…t, 0.0017298658188965917)","(ɪn…#, 0.00116888722395259)","(æ…t, 0.0013764462001531886)","(nd, 0.0011182535784337345)","(ɪ…e, 0.0010975461113033767)","(ɪb, 0.001168227033941343)","(a…z#, 0.0012699286537403778)","(fa…#, 0.00147389196020713)",...,"(ɹe…#, 0.00043331769492757513)","(ɪp#, 0.0005378086513083356)","(d…m#, 0.000721715709058473)","(tɝ, 0.0006087381203780225)","(ip, 0.0003590717399099637)","(i…e, 0.000518462250458858)","(ɑm, 0.00045009510433370316)","(jɝ#, 0.000416939839761882)","(pɛɫ#, 0.0004862084998915093)","(ɪp, 0.00024594372132637503)"
5,"(ɪz#, 0.001980828085478327)","(aɪz#, 0.0033727929149042976)","(ɝ…ɪt, 0.0015680615556768944)","(z#, 0.0011583646448692193)","(ə…e, 0.0013732688392108202)","(nd…#, 0.000994874042436843)","(eɪd, 0.00108733735068972)","(p#, 0.0010881026788992366)","(a…z, 0.0012621719721247)","(a…n, 0.0013683186983762742)",...,"(n…ʃ#, 0.0004258165696373692)","(ɹe…#, 0.0005318908778748448)","(ɹ…m, 0.0006775789136537149)","(tɝ#, 0.0005916270234017632)","(ip#, 0.00035188735964586994)","(i…e…#, 0.0005183927456453134)","(hɔk, 0.0004433198391079239)","(ɡj…#, 0.0004141653737691445)","(ɪz, 0.00043248141001033473)","(ɔɹd, 0.00024431291850739064)"
6,"(a…z#, 0.0018083948208497014)","(aɪz, 0.0033575879549128376)","(ɝ…ɪt#, 0.0015644210007643333)","(p…t#, 0.0011249933039669162)","(nt, 0.0013384925428987613)","(p…t, 0.000982997536238778)","(e…d, 0.0010658897702529946)","(s…aɪ, 0.0010358000123326702)","(aɪz#, 0.0012404428502823122)","(faɪ, 0.0013639976522090724)",...,"(ɹeɪ, 0.00041839248376837824)","(ɪp, 0.0005237221759548765)","(d…m, 0.0006649371692873192)","(ɫut#, 0.0005035341198144158)","(ɪɝ, 0.00034278590199382107)","(ɪk, 0.0005181397168373643)","(h…k, 0.00044100297659285375)","(ɡ…ɝ#, 0.0004130658803526419)","(pɛ…#, 0.0004266362166825513)","(k…ve…#, 0.00024380654510572863)"


In [124]:
## integrate redundant terms
import term_handlers
topics_df = raw_df
#
check = True
params_dict = { 'gap_weight' : -0,
                'match_weight' : 1,
                'mismatch_weight' : -1,
                'theta' : -0
          }
#
for id in topics_df:
    d = topics_df[id] # d is a string!
    if check:
        print(d)
    terms_as_dict = { }
    for x in d:
        term, score = x[0], x[1]
        try:
            val = float(score)
            terms_as_dict[str(term)] = val
        except ValueError:
            print(f"#{score}")
    if check:
        print(terms_as_dict)
    #
    terms = terms_as_dict.keys()
    seen = [ ]
    for one in terms:
        seen.append(one)
        unseen = [ x for x in terms if not x in seen ]
        for another in unseen:
            if another != one:
                #biopy_show_alignments(one, another)
                nw_show_alignments(one, another,
                                   theta = 10, gap_char = gap_mark, with_score = True, *params_dict)
                #sw_show_alignments(one, another, gap_char = gap_mark, with_score = True)
                

0        (ɪz, 0.002392253534277575)
1        (z#, 0.002292763267122344)
2       (ə…a, 0.002175632400358344)
3    (ə…a…#, 0.0021500972762892663)
4     (ə…aɪ, 0.0021494282912593024)
5       (ɪz#, 0.001980828085478327)
6     (a…z#, 0.0018083948208497014)
Name: 0, dtype: object
{'ɪz': 0.002392253534277575, 'z#': 0.002292763267122344, 'ə…a': 0.002175632400358344, 'ə…a…#': 0.0021500972762892663, 'ə…aɪ': 0.0021494282912593024, 'ɪz#': 0.001980828085478327, 'a…z#': 0.0018083948208497014}


TypeError: nw_show_alignments() got multiple values for argument 'theta'

In [73]:
## reduce terms: under implementation
import term_handlers
reload_modules = False
if reload_modules:
    import importlib
    importlib.reload(term_handlers)
#
check = False
for topic in topics_df:
    if check:
        print(topic)
    topic_id, topic_matrix = topic[0], topic[1]
    terms =  [ x[0] for x in topic_matrix ]
    extended_terms = terms.copy()
    print(f"topic id: {topic_id}")
    print(terms)
    reduced_terms = term_handlers.reduce_by_superposition(terms, min_overlap = 2, check = check)
    #reduced_terms = term_handlers.reduce_by_superposition_under_gap(terms, min_overlap = 2, gap_marker = gap_marker, check = check)
    print(reduced_terms)
    if len(reduced_terms) > 0:
        for term in reduced_terms:
            if not term in extended_terms:
                extended_terms.extend(term)
        extended_terms = sorted(extended_terms, key = lambda x: len(x), reverse = True)
    print(extended_terms)

TypeError: 'int' object is not subscriptable