# 分散的に表現された terms をまとめる
# string2string パッケージを使用

In [1]:
#!conda install -n base ipykernel --update-deps --force-reinstall -y

In [None]:
#!conda install pandas -y
#!conda install ast -y

In [10]:
## run the following to install string2string package if neccessary
#!pip install -U string2string
## run the following if string2string needs it
#!pip install -U ipywidgets

In [2]:
import sys, os, re
import pandas as pd
import ast

In [11]:
## target language
## a key must be part of a file name 
target_lang_dict = {    'en_US' : 'English (US)',
                        'en_UK' : 'English (UK)',
                        'en_N_only' : 'English noun (WN)',
                        'en_V_only' : 'English verb (WN)',
                        'en_A_only' : 'English adj (WN)',
                        'en_R_only' : 'English adv (WN)',
                        'ar'    : 'Arabic',
                        'de'    : 'German',
                        'de_N_only' : 'German Nouns',
                        'de_non_N_only' : 'German Non-nouns',
                        'eo'    : 'Esperanto',
                        'es_ES' : 'Spanish (Spain)',
                        'es_MX' : 'Spanish (Mexico)',
                        'fi'    : 'Finnish',
                        'fr_FR' : 'French (France)',
                        'fr_QC' : 'French (Quebec)',
                        'is'    : 'Icelandic',
                        'ir'    : 'Irish',
                        'nl'    : 'Dutch',
                        'ro'    : 'Romanian',
                        'sw'    : 'Swahili' }
## proper language selection
target_lang_keys = [    'en_US', # 0
                        'en_UK', # 1
                        'en_N_only', # 2
                        'en_V_only', # 3
                        'en_A_only', # 4
                        'en_R_only', # 5
                        'ar', # 6
                        'de', # 7
                        'de_N_only', # 8
                        'de_non_N_only', # 9
                        'eo', 'es_ES', 'es_MX',
                        'fi', 'fr_FR', 'fr_QC',
                        'is', 'nl', 'ro', 'sw',
                        'ir' # This lacks sound
                    ]
## check
target_lang_key  = target_lang_keys[4]
print(f"target_lang_key: {target_lang_key}")
target_lang_name = target_lang_dict[target_lang_key]
print(f"target lang: {target_lang_name} [{target_lang_key}]")

## target_attr [effective only for Irish]
target_class = ""
#target_class = None # This causes an unrediable error
if target_lang_key == "ir":
    target_classes = [ 'adjectives', 'nouns', 'verbs' ]
    target_class = f"-{target_classes[3]}"
print(f"target_class: {target_class}")

lang_dir_name = target_lang_dict[target_lang_key].split()[0]
print(f"lang_dir_name: {lang_dir_name}")

target_lang_key: en_A_only
target lang: English adj (WN) [en_A_only]
target_class: 
lang_dir_name: English


In [12]:
## paramters
verbose    = False
use_sample = False

## settings for terms
gap_mark  = "…"

max_n_topics = 90
print(f"max_n_topics: {max_n_topics}")

term_classes = [ 'adj', 'noun', 'pair' ]
term_class  = term_classes[-1]
print(f"term_class: {term_class}")

selection_key  = "5gram"
print(f"selection_key: {selection_key}")

max_n_topics: 90
term_class: pair
selection_key: 5gram


In [13]:
## read data from a file
import glob
import pprint as pp

target_dir = f"results/terms-by-topics-raw/{lang_dir_name}"
print(f"target_dir: {target_dir}")

target_files = glob.glob(f"{target_dir}/*.csv")
if verbose:
    pp.pprint(f"target_files: {target_files}")
##
if use_sample:
    #data_file = f"results/terms-by-topics-raw/samples/hdp{max_n_topics}_topics_raw.csv"
    data_file = f"results/terms-by-topics-raw/English/English-pair-topics90-sp_5gram-sn_5gram-unaccented.csv"
else:
    data_file = [ f for f in target_files if
                    str(max_n_topics) in f and term_class in f and selection_key in f ][0]
print(data_file)

## read a file and evaluate its content
raw_df = pd.read_csv(data_file, header = None)
for col in raw_df:
    raw_df[col] = raw_df[col].apply(ast.literal_eval) # evaluation is crucial

target_dir: results/terms-by-topics-raw/English
results/terms-by-topics-raw/English/English-pair-topics90-sp_5gram-sn_5gram-unaccented.csv


In [14]:
## check loaded data
raw_df.sample(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,80,81,82,83,84,85,86,87,88,89
37,"(accou :: #əˈk, 0.00027995888279721317)","(chri :: ɹɪst, 0.00026171007107245344)","(ger# :: aɪɡɝ, 0.00028567288795285024)","(lett :: oʊˈɫ, 0.0002990336181912643)","(gress :: ˈɡɹɛs, 0.00028038033151860976)","(eier :: maɪɝ#, 0.00027901899442546815)","(#for :: ˈfɔɹə, 0.00027474488228069945)","(#sub :: #səb, 0.00028199526583029576)","(#age :: ˈeɪdʒ, 0.00028234870634618383)","(#valu :: ˈvæɫ, 0.0002734931527014768)",...,"(ahlst :: ɫstɹə, 0.0002899021434566825)","(yans# :: ɹaɪə, 0.0002896407159307481)","(schmi :: ˈʃmɪ, 0.00028978573080542017)","(ving :: ɪvɪŋ, 0.0002870577238068204)","(rium :: ɹiəm#, 0.00028006062171335766)","(atche :: ætʃɝ, 0.00027740860098954696)","(sors :: sɛnsɝ, 0.0002891113036920764)","(#tigh :: taɪt, 0.00028760650290806164)","(overh :: ˌhæŋ, 0.00027894317824225676)","(#dig :: ɪˈɡɹɛ, 0.00028542594132154216)"
57,"(enber :: bɝɡ#, 0.0002695626837655231)","(jung :: dʒəŋ, 0.0002482293097488468)","(liva :: ɫɪvə, 0.00026458847754261283)","(berg :: ɝɡmən, 0.00027221207992347965)","(nda# :: ndə#, 0.00026068174535916455)","(lubr :: ubɹək, 0.00026426326918523393)","(nell :: nəɫ#, 0.0002637926762249573)","(less# :: mɫəs, 0.00026657252411429186)","(ell# :: wɛɫ#, 0.00026479013226001055)","(ather :: kæðɝ, 0.00025196544081074835)",...,"(iney :: ɪni#, 0.00027081660656509763)","(cord :: kɔɹd, 0.00027260261164727026)","(#affe :: ˈfɛk, 0.00026786908982611324)","(ways# :: ˌweɪz, 0.0002749724973070744)","(ck's :: ɪks#, 0.00025034544382058636)","(#door :: ˈdɔɹ, 0.00026607351283903315)","(llin :: kɪɫɪ, 0.0002671230832610522)","(#o'ca :: #oʊˈk, 0.0002666746054990241)","(ahold :: #əˈho, 0.00026831893675027785)","(#sav :: seɪv, 0.0002715889862339495)"
44,"(sten :: ˈstɛ, 0.00027501406591143296)","(rand :: ˈænd, 0.00025422835924213886)","(#ring :: #ˈɹɪ, 0.0002763614105891574)","(den# :: ɪdən#, 0.00028968098087256265)","(ostic :: æɡˈnɑ, 0.00027642694025255027)","(plant :: ɫænt, 0.00027462272716316223)","(field :: fiɫd#, 0.0002704062507960915)","(esse :: ɛsɪz#, 0.0002782076569986235)","(nsons :: sənz, 0.0002766409434320719)","(#anno :: əˈna, 0.00026126432862799)",...,"(#rin :: #ˈɹɪ, 0.0002835870176457544)","(#moir :: ˈmɔɪ, 0.0002819941633528413)","(rket :: mɑɹkə, 0.0002823194614197145)","(erer :: dɝɝ#, 0.00028342183565460054)","(#jet :: dʒɛt, 0.0002702348298484784)","(fied :: aɪd#, 0.00027093417803254673)","(ways :: #ˈwɔ, 0.00027395452529130885)","(reic :: ˈɹaɪk, 0.0002828049794215727)","(acke :: ækt#, 0.00027569986998891193)","(ger# :: nɪŋɝ, 0.00028088862808730284)"
30,"(icho :: ˈnɪkə, 0.00029189171853487647)","(igre :: #daɪˈ, 0.0002714778258310494)","(ider :: ˈsnaɪ, 0.00028917658547587767)","(erma :: tɝmən, 0.00031151798303117933)","(grap :: əˌɡɹæ, 0.0002887232214118764)","(egres :: ˈɡɹɛ, 0.0002896754584085501)","(cath :: ˈkæðɝ, 0.00028634539032153613)","(enbe :: bɝɡ#, 0.00029615020031553403)","(gaze :: #ˈɡeɪ, 0.0002919032087882009)","(over :: oʊvɝ, 0.0002845061830140595)",...,"(illi :: #ˈkɪ, 0.0002973165908580117)","(uenc :: fɫuə, 0.0002935192601567867)","(#down :: ˈdaʊn, 0.0002962043864325961)","(ross :: #ˈɹɔs, 0.00029183498496917093)","(#duf :: #ˈdəf, 0.0002870737684867488)","(albur :: æɫbɝ, 0.00028925704332192196)","(torm :: tɔɹm, 0.0002973175664171026)","(ober :: ʊbɝ#, 0.00030255981729153955)","(orte :: ɹˈtɛ, 0.000287720600388299)","(ders# :: ndɝz#, 0.0002962806178634563)"
26,"(yar# :: ɹaɪɝ#, 0.0002981071909776036)","(ecap :: iˈkæ, 0.0002749170319130657)","(read :: ˈtɹɛd, 0.00029519339754681314)","(mega :: ˈmɛɡə, 0.00031829578829638516)","(mano :: ɑnoʊ#, 0.00029854869741936406)","(retr :: ˈtɹɛd, 0.00029549513498701804)","(erg# :: aɪnb, 0.00028970042233799546)","(ist# :: kwɪst, 0.00030253472388301356)","(rgan :: ɹɡən, 0.0003004353843927428)","(rhan :: #ˈoʊ, 0.0002862363942758124)",...,"(rnel :: ˈnɛɫ#, 0.0003059451948224653)","(enbe :: ɛnbɝ, 0.00029839420854079775)","(digr :: ɡɹɛs, 0.00030714141492249277)","(ving :: vɪŋz#, 0.00030289563038184527)","(arne :: ɑɹnz, 0.00029151311391299397)","(ted# :: pɔɹt, 0.0002926537054681)","(dden# :: ɛdən#, 0.0003019961093757303)","(rder :: ɝdɝ#, 0.00031864502868113857)","(#gar :: #ˈɡɑɹ, 0.00029283920003260336)","(#chop :: #ˈtʃ, 0.00030343778421150236)"
14,"(nberg :: bɝɡ#, 0.00032493373927063864)","(etra :: ɛˈtɹ, 0.0002907042877863979)","(luen :: uəns, 0.00031286861385967034)","(megad :: ˈmɛɡ, 0.00034794352733283113)","(#cov :: koʊv, 0.00032418279457137404)","(udin :: ˈkɫa, 0.00033293804080578905)","(#bro :: ˈbɹɑ, 0.0003034850017327405)","(enger :: ɛndʒ, 0.00032813718790491276)","(land# :: ɹɫən, 0.0003302928648936959)","(iden :: fədə, 0.0003148659159643958)",...,"(stro :: ɹəm#, 0.0003331155766821133)","(#byl :: #ˈba, 0.00032642804230008864)","(ngto :: ɫɪŋt, 0.00033834229937433445)","(black :: #ˈbɫ, 0.00034297007442826754)","(#tape :: ˈteɪ, 0.0003145203960470037)","(kopp :: ˈkɑp, 0.0003141394949665012)","(ogli :: ˈfɑɡɫ, 0.00032968623102880876)","(wicz :: vɪtʃ, 0.000343474232130208)","(atric :: ɪks#, 0.0003088519176539454)","(leas :: pɫɛzə, 0.0003326282187693364)"
47,"(chev :: #ˈtʃɛ, 0.00027312186744127135)","(mcca :: #məˈk, 0.00025266851389172204)","(rawli :: ɔɫɪŋ#, 0.00027371123454515814)","(#min :: ˈmɪn, 0.00028475548636172686)","(ress :: ɡɹɛs, 0.00026993222784682156)","(eism :: ɪzəm#, 0.00027317430237797357)","(tting :: ɛtɪŋ#, 0.0002672352764824228)","(deri :: ɝɪŋ#, 0.00027545462210884535)","(ndy# :: ndi#, 0.0002752668207377456)","(tone# :: toʊn, 0.00025908723646023715)",...,"(enber :: tɛnbɝ, 0.0002813095939724762)","(edden :: ɛdən#, 0.0002786419750548012)","(#kil :: kɪɫɪŋ, 0.0002780878815638176)","(#tre :: #ˈtɹɛ, 0.0002817778203102342)","(erha :: #ˈoʊ, 0.0002640463690736718)","(#sto :: stoʊ, 0.00027053395009424996)","(hair :: hɛɹˌ, 0.0002728594125392488)","(fiel :: əˌfiɫ, 0.00028029297637176286)","(gadea :: #ˈmɛ, 0.00027343327933947203)","(esse :: ɡɹɛs, 0.00027762180752672494)"
19,"(lenc :: əns#, 0.0003148980983793988)","(tabl :: təbəɫ, 0.0002797034042714559)","(moun :: maʊnt, 0.00030359738532860975)","(rman# :: ɝmən#, 0.00033718003960725353)","(#den :: #dɪˈ, 0.00030842116903539155)","(retr :: #ɹiˈt, 0.0003068892877281188)","(#arc :: #ˈɑɹ, 0.0002981348877250621)","(#sch :: ʃnaɪ, 0.00031286179906490105)","(rnet :: ˈnɛt, 0.00031590843976762113)","(anno :: əˈna, 0.00029700632648985647)",...,"(arre :: ɑˈɹɛ, 0.0003242334827584609)","(erin :: ŋɡɝɪ, 0.0003064029218899667)","(man# :: ɪɫmən, 0.0003270058234099841)","(#ava :: veɪɫ, 0.0003163044211452088)","(#pouc :: ˈpaʊ, 0.0003009615206433444)","(#clo :: #ˈkɫ, 0.00030882294627508806)","(mcca :: əˈkæ, 0.000308694055148726)","(ing# :: ɹsɪŋ, 0.00033193081037822267)","(ert# :: bɝt#, 0.00030274711904292213)","(rmand :: #ˈɑɹm, 0.0003217688652388229)"
8,"(berg :: bɝɡ#, 0.0003445067101555949)","(ahol :: ʊɫdz#, 0.0003196308382293911)","(#tur :: #ˈtɝ, 0.0003297820583500267)","(#sor :: ˈsɔɹt, 0.00035886238132111156)","(ried :: #ˈfɹ, 0.0003362924290945376)","(sion :: ənˈk, 0.0003626827863357978)","(rlin# :: ɝɫɪn#, 0.00032219231530136684)","(#min :: ˈmaɪn, 0.00035178662688237856)","(polit :: ˈɫito, 0.00034572354309580914)","(rino :: inoʊ, 0.00033191676907381263)",...,"(align :: ɫaɪn, 0.0003597849370076162)","(#rudo :: #ˈɹu, 0.00036889071247843094)","(tor# :: ɪtɝ#, 0.00035113083912677523)","(#bar :: ˈbɑɹˌ, 0.00036437264666532737)","(olor :: ˈkəɫɝ, 0.00033724167587785965)","(snow :: ˈsno, 0.00033933682348768443)","(#spr :: pɹeɪ, 0.000341142470286076)","(#ext :: ˈɛkˌ, 0.000355679069756069)","(unab :: ˈjunə, 0.0003373040308658177)","(erg# :: ɛnbɝɡ, 0.00036554009021584855)"
11,"(patr :: pætɹ, 0.00033569403633730647)","(gorsk :: ɔɹsk, 0.0003063023532227003)","(leasa :: ˈpɫɛ, 0.00032716549794744394)","(#hus :: #ˈhə, 0.0003536967192583878)","(holl :: ˈhɑɫ, 0.0003255205038614246)","(forg :: #fɝˈ, 0.0003480329561812719)","(rfiel :: fiɫd#, 0.00030723472712054995)","(tenb :: tənbɝ, 0.0003430121783512532)","(ounc :: nsɪŋ#, 0.00033184412534473234)","(#mcc :: məˈkɹ, 0.0003217597453144874)",...,"(#huf :: #ˈhəf, 0.0003514636074135609)","(anda :: ˈænd, 0.0003428632714476267)","(puzz :: #ˈpəz, 0.0003408122940591165)","(#how :: #ˈha, 0.0003544743804365437)","(#ava :: əˈve, 0.0003282815343455849)","(ennel :: #ˈpɛ, 0.00032637047351941615)","(confi :: #ˈkɑ, 0.00033419185807201195)","(#eve :: #ˈɛvə, 0.00035047622725062333)","(#pat :: #ˈpæ, 0.0003200796508007747)","(fax# :: ˌfæk, 0.00036199001660863127)"


# 境界記号の処理

In [15]:
## supplement boudaries: this modification improves alignment quality
## But this modification most likely to do harm to paired terms
use_paired_terms      = True
supplement_boundaries = False
boundary_mark         = "#"
if supplement_boundaries and not use_paired_terms:
    for col in raw_df:
        enc = [ (f"{gap_mark}{term}", score) if term[-1] == boundary_mark else (f"{gap_mark}{term}{gap_mark}", score) for term, score in raw_df[col] ]
        raw_df[col] = enc
##
raw_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,80,81,82,83,84,85,86,87,88,89
0,"(ence :: fɫuə, 0.00043651499294464473)","(illo :: ɪɫoʊ, 0.00037750259520552727)","(onnag :: ənədʒ, 0.0004910431434715073)","(rosan :: ɹoʊˈ, 0.0004895597273261253)","(inter :: ɪntɝˌ, 0.00047371187853034285)","(man# :: kmən, 0.0004344903045444988)","(cool :: #ˈkuɫ, 0.000417013685007599)","(#conf :: ˈkɑn, 0.00040665820235427527)","(union :: unjən, 0.0004000507501761628)","(elts :: #ˈjɛɫ, 0.0005289402879778112)",...,"(atric :: #ˈpæt, 0.0004269374352125742)","(#mcc :: məˈkɫ, 0.0004576376468452179)","(istle :: səɫ#, 0.0005059700780670552)","(ssee :: ˈsi#, 0.00044875641004962167)","(reras :: ɛɹɑz, 0.0005043573167682752)","(#hols :: #ˈhoʊ, 0.0005071013377160205)","(aine# :: keɪn, 0.0004010700774100492)","(lind :: ɫɪnd, 0.0004249127728870297)","(anick :: pænɪ, 0.0004322962687608556)","(cros :: kɹɑs, 0.0004564843063324881)"
1,"(ates# :: eɪts#, 0.00042877813846944093)","(nski :: ɪnski, 0.0003738866651853041)","(andr :: dɹɔɪ, 0.0004416362319212404)","(lins :: nski#, 0.0004560158056090709)","(nson :: ɹənsə, 0.00044405429499286075)","(#mar :: sən#, 0.00043198143512601477)","(ld's# :: oʊɫdz, 0.0003962038200048876)","(ing# :: əɫɪŋ, 0.00039594429498193755)","(icol :: ɪkəɫ, 0.00039884917150664504)","(mbra# :: æmbɹə, 0.0003846298847182896)",...,"(scho :: #ˈsk, 0.0003909790476129611)","(#zen :: ˈzɛn, 0.0004221994761037682)","(ades :: ɪdz#, 0.0004649130400498402)","(#gra :: ˈɡɹeɪ, 0.00042684941671176186)","(lets# :: əts#, 0.00041033731328737584)","(#chop :: #ˈtʃɑ, 0.0004644455528815378)","(armen :: ɑɹmən, 0.0003659175931841294)","(petr :: ɛˈtɹ, 0.00042426523771551226)","(#cho :: ˈtʃɑ, 0.00040219286130261674)","(cadd :: ˈkæd, 0.00044516193194208645)"
2,"(bric :: əkən, 0.0004097273380346505)","(ski# :: nski, 0.0003663450460073422)","(#boo :: #ˈbʊ, 0.0004203644275651039)","(man# :: mən#, 0.00041313883069818896)","(aph# :: ˌɡɹæ, 0.00036838035184401315)","(easte :: #ˈist, 0.00037708942099266417)","(ocke :: #ˈɫɑk, 0.00038848416406455386)","(#per :: #pɝˈ, 0.00038810843054622933)","(hilt :: ˈhɪɫt, 0.00038607586182357024)","(igma :: ɪɡmən, 0.0003761113384759615)",...,"(elli :: ɹɪˈbɛ, 0.000390491709597304)","(bergm :: ˈbɝɡ, 0.0004167082742796982)","(icol :: ɪkəɫ, 0.0004570018274022797)","(rtai :: ˈsɝtə, 0.00042662337128396353)","(mmit# :: dæmɪt, 0.0004004096980749901)","(scrib :: #ˈskɹ, 0.00040762446052046365)","(#ram :: #ˈɹæm, 0.0003616881960940826)","(etra :: ɛˈtɹ, 0.0004106859598544254)","(ito# :: oʊˈɫi, 0.00039821549725291516)","(ling# :: #ˈbɝ, 0.0004404266709411943)"
3,"(#mul :: #ˈmə, 0.0004088014965608648)","(#wal :: ˈwɔɫs, 0.0003635338700866835)","(#hei :: #ˈhaɪ, 0.0004034554788274737)","(#tick :: ˈtɪk, 0.0004110535097099955)","(rick :: ˈpæt, 0.0003524278391438973)","(opera :: #ˈɑp, 0.0003685489605121063)","(fied :: aɪd#, 0.00037929924142572326)","(#cour :: ˈkɔɹt, 0.00035866859282815096)","(boor :: #ˈbʊ, 0.00038006889690769976)","(ning# :: ɪnɪŋ, 0.0003721052872808423)",...,"(man# :: ˈbɝɡ, 0.0003766652671304949)","(ecke :: ɛkt#, 0.000394758598130487)","(ons# :: tʃən, 0.00042897239700085435)","(rgma :: bɝɡmə, 0.0003913276963833005)","(efie :: ˌfiɫd, 0.0003764972999639721)","(gel# :: ɡəɫ#, 0.0003930517864188718)","(inger :: ɪŋɝ#, 0.0003610387397332628)","(ahls :: ɑɫst, 0.0003942237771997)","(nick :: ˈpæn, 0.0003894506541911149)","(y's# :: biz#, 0.00041387128214071607)"
4,"(oniz :: #ˈaɪ, 0.00038059273321296893)","(gnost :: #æɡˈn, 0.0003509604101565168)","(nson :: nsən#, 0.00038295063075638187)","(#rin :: #ˈɹɪŋ, 0.00039745526234650857)","(offs# :: ɔfs#, 0.00034970763056638973)","(ingt :: ŋtən, 0.00036780801983661397)","(nson# :: ɹəns, 0.0003723796683992603)","(#mou :: #ˈmaʊ, 0.0003577658186212382)","(use# :: ˌhaʊ, 0.0003758846796117311)","(cros :: ˈkɹɑs, 0.0003594056247099446)",...,"(#dis :: dɪsˈm, 0.00037509403994054894)","(#cent :: #ˈsɛ, 0.00038829685111357735)","(#bomb :: #ˈbɑm, 0.0003811186019369058)","(#scal :: skeɪ, 0.00038440364431935645)","(#east :: ˈistɝ, 0.00037628187716495955)","(judge :: ˈdʒəd, 0.00039068770336881043)","(tors :: tɝz#, 0.00035663493204404626)","(#o'ca :: oʊˈkæ, 0.00036741834756083715)","(uipp :: ˈkwɪ, 0.0003705281463342066)","(erbe :: ˈhɝb, 0.00040836559328369273)"
5,"(enbe :: ɛnbɝ, 0.0003571073106433794)","(aric :: ɑˈɹi, 0.0003251073997042638)","(hing# :: tʃɪŋ, 0.00033318315476351263)","(nton :: ntən, 0.000395890648743174)","(sprag :: #ˈsp, 0.00034566375867330204)","(aving :: ɪvɪŋ, 0.0003667253916627093)","(uaker :: kɝz#, 0.0003455219469573625)","(ick# :: tɹɪk#, 0.00035584211155958827)","(cks# :: ɹɪks#, 0.0003753289322653495)","(olor :: kəɫɝ, 0.00035014565437639146)",...,"(#dec :: #ˈdɛ, 0.00037244643505218646)","(ine# :: aɪn#, 0.0003812219739015088)","(eras :: ɑˈɹɛ, 0.00037714502174315487)","(trom# :: stɹə, 0.00038341447618416084)","(nfide :: ɑnfə, 0.0003753674516956062)","(raws :: fski#, 0.00036575604098438945)","(ors# :: #ˌɪn, 0.0003484121124065531)","(ress :: ɹɛsɪ, 0.0003669138144960097)","(ploye :: ɛmˈpɫ, 0.000357224352566678)","(stee :: #ˈsti, 0.00037811460185650596)"
6,"(enber :: ənbɝ, 0.0003541840713577691)","(old' :: ʊɫdz#, 0.00032202398274108513)","(yter# :: ɪtɝ#, 0.000332375034028136)","(adea :: ɛɡəˌ, 0.000363771498695103)","(ter# :: aɪtɝ#, 0.0003453885913912597)","(bert :: #ˈɫæ, 0.00036587550522362237)","(oots# :: uts#, 0.00034049671242154914)","(#hai :: ˌɫaɪn, 0.00035442687654372587)","(spen :: ˈspɛn, 0.00036900289160819933)","(#imp :: ˌɪmˈp, 0.0003452683709993065)",...,"(ighbo :: ɪbɝz, 0.00036740889227102817)","(trow :: tɹoʊ, 0.0003720803980055846)","(#spra :: spɹeɪ, 0.00035663226181782665)","(resh :: fɹɛʃ, 0.0003825407213681378)","(ld's :: ʊɫdz#, 0.0003665980905240583)","(ortin :: ɔɹtɪ, 0.0003513141896486597)","(onal# :: nəɫ#, 0.00034407601699631366)","(#col :: #ˈkəɫ, 0.0003645845228639603)","(flur :: #ˈfɫ, 0.0003397485691016971)","(osad :: ˈsɑd, 0.0003753434398851098)"
7,"(iden :: ˈkɑnf, 0.0003539965599428721)","(ting# :: ʊtɪŋ, 0.00032012154113029024)","(rside :: saɪd#, 0.000330453162077603)","(late :: ˌɫeɪt, 0.0003612095544171249)","(aph# :: ɡɹæf, 0.0003406961811327769)","(chmid :: ˈʃmɪ, 0.00036289378455372546)","(#empl :: pɫɔɪ, 0.0003388175982816616)","(neigh :: #ˈneɪ, 0.00035354391042310306)","(lley :: ɪɫi#, 0.00036560680272138045)","(ewco :: ukəm#, 0.0003342776198783764)",...,"(#hil :: hɪɫt, 0.0003665702030499543)","(conf :: ˈkɑnf, 0.00036926570683164353)","(ntly# :: ntɫi#, 0.00035113486582373835)","(ter# :: ɪtɝ#, 0.00036680442033098126)","(arch :: ˈɑɹtʃ, 0.00035073412354957097)","(iler# :: ɪɫɝ#, 0.00033957245813200597)","(#cho :: ˈtʃɑp, 0.0003412996123276557)","(ino# :: inoʊ, 0.00036245842609556705)","(ole# :: ˌhoʊɫ, 0.00033902021491511336)","(olla :: #ˈkɑɫ, 0.000367412360703839)"
8,"(berg :: bɝɡ#, 0.0003445067101555949)","(ahol :: ʊɫdz#, 0.0003196308382293911)","(#tur :: #ˈtɝ, 0.0003297820583500267)","(#sor :: ˈsɔɹt, 0.00035886238132111156)","(ried :: #ˈfɹ, 0.0003362924290945376)","(sion :: ənˈk, 0.0003626827863357978)","(rlin# :: ɝɫɪn#, 0.00032219231530136684)","(#min :: ˈmaɪn, 0.00035178662688237856)","(polit :: ˈɫito, 0.00034572354309580914)","(rino :: inoʊ, 0.00033191676907381263)",...,"(align :: ɫaɪn, 0.0003597849370076162)","(#rudo :: #ˈɹu, 0.00036889071247843094)","(tor# :: ɪtɝ#, 0.00035113083912677523)","(#bar :: ˈbɑɹˌ, 0.00036437264666532737)","(olor :: ˈkəɫɝ, 0.00033724167587785965)","(snow :: ˈsno, 0.00033933682348768443)","(#spr :: pɹeɪ, 0.000341142470286076)","(#ext :: ˈɛkˌ, 0.000355679069756069)","(unab :: ˈjunə, 0.0003373040308658177)","(erg# :: ɛnbɝɡ, 0.00036554009021584855)"
9,"(ocess :: #ˈpɹ, 0.00034172591964726576)","(ssen# :: ɑsən#, 0.0003157187090041159)","(son# :: nsən, 0.0003278907522998519)","(reshe :: ˈfɹɛʃ, 0.0003562904385771946)","(forgi :: fɝˈɡ, 0.00032566587625498463)","(aster :: ˈistɝ, 0.00035622385043429034)","(troop :: #ˈtɹu, 0.0003196524771135944)","(gman :: mən#, 0.00034508930267784224)","(bill :: #ˈbɪ, 0.0003374881824268282)","(teri :: tɪɹiə, 0.0003288271048515481)",...,"(#leg :: ˈɫɛɡ, 0.00035662644231115913)","(#von :: #ˈvɑn, 0.0003612437661599527)","(idli :: ɪdɫɪ, 0.0003508275461260633)","(#rive :: #ˈɹɪ, 0.00036146021280861895)","(#cam :: #ˈkæ, 0.0003347108182506924)","(ppren :: əˈpɹ, 0.00033717990639615985)","(ander :: dɝz#, 0.0003390893413452186)","(eyser :: ɪzɝ#, 0.0003533330411382965)","(tar# :: stɑɹ, 0.00032912690695894583)","(nder :: ɛndɝz, 0.00036495250726996706)"


# 解析

In [16]:
## integrate redundant terms by superposition
import term_handler

## run the following if you need to make effective your updates in the module
reload_modules = False
if reload_modules:
    import importlib
    importlib.reload (term_handler)

## Good paramterization is crucial for convergence. Here are two samples of good one for
## global and another for local alignments. NB: different values would generate unexitable loop.
## Acceptable setting for global alignment
global_align_params = {
            'gap_weight'      : -2, # This may not be smaller
            'match_weight'    :  3,
            'mismatch_weight' : -2,
            'theta'           :  0 # If it takes too long to complete, try out a larger value.
            }
## Acceptable setting for local alignment: Local alignment is more robust for variation
local_align_params = {
            'gap_weight'      : -2.5,
            'match_weight'    :  1,
            'mismatch_weight' : -2.5,
            'theta'           :  0 # If it takes too long to complete, try out a larger value.
            }

## settings for execution
align_globally = False
if align_globally:
    method = "globally"
else:
    method = "locally"
expansive      = True
greedy         = True # This attribute is largely incompatible with global alignment
detailed       = True
check          = False

##
topics_df = raw_df

## main
print(f"method: {method}")
print(f"greedy: {greedy}")
print(f"expansive: {expansive}")
print(f"detailed: {detailed}")
print(f"check: {check}")
print(f"=====================")
for tid in topics_df:
    topic_data = topics_df[tid]
    if check:
        print(topic_data)
    #
    terms_as_dict = { }
    for data in topic_data:
        term, score = data[0], data[1]
        try:
            val = float(score)
            terms_as_dict[str(term)] = val
        except ValueError:
            print(f"#{score}")
    if check:
        print(terms_as_dict)
    
    ## recursively update data
    T = list(terms_as_dict.keys())
    if align_globally:
        O = term_handler.derive_terms_by_superposition (T, align_globally = True, expansive = expansive, greedy = greedy, check = check, **global_align_params)
    else:
        O = term_handler.derive_terms_by_superposition (T, align_globally = False, expansive = expansive, greedy = greedy, check = check, **local_align_params)
    
    ## show result
    if detailed:
        T = sorted(T, key = lambda x: (len(x) - x.count(gap_mark) - x.count("[") - x.count("]")), reverse = True)
        print(f"topic {tid:03} has {len(T):03} raw terms: {T}")
    #
    O = [ x for x in O if not x in T ] # Filter out terms already in originals 
    O = sorted(O, key = lambda x: (len(x) - x.count(gap_mark) - x.count("[") - x.count("]")), reverse = True)
    print(f"topic {tid:03} found {len(O):03} {method} superposed terms: {O}")

##
print(f"# Topic-wise term integration for HDP/LDA with {max_n_topics} topics via {target_lang_dict[target_lang_key]} {target_class}") 

method: locally
greedy: True
expansive: True
detailed: True
check: False



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/opt/miniconda3/lib/python3.11/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/opt/miniconda3/lib/python3.11/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/opt/miniconda3/lib/python3.11/site-packages/ipykernel/kernelapp.py", line 739, in start
    self.io_loop.start()
  File "/opt/miniconda3/lib/python3.11/s

topic 000 has 060 raw terms: ['ates# :: eɪts#', 'berg# :: nbɝɡ#', 'gadea :: mɛɡəˌ', 'aking :: eɪkɪŋ', '#agen :: ʒənsi', '#chev :: #ˈtʃɛ', 'retre :: iˈtɹɛ', 'enber :: ənbɝ', 'iden :: ˈkɑnf', 'ocess :: #ˈpɹ', 'reich :: #ˈɹa', 'berg# :: bɝɡ#', 'nberg :: bɝɡ#', '#bag :: #ˈbæɡ', 'nfide :: ɑnfə', 'ueen# :: ˈkwi', 'enber :: nbɝɡ', 'aine# :: eɪn#', 'yar# :: ɹaɪɝ#', 'nberg :: nbɝɡ', 'ates# :: eɪts', '#toy :: #ˈtɔɪ', 'icho :: ˈnɪkə', 'hoos :: ænˈhu', '#ven :: #ˈvɛn', 'accou :: #əˈk', '#chi :: #ˈtʃɪ', '#tun :: #ˈtun', 'rnam :: ˌneɪm', 'confi :: kɑnf', '#sten :: stɛn', 'chev :: #ˈtʃɛ', 'ght# :: weɪnˌ', 'ring# :: ɔɹdɝ', 'fluen :: ɫuən', 'frai :: ˈfɹeɪ', 'enber :: bɝɡ#', 'rron :: ɛɹən#', 'ence :: fɫuə', 'bric :: əkən', '#mul :: #ˈmə', 'oniz :: #ˈaɪ', 'enbe :: ɛnbɝ', 'berg :: bɝɡ#', 'patr :: pætɹ', 'nico :: əɫaʊ', 'lenc :: əns#', 'isto :: ˈkɹɪ', 'ssio :: ʃən#', 'lman :: əɫmə', 'tes# :: ɪts#', '#pro :: ˈpɹɑ', 'tain :: tənz', 'nber :: bɝɡ#', 'sten :: ˈstɛ', 'ichm :: kmən', '#pla :: ɫænt', 'hes# :: ʃəz#