### Final version

In [9]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
import glob
import os

import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()

import tmdglobals

In [13]:
# Load the model ONCE, at the beginning
tmdglobals.init_embedding_model(langs=['en','de'])

en model initialized
de model initialized


In [15]:
tmdglobals.xlang_model

EmbeddingModel with 2 langs: en, de

In [16]:
def load_chapter(ch_fpath, delimiter='\t'):
    # First annoying thing: need to escape the quote characters, otherwise
    # loading with Pandas will fail
    with open(ch_fpath, 'r', encoding='utf-8') as infile:
        orig_text = infile.read()
    new_text = orig_text.replace("\"", "\\\"")
    with open(ch_fpath, 'w', encoding='utf-8') as outfile:
        outfile.write(new_text)
    # Now we can load with Pandas
    ch_df = pd.read_csv(ch_fpath, delimiter=delimiter, header=None, encoding="utf8")
    return ch_df

In [17]:
def output_wmds(output_path, ch_name, wmd_df):
    # Make sure the output_path exists
    if not os.path.isdir(output_path):
        os.mkdir(output_path)
    output_fname = f"{ch_name}.wmd.csv"
    wmd_fpath = os.path.join(output_path, output_fname)
    wmd_df.to_csv(wmd_fpath)
    return wmd_fpath

In [18]:
verbose = True
vprint = print if verbose else lambda x: None

In [21]:
doc1 = "de_tge"
lang1 = tmdglobals.get_lang_code(doc1)
doc2 = "en_reitter"
lang2 = tmdglobals.get_lang_code(doc2)
alignment_code = tmdglobals.get_alignment_code(doc1, doc2)

In [22]:
alignment_code

'de_tge.en_reitter'

In [24]:
## Load the chapters
if alignment_code == "de_tge.kk_prog":
    # Special case for Kazakh
    alignment_path = tmdglobals.get_alignment_path(alignment_code + ".pickled")
else:
    alignment_path = tmdglobals.get_alignment_path(alignment_code)
print(alignment_path)
if alignment_code == "de_tge.kk_prog":
    glob_ext = "*.pkl"
else:
    glob_ext = "*.txt"
alignment_glob = os.path.join(alignment_path, glob_ext)
print(alignment_glob)
all_ch_fpaths = sorted(glob.glob(alignment_glob))

..\Texts_Aligned\de_tge.en_reitter
..\Texts_Aligned\de_tge.en_reitter\*.txt


In [25]:
all_ch_fpaths

['..\\Texts_Aligned\\de_tge.en_reitter\\ch01.align.txt',
 '..\\Texts_Aligned\\de_tge.en_reitter\\ch02.align.txt',
 '..\\Texts_Aligned\\de_tge.en_reitter\\ch03.align.txt',
 '..\\Texts_Aligned\\de_tge.en_reitter\\ch04.align.txt',
 '..\\Texts_Aligned\\de_tge.en_reitter\\ch05.align.txt',
 '..\\Texts_Aligned\\de_tge.en_reitter\\ch06.align.txt',
 '..\\Texts_Aligned\\de_tge.en_reitter\\ch07.align.txt',
 '..\\Texts_Aligned\\de_tge.en_reitter\\ch08.align.txt',
 '..\\Texts_Aligned\\de_tge.en_reitter\\ch09.align.txt',
 '..\\Texts_Aligned\\de_tge.en_reitter\\ch10.align.txt',
 '..\\Texts_Aligned\\de_tge.en_reitter\\ch11.align.txt',
 '..\\Texts_Aligned\\de_tge.en_reitter\\ch12.align.txt',
 '..\\Texts_Aligned\\de_tge.en_reitter\\ch13.align.txt',
 '..\\Texts_Aligned\\de_tge.en_reitter\\ch14.align.txt',
 '..\\Texts_Aligned\\de_tge.en_reitter\\ch15a.align.txt',
 '..\\Texts_Aligned\\de_tge.en_reitter\\ch15b.align.txt',
 '..\\Texts_Aligned\\de_tge.en_reitter\\ch16.align.txt',
 '..\\Texts_Aligned\\de_tge.e

In [26]:
alignment_data_path = tmdglobals.get_data_path(alignment_code)
if not os.path.isdir(alignment_data_path):
    os.mkdir(alignment_data_path)
alignment_data_path

'..\\data_tmd\\de_tge.en_reitter'

In [31]:
!pip install pyemd

Collecting pyemd
  Downloading pyemd-0.5.1.tar.gz (91 kB)
     ---------------------------------------- 91.5/91.5 kB 2.5 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: pyemd
  Building wheel for pyemd (setup.py): started
  Building wheel for pyemd (setup.py): finished with status 'error'
  Running setup.py clean for pyemd
Failed to build pyemd
Installing collected packages: pyemd
  Running setup.py install for pyemd: started
  Running setup.py install for pyemd: finished with status 'error'


  error: subprocess-exited-with-error
  
  python setup.py bdist_wheel did not run successfully.
  exit code: 1
  
  [16 lines of output]
  running bdist_wheel
  running build
  running build_py
  creating build
  creating build\lib.win-amd64-3.10
  creating build\lib.win-amd64-3.10\pyemd
  copying pyemd\__about__.py -> build\lib.win-amd64-3.10\pyemd
  copying pyemd\__init__.py -> build\lib.win-amd64-3.10\pyemd
  running build_ext
  creating build\temp.win-amd64-3.10
  creating build\temp.win-amd64-3.10\Release
  creating build\temp.win-amd64-3.10\Release\pyemd
  "C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\VC\Tools\MSVC\14.32.31326\bin\HostX86\x64\cl.exe" /c /nologo /O2 /W3 /GL /DNDEBUG /MD -Ipyemd -IC:\Users\jpjac\include -IC:\Python310\include -IC:\Python310\Include -IC:\Users\jpjac\lib\site-packages\numpy\core\include "-IC:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\VC\Tools\MSVC\14.32.31326\include" /EHsc /Tppyemd/emd.cpp /Fobuild\temp.win-amd64

In [30]:
# Test run, one chapter, for debugging
compute_wmd_partial = lambda cur_row: tmdglobals.compute_tmd(cur_row, lang1, lang2, verbose=False)
cur_ch_fpath = all_ch_fpaths[0]
cur_fname, fname_prefix, tr_code, ch_name = tmdglobals.get_file_info(cur_ch_fpath)
print(f"***** Processing {ch_name}")

cur_df = load_chapter(cur_ch_fpath)
cur_df.columns = [lang1, lang2, "alignment_id"]
cur_df['chapter'] = ch_name
# Computation
cur_df['wmd'] = cur_df.progress_apply(compute_wmd_partial, axis=1)

***** Processing 01


  0%|          | 1/587 [00:00<01:50,  5.31it/s]


ImportError: Please install pyemd Python package to compute WMD.

In [None]:
# Special case for Kazakh, we need to treat it as Russian
if lang2 == "kk":
    ch_num = 1
    ch_num_str = str(ch_num).zfill(2)
    first_fpath = f"../Texts_Aligned/de_tge.kk_prog.pickled/ch{ch_num_str}.pkl"
    kk_df = pd.read_pickle(first_fpath)
    kk_df.columns = ['de','ru']
    kk_df['chapter'] = f"ch{ch_num_str}"
    kk_df['tmd'] = trglobals.compute_tmd_df(kk_df)

In [44]:
all_ch_fpaths

['..\\Texts_Aligned\\de_tge.el_mod\\ch01.align.txt',
 '..\\Texts_Aligned\\de_tge.el_mod\\ch02.align.txt',
 '..\\Texts_Aligned\\de_tge.el_mod\\ch03.align.txt',
 '..\\Texts_Aligned\\de_tge.el_mod\\ch04.align.txt',
 '..\\Texts_Aligned\\de_tge.el_mod\\ch05.align.txt',
 '..\\Texts_Aligned\\de_tge.el_mod\\ch06.align.txt',
 '..\\Texts_Aligned\\de_tge.el_mod\\ch07.align.txt',
 '..\\Texts_Aligned\\de_tge.el_mod\\ch08.align.txt',
 '..\\Texts_Aligned\\de_tge.el_mod\\ch09.align.txt',
 '..\\Texts_Aligned\\de_tge.el_mod\\ch10.align.txt',
 '..\\Texts_Aligned\\de_tge.el_mod\\ch11.align.txt',
 '..\\Texts_Aligned\\de_tge.el_mod\\ch12.align.txt',
 '..\\Texts_Aligned\\de_tge.el_mod\\ch13.align.txt',
 '..\\Texts_Aligned\\de_tge.el_mod\\ch14.align.txt',
 '..\\Texts_Aligned\\de_tge.el_mod\\ch15a.align.txt',
 '..\\Texts_Aligned\\de_tge.el_mod\\ch15b.align.txt',
 '..\\Texts_Aligned\\de_tge.el_mod\\ch16.align.txt',
 '..\\Texts_Aligned\\de_tge.el_mod\\ch17.align.txt',
 '..\\Texts_Aligned\\de_tge.el_mod\\ch18.ali

In [45]:
# Full run, over all chapters
all_dfs = []
for cur_ch_fpath in all_ch_fpaths:
    cur_fname = os.path.basename(cur_ch_fpath)
    print(f"****** Processing {cur_fname}")
    if lang2 == "kk":
        # Special case for Kazakh, since it wouldn't load the .csv
        cur_df = pd.read_pickle(cur_ch_fpath)
    else:
        cur_df = load_chapter(cur_ch_fpath)
    if lang2 == "kk":
        cur_df.columns = [lang1, 'ru']
    else:
        cur_df.columns = [lang1,lang2,'alignment_id']
    cur_df['chapter'] = ch_name
    # Computation
    cur_df['tmd'] = trglobals.compute_tmd_df(cur_df)
    # Output current chapter file
    ch_data_fpath = os.path.join(alignment_data_path, f"{ch_name}.pkl")
    cur_df.to_pickle(ch_data_fpath)
    #vprint(f"Saved to {ch_data_fpath}")
    # And append to all_dfs
    all_dfs.append(cur_df)

****** Processing ch01.align.txt


100%|██████████| 788/788 [00:01<00:00, 538.63it/s]


****** Processing ch02.align.txt


100%|██████████| 152/152 [00:00<00:00, 685.17it/s]


****** Processing ch03.align.txt


100%|██████████| 874/874 [00:01<00:00, 707.93it/s]


****** Processing ch04.align.txt


100%|██████████| 271/271 [00:00<00:00, 369.64it/s]


****** Processing ch05.align.txt


100%|██████████| 207/207 [00:00<00:00, 742.57it/s]


****** Processing ch06.align.txt


100%|██████████| 167/167 [00:00<00:00, 652.35it/s]


****** Processing ch07.align.txt


100%|██████████| 387/387 [00:00<00:00, 598.53it/s]


****** Processing ch08.align.txt


100%|██████████| 181/181 [00:00<00:00, 578.27it/s]


****** Processing ch09.align.txt


100%|██████████| 266/266 [00:00<00:00, 462.04it/s]


****** Processing ch10.align.txt


100%|██████████| 1180/1180 [00:01<00:00, 630.21it/s]


****** Processing ch11.align.txt


100%|██████████| 89/89 [00:00<00:00, 274.11it/s]


****** Processing ch12.align.txt


100%|██████████| 107/107 [00:00<00:00, 360.27it/s]


****** Processing ch13.align.txt


100%|██████████| 141/141 [00:00<00:00, 328.55it/s]


****** Processing ch14.align.txt


100%|██████████| 477/477 [00:00<00:00, 624.91it/s]


****** Processing ch15a.align.txt


100%|██████████| 719/719 [00:02<00:00, 356.55it/s]


****** Processing ch15b.align.txt


100%|██████████| 688/688 [00:01<00:00, 441.38it/s]


****** Processing ch16.align.txt


100%|██████████| 160/160 [00:00<00:00, 545.69it/s]


****** Processing ch17.align.txt


100%|██████████| 157/157 [00:00<00:00, 525.80it/s]


****** Processing ch18.align.txt


100%|██████████| 50/50 [00:00<00:00, 555.54it/s]


****** Processing ch19.align.txt


100%|██████████| 117/117 [00:00<00:00, 506.44it/s]


****** Processing ch20.align.txt


100%|██████████| 120/120 [00:00<00:00, 641.70it/s]


****** Processing ch21.align.txt


100%|██████████| 129/129 [00:00<00:00, 704.93it/s]


****** Processing ch22.align.txt


100%|██████████| 44/44 [00:00<00:00, 273.30it/s]


****** Processing ch23.align.txt


100%|██████████| 190/190 [00:00<00:00, 458.25it/s]


****** Processing ch24.align.txt


100%|██████████| 520/520 [00:00<00:00, 551.50it/s]


****** Processing ch25.align.txt


100%|██████████| 1516/1516 [00:02<00:00, 569.59it/s]


****** Processing ch26.align.txt


100%|██████████| 47/47 [00:00<00:00, 527.73it/s]


****** Processing ch27.align.txt


100%|██████████| 289/289 [00:00<00:00, 923.04it/s]


****** Processing ch28.align.txt


100%|██████████| 148/148 [00:00<00:00, 680.20it/s]


****** Processing ch29.align.txt


100%|██████████| 34/34 [00:00<00:00, 871.79it/s]


****** Processing ch30.align.txt


100%|██████████| 76/76 [00:00<00:00, 826.13it/s]


****** Processing ch31.align.txt


100%|██████████| 162/162 [00:00<00:00, 548.07it/s]


****** Processing ch32.align.txt


100%|██████████| 40/40 [00:00<00:00, 571.43it/s]


****** Processing ch33.align.txt


100%|██████████| 115/115 [00:00<00:00, 412.19it/s]


In [46]:
full_df = pd.concat(all_dfs)

In [47]:
full_df

Unnamed: 0,de,el,alignment_id,chapter,tmd
0,,ΤΟ ΕΜΠΟΡΕΥΜΑ 1.,ch01_clean.de_tge-ch01_clean.el_mod,ch15,inf
1,ERSTES KAPITEL Die Ware 1. Die zwei Faktoren d...,ΟΙ ΔΥΟ ΠΑΡΑΓΟΝΤΕΣ ΤΟΥ ΕΜΠΟΡΕΥΜΑΤΟΣ: ΑΞΙΑ ΧΡΗΣΗ...,ch01_clean.de_tge-ch01_clean.el_mod,ch15,0.963290
2,Unsere Untersuchung beginnt daher mit der Anal...,"Γιαυτό, ή Ερευνά μας αρχίζει μέ την άνάλυση το...",ch01_clean.de_tge-ch01_clean.el_mod,ch15,1.214450
3,"Die Ware ist zunachst ein ausserer Gegenstand,...",Τό έμπόρευμα είναι πριν άπ’ όλα ένα εξωτερικό ...,ch01_clean.de_tge-ch01_clean.el_mod,ch15,1.084576
4,"Die Natur dieser Bedurfnisse, ob sie z.B. dem ...","Ή φύση αυτών τών αναγκών, άν λχ. προέρχονται ά...",ch01_clean.de_tge-ch01_clean.el_mod,ch15,0.960944
...,...,...,...,...,...
110,"Die grosse Republik hat also aufgehort, das ge...",“Ετσι ή μεγάλη δημοκρατία έπαψε νά είναι ή γή ...,ch33_clean.de_tge-ch33_clean.el_mod,ch15,1.134638
111,Die kapitalistische Produktion geht dort mit R...,Ή κεφαλαιοκρατική παραγωγή προχωρεί έκεΐ μέ γι...,ch33_clean.de_tge-ch33_clean.el_mod,ch15,1.125233
112,"Die von Wakefield selbst so laut denunzierte, ...",Τό έπαίσχυντο ξεπούλημα σέ έξευτελιστικές τιμέ...,ch33_clean.de_tge-ch33_clean.el_mod,ch15,1.096947
113,Jedoch beschaftigt uns hier nicht der Zustand ...,Ωστόσο τό θέμα πού μάς άπασχολεΐ έδώ δέν είναι...,ch33_clean.de_tge-ch33_clean.el_mod,ch15,1.027426


In [49]:
full_df['tmd'].mean()

inf

In [50]:
trglobals.noninf_mean(full_df, 'tmd')

1.1669852750570167

In [51]:
full_fpath = os.path.join(alignment_data_path, "full.pkl")
full_df.to_pickle(full_fpath)
vprint(f"Saved to {full_fpath}")

Saved to ..\data_tmd\de_tge.el_mod\full.pkl
