## Use LFAlign to align the two texts

Important note here: LFAlign checks for nonbreaking prefixes for the desired langs within the folder:

`SentenceAligner\scripts\sentence_splitter\nonbreaking_prefixes\`

Currently they have: bg, ca, cs, de, el, en, es, et, fr, hu, is, it, nl, pl, pt, ro, ru, sk, sl, sv

In [49]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [50]:
#!pip install git+https://github.com/rsennrich/Bleualign.git

In [51]:
import bleualign

In [52]:
import glob
import itertools
import os
import pathlib
import shutil
import subprocess

from tqdm import tqdm

import tmdglobals

In [53]:
def output_logs(align_logs, output_fname):
    log_path = "./logs"
    if not os.path.isdir(log_path):
        os.mkdir(log_path)
    log_fpath = os.path.join(log_path, output_fname.replace(".txt","_log.txt"))
    with open(log_fpath, "w", encoding="utf-8") as outfile:
        for log_num, cur_log in enumerate(align_logs):
            # (log_num starts at 0, whereas chapter numbers start at 1)
            ch_num = log_num + 1
            outfile.write("Chapter #" + str(ch_num))
            outfile.write("------------")
            outfile.write(cur_log.decode())
    return log_fpath

In [54]:
verbose = False
vprint = print if verbose else lambda x: None

In [55]:
doc1 = "de_tge"
lang1 = tmdglobals.get_lang_code(doc1)
doc2 = "en_reitter"
lang2 = tmdglobals.get_lang_code(doc2)
# This will be e.g. "en_fowkes.de_tge"
alignment_code = tmdglobals.get_alignment_code(doc1, doc2)

In [56]:
x = ['a','b','c']
y = ['1','2','3']
list(zip(x,y))

[('a', '1'), ('b', '2'), ('c', '3')]

In [57]:
align_logs = []
# First we have to construct the command-line argument we're going to pass
# into LF Align
d1_path = tmdglobals.get_cleaned_path(doc1)
d1_fpaths = glob.glob(os.path.join(d1_path,"*.txt"))
d2_path = tmdglobals.get_cleaned_path(doc2)
d2_fpaths = glob.glob(os.path.join(d2_path, "*.txt"))
print(list(zip(d1_fpaths, d2_fpaths)))

[('..\\Texts\\ThirdGermanEdition\\cleaned\\ch01_clean.de_tge.txt', '..\\Texts\\Reitter\\cleaned\\ch01_clean.en_reitter.txt'), ('..\\Texts\\ThirdGermanEdition\\cleaned\\ch02_clean.de_tge.txt', '..\\Texts\\Reitter\\cleaned\\ch02_clean.en_reitter.txt'), ('..\\Texts\\ThirdGermanEdition\\cleaned\\ch03_clean.de_tge.txt', '..\\Texts\\Reitter\\cleaned\\ch03_clean.en_reitter.txt'), ('..\\Texts\\ThirdGermanEdition\\cleaned\\ch04_clean.de_tge.txt', '..\\Texts\\Reitter\\cleaned\\ch04_clean.en_reitter.txt'), ('..\\Texts\\ThirdGermanEdition\\cleaned\\ch05_clean.de_tge.txt', '..\\Texts\\Reitter\\cleaned\\ch05_clean.en_reitter.txt'), ('..\\Texts\\ThirdGermanEdition\\cleaned\\ch06_clean.de_tge.txt', '..\\Texts\\Reitter\\cleaned\\ch06_clean.en_reitter.txt'), ('..\\Texts\\ThirdGermanEdition\\cleaned\\ch07_clean.de_tge.txt', '..\\Texts\\Reitter\\cleaned\\ch07_clean.en_reitter.txt'), ('..\\Texts\\ThirdGermanEdition\\cleaned\\ch08_clean.de_tge.txt', '..\\Texts\\Reitter\\cleaned\\ch08_clean.en_reitter.txt'),

In [59]:
def align_files(d1_fpath, d2_fpath, skip_if_exists=False):
    d1_fname, d1_fname_prefix, d1_tr_code, d1_ch_name = tmdglobals.get_file_info(cur_d1_fpath)
    d2_fname, d2_fname_prefix, d2_tr_code, d2_ch_name = tmdglobals.get_file_info(cur_d2_fpath)
    #print(f"Processing chapter {d1_ch_name}")
    # Make the blank text file that LF_align will write into
    output_fname = f"ch{d1_ch_name}.align.txt"
    output_fpath = os.path.join(alignment_path, output_fname)
    # Check if it already exists, delete if so
    if os.path.isfile(output_fpath):
        os.remove(output_fpath)
    # And create new blank file
    pathlib.Path(output_fpath).touch()
    ## format the command
    ## OLD: trying to format into subproc.Popen() format...
    #infiles = quotes(cur_en_path) + "," + quotes(cur_de_path)
    #languages = quotes("en") + "," + quotes("de")
    #subproc_list = ['LF_aligner_4.1.exe','--filetype',quotes("t"),'--infiles',infiles,
    #                '--outfiles',quotes(output_path),'--languages',languages,
    #                '--segment',quotes("y"),'--review',quotes("n"),'--tmx',quotes("n")]
    #print(subproc_list)
    ## New: just use subprocess.call()
    cur_cmd = tmdglobals.get_lf_cmd(lang1=lang1, lang2=lang2,
                                    d1_fpath=cur_d1_fpath, d2_fpath=cur_d2_fpath,
                                    out_fpath=output_fpath)
    vprint(f"Running {cur_cmd}")
    # Temporarily change into the directory of LF_aligner, run the command,
    # then change back
    orig_dir = os.getcwd()
    os.chdir(tmdglobals.lf_path)
    vprint(f"Running command from {os.getcwd()}")
    result = subprocess.check_output(cur_cmd, shell=True)
    align_logs.append(result)
    os.chdir(orig_dir)

In [60]:
# Now make sure a subfolder for this specific language pair exists
alignment_path = tmdglobals.get_alignment_path(alignment_code)
if not os.path.isdir(alignment_path):
    os.mkdir(alignment_path)
# And loop over the fpath pairs
print(f"Performing alignment: {alignment_code}")
num_docs = len(d1_fpaths)
tqdm_iter = tqdm(zip(d1_fpaths, d2_fpaths), total=num_docs)
for cur_d1_fpath, cur_d2_fpath in tqdm_iter:
    d1_fname = os.path.basename(cur_d1_fpath)
    tqdm_iter.set_description(d1_fname)
    align_files(cur_d1_fpath, cur_d2_fpath)
# Done with for loop - just output the alignment logs to a file
log_fpath = output_logs(align_logs, output_fname)
print(f"Logs saved to {log_fpath}")
# Last thing: delete the stupid align_<date> folder that it creates,
# that we don't need (because we set the output file)
align_folders = glob.glob(os.path.join(alignment_path, "align_*"))
for cur_folder in align_folders:
    shutil.rmtree(cur_folder)

Performing alignment: de_tge.en_reitter


ch33_clean.de_tge.txt: 100%|██████████| 34/34 [00:31<00:00,  1.10it/s] 

Logs saved to ./logs\ch33.align_log.txt





In [None]:

# Uses the LF Align command-line interface to do the batch alignment of each
# chapter [and thus has to be run on Windows :(] [also you have to DISABLE
# DROPBOX!!!]
