In [1]:
def get_lines_of_file(path):
    with open(path, 'r') as f:
        lines = f.readlines()
    return lines

import re
def tokenize(sentence):
    # split on whitespace and punctuation
    return re.findall(r"[\w]+|[^\s\w]", sentence)

def generate_aligned_lines(src, trg):
    aligned_lines = []
    for i in range(len(src)):
        if len(src[i]) > 1 and len(trg[i]) > 1:
            aligned_lines.append([
                " ".join(tokenize(src[i].strip())),
                " ".join(tokenize(trg[i].strip()))
            ])
    return aligned_lines

# eflomal expects input files like:
# source sentence ||| target sentence
def store_in_eflomal_input_format(aligned_lines, path):
    with open(path, 'w') as f:
        for line in aligned_lines:
            f.write(line[0] + " ||| " + line[1] + "\n")

In [10]:
# The source files match by line number, so only lines in which both files have content are used
def only_mark(lines):
    return lines[24284:24962]

src = only_mark(get_lines_of_file("/home/jcuenod/Programming/ebible-lancedb/ebible/corpus/eng-engULB.txt"))
trg = only_mark(get_lines_of_file("/home/jcuenod/Programming/ebible-lancedb/ebible/corpus/tpi-tpiOTNT.txt"))
filename = "eflomal_ulb_tpi"

aligned_lines = generate_aligned_lines(src,trg)
input_filename = filename + "_input.txt"
store_in_eflomal_input_format(aligned_lines, input_filename)

print(len(aligned_lines))
print(aligned_lines[0])

673
['This is the beginning of the gospel of Jesus Christ , the Son of God .', 'Dispela em i gutnius bilong Jisas Krais , Pikinini Bilong God .']


In [11]:
# Now we run the eflomal alignment tool and output the results to a file
output_filename = filename + "_alignment.txt"
# delete the output file if it already exists
import os
if os.path.exists(output_filename):
    os.remove(output_filename)
!(cd eflomal/ && python/scripts/eflomal-align -i $f'../{input_filename}' -f $f'../{output_filename}')

In [13]:
# To generate alignments, we run:
# ```
# eflomal/python/scripts/eflomal-align -i eflomal_gnt_ylt.txt -f eflomal_gnt_ylt_alignment.txt
# ```

# We can now read that file in:
with open(output_filename, 'r') as f:
    alignments = f.readlines()

# The alignments are in the format:
# 0-1 1-4 2-6 3-7 4-8 5-10 6-11 7-13

# Let's print out the first aligned tokens:
for i in range(len(alignments)):
    alignment = [token_map.split("-") for token_map in alignments[i].strip().split(" ")]
    
    src_tokens = aligned_lines[i][0].split(" ")
    trg_tokens = aligned_lines[i][1].split(" ")

    for token_map in alignment:
        print(trg_tokens[int(token_map[1])],src_tokens[int(token_map[0])])
    

Dispela This
em is
i the
gutnius gospel
bilong of
Jisas Jesus
Krais Christ
, ,
Pikinini Son
Bilong of
God God
. .
gutnius As
em it
i is
pastaim written
profet Isaiah
Aisaia Isaiah
i the
raitim prophet
, ,
“ “
Harim Look
. ,
mi I
salim sending
man who
mi my
em messenger
paslain before
yu your
. ,
Na the
bai will
redim prepare
rot way
yu your
. .
Long in
ples the
no wilderness
, ,
maus voice
bilong of
wanpela one
singaut out
, ,
‘ '
Redim Make
rot way
bilong of
Bikpela Lord
. ;
Stretim make
ol his
rot paths
. .
’ '
” ”
Jon John
, ,
baptais baptizing
long in
ples the
no wilderness
na and
autim preaching
long a
ol a
i came
tanim baptizing
baptais baptism
lusim the
sin forgiveness
bilong of
ol sins
. .
Na The
olgeta whole
Judia Judea
na and
olgeta all
manmeri people
bilong of
Jerusalem Jerusalem
i went
go out
long to
Jon him
. .
ol They
i were
autim confessing
sin sins
ol their
, ,
Jon him
i were
baptaisim baptized
ol by
long the
wara Jordan
Jordan River
. .
Jon John
i was
save wearing
puti