In [1]:
def get_lines_of_file(path):
    with open(path, 'r') as f:
        lines = f.readlines()
    return lines

import re
def tokenize(sentence):
    # split on whitespace and punctuation
    return re.findall(r"[\w]+|[^\s\w]", sentence)

def generate_aligned_lines(src, trg):
    aligned_lines = []
    for i in range(len(src)):
        if len(src[i]) > 1 and len(trg[i]) > 1:
            aligned_lines.append([
                " ".join(tokenize(src[i].strip())),
                " ".join(tokenize(trg[i].strip()))
            ])
    return aligned_lines

# eflomal expects input files like:
# source sentence ||| target sentence
def store_in_eflomal_input_format(aligned_lines, path):
    with open(path, 'w') as f:
        for line in aligned_lines:
            f.write(line[0] + " ||| " + line[1] + "\n")

In [2]:
# The source files match by line number, so only lines in which both files have content are used
src = get_lines_of_file("/home/jcuenod/Programming/ebible-lancedb/ebible/corpus/grc-grcbyz.txt")
trg = get_lines_of_file("/home/jcuenod/Programming/ebible-lancedb/ebible/corpus/tpi-tpiOTNT.txt")
filename = "eflomal_byz_tpi"

aligned_lines = generate_aligned_lines(src,trg)
input_filename = filename + "_input.txt"
store_in_eflomal_input_format(aligned_lines, input_filename)

print(len(aligned_lines))
print(aligned_lines[0])

7952
['Βίβλος γενέσεως Ἰησοῦ Χριστοῦ , υἱοῦ Δαυῒδ υἱοῦ Ἀβραάμ .', 'Dispela em i tok long ol lain tumbuna bilong Jisas Krais . Jisas em i bilong lain bilong Devit na Devit em i bilong lain bilong Abraham .']


In [3]:
# Now we run the eflomal alignment tool and output the results to a file
output_filename = filename + "_alignment.txt"
# delete the output file if it already exists
import os
if os.path.exists(output_filename):
    os.remove(output_filename)
!(cd eflomal/ && python/scripts/eflomal-align -i $f'../{input_filename}' -f $f'../{output_filename}')

In [4]:
# To generate alignments, we run:
# ```
# eflomal/python/scripts/eflomal-align -i eflomal_gnt_ylt.txt -f eflomal_gnt_ylt_alignment.txt
# ```

# We can now read that file in:
with open(output_filename, 'r') as f:
    alignments = f.readlines()

# The alignments are in the format:
# 0-1 1-4 2-6 3-7 4-8 5-10 6-11 7-13

# Let's print out the first aligned tokens:
for i in range(len(alignments)):
    alignment = [token_map.split("-") for token_map in alignments[i].strip().split(" ")]
    
    src_tokens = aligned_lines[i][0].split(" ")
    trg_tokens = aligned_lines[i][1].split(" ")

    for token_map in alignment:
        print(src_tokens[int(token_map[0])], trg_tokens[int(token_map[1])])
    

Δαυῒδ Dispela
υἱοῦ lain
Ἀβραάμ tumbuna
Ἰησοῦ Jisas
Χριστοῦ Krais
, .
υἱοῦ lain
Δαυῒδ Devit
Δαυῒδ Devit
υἱοῦ lain
Ἀβραάμ Abraham
. .
Ἀβραὰμ Abraham
ἐγέννησε papa
τὸν bilong
Ἰσαάκ Aisak
, .
Ἰσαὰκ Aisak
δὲ i
ἐγέννησε papa
τὸν bilong
Ἰακώβ Jekop
, .
Ἰακὼβ Jekop
δὲ i
ἐγέννησε papa
τὸν bilong
Ἰούδαν Juda
καὶ wantaim
τοὺς ol
ἀδελφοὺς brata
αὐτοῦ bilong
αὐτοῦ en
, .
Ἰούδας Juda
δὲ i
ἐγέννησε papa
τὸν bilong
Φαρὲς Peres
καὶ wantaim
Ζαρὰ Sera
ἐκ .
Θάμαρ Mama
τῆς bilong
Θάμαρ tupela
Θάμαρ Tamar
, .
Φαρὲς Peres
δὲ i
ἐγέννησε papa
τὸν bilong
Ἐσρώμ Hesron
, .
Θάμαρ Hesron
δὲ em
ἐγέννησε papa
τὸν bilong
Ἀράμ Ram
, .
Ἀρὰμ Ram
δὲ em
ἐγέννησε papa
τὸν bilong
Ἀμιναδάβ Aminadap
, .
Ἀμιναδάβ Aminadap
δὲ i
ἐγέννησε papa
τὸν bilong
Ναασσών Nason
, .
, Na
Ναασσών Nason
, em
δὲ i
ἐγέννησε papa
τὸν bilong
Σαλμών Salmon
, .
Σαλμὼν Salmon
δὲ i
ἐγέννησε papa
τὸν bilong
Ὠβὴδ Boas
ἐκ Mama
τῆς bilong
Ῥούθ Rahap
, .
Ὠβὴδ Boas
δὲ i
ἐγέννησε papa
τὸν bilong
Βοὸζ Obet
ἐκ .
ἐκ Mama
τῆς bilong
Ῥαχάβ Rut
, .
Βοὸζ Obet
δὲ i
