In [63]:
import requests
import os
from data.annotated.convert_annotations_utils import Alignment

In [64]:
def align_texts_fastalign(source_text, target_text, langpair):
    url = 'https://quest.ms.mff.cuni.cz/ptakopet-mt380/align/'+langpair
    headers = {
        'Content-Type': 'application/json',
    }
    data = {
        'src_tokens': source_text.split(),
        'trg_tokens': target_text.split(),
    }

    response = requests.post(url, headers=headers, json=data)

    if response.status_code == 200:
        alignment = response.json()
        alignment["alignment"] = [[int(idx) for idx in pair.split("-")] for pair in alignment["alignment"].split()]
        return alignment
    else:
        print(f"Error: {response.status_code}")
        print(response.text)


def align_texts(source_text, target_text, langpair):
    url = 'https://quest.ms.mff.cuni.cz/ptakopet-mt380/align/cs-uk?method=awesome'
    headers = {
        'Content-Type': 'application/json',
    }
    data = {
        'src_text': source_text,
        'trg_text': target_text,
    }

    response = requests.post(url, headers=headers, json=data)

    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error: {response.status_code}")
        print(response.text)

def write_alignments(alignments, output_dir, filename):
    os.makedirs(output_dir, exist_ok=True)
    with open(os.path.join(output_dir, filename), "w") as output:
        for alignment_result in alignments:
            alignment_string_pairs = ["-".join(map(str, al)) for al in alignment_result["alignment"]]
            alignment_string = " ".join(alignment_string_pairs)
            output.write(alignment_string + "\n")

In [72]:
%%time
alignments = []
with open("./data/annotated/cs-uk/csuk.src-tgt") as sentences:
    for pair in sentences:
        src, tgt = pair.strip().split(" ||| ")
        alignment = align_texts(src, tgt, "cs-uk")
        alignments.append(alignment)

CPU times: user 7.67 s, sys: 54.5 ms, total: 7.72 s
Wall time: 41 s


In [71]:
%%time
alignments = []
with open("./data/annotated/cs-uk/csuk.src-tgt") as sentences:
    for pair in sentences:
        src, tgt = pair.strip().split(" ||| ")
        alignment = align_texts_fastalign(src, tgt, "cs-uk")
        alignments.append(alignment)

CPU times: user 7.25 s, sys: 67.5 ms, total: 7.32 s
Wall time: 8.37 s


In [66]:
write_alignments(alignments, "./finetune/fastAlign", "csuk.fastAlign.out")

In [67]:
alignments = []
with open("./data/annotated/en-cs/encs.src-tgt") as sentences:
    for pair in sentences:
        src, tgt = pair.strip().split(" ||| ")
        alignment = align_texts_fastalign(src, tgt, "en-cs")
        alignments.append(alignment)

In [68]:
write_alignments(alignments, "./finetune/fastAlign", "encs.fastAlign.out")

In [69]:
from notebook_utils import evaluate

In [70]:
evaluate("./data/annotated/cs-uk/csuk.gold", "./finetune/fastAlign/csuk.fastAlign.out", [])

(0.2591642228739003,
 0.6748582230623819,
 0.8326029798422436,
 0.7454771856285451)

In [53]:
evaluate("./data/annotated/en-cs/encs.gold", "./finetune/fastAlign/encs.fastAlign.out", [])

(0.18151571164510172,
 0.7921810699588477,
 0.8492381716118684,
 0.8197179445680354)