### Load T2S Lusa corpus

In [None]:
directory = "../../Datasets/lusa_news/"
import os

files = []
for filename in os.listdir(directory):
    if filename.endswith(".txt") and "en" not in filename:
        files.append(filename.split(".")[0])  
        
len(files)

In [None]:
file = files[0]
file

### Parse Brat format

In [None]:
from brat_parser import get_entities_relations_attributes_groups

def parseBratFile(file):
    entities, relations, attributes, groups = get_entities_relations_attributes_groups(file)
    return entities, relations, attributes, groups


def filter_annotations(annotations, type, match=False, exceptions=[]):
    if match:
        return {id: ann for id, ann in annotations.items() if type in ann.type and ann.type not in exceptions}
    else:
        return {id: ann for id, ann in annotations.items() if ann.type == type and ann.type not in exceptions}
    

def get_entities_translations(entities,entities_translations):
    res = {}
    for id_, entity in entities.items():
        entity.translation = entities_translations[id_].text
        res[id_] = entity
    return res

entities, relations, attributes, groups = get_entities_relations_attributes_groups(directory+ file + ".ann")

entities_translations, _, _, _ = get_entities_relations_attributes_groups(directory+ "translated_lusa/en/" + file + ".ann")

entities = get_entities_translations(entities,entities_translations)

In [None]:
def sentence_spans(file):
    spans = []
    start_index = 0 
    with open(file) as f:
        for line in f:
            end_index = start_index + len(line)
            spans.append((line, (start_index, end_index)))
            start_index = end_index
    return spans


    
def get_sentence_entities(span,entities):
    res = []
    for k, entity in entities.items():
        if entity.span[0][0] >= span[0] and entity.span[0][1] <= span[1]:
            res.append(entity)
    return res



def process_file(files):
    
    entries = {}
    for file in files:
        #print(file)
        entities, relations, attributes, groups = get_entities_relations_attributes_groups(directory+ file + ".ann")
        #print("parsed original")
        entities_translations, _, _, _ = get_entities_relations_attributes_groups(directory+ "translated_lusa/en/" + file + ".ann")
        entities = get_entities_translations(entities,entities_translations)
        #print("parsed translated")
        
        spans = sentence_spans(directory+ file + ".txt")

        spans_trasnlated = sentence_spans(directory+ "translated_lusa/en/" + file + ".txt")

        if len(spans) != len(spans_trasnlated):
            print("error")
        dict_sents = {}
        for i, (sentence, span) in enumerate(spans):
            dict_sents[span] = {
                "src_sent": sentence, 
                "src_anns": get_sentence_entities(span, entities),
                "tgt_sent": spans_trasnlated[i][0],
                "sent_tg_span": spans_trasnlated[i][1]
                }
        entry = {
            "sents": dict_sents,
            "entities": entities,
            "relations": list(relations.values()),
            "attributes": list(attributes.values()),
            "groups": list(groups.values())
        }
        entries[file] = entry
    return entries

entries = process_file(files)
len(entries)

### Load LinguAligner

In [None]:
from LinguAligner import AlignmentPipeline, translation

config= {
    "pipeline": [ "lemma", "M_Trans", "word_aligner","gestalt","leveinstein"], # can be changed according to the desired pipeline
    "spacy_model": "en_core_web_sm", # needed for lemma
    "WAligner_model": "bert-base-multilingual-uncased", # needed for word_aligner
}
aligner = AlignmentPipeline(config)

### Generating lookup Table (Multiple Translations)

In [None]:

annotations = set()

for entry in entries:
    for span, sent in entry["sents"].items():
        for ann in sent["src_anns"]:
            annotations.add(ann.text)
print(len(annotations), annotations )

In [None]:
lookupTable = {}

In [None]:
translator = translation.MicrosoftTranslator(source_lang="pt", target_lang="en", auth_key="f6d44239a73046ca8378bcdc689b395c")
for word in annotations:
    if word not in lookupTable and len(word) < 50:
        mtrans = translator.getMultipleTranslations(word)
        lookupTable[word] = mtrans
        print(word, mtrans)

In [None]:
#import json
#f_out = open("lookupTable_annotations.json", "w")
#json.dump(lookupTable, f_out, indent=4, ensure_ascii=False)
#f_out.flush()
#f_out.close()

import json
lookupTable = json.load(open("lookupTable_annotations.json"))

### Using LinguAligner to align translated annotations with the translated text

In [None]:
i = 0
for file, entry in entries.items():
    for span, sent in entry["sents"].items():
        for ann in sent["src_anns"]:
            target_annotation = aligner.align_annotation(sent["src_sent"], ann.text, sent["tgt_sent"], ann.translation, lookupTable=lookupTable, src_ann_start=ann.span[0][0])
            ann.tgt_ann = target_annotation
    i += 1
    print(i)

In [None]:
from pickle import dump, load
#dump(entries, open("lusa_entries.pkl", "wb"))

### Save data in Brat format

In [None]:
entries = load(open("lusa_entries.pkl", "rb"))

In [None]:

for file, entry in entries.items():
    entry["entities"] = []
    for span, sent in entry["sents"].items():
        for ann in sent["src_anns"]:
            print(ann.text, ann.span, ann.translation, ann.tgt_ann)
            ann.text = ann.tgt_ann[0]
            ann.span = ((ann.tgt_ann[1][0] + sent["sent_tg_span"][0], ann.tgt_ann[1][1] + sent["sent_tg_span"][0] + 1),)
            entry["entities"].append(ann)

In [None]:
#read a brat file, swap every entity with a new entity and write a new file with the new entities
def get_entity(id,entities):
    found = None
    res = 0
    for entity in entities:
        #print(entity.id, id)
        if entity.id == id:
            found =  entity
    if not found:

        print("not found")
    return found


def swap_entities(file, entry, output_file):
    with open(file) as f:
        with open(output_file, "w") as f_out:
            for line in f:
                if line.startswith("T"):
                    id = line.strip().split("\t")[0]
                    entity = get_entity(id, entry["entities"])
                    f_out.write(f"{entity.id}\t{entity.type} {entity.span[0][0]} {entity.span[0][1]}\t{entity.text}\n")
                    #print(entity.text)
                else:
                    f_out.write(line)
                    ...
                    
for file in files:
    swap_entities(directory + file + ".ann", entries[file], directory+ "translated_lusa/en_aligned/" + file + ".ann")

In [None]:
#copy text files crom translated_lusa/en to translated_lusa/en_aligned
import shutil
for file in files:
    shutil.copy(directory + "translated_lusa/en/" + file + ".txt", directory+ "translated_lusa/en_aligned/" + file + ".txt")

In [None]:
for file in files[0:1]:
    print(file)
    entities, relations, attributes, groups = get_entities_relations_attributes_groups(directory+ "translated_lusa/en_aligned/" +file + ".ann")
    print(entities)

    #print("parsed translated")