### Load Data

In [69]:
import json
file_dev = open("../../Datasets/WikiEvents/dev.jsonl")
file_train = open("../../Datasets/WikiEvents/train.jsonl")
file_test = open("../../Datasets/WikiEvents/test.jsonl")
docs_dev = []
docs_train = []
docs_test = []
for line in file_dev:
    docs_dev.append(json.loads(line))
    
for line in file_train:
    docs_train.append(json.loads(line))

for line in file_test:
    docs_test.append(json.loads(line))
    
file_train.close()
file_dev.close()
file_test.close()

### Load LinguAligner

In [56]:
from LinguAligner import AlignmentPipeline, translation
translator = translation.GoogleTranslator(source_lang="en", target_lang="pt")
aligner = AlignmentPipeline()

Loading spacy model: pt_core_news_lg
Model loaded
Loading WAligner model: bert-base-multilingual-uncased
Model loaded


### Generate multi-translation Lookuptable

In [4]:
entities = set()
triggers = set()
for doc in docs_dev:
    for entity in doc["entity_mentions"]:
        entities.add(entity["text"])
    for event in doc["event_mentions"]:
        triggers.add(event["trigger"]["text"])
for doc in docs_train:
    for entity in doc["entity_mentions"]:
        entities.add(entity["text"])
    for event in doc["event_mentions"]:
        triggers.add(event["trigger"]["text"])
for doc in docs_test:
    for entity in doc["entity_mentions"]:
        entities.add(entity["text"])
    for event in doc["event_mentions"]:
        triggers.add(event["trigger"]["text"])
triggers = list(triggers)
entities = list(entities)

In [8]:
lookupTable = {}

In [None]:
translator = translation.MicrosoftTranslator(source_lang="en", target_lang="pt", auth_key="f6d44239a73046ca8378bcdc689b395c")
annotations_list = triggers
for word in annotations_list:
    if word not in lookupTable and len(word) < 50:
        print(word)
        lookupTable[word] = translator.getMultipleTranslations(word) # change the language codes according to the desired languages

In [13]:
#f_out = open("lookupTable_triggers.json", "w")
#json.dump(lookupTable, f_out, indent=4, ensure_ascii=False)

### Translate and align annotations on the sentence level
(aligning annotations on sentence level is easier)

In [4]:
import json
lookupTable_triggers = json.load(open("lookupTable_triggers.json"))
lookupTable_entities = json.load(open("lookupTable_entities.json"))

In [6]:
def token_index_doc_to_sent(sentences, sent_idx, ann_index):
    i = 0
    token_offset = 0
    while i < sent_idx:
        token_offset += len(sentences[i][0])
        i = i + 1
    return ann_index - token_offset

#print(token_index_doc_to_sent(docs_dev[0]["sentences"],2, 58))

def token_to_char_offset(tokens, index):
    start, end = 0, 0
    for token in tokens[:index]:
        start = start + len(token) + 1
    #print(tokens[index])
    end = start + len(tokens[index])
    return start, end

#token_to_char_offset(docs_dev[0]["tokens"],58)

In [7]:
import spacy

nlp = spacy.load("pt_core_news_lg")

In [46]:
def getEntityText(id, entities):
    for entity in entities:
        if entity["id"] == id:
            return entity["text"]
    return None


def align_annotations(sentences_src, sentences_trans, sent_idx, ann_src , ann_src_start, translated_ann,lookupTable):
    sent_tokens, sent_text = sentences_src[sent_idx]
    print(sentences_src[sent_idx][1], sentences_trans[sent_idx], ann_src , translated_ann, sep="\n")
    ann_token_index = token_index_doc_to_sent(sentences_src, sent_idx, ann_src_start)
    src_ann_start, scr_ann_end = token_to_char_offset(sent_tokens, ann_token_index)
    print(sent_text, ann_src, sentences_trans[sent_idx], translated_ann, sep="\n")
    target_annotation = aligner.align_annotation(sent_text, ann_src, sentences_trans[sent_idx], translated_ann, lookupTable=lookupTable, src_ann_start=src_ann_start)
    print(target_annotation, ann_src, src_ann_start, scr_ann_end)
    return target_annotation

def translate_docs(docs):
    new_docs = []
    for doc in docs:
        trans_sencences = []
        new_doc = {}
        new_doc["event_mentions"] = []
        new_doc["entity_mentions"] = []
        new_doc["relation_mentions"] = doc["relation_mentions"]
        
        #Translate sentences
        for sent in doc["sentences"]:
            sent[1] = sent[1].replace("","")
            if len(sent[1]) > 5000:
                doc_ = nlp(sent[1])
                trans_sent = ""
                for sent_ in doc_.sents:
                    trans_sent = trans_sent + " " + translator.translate(sent_.text)
                trans_sent = trans_sent.strip()
            else:        
                trans_sent = translator.translate(sent[1])
            trans_sencences.append(trans_sent)
        
        #Translate and align entities
        for ann in doc["entity_mentions"]:
            ann["text"] = ann["text"].replace("","")
            translated_ann = translator.translate(ann["text"])
            aligned_ann, span = align_annotations(doc["sentences"], 
                                            trans_sencences, 
                                            ann["sent_idx"],  
                                            ann["text"], 
                                            ann["start"], 
                                            translated_ann, 
                                            lookupTable_entities)
            ann["text_en"]= ann["text"]
            ann["text"] = aligned_ann
            ann["start_en"] = ann["start"]
            ann["end_en"] = ann["end"]
            ann["start"] = span[0]
            ann["end"] = span[1]
            new_doc["entity_mentions"].append(ann)
        
        #Translate and align events
        for ann in doc["event_mentions"]:
            ann["trigger"]["text"] = ann["trigger"]["text"].replace("","")
            translated_ann = translator.translate(ann["trigger"]["text"]).replace("","")
            aligned_ann, span = align_annotations(doc["sentences"], 
                                                trans_sencences, 
                                                ann["trigger"]["sent_idx"],  
                                                ann["trigger"]["text"], 
                                                ann["trigger"]["start"],
                                                translated_ann,
                                                lookupTable_triggers)
            ann["trigger"]["text_en"]= ann["trigger"]["text"]
            ann["trigger"]["text"] = aligned_ann 
            ann["trigger"]["start_en"] = ann["trigger"]["start"]
            ann["trigger"]["end_en"] = ann["trigger"]["end"]
            ann["trigger"]["start"] = span[0]
            ann["trigger"]["end"] = span[1]
            #arguments
            for arg in ann["arguments"]:
                text = getEntityText(arg["entity_id"], new_doc["entity_mentions"])
                arg["text_en"] = arg["text"]
                arg["text"] = text
            new_doc["event_mentions"].append(ann)


        new_doc["sentences_en"] = [sent for _, sent in doc["sentences"]]
        new_doc["sentences"] = trans_sencences
        new_doc["text_en"] = doc["text"]
        new_doc["text"] = " ".join(trans_sencences)
        new_docs.append(new_doc)
    return new_docs
    
def saveJsonLines(docs, filename):
    with open(filename, 'w') as outfile:
        for entry in docs:
            json.dump(entry, outfile)
            outfile.write('\n')
        outfile.flush()
        

In [11]:

docs_dev_pt = translate_docs(docs_dev)
saveJsonLines(docs_dev_pt, "../../Datasets/WikiEvents/dev_pt_sent.jsonl")
f_out = open("../../Datasets/WikiEvents/dev_pt_sent.json", "w")
json.dump(docs_dev_pt, f_out, indent=4, ensure_ascii=False)
f_out.flush()
f_out.close()

docs_train_pt = translate_docs(docs_train)
saveJsonLines(docs_train_pt, "../../Datasets/WikiEvents/train_pt_sent.jsonl")
f_out = open("../../Datasets/WikiEvents/train_pt_sent.json", "w")
json.dump(docs_train_pt, f_out, indent=4, ensure_ascii=False)
f_out.flush()
f_out.close()

docs_test_pt = translate_docs(docs_test)
saveJsonLines(docs_test_pt, "../../Datasets/WikiEvents/test_pt_sent.jsonl")
f_out = open("../../Datasets/WikiEvents/test_pt_sent.json", "w")
json.dump(docs_test_pt, f_out, indent=4, ensure_ascii=False)
f_out.flush()
f_out.close()

('IED', (0, 2)) IED 4 7
('russo', (42, 46)) Russian 12 15
('major-general', (28, 40)) general 20 23
('Síria', (51, 55)) Syria 28 31
('deserto', (2, 8)) desert 4 7
('Deir ez-Zor', (29, 39)) Deir ez-Zor 24 27
('regime', (53, 58)) regime 48 51
('Eufrates', (63, 70)) Euphrates 72 75
('insurgentes', (89, 99)) insurgents 88 91
('Rússia', (14, 19)) Russia 0 3
('Síria', (24, 28)) Syria 20 23
('país', (109, 112)) nation 64 67
('Ministério da Defesa', (80, 99)) Defense Ministry 76 79
('major-general', (131, 143)) general 100 103
('Síria', (158, 162)) Syria 116 119
('dispositivo explosivo', (171, 191)) explosive device 132 135
('Al-Monitor', (217, 226)) Al-Monitor 144 147
('major-general', (2, 14)) General 4 7
('Vyacheslav Gladkikh', (16, 34)) Vyacheslav Gladkikh 8 11
('IED', (57, 59)) IED 32 35
('comboio', (92, 98)) convoy 48 51
('russos', (112, 117)) Russian 56 59
('soldados', (103, 110)) soldiers 60 63
('sírios', (143, 148)) Syrian 68 71
('regime', (136, 141)) regime 80 83
('milicianos', (121,

### Convert sentence level annotations spans to document level

In [40]:
file_train = open("../../Datasets/WikiEvents/dev_pt_sent.json")
x = json.load(file_train)

In [42]:
import json
file_dev = open("../../Datasets/WikiEvents/dev_pt_sent.json")
file_train = open("../../Datasets/WikiEvents/train_pt_sent.json")
file_test = open("../../Datasets/WikiEvents/test_pt_sent.json")
docs_dev_pt = json.load(file_dev)
docs_train_pt = json.load(file_train)
docs_test_pt = json.load(file_test)
    
file_train.close()
file_dev.close()
file_test.close()

In [43]:
def sentence_to_doc_offset(annotation, sentences):
    sent_idx = annotation["sent_idx"]
    token_idx = annotation["start"]
    offset = 0
    for i in range(sent_idx):
        offset += len(sentences[i]) + 1
    annotation["start"] = offset + token_idx
    annotation["end"] = offset + annotation["end"]
    return annotation
    


for doc in docs_dev_pt:
    for ann in doc["entity_mentions"]:
        ann = sentence_to_doc_offset(ann, doc["sentences"])
    for ann in doc["event_mentions"]:
        ann["trigger"] = sentence_to_doc_offset(ann["trigger"], doc["sentences"])

for doc in docs_train_pt:
    for ann in doc["entity_mentions"]:
        ann = sentence_to_doc_offset(ann, doc["sentences"])
    for ann in doc["event_mentions"]:
        ann["trigger"] = sentence_to_doc_offset(ann["trigger"], doc["sentences"])

for doc in docs_test_pt:
    for ann in doc["entity_mentions"]:
        ann = sentence_to_doc_offset(ann, doc["sentences"])
    for ann in doc["event_mentions"]:
        ann["trigger"] = sentence_to_doc_offset(ann["trigger"], doc["sentences"])


In [50]:
saveJsonLines(docs_test_pt, "../../Datasets/WikiEvents/test_pt.jsonl")
f_out = open("../../Datasets/WikiEvents/test_pt.json", "w")
json.dump(docs_test_pt, f_out, indent=4, ensure_ascii=False)
f_out.flush()
f_out.close()

saveJsonLines(docs_train_pt, "../../Datasets/WikiEvents/train_pt.jsonl")
f_out = open("../../Datasets/WikiEvents/train_pt.json", "w")
json.dump(docs_train_pt, f_out, indent=4, ensure_ascii=False)
f_out.flush()
f_out.close()

saveJsonLines(docs_dev_pt, "../../Datasets/WikiEvents/dev_pt.jsonl")
f_out = open("../../Datasets/WikiEvents/dev_pt.json", "w")
json.dump(docs_dev_pt, f_out, indent=4, ensure_ascii=False)
f_out.flush()
f_out.close()

In [58]:
for i, doc in enumerate(docs_train):
    for ann in doc["entity_mentions"]:
        if ann["id"] == "2_VOA_EN_NW_2015.01.04.2583337-T346":
            print(i)
            
        

51


In [65]:
file_dev = open("../../Datasets/WikiEvents/dev_pt.json")
file_train = open("../../Datasets/WikiEvents/train_pt.json")
file_test = open("../../Datasets/WikiEvents/test_pt.json")
docs_dev_pt = json.load(file_dev)
docs_train_pt = json.load(file_train)
docs_test_pt = json.load(file_test)