In [2]:
import numpy as np
from pathlib import Path
from collections import Counter
import os
import re
import nltk

os.chdir('../')

from data import read_sentences

In [384]:
PT_VERBS = ['tratar', 'estabelecer', 'marcar', 'vir', 'colocar',\
           'fechar', 'dar', 'cair', 'encontrar', 'registrar',\
           'levar', 'receber', 'apresentar', 'passar', 'deixar',\
           'chegar', 'ficar', 'fazer', 'ter', 'ser']

In [385]:
PT_DICT_VERBS ={
    'tratar': ['deal with', 'treat', 'process', 'deal', 'manage', 'do', 'attend', 'look after', 'cherrish', 'misuse', 'size'],\
    'estabelecer': ['establish', 'set', 'fix', 'lay down', 'make', 'settle', 'determine', 'presribe', 'impose', 'enter into', 'stipulate', 'organize', 'seat'],\
    'marcar': ['mark', 'brand', 'dial', 'book', 'stamp', 'show', 'read', 'define', 'trace', 'earmark', 'reserve', 'feature', 'signilize', 'scribe'],\
    'vir': ['come', 'arrive', 'come on', 'come up with'],\
    'colocar': ['put', 'lay', 'set', 'place', 'post', 'pose', 'stick', 'plant', 'dispose', 'collocate', 'posit', 'bestow', 'pitch', 'clap', 'ship', 'placatory'],\
    'fechar': ['close', 'shut', 'seal', 'turn off', 'pin down', 'fasten', 'occlude', 'shut in', 'box', 'impount', 'bar', 'berate', 'stop up', 'mure', 'stopple', 'rail', 'pen', 'inmure'],\
    'dar': ['to give', 'give', 'impart', 'provide', 'render', 'afford', 'yield', 'allow', 'hand', 'deal', 'administer', 'give in', 'gift', 'confer', 'inflict', 'handsel', 'accord'],\
    'cair': ['fall', 'go down', 'drop', 'sink', 'collapse', 'founder', 'topple', 'lapse', 'sleet', 'keel over', 'prey', 'prostrate', 'pelt', 'plump', 'flump'],\
    'encontrar': ['meet', 'find', 'detect', 'encounter', 'find out', 'discover', 'meet with', 'experience', 'get together', 'impinge', 'hunt up'],\
    'registrar': ['register', 'record', 'read', 'book', 'enroll', 'inscribe', 'enrol', 'list', 'write down', 'set down', 'trace', 'score', 'label', 'matriculate', 'prick down', 'calendar', 'signalize'],\
    'levar': ['take along', 'take', 'carry', 'convey', 'go', 'prompt', 'induce', 'hold', 'charge', 'ravish'],\
    'receber': ['to receive', 'receive', 'welcome', 'get', 'have', 'accept', 'collect', 'meet', 'entertain', 'do', 'reap', 'derive', 'salute'],\
    'apresentar': ['to present', 'present', 'introduce', 'show', 'exhibit', 'lodge', 'produce', 'put', 'bring forward', 'come up with', 'represent', 'bring up', 'render'],\
    'passar': ['pass', 'spend', 'hand', 'go', 'go by', 'elapse', 'slip away', 'come', 'transfer', 'give in'],\
    'deixar': ['leave', 'let', 'have', 'quit', 'let go', 'depart', 'go away', 'drop off', 'leave out'],\
    'chegar': ['to arrive', 'arrive', ' get in', 'come', 'achieve', 'land', 'get around', 'turn up'],\
    'ficar': ['stay', 'bide', 'be', 'remain', 'go', 'continue', 'keep', 'come'],\
    'fazer': ['do', 'perform', 'make', 'cause', 'create', 'produce', 'render', 'manufacture'],\
    'ter': ['tue', 'have', 'take'],\
    'ser': ['to be', 'be', 'being']
}

# Eval senses tagged

## Read translations

In [534]:
or_file_semcor = open('data/translation/semcor/en_pt/semcor_or_2.txt', 'r')
tr_file_semcor = open('data/translation/semcor/en_pt/semcor_tr_2.txt', 'r')
id_file_semcor = open('data/translation/semcor/en_pt/semcor_id_2.txt', 'r')
al_file_semcor = open('data/translation/semcor/en_pt/semcor_al_2.txt', 'r')

or_file_omsti = open('data/translation/omsti/en_pt/omsti_or.txt', 'r')
tr_file_omsti = open('data/translation/omsti/en_pt/omsti_tr.txt', 'r')
id_file_omsti = open('data/translation/omsti/en_pt/omsti_id.txt', 'r')
al_file_omsti = open('data/translation/omsti/en_pt/omsti_al.txt', 'r')

In [451]:
def parse_alignment(chars):
    alignment = []
    for e in re.split(r'[,|;]', chars):
        temp = e.split('-')
        range_a = temp[0].split(':')
        range_b = temp[1].split(':')
        alignment.append([np.arange(int(range_a[0]), int(range_a[0]) + int(range_a[1]) + 1),\
                         np.arange(int(range_b[0]), int(range_b[0]) + int(range_b[1]) + 1)])
        
    return alignment

In [452]:
def load_arrays(or_file, tr_file, id_file, al_file):
    tr_ar = []
    for sent in tr_file:
        tr_ar.append(sent.replace('\n', ''))
        
    or_ar = []
    for sent in or_file:
        or_ar.append(sent.replace('\n', ''))

    id_ar = []
    for sent in id_file:
        id_ar.append(int(sent.replace('\n', '')))

    al_ar = []
    for sent in al_file:
        al_ar.append(parse_alignment(sent.replace('\n', '')))
        
    return np.array(or_ar), np.array(tr_ar), np.array(id_ar), np.array(al_ar)

In [453]:
or_semcor, tr_semcor, id_semcor, al_semcor = load_arrays(or_file_semcor, tr_file_semcor, id_file_semcor, al_file_semcor)
or_omsti, tr_omsti, id_omsti, al_omsti = load_arrays(or_file_omsti, tr_file_omsti, id_file_omsti, al_file_omsti)

In [547]:
len(or_semcor), len(tr_semcor), len(id_semcor), len(al_semcor)

(37163, 37163, 37163, 37163)

In [548]:
len(or_omsti), len(tr_omsti), len(id_omsti), len(al_omsti)

(40260, 40260, 40260, 40260)

In [445]:
or_file_semcor.close()
tr_file_semcor.close()
id_file_semcor.close()
al_file_semcor.close()

or_file_omsti.close()
tr_file_omsti.close()
id_file_omsti.close()
al_file_omsti.close()

## Read corpus annotations

In [17]:
input_sent_semcor, sent_to_id_semcor, target_sent_semcor = read_sentences(Path.cwd() / 'data/WSD_Evaluation_Framework/Training_Corpora/', 'SemCor', False)
input_sent_temp, sent_to_id_temp, target_sent_temp = read_sentences(Path.cwd() / 'data/WSD_Evaluation_Framework/Training_Corpora/', 'SemCor+OMSTI', True)

# Filter sentences
input_sent_omsti = []
sent_to_id_omsti = []
target_sent_omsti = []
for ix, sent in enumerate(input_sent_temp):
    if sent not in input_sent_semcor:
        input_sent_omsti.append(sent)
        sent_to_id_omsti.append(sent_to_id_temp[ix])
        target_sent_omsti.append(target_sent_temp[ix])

## Check amount of valid data

In [150]:
def get_alignment(pos_word, al):
    temp = []
    for e in al:
        if pos_word in e[1]:
            temp.append(e)
    
    if len(temp):
        temp = sorted(temp, key= lambda x: len(x[1]))
        temp = temp[0]
        
    return temp

In [367]:
def count_words_with_verb(sent):
    temp_ix = []
    temp_cont = []
    cont_char = 0
    for ix, word in enumerate(sent.split()):
        if word in PT_VERBS:
            temp_cont.append(cont_char)
            temp_ix.append(ix)
        cont_char += len(word) + 1
    return temp_cont, temp_ix

In [368]:
def count_tagged_instance(id_arr, al_arr, tr_arr, input_sent_arr, sent_to_id_arr, target_sent_arr):
    cont = 0
    for ix_tr, ix_input_sent in enumerate(id_arr):
        
        input_sent = input_sent_arr[ix_input_sent]
        sent_to_id = sent_to_id_arr[ix_input_sent]
        target_sent = target_sent_arr[ix_input_sent]
        
        al = al_arr[ix_tr]
        tr = tr_arr[ix_tr]
        
        arr_cont_char, arr_pos_word = count_words_with_verb(tr)
        if len(arr_cont_char):
            for cont_char, pos_word in zip(arr_cont_char, arr_pos_word):
                alingment = get_alignment(cont_char, al)
                #print(alingment)
                t_or = min(alingment[0])
                d_or = max(alingment[0])
                t_tr = min(alingment[1])
                d_tr = max(alingment[1])

                ant_n_words = len(input_sent[:t_or].split())
                n_words = len(input_sent[t_or:d_or].split())
                for id_sense in sent_to_id[ant_n_words:(ant_n_words+n_words)]:
                    if target_sent[id_sense] != 'no_instance':
                        cont +=1
    
    return cont

In [216]:
count_tagged_instance(id_semcor, al_semcor, tr_semcor, input_sent_semcor, sent_to_id_semcor, target_sent_semcor)

7084

In [217]:
count_tagged_instance(id_omsti, al_omsti, tr_omsti, input_sent_omsti, sent_to_id_omsti, target_sent_omsti)

6611

In [214]:
len(id_semcor) + len(id_omsti)

76260

# Eval metrics

In [280]:
or_file_semcor = open('data/translation/semcor/pt_en/semcor_or_3.txt', 'r')
tr_file_semcor = open('data/translation/semcor/pt_en/semcor_tr_3.txt', 'r')
al_file_semcor = open('data/translation/semcor/pt_en/semcor_al_3.txt', 'r')

or_file_omsti = open('data/translation/omsti/pt_en/omsti_or_2.txt', 'r')
tr_file_omsti = open('data/translation/omsti/pt_en/omsti_tr_2.txt', 'r')
al_file_omsti = open('data/translation/omsti/pt_en/omsti_al_2.txt', 'r')

In [281]:
bck_or_semcor, bck_tr_semcor, _, bck_al_semcor = load_arrays(or_file_semcor, tr_file_semcor, [], al_file_semcor)
bck_or_omsti, bck_tr_omsti, _, bck_al_omsti = load_arrays(or_file_omsti, tr_file_omsti, [], al_file_omsti)

In [282]:
or_file_semcor.close()
tr_file_semcor.close()
al_file_semcor.close()

or_file_omsti.close()
tr_file_omsti.close()
al_file_omsti.close()

In [242]:
bck_tr_semcor[-100], or_semcor[-100]

('the next days may show where things are .',
 'the next days may show where things stand .')

In [243]:
bck_tr_omsti[-100], or_omsti[-100]

('whether as athletes or spectators , when we find ourselves in and participate in international sporting events , we share the brilliance of world-class competition , whether it be in the pain of losing, or the glory of winning , but more importantly , on a willingness of participation .',
 'whether as athletes or spectators , when we meet at and participate in international sporting events , we share in the glow of world-class competition , whether it be in the heartbreak of losing or in the glory of winning , but most importantly , in the goodwill of participation .')

## Eval Rouge

In [None]:
import rouge

In [1083]:
def get_rouge_per_sentence(input_sentences, trns_sentences):
    f1_scores = []
    for sent, trns in zip(input_sentences, trns_sentences):
        evaluator = rouge.Rouge(metrics=['rouge-n'],
                               max_n=2,
                               limit_length=True,
                               length_limit=100,
                               length_limit_type='words',
                               alpha=0.5, # Default F1_score
                               weight_factor=1.2,
                               stemming=True)

        all_hypothesis = [trns]
        all_references = [sent]

        scores = evaluator.get_scores(all_hypothesis, all_references)
        for metric, results in sorted(scores.items(), key=lambda x: x[0]):
            if metric == 'rouge-2':
                f1_scores.append(results['f'])

    return np.array(f1_scores)

In [1084]:
rouge_omsti = get_rouge_per_sentence(or_omsti, bck_tr_omsti)
rouge_semcor = get_rouge_per_sentence(or_semcor, bck_tr_semcor)

## Eval Bleu

In [1085]:
import nltk
from nltk.translate.bleu_score import SmoothingFunction

def get_bleu_per_sentence(input_sentences, trns_sentences):
    bleu_scores = []
    smoothie = SmoothingFunction().method4
    for sent, trns in zip(input_sentences, trns_sentences):
        hypothesis = trns.split() 
        reference = sent.split()
        
        references = [reference]
        bleu = nltk.translate.bleu_score.sentence_bleu(references, hypothesis, smoothing_function=smoothie)
        
        bleu_scores.append(bleu)

    return np.array(bleu_scores)

In [1086]:
bleu_omsti = get_bleu_per_sentence(or_omsti, bck_tr_omsti)
bleu_semcor = get_bleu_per_sentence(or_semcor, bck_tr_semcor)

# Transform translations

In [272]:
def count_words(sent):
    temp_ix = []
    temp_cont = []
    cont_char = 0
    for ix, word in enumerate(sent.split()):
        temp_cont.append(cont_char)
        temp_ix.append(ix)
        
        cont_char += len(word) + 1
        
    return temp_cont, temp_ix

In [388]:
def check_word(tr_word, orig_words):
    if tr_word in PT_DICT_VERBS:
        orig_options = PT_DICT_VERBS[tr_word]
        for orig_word in orig_words:
            if orig_word in orig_options:
                return True
                
    return False

In [909]:
from spacy.tokens import Doc

class WhitespaceTokenizer(object):
    def __init__(self, vocab):
        self.vocab = vocab

    def __call__(self, text):
        words = text.split(' ')
        # All tokens 'own' a subsequent space character in this tokenizer
        spaces = [True] * len(words)
        return Doc(self.vocab, words=words, spaces=spaces)


In [910]:
import spacy

nlp_pt = spacy.load('pt_core_news_sm')
nlp_pt.tokenizer = WhitespaceTokenizer(nlp_pt.vocab)

nlp_en = spacy.load('en_core_web_sm')
nlp_en.tokenizer = WhitespaceTokenizer(nlp_en.vocab)

In [912]:
def get_pos_tags(orig_sentences, trns_sentences): 
    tags_tr = []
    for sent in (trns_sentences):
        if len(sent):
            tag_tr = []
            temp = nlp_pt(str(sent).strip())
            for token in temp:
                tag_tr.append([token.text, token.pos_])
            tags_tr.append(tag_tr)
        else:
            tags_tr.append(['invalid', 'invalid'])
    
    tags_or = []
    for sent in (orig_sentences):
        if len(sent):
            tag_or = []
            temp = nlp_en(str(sent).strip())
            for token in temp:
                tag_or.append([token.text, token.pos_])
            tags_or.append(tag_or)
        else:
            tags_or.append(['invalid', 'invalid'])
        
    return tags_or, tags_tr

In [1009]:
def tagged_translations(selected_id_tr, id_arr, al_arr, tr_arr, input_sent_arr, sent_to_id_arr, target_sent_arr, pos_or, pos_tr):
    cont = 0
    tagged_tr = []
    
    for ix_tr, ix_input_sent in zip(selected_id_tr, id_arr.take(selected_id_tr)):
        input_sent = input_sent_arr[ix_input_sent]
        sent_to_id = sent_to_id_arr[ix_input_sent]
        target_sent = target_sent_arr[ix_input_sent]
        
        al = al_arr[ix_tr]
        tr = tr_arr[ix_tr]
        
        arr_cont_char, arr_pos_word = count_words(tr)
        if len(arr_cont_char):
            
            tags_tr = pos_tr[ix_tr]
            tags_or = pos_or[ix_input_sent]
            if verbose:
                print()
                print('sentence orig: ', input_sent)
                print('sentence translated: ', tr)
            temp = []
            for cont_char, pos_word in zip(arr_cont_char, arr_pos_word):
                alingment = get_alignment(cont_char, al)
                if len(alingment):
                    t_or = min(alingment[0])
                    d_or = max(alingment[0])
                    t_tr = min(alingment[1])
                    d_tr = max(alingment[1])

                    ant_n_words_or = len(input_sent[:t_or].split())
                    n_words_or = len(input_sent[t_or:d_or].split())
                    
                    ant_n_words_tr = len(tr[:t_tr].split())
                    n_words_tr = len(tr[t_tr:d_tr].split())
                    
                    input_sent_tokens = input_sent.split()[ant_n_words_or:(ant_n_words_or+n_words_or)]
                    trns_sent_tokens = tr.split()[ant_n_words_tr:(ant_n_words_tr+n_words_tr)]
                    
                    if verbose:
                        print()
                        print('translated word: ', tr.split()[pos_word])
                        print('senses: ', np.array(target_sent).take(sent_to_id[ant_n_words_or:(ant_n_words_or+n_words_or)]))
                        print('token input original: ', input_sent_tokens)
                        print('token translations: ', tr.split()[ant_n_words_tr:(ant_n_words_tr+n_words_tr)])
                        print('pos original: ', tags_tr[pos_word])
                        print('pos translation: ', tags_or[ant_n_words_or:(ant_n_words_or+n_words_or)])
                    if len(input_sent_tokens) == 1:   
                        
                        tag_or = tags_or[ant_n_words_or:(ant_n_words_or+n_words_or)][0]
                        tag_tr = tags_tr[pos_word]
                        id_sense = sent_to_id[ant_n_words_or:(ant_n_words_or+n_words_or)][0]
                        if verbose:
                            print('token 1: ', target_sent[id_sense])
                        if target_sent[id_sense] != 'no_instance' and tag_or[1] == tag_tr[1]: 
                            temp.append(target_sent[id_sense])
                        else:
                            temp.append('no_instance') 

                    elif len(input_sent_tokens) <= 4 and len(trns_sent_tokens) <= 5 and check_word(tr.split()[pos_word], input_sent[t_or:d_or].split()):
                        is_ambiguous = False

                        for id_sense, word in zip(sent_to_id[ant_n_words_or:(ant_n_words_or+n_words_or)], input_sent_tokens):
                            if target_sent[id_sense] != 'no_instance' and word in PT_DICT_VERBS[tr.split()[pos_word]]:
                                is_ambiguous = True
                                break
                        
                        if is_ambiguous:
                            if verbose:
                                print('token check_word: ', target_sent[id_sense])
                            temp.append(target_sent[id_sense])
                        else:
                            temp.append('no_instance')
                            
                    elif len(input_sent_tokens) <= 4 and len(trns_sent_tokens) <= 4:   
                        
                        is_ambiguous = False
                        tag_tr = tags_tr[pos_word]
                        
                        tag_cntr_or = Counter([tag[1] for tag in tags_or[ant_n_words_or:(ant_n_words_or+n_words_or)]])
                        tag_cntr_tr = Counter([tag[1] for tag in tags_tr[ant_n_words_tr:(ant_n_words_tr+n_words_tr)]])

                        for id_sense, tag_or in zip(sent_to_id[ant_n_words_or:(ant_n_words_or+n_words_or)], tags_or[ant_n_words_or:(ant_n_words_or+n_words_or)]):
                            if target_sent[id_sense] != 'no_instance' and tag_or[1] == tag_tr[1] and tag_cntr_tr[tag_tr[1]] == 1 and tag_cntr_or[tag_or[1]] == 1: 
                                is_ambiguous = True
                                temp_id = target_sent[id_sense]
                                if verbose:
                                    print('sentido encontrado: ', temp_id)
                                break
                                
                        if is_ambiguous:
                            temp.append(temp_id)
                        else:
                            temp.append('no_instance')

                    else:
                        temp.append('no_instance')
                else:
                    temp.append('no_instance')
            
            tagged_tr.append([temp, tr])
    
    return tagged_tr

In [1053]:
(target_sent_semcor[id_semcor[728]]), list(zip((input_sent_semcor[728].split()), sent_to_id_semcor[728]))

(['no_instance',
  'd006.s071.t000',
  'd006.s071.t001',
  'no_instance',
  'no_instance',
  'no_instance',
  'd006.s071.t002',
  'no_instance',
  'no_instance',
  'd006.s071.t003',
  'd006.s071.t004',
  'no_instance',
  'd006.s071.t005',
  'd006.s071.t006',
  'no_instance',
  'd006.s071.t007',
  'd006.s071.t008',
  'no_instance',
  'd006.s071.t009',
  'no_instance',
  'no_instance',
  'no_instance',
  'no_instance',
  'd006.s071.t010',
  'd006.s071.t011',
  'd006.s071.t012',
  'no_instance',
  'no_instance',
  'd006.s071.t013',
  'no_instance',
  'd006.s071.t014',
  'no_instance',
  'd006.s071.t015',
  'no_instance',
  'd006.s071.t016',
  'no_instance',
  'd006.s071.t017',
  'no_instance'],
 [('on', 0),
  ('december', 1),
  ('9', 2),
  (',', 3),
  ('1862', 4),
  (',', 5),
  ('sergeant', 6),
  ('edwin', 6),
  ('h.', 6),
  ('fay', 6),
  (',', 7),
  ('an', 8),
  ('unusual', 9),
  ('louisianan', 10),
  ('who', 11),
  ('held', 12),
  ('a.', 13),
  ('b.', 13),
  ('and', 14),
  ('m.', 15),
 

In [1054]:
tagged_tr_semcor_r = tagged_translations(np.array([728]), id_semcor, al_semcor, tr_semcor, input_sent_semcor, sent_to_id_semcor, target_sent_semcor, pos_or, pos_tr)



sentence orig:  on december 9 , 1862 , sergeant edwin h. fay , an unusual louisianan who held a. b. and m. a. degrees from harvard university and who before the war was headmaster of a private school for boys in louisiana , wrote his wife :
sentence translated:  em 9 de dezembro de 1862 , o sargento edwin h. fay , uma invulgar louisianan, que realizou uma. b. e m. um. graus da universidade de harvard e que antes da guerra, era diretor de uma escola particular para meninos, em louisiana , escreveu a sua esposa :

translated word:  em
senses:  ['no_instance' 'd006.s071.t000' 'd006.s071.t001' 'no_instance']
token input original:  ['on', 'december', '9', ',']
token translations:  ['em', '9', 'de', 'dezembro', 'de']
pos original:  ['em', 'ADP']
pos translation:  [['on', 'ADP'], ['december', 'NOUN'], ['9', 'NUM'], [',', 'PUNCT']]

translated word:  9
senses:  ['no_instance' 'd006.s071.t000' 'd006.s071.t001' 'no_instance']
token input original:  ['on', 'december', '9', ',']
token translation

In [1103]:
def filter_index(rouge, id_arr, or_arr, bleu=[], p=''):
    
    if len(bleu):
        filtered_ids = np.where(((np.array(rouge) > (np.mean(rouge) - np.std(rouge))) & ((np.array(bleu) > (np.mean(bleu) - np.std(bleu))))))[0]
    else:
        filtered_ids = np.where(np.array(rouge) > (np.mean(rouge) - np.std(rouge)))[0]
    print(f'{p:15s}total ids: {len(id_arr)}\t|\t After filter: {len(filtered_ids)}')
    return filtered_ids

In [666]:
def get_verbs(sent):
    temp_cont = []
    for ix, word in enumerate(sent.split()):
        if word in PT_VERBS:
            temp_cont.append(ix)
    return temp_cont

In [497]:
def count_instances_with_verbs(tagged_tr):
    cont = []
    for ix, instance in enumerate(tagged_tr):
        arr_pos_word = get_verbs(instance[1])
        if len(arr_pos_word):
            for pos_word in arr_pos_word:
                if instance[0][pos_word] != 'no_instance':
                    cont.append(ix)
                    
    return len(cont)

## Filtering semcor

### Filter id's

In [1104]:
filtered_ids_semcor_r = filter_index(rouge_semcor, id_semcor, tr_semcor, p='rouge')
filtered_ids_semcor_b = filter_index(bleu_semcor, id_semcor, tr_semcor, p='bleu')
filtered_ids_semcor_r_b = filter_index(rouge_semcor, id_semcor, tr_semcor, bleu_semcor, p='bleu-rouge')

rouge          total ids: 37163	|	 After filter: 32488
bleu           total ids: 37163	|	 After filter: 31132
bleu-rouge     total ids: 37163	|	 After filter: 30699


### Make target

In [913]:
pos_or, pos_tr = get_pos_tags(input_sent_semcor, tr_semcor)

In [866]:
for ix, s in enumerate(tr_semcor):
    if len(s.split()) != len(pos_tr[ix]):
        print(False)
        break

In [867]:
for ix, s in enumerate(input_sent_semcor):
    if len(s.split()) != len(pos_or[ix]):
        print(False)
        break

In [917]:
np.save('pos_or.npy', pos_or)
np.save('pos_tr.npy', pos_tr)

In [1066]:
verbose=False

In [1067]:
print('tagging with rouge')
tagged_tr_semcor_r = tagged_translations(filtered_ids_semcor_r, id_semcor, al_semcor, tr_semcor, input_sent_semcor, sent_to_id_semcor, target_sent_semcor, pos_or, pos_tr)
print('tagging with bleu')
tagged_tr_semcor_b = tagged_translations(filtered_ids_semcor_b, id_semcor, al_semcor, tr_semcor, input_sent_semcor, sent_to_id_semcor, target_sent_semcor, pos_or, pos_tr)
print('tagging with rouge and bleu')
tagged_tr_semcor_r_b = tagged_translations(filtered_ids_semcor_r_b, id_semcor, al_semcor, tr_semcor, input_sent_semcor, sent_to_id_semcor, target_sent_semcor, pos_or, pos_tr)

tagging with rouge
tagging with bleu
tagging with rouge and bleu


In [1052]:
filtered_ids_semcor_r[649]

728

In [1028]:
k= []
for ix, sent in enumerate(tagged_tr_semcor_r):
    c = Counter([i for i in sent[0] if i != 'no_instance'])
    if len(c) > 0 and (c.most_common(1)[0][1] > 1) :
        k.append([ix, sent])


In [1030]:
filtered_ids_semcor_r[43]

45

In [1046]:
k[100]

[649,
 [['no_instance',
   'no_instance',
   'no_instance',
   'no_instance',
   'no_instance',
   'no_instance',
   'no_instance',
   'no_instance',
   'd006.s071.t002',
   'no_instance',
   'no_instance',
   'no_instance',
   'no_instance',
   'no_instance',
   'no_instance',
   'd006.s071.t004',
   'no_instance',
   'd006.s071.t005',
   'no_instance',
   'd006.s071.t006',
   'no_instance',
   'd006.s071.t007',
   'no_instance',
   'd006.s071.t008',
   'no_instance',
   'no_instance',
   'no_instance',
   'no_instance',
   'no_instance',
   'no_instance',
   'no_instance',
   'no_instance',
   'd006.s071.t010',
   'd006.s071.t011',
   'd006.s071.t012',
   'no_instance',
   'no_instance',
   'd006.s071.t013',
   'd006.s071.t013',
   'no_instance',
   'd006.s071.t014',
   'no_instance',
   'd006.s071.t015',
   'no_instance',
   'd006.s071.t016',
   'no_instance',
   'no_instance',
   'd006.s071.t017',
   'no_instance'],
  'em 9 de dezembro de 1862 , o sargento edwin h. fay , uma invulg

In [1018]:
len(c)

0

In [946]:
target_sent_semcor[1174]

['no_instance',
 'd010.s002.t000',
 'no_instance',
 'no_instance',
 'd010.s002.t001',
 'no_instance',
 'd010.s002.t002',
 'd010.s002.t003',
 'no_instance',
 'd010.s002.t004',
 'no_instance',
 'no_instance',
 'd010.s002.t005',
 'no_instance',
 'no_instance',
 'no_instance',
 'd010.s002.t006',
 'no_instance',
 'd010.s002.t007',
 'no_instance',
 'no_instance',
 'd010.s002.t008',
 'd010.s002.t009',
 'no_instance',
 'd010.s002.t010',
 'd010.s002.t011',
 'no_instance',
 'no_instance',
 'd010.s002.t012',
 'd010.s002.t013',
 'd010.s002.t014',
 'no_instance',
 'd010.s002.t015',
 'd010.s002.t016',
 'no_instance',
 'd010.s002.t017',
 'no_instance',
 'no_instance',
 'd010.s002.t018',
 'no_instance',
 'd010.s002.t019',
 'd010.s002.t020',
 'no_instance',
 'd010.s002.t021',
 'd010.s002.t022',
 'no_instance',
 'd010.s002.t023',
 'd010.s002.t024',
 'd010.s002.t025',
 'no_instance',
 'd010.s002.t026',
 'd010.s002.t027',
 'no_instance']

In [976]:
tagged_tr_semcor_r[10000]

[['no_instance',
  'no_instance',
  'no_instance',
  'no_instance',
  'no_instance',
  'no_instance',
  'no_instance',
  'd106.s080.t004',
  'no_instance',
  'no_instance',
  'd106.s080.t006',
  'no_instance',
  'd106.s080.t008',
  'd106.s080.t007',
  'no_instance',
  'no_instance',
  'd106.s080.t009',
  'no_instance',
  'no_instance',
  'no_instance',
  'no_instance',
  'd106.s080.t011',
  'no_instance',
  'no_instance',
  'no_instance',
  'no_instance',
  'd106.s080.t013',
  'no_instance',
  'no_instance',
  'd106.s080.t014',
  'no_instance',
  'no_instance',
  'no_instance',
  'no_instance'],
 'o restante ( incompleta ) componentes, todos têm um mesmo número de pontos comuns a qualquer argumento , e são definidos apenas em uma forma de sub-intervalo de [ 0 , t ] .']

### Checking target

In [1068]:
for instance in tagged_tr_semcor_r:
    if len(instance[0]) != len(instance[1].split()):
        print(False)

In [1069]:
# Rouge
count_instances_with_verbs(tagged_tr_semcor_r)

3054

In [1070]:
for instance in tagged_tr_semcor_b:
    if len(instance[0]) != len(instance[1].split()):
        print(False)

In [1071]:
# Bleu
count_instances_with_verbs(tagged_tr_semcor_b)

2968

In [1072]:
for instance in tagged_tr_semcor_r_b:
    if len(instance[0]) != len(instance[1].split()):
        print(False)

In [1073]:
# Rouge and Bleu
count_instances_with_verbs(tagged_tr_semcor_r_b)

2940

### Annotate target

In [1074]:
d_senses = {}
with open(Path.cwd() / 'data/WSD_Evaluation_Framework/Training_Corpora/SemCor/semcor.gold.key.txt', 'r') as f:
    for line in f.read().split('\n'):
        if len(line):
            temp = line.split()
            d_senses[temp[0]] = temp[1:]

In [1161]:
def save_tagged_sentence(dir_path, tagged_tr, name_file, d_senses, metric, mode='w', wordnet_senses={}):
    tagged_sense_sent = []
    cont = 0
    for instance in tagged_tr:
        temp = []
        for ix, word in enumerate(instance[1].split()):
            if instance[0][ix] != 'no_instance':
                if len(wordnet_senses) != 0:
                    if d_senses[instance[0][ix]][0] in wordnet_senses:
                        temp.append(wordnet_senses[d_senses[instance[0][ix]][0]])
                    else:
                        cont += 1
                        temp.append(word)
                else:
                    temp.append(d_senses[instance[0][ix]][0])
            else:
                temp.append(word)

        tagged_sense_sent.append(' '.join(temp))

    if mode == 'w':
        np.save(dir_path / f'{name_file}.npy', np.array(zip(tagged_sense_sent, metric)))
    
    with open(dir_path / f'{name_file}_out.txt', mode) as file:
        for sent in tagged_sense_sent:
            file.write(sent + '\n')
    file.close()
            
    with open(dir_path / f'{name_file}_in.txt', mode) as file:
        for sent in tagged_tr:
            file.write(sent[1] + '\n')
    file.close()        
    return cont

In [1153]:
wordnet_senses = {}
with open(Path.cwd() / 'data/wordnet/index.sense') as f:
    for line in f.read().split('\n'):
        if len(line) > 1:
            wordnet_senses[line.split()[0]] = line.split()[1]
f.close()

In [1160]:
print('Saving rouge')
print(save_tagged_sentence(Path.cwd() / 'data/disambiguation/semcor', tagged_tr_semcor_r, 'semcor_rouge_map', d_senses, rouge_semcor.take(filtered_ids_semcor_r), wordnet_senses=wordnet_senses))
print('Saving bleu')
print(save_tagged_sentence(Path.cwd() / 'data/disambiguation/semcor', tagged_tr_semcor_b, 'semcor_bleu_map', d_senses, bleu_semcor.take(filtered_ids_semcor_r), wordnet_senses=wordnet_senses))
print('Saving rouge and bleu')
print(save_tagged_sentence(Path.cwd() / 'data/disambiguation/semcor', tagged_tr_semcor_r_b, 'semcor_rouge_bleu_map', d_senses, list(zip(rouge_semcor.take(filtered_ids_semcor_r), bleu_semcor.take(filtered_ids_semcor_r))), wordnet_senses=wordnet_senses))

Saving rouge
0
Saving bleu
0
Saving rouge and bleu
0


In [1063]:
tagged_sense_sent

['em 9 de dezembro de 1862 , o person%1:03:00:: edwin h. fay , uma invulgar louisianan%1:18:00:: que hold%2:40:00:: uma. ab%1:10:00:: e ma%1:10:00:: um. degree%1:10:00:: da universidade de harvard e que antes da war%1:26:00:: be%2:42:06:: headmaster%1:18:00:: de uma private_school%1:14:00:: private_school%1:14:00:: para boy%1:18:00:: em louisiana%1:15:00:: , write%2:32:00:: a sua wife%1:18:00:: :']

In [1064]:
tagged_sense_sent

['em 9 de dezembro de 1862 , o person%1:03:00:: edwin h. fay , uma invulgar louisianan%1:18:00:: que hold%2:40:00:: uma. ab%1:10:00:: e ma%1:10:00:: um. degree%1:10:00:: da universidade de harvard e que antes da war%1:26:00:: be%2:42:06:: headmaster%1:18:00:: de uma private_school%1:14:00:: private_school%1:14:00:: para boy%1:18:00:: em louisiana%1:15:00:: , write%2:32:00:: a sua wife%1:18:00:: :']

In [645]:
instance

['no_instance',
 'no_instance',
 'no_instance',
 'no_instance',
 'no_instance',
 'no_instance',
 'no_instance',
 'no_instance',
 'no_instance',
 'no_instance',
 'no_instance',
 'no_instance',
 'no_instance',
 'd000.s000.t005',
 'd000.s000.t005',
 'd000.s000.t006',
 'no_instance']

In [598]:
tagged_tr_semcor_r[0]

[['no_instance',
  'no_instance',
  'no_instance',
  'no_instance',
  'no_instance',
  'no_instance',
  'no_instance',
  'no_instance',
  'no_instance',
  'no_instance',
  'no_instance',
  'no_instance',
  'no_instance',
  'd000.s000.t005',
  'd000.s000.t005',
  'd000.s000.t006',
  'no_instance'],
 'quanto tempo tem sido desde que reviu os objectivos do seu benefício e serviço de programa ?']

In [600]:
input_sent_semcor[0]

'how long has it been since you reviewed the objectives of your benefit and service program ?'

In [601]:
target_sent_semcor[0]

['no_instance',
 'd000.s000.t000',
 'no_instance',
 'no_instance',
 'd000.s000.t001',
 'no_instance',
 'no_instance',
 'd000.s000.t002',
 'no_instance',
 'd000.s000.t003',
 'no_instance',
 'no_instance',
 'd000.s000.t004',
 'no_instance',
 'd000.s000.t005',
 'd000.s000.t006',
 'no_instance']

In [642]:
d_senses

{'d000.s000.t000': ['long%3:00:02::'],
 'd000.s000.t001': ['be%2:42:03::'],
 'd000.s000.t002': ['review%2:31:00::'],
 'd000.s000.t003': ['objective%1:09:00::'],
 'd000.s000.t004': ['benefit%1:21:00::'],
 'd000.s000.t005': ['service%1:04:07::'],
 'd000.s000.t006': ['program%1:09:01::'],
 'd000.s001.t000': ['permit%2:41:00::'],
 'd000.s001.t001': ['become%2:42:01::'],
 'd000.s001.t002': ['giveaway%1:21:00::'],
 'd000.s001.t003': ['program%1:09:01::'],
 'd000.s001.t004': ['rather%4:02:02::'],
 'd000.s001.t005': ['have%2:42:00::'],
 'd000.s001.t006': ['goal%1:09:00::'],
 'd000.s001.t007': ['improved%3:00:00::'],
 'd000.s001.t008': ['employee%1:18:00::'],
 'd000.s001.t009': ['morale%1:26:00::'],
 'd000.s001.t010': ['consequently%4:02:00::'],
 'd000.s001.t011': ['increased%3:00:00::'],
 'd000.s001.t012': ['productivity%1:07:00::'],
 'd000.s002.t000': ['effort%1:04:00::'],
 'd000.s002.t001': ['make%2:41:00::'],
 'd000.s002.t002': ['assess%2:31:00::'],
 'd000.s002.t003': ['result%1:11:00::'],


In [627]:
f.close()

In [620]:
print(input_sent_semcor[0][75:82])
tr_semcor[0][71:81]

service


'serviço de'

In [606]:
tr_semcor[0]

'quanto tempo tem sido desde que reviu os objectivos do seu benefício e serviço de programa ?'

In [607]:
al_semcor[0]

[[array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
         17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
         34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
         51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
         68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
         85, 86, 87, 88, 89, 90, 91, 92]),
  array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
         17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
         34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
         51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
         68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
         85, 86, 87, 88, 89, 90, 91, 92])],
 [array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
         17, 18, 19, 20]),
  array([ 0,  1,  2,  3,  4,  5,  6,  7

In [602]:
sent_to_id_semcor[0]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]

## Filtering omsti

### Filter id's

In [1106]:
filtered_ids_omsti_r = filter_index(rouge_omsti, id_omsti, tr_omsti, p='rouge')
filtered_ids_omsti_b = filter_index(bleu_omsti, id_omsti, tr_omsti, p='bleu')
filtered_ids_omsti_r_b = filter_index(rouge_omsti, id_omsti, tr_omsti, bleu_omsti, p='bleu-rouge')

rouge          total ids: 40260	|	 After filter: 34132
bleu           total ids: 40260	|	 After filter: 33846
bleu-rouge     total ids: 40260	|	 After filter: 32502


### Make target

In [1108]:
pos_or_omsti, pos_tr_omsti = get_pos_tags(input_sent_omsti, tr_omsti)

In [1109]:
for ix, s in enumerate(tr_omsti):
    if len(s.split()) != len(pos_tr_omsti[ix]):
        print(False)
        break

In [1110]:
for ix, s in enumerate(input_sent_omsti):
    if len(s.split()) != len(pos_or_omsti[ix]):
        print(False)
        break

In [1111]:
np.save('pos_or_omsti.npy', pos_or_omsti)
np.save('pos_tr_omsti.npy', pos_tr_omsti)

In [1112]:
verbose=False

In [1113]:
print('tagging with rouge')
tagged_tr_omsti_r = tagged_translations(filtered_ids_omsti_r, id_omsti, al_omsti, tr_omsti, input_sent_omsti, sent_to_id_omsti, target_sent_omsti, pos_or_omsti, pos_tr_omsti, wordnet_senses=wordnet_senses)
print('tagging with bleu')
tagged_tr_omsti_b = tagged_translations(filtered_ids_omsti_b, id_omsti, al_omsti, tr_omsti, input_sent_omsti, sent_to_id_omsti, target_sent_omsti, pos_or_omsti, pos_tr_omsti, wordnet_senses=wordnet_senses)
print('tagging with rouge and bleu')
tagged_tr_omsti_r_b = tagged_translations(filtered_ids_omsti_r_b, id_omsti, al_omsti, tr_omsti, input_sent_omsti, sent_to_id_omsti, target_sent_omsti, pos_or_omsti, pos_tr_omsti, wordnet_senses=wordnet_senses)

tagging with rouge
tagging with bleu
tagging with rouge and bleu


In [1114]:
for instance in tagged_tr_omsti_r:
    if len(instance[0]) != len(instance[1].split()):
        print(False)

In [1115]:
# Rouge
count_instances_with_verbs(tagged_tr_omsti_r)

4268

In [1116]:
for instance in tagged_tr_omsti_b:
    if len(instance[0]) != len(instance[1].split()):
        print(False)

In [1117]:
# Bleu
count_instances_with_verbs(tagged_tr_omsti_b)

4238

In [1118]:
for instance in tagged_tr_omsti_r_b:
    if len(instance[0]) != len(instance[1].split()):
        print(False)

In [1119]:
# Bleu
count_instances_with_verbs(tagged_tr_omsti_r_b)

4063

### Annotate corpus

In [1121]:
d_senses_omsti = {}
with open(Path.cwd() / 'data/WSD_Evaluation_Framework/Training_Corpora/SemCor+OMSTI/semcor+omsti.gold.key.txt', 'r') as f:
    for line in f.read().split('\n'):
        if len(line):
            temp = line.split()
            d_senses_omsti[temp[0]] = temp[1:]

In [1162]:
print('Saving rouge')
print(save_tagged_sentence(Path.cwd() / 'data/disambiguation/omsti', tagged_tr_omsti_r, 'omsti_rouge_map', d_senses_omsti, rouge_omsti.take(filtered_ids_omsti_r), wordnet_senses=wordnet_senses))
print('Saving bleu')
print(save_tagged_sentence(Path.cwd() / 'data/disambiguation/omsti', tagged_tr_omsti_b, 'omsti_bleu_map', d_senses_omsti, bleu_omsti.take(filtered_ids_omsti_r), wordnet_senses=wordnet_senses))
print('Saving rouge and bleu')
print(save_tagged_sentence(Path.cwd() / 'data/disambiguation/omsti', tagged_tr_omsti_r_b, 'omsti_rouge_bleu_map', d_senses_omsti, list(zip(rouge_omsti.take(filtered_ids_omsti_r), bleu_omsti.take(filtered_ids_omsti_r))), wordnet_senses=wordnet_senses))

Saving rouge
0
Saving bleu
0
Saving rouge and bleu
0


# Saving all

In [1163]:
print('Saving rouge')
print(save_tagged_sentence(Path.cwd() / 'data/disambiguation/all', tagged_tr_omsti_r, 'all_rouge_map', d_senses_omsti, rouge_omsti.take(filtered_ids_omsti_r), wordnet_senses=wordnet_senses))
print(save_tagged_sentence(Path.cwd() / 'data/disambiguation/all', tagged_tr_semcor_r, 'all_rouge_map', d_senses, rouge_semcor.take(filtered_ids_semcor_r), mode='a', wordnet_senses=wordnet_senses))
print('Saving bleu')
print(save_tagged_sentence(Path.cwd() / 'data/disambiguation/all', tagged_tr_omsti_b, 'all_bleu_map', d_senses_omsti, bleu_omsti.take(filtered_ids_omsti_r), wordnet_senses=wordnet_senses))
print(save_tagged_sentence(Path.cwd() / 'data/disambiguation/all', tagged_tr_semcor_b, 'all_bleu_map', d_senses, bleu_semcor.take(filtered_ids_semcor_r), mode='a', wordnet_senses=wordnet_senses))
print('Saving rouge and bleu')
print(save_tagged_sentence(Path.cwd() / 'data/disambiguation/all', tagged_tr_omsti_r_b, 'all_rouge_bleu_map', d_senses_omsti, list(zip(rouge_omsti.take(filtered_ids_omsti_r), bleu_omsti.take(filtered_ids_omsti_r))), wordnet_senses=wordnet_senses))
print(save_tagged_sentence(Path.cwd() / 'data/disambiguation/all', tagged_tr_semcor_r_b, 'all_rouge_bleu_map', d_senses, list(zip(rouge_semcor.take(filtered_ids_semcor_r), bleu_semcor.take(filtered_ids_semcor_r))), mode='a', wordnet_senses=wordnet_senses))

Saving rouge
0
0
Saving bleu
0
0
Saving rouge and bleu
0
0


In [1161]:
def save_tagged_sentence(dir_path, tagged_tr, name_file, d_senses, metric, mode='w', wordnet_senses={}):
    tagged_sense_sent = []
    cont = 0
    for instance in tagged_tr:
        temp = []
        for ix, word in enumerate(instance[1].split()):
            if instance[0][ix] != 'no_instance':
                if len(wordnet_senses) != 0:
                    if d_senses[instance[0][ix]][0] in wordnet_senses:
                        temp.append(wordnet_senses[d_senses[instance[0][ix]][0]])
                    else:
                        cont += 1
                        temp.append(word)
                else:
                    temp.append(d_senses[instance[0][ix]][0])
            else:
                temp.append(word)

        tagged_sense_sent.append(' '.join(temp))

    if mode == 'w':
        np.save(dir_path / f'{name_file}.npy', np.array(zip(tagged_sense_sent, metric)))
    
    with open(dir_path / f'{name_file}_out.txt', mode) as file:
        for sent in tagged_sense_sent:
            file.write(sent + '\n')
    file.close()
            
    with open(dir_path / f'{name_file}_in.txt', mode) as file:
        for sent in tagged_tr:
            file.write(sent[1] + '\n')
    file.close()        
    return cont

In [1163]:
print('Saving rouge')
print(save_tagged_sentence(Path.cwd() / 'data/disambiguation/all', tagged_tr_omsti_r, 'all_rouge_map', d_senses_omsti, rouge_omsti.take(filtered_ids_omsti_r), wordnet_senses=wordnet_senses))
print(save_tagged_sentence(Path.cwd() / 'data/disambiguation/all', tagged_tr_semcor_r, 'all_rouge_map', d_senses, rouge_semcor.take(filtered_ids_semcor_r), mode='a', wordnet_senses=wordnet_senses))
print('Saving bleu')
print(save_tagged_sentence(Path.cwd() / 'data/disambiguation/all', tagged_tr_omsti_b, 'all_bleu_map', d_senses_omsti, bleu_omsti.take(filtered_ids_omsti_r), wordnet_senses=wordnet_senses))
print(save_tagged_sentence(Path.cwd() / 'data/disambiguation/all', tagged_tr_semcor_b, 'all_bleu_map', d_senses, bleu_semcor.take(filtered_ids_semcor_r), mode='a', wordnet_senses=wordnet_senses))
print('Saving rouge and bleu')
print(save_tagged_sentence(Path.cwd() / 'data/disambiguation/all', tagged_tr_omsti_r_b, 'all_rouge_bleu_map', d_senses_omsti, list(zip(rouge_omsti.take(filtered_ids_omsti_r), bleu_omsti.take(filtered_ids_omsti_r))), wordnet_senses=wordnet_senses))
print(save_tagged_sentence(Path.cwd() / 'data/disambiguation/all', tagged_tr_semcor_r_b, 'all_rouge_bleu_map', d_senses, list(zip(rouge_semcor.take(filtered_ids_semcor_r), bleu_semcor.take(filtered_ids_semcor_r))), mode='a', wordnet_senses=wordnet_senses))

Saving rouge
0
0
Saving bleu
0
0
Saving rouge and bleu
0
0


In [1167]:
f_i = open(Path.cwd() / 'data/disambiguation/all/all_rouge_map_in.txt', 'r')
f_o = open(Path.cwd() / 'data/disambiguation/all/all_rouge_map_out.txt', 'r')
f = open(Path.cwd() / 'data/disambiguation/all/all_rouge_map.tsv', 'w')
for l_i, l_o in zip(f_i.read().split('\n'), f_o.read().split('\n')):
    f.write(l_i + '\t' + l_o + '\n')
f.close()
f_i.close()
f_o.close()

In [1168]:
f_i = open(Path.cwd() / 'data/disambiguation/all/all_bleu_map_in.txt', 'r')
f_o = open(Path.cwd() / 'data/disambiguation/all/all_bleu_map_out.txt', 'r')
f = open(Path.cwd() / 'data/disambiguation/all/all_bleu_map.tsv', 'w')
for l_i, l_o in zip(f_i.read().split('\n'), f_o.read().split('\n')):
    f.write(l_i + '\t' + l_o + '\n')
f.close()
f_i.close()
f_o.close()

In [1169]:
f_i = open(Path.cwd() / 'data/disambiguation/all/all_rouge_bleu_map_in.txt', 'r')
f_o = open(Path.cwd() / 'data/disambiguation/all/all_rouge_bleu_map_out.txt', 'r')
f = open(Path.cwd() / 'data/disambiguation/all/all_rouge_bleu_map.tsv', 'w')
for l_i, l_o in zip(f_i.read().split('\n'), f_o.read().split('\n')):
    f.write(l_i + '\t' + l_o + '\n')
f.close()
f_i.close()
f_o.close()

In [1150]:
    cont = []
    with open(Path.cwd() / 'data/disambiguation/semcor' / 'semcor_rouge_out.txt', 'r') as file:
        for sent in file.read().split('\n'):
            cont.append(sent)
    file.close()

In [1151]:
cont[0]

'quanto tempo tem sido desde que review%2:31:00:: os objectivos do seu benefit%1:21:00:: e service%1:04:07:: de program%1:09:01:: ?'

In [1147]:
    cont = 0
    with open(Path.cwd() / 'data/disambiguation/omsti' / 'omsti_rouge_in.txt', 'r') as file:
        for sent in file.read().split('\n'):
            cont+=1
    file.close()

In [1148]:
cont

34133

In [1146]:
cont

32489

In [1144]:
cont

66621