In [1]:
import numpy as np
from pathlib import Path
from collections import Counter
import os
import re
import nltk

os.chdir('../')

from src.data import read_sentences

In [77]:
PT_VERBS = ['tratar', 'estabelecer', 'marcar', 'vir', 'colocar',\
           'fechar', 'dar', 'cair', 'encontrar', 'registrar',\
           'levar', 'receber', 'apresentar', 'passar', 'deixar',\
           'chegar', 'ficar', 'fazer', 'ter', 'ser']

In [3]:
SELECTED_SYNSETS = np.load(Path.cwd() / 'data/disambiguation/selected_synsets.npy')

In [4]:
PT_DICT_VERBS ={
    'tratar': ['deal with', 'treat', 'process', 'deal', 'manage', 'do', 'attend', 'look after', 'cherrish', 'misuse', 'size'],\
    'estabelecer': ['establish', 'set', 'fix', 'lay down', 'make', 'settle', 'determine', 'presribe', 'impose', 'enter into', 'stipulate', 'organize', 'seat'],\
    'marcar': ['mark', 'brand', 'dial', 'book', 'stamp', 'show', 'read', 'define', 'trace', 'earmark', 'reserve', 'feature', 'signilize', 'scribe'],\
    'vir': ['come', 'arrive', 'come on', 'come up with'],\
    'colocar': ['put', 'lay', 'set', 'place', 'post', 'pose', 'stick', 'plant', 'dispose', 'collocate', 'posit', 'bestow', 'pitch', 'clap', 'ship', 'placatory'],\
    'fechar': ['close', 'shut', 'seal', 'turn off', 'pin down', 'fasten', 'occlude', 'shut in', 'box', 'impount', 'bar', 'berate', 'stop up', 'mure', 'stopple', 'rail', 'pen', 'inmure'],\
    'dar': ['to give', 'give', 'impart', 'provide', 'render', 'afford', 'yield', 'allow', 'hand', 'deal', 'administer', 'give in', 'gift', 'confer', 'inflict', 'handsel', 'accord'],\
    'cair': ['fall', 'go down', 'drop', 'sink', 'collapse', 'founder', 'topple', 'lapse', 'sleet', 'keel over', 'prey', 'prostrate', 'pelt', 'plump', 'flump'],\
    'encontrar': ['meet', 'find', 'detect', 'encounter', 'find out', 'discover', 'meet with', 'experience', 'get together', 'impinge', 'hunt up'],\
    'registrar': ['register', 'record', 'read', 'book', 'enroll', 'inscribe', 'enrol', 'list', 'write down', 'set down', 'trace', 'score', 'label', 'matriculate', 'prick down', 'calendar', 'signalize'],\
    'levar': ['take along', 'take', 'carry', 'convey', 'go', 'prompt', 'induce', 'hold', 'charge', 'ravish'],\
    'receber': ['to receive', 'receive', 'welcome', 'get', 'have', 'accept', 'collect', 'meet', 'entertain', 'do', 'reap', 'derive', 'salute'],\
    'apresentar': ['to present', 'present', 'introduce', 'show', 'exhibit', 'lodge', 'produce', 'put', 'bring forward', 'come up with', 'represent', 'bring up', 'render'],\
    'passar': ['pass', 'spend', 'hand', 'go', 'go by', 'elapse', 'slip away', 'come', 'transfer', 'give in'],\
    'deixar': ['leave', 'let', 'have', 'quit', 'let go', 'depart', 'go away', 'drop off', 'leave out'],\
    'chegar': ['to arrive', 'arrive', ' get in', 'come', 'achieve', 'land', 'get around', 'turn up'],\
    'ficar': ['stay', 'bide', 'be', 'remain', 'go', 'continue', 'keep', 'come'],\
    'fazer': ['do', 'perform', 'make', 'cause', 'create', 'produce', 'render', 'manufacture'],\
    'ter': ['tue', 'have', 'take'],\
    'ser': ['to be', 'be', 'being']
}

# Eval senses tagged

## Read translations

In [5]:
or_file_semcor = open('data/translation/semcor/en_pt/semcor_or_2.txt', 'r')
tr_file_semcor = open('data/translation/semcor/en_pt/semcor_tr_2.txt', 'r')
id_file_semcor = open('data/translation/semcor/en_pt/semcor_id_2.txt', 'r')
al_file_semcor = open('data/translation/semcor/en_pt/semcor_al_2.txt', 'r')

or_file_omsti = open('data/translation/omsti/en_pt/omsti_or.txt', 'r')
tr_file_omsti = open('data/translation/omsti/en_pt/omsti_tr.txt', 'r')
id_file_omsti = open('data/translation/omsti/en_pt/omsti_id.txt', 'r')
al_file_omsti = open('data/translation/omsti/en_pt/omsti_al.txt', 'r')

In [6]:
def parse_alignment(chars):
    alignment = []
    for e in re.split(r'[,|;]', chars):
        temp = e.split('-')
        range_a = temp[0].split(':')
        range_b = temp[1].split(':')
        alignment.append([np.arange(int(range_a[0]), int(range_a[0]) + int(range_a[1]) + 1),\
                         np.arange(int(range_b[0]), int(range_b[0]) + int(range_b[1]) + 1)])
        
    return alignment

In [7]:
def load_arrays(or_file, tr_file, id_file, al_file):
    tr_ar = []
    for sent in tr_file:
        tr_ar.append(sent.replace('\n', ''))
        
    or_ar = []
    for sent in or_file:
        or_ar.append(sent.replace('\n', ''))

    id_ar = []
    for sent in id_file:
        id_ar.append(int(sent.replace('\n', '')))

    al_ar = []
    for sent in al_file:
        al_ar.append(parse_alignment(sent.replace('\n', '')))
        
    return np.array(or_ar), np.array(tr_ar), np.array(id_ar), np.array(al_ar)

In [8]:
or_semcor, tr_semcor, id_semcor, al_semcor = load_arrays(or_file_semcor, tr_file_semcor, id_file_semcor, al_file_semcor)
or_omsti, tr_omsti, id_omsti, al_omsti = load_arrays(or_file_omsti, tr_file_omsti, id_file_omsti, al_file_omsti)

In [9]:
len(or_semcor), len(tr_semcor), len(id_semcor), len(al_semcor)

(37163, 37163, 37163, 37163)

In [10]:
len(or_omsti), len(tr_omsti), len(id_omsti), len(al_omsti)

(40260, 40260, 40260, 40260)

In [11]:
or_file_semcor.close()
tr_file_semcor.close()
id_file_semcor.close()
al_file_semcor.close()

or_file_omsti.close()
tr_file_omsti.close()
id_file_omsti.close()
al_file_omsti.close()

## Read corpus annotations

In [12]:
input_sent_semcor, sent_to_id_semcor, target_sent_semcor = read_sentences(Path.cwd() / 'data/WSD_Evaluation_Framework/Training_Corpora/', 'SemCor', False)
input_sent_temp, sent_to_id_temp, target_sent_temp = read_sentences(Path.cwd() / 'data/WSD_Evaluation_Framework/Training_Corpora/', 'SemCor+OMSTI', True)

# Filter sentences
input_sent_omsti = []
sent_to_id_omsti = []
target_sent_omsti = []
for ix, sent in enumerate(input_sent_temp):
    if sent not in input_sent_semcor:
        input_sent_omsti.append(sent)
        sent_to_id_omsti.append(sent_to_id_temp[ix])
        target_sent_omsti.append(target_sent_temp[ix])

## Check amount of valid data

In [13]:
def get_alignment(pos_word, al):
    temp = []
    for e in al:
        if pos_word in e[1]:
            temp.append(e)
    
    if len(temp):
        temp = sorted(temp, key= lambda x: len(x[1]))
        temp = temp[0]
        
    return temp

In [14]:
def count_words_with_verb(sent):
    temp_ix = []
    temp_cont = []
    cont_char = 0
    for ix, word in enumerate(sent.split()):
        if word in PT_VERBS:
            temp_cont.append(cont_char)
            temp_ix.append(ix)
        cont_char += len(word) + 1
    return temp_cont, temp_ix

In [15]:
def count_tagged_instance(id_arr, al_arr, tr_arr, input_sent_arr, sent_to_id_arr, target_sent_arr):
    cont = 0
    for ix_tr, ix_input_sent in enumerate(id_arr):
        
        input_sent = input_sent_arr[ix_input_sent]
        sent_to_id = sent_to_id_arr[ix_input_sent]
        target_sent = target_sent_arr[ix_input_sent]
        
        al = al_arr[ix_tr]
        tr = tr_arr[ix_tr]
        
        arr_cont_char, arr_pos_word = count_words_with_verb(tr)
        if len(arr_cont_char):
            for cont_char, pos_word in zip(arr_cont_char, arr_pos_word):
                alingment = get_alignment(cont_char, al)
                #print(alingment)
                t_or = min(alingment[0])
                d_or = max(alingment[0])
                t_tr = min(alingment[1])
                d_tr = max(alingment[1])

                ant_n_words = len(input_sent[:t_or].split())
                n_words = len(input_sent[t_or:d_or].split())
                for id_sense in sent_to_id[ant_n_words:(ant_n_words+n_words)]:
                    if target_sent[id_sense] != 'no_instance':
                        cont +=1
    
    return cont

In [16]:
count_tagged_instance(id_semcor, al_semcor, tr_semcor, input_sent_semcor, sent_to_id_semcor, target_sent_semcor)

7280

In [17]:
count_tagged_instance(id_omsti, al_omsti, tr_omsti, input_sent_omsti, sent_to_id_omsti, target_sent_omsti)

6611

In [18]:
len(id_semcor) + len(id_omsti)

77423

In [19]:
len(id_semcor) + len(id_omsti)

77423

# Eval metrics

In [20]:
or_file_semcor = open('data/translation/semcor/pt_en/semcor_or_2.txt', 'r')
tr_file_semcor = open('data/translation/semcor/pt_en/semcor_tr_2.txt', 'r')
al_file_semcor = open('data/translation/semcor/pt_en/semcor_al_2.txt', 'r')

or_file_omsti = open('data/translation/omsti/pt_en/omsti_or_2.txt', 'r')
tr_file_omsti = open('data/translation/omsti/pt_en/omsti_tr_2.txt', 'r')
al_file_omsti = open('data/translation/omsti/pt_en/omsti_al_2.txt', 'r')

In [21]:
bck_or_semcor, bck_tr_semcor, _, bck_al_semcor = load_arrays(or_file_semcor, tr_file_semcor, [], al_file_semcor)
bck_or_omsti, bck_tr_omsti, _, bck_al_omsti = load_arrays(or_file_omsti, tr_file_omsti, [], al_file_omsti)

In [22]:
or_file_semcor.close()
tr_file_semcor.close()
al_file_semcor.close()

or_file_omsti.close()
tr_file_omsti.close()
al_file_omsti.close()

In [23]:
bck_tr_semcor[-100], or_semcor[-100]

('the next days may show where things are .',
 'the next days may show where things stand .')

In [24]:
bck_tr_omsti[-100], or_omsti[-100]

('whether as athletes or spectators , when we find ourselves in and participate in international sporting events , we share the brilliance of world-class competition , whether it be in the pain of losing, or the glory of winning , but more importantly , on a willingness of participation .',
 'whether as athletes or spectators , when we meet at and participate in international sporting events , we share in the glow of world-class competition , whether it be in the heartbreak of losing or in the glory of winning , but most importantly , in the goodwill of participation .')

## Eval Rouge

In [25]:
import rouge

In [26]:
def get_rouge_per_sentence(input_sentences, trns_sentences):
    f1_scores = []
    for sent, trns in zip(input_sentences, trns_sentences):
        evaluator = rouge.Rouge(metrics=['rouge-n'],
                               max_n=2,
                               limit_length=True,
                               length_limit=100,
                               length_limit_type='words',
                               alpha=0.5, # Default F1_score
                               weight_factor=1.2,
                               stemming=True)

        all_hypothesis = [trns]
        all_references = [sent]

        scores = evaluator.get_scores(all_hypothesis, all_references)
        for metric, results in sorted(scores.items(), key=lambda x: x[0]):
            if metric == 'rouge-2':
                f1_scores.append(results['f'])

    return np.array(f1_scores)

In [27]:
rouge_omsti = get_rouge_per_sentence(or_omsti, bck_tr_omsti)
rouge_semcor = get_rouge_per_sentence(or_semcor, bck_tr_semcor)

## Eval Bleu

In [28]:
import nltk
from nltk.translate.bleu_score import SmoothingFunction

def get_bleu_per_sentence(input_sentences, trns_sentences):
    bleu_scores = []
    smoothie = SmoothingFunction().method4
    for sent, trns in zip(input_sentences, trns_sentences):
        hypothesis = trns.split() 
        reference = sent.split()
        
        references = [reference]
        bleu = nltk.translate.bleu_score.sentence_bleu(references, hypothesis, smoothing_function=smoothie)
        
        bleu_scores.append(bleu)

    return np.array(bleu_scores)

In [29]:
bleu_omsti = get_bleu_per_sentence(or_omsti, bck_tr_omsti)
bleu_semcor = get_bleu_per_sentence(or_semcor, bck_tr_semcor)

## Eval f1

In [30]:
def get_bleu_per_sentence(bleu, rouge):
    
    f1 = []
    for b, r in zip(bleu, rouge):
        if b != 0 or r != 0:
            f1.append(2 * (b * r) / (b + r))
        else:
            f1.append(0)
        
    return np.array(f1)

In [31]:
f1_omsti = get_bleu_per_sentence(bleu_omsti, rouge_omsti)
f1_semcor = get_bleu_per_sentence(bleu_semcor, rouge_semcor)

## Eval similarity

In [32]:
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer
import re, string, unicodedata

def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words


def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words


def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words


def normalize_text(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = remove_stopwords(words)
    return words


In [33]:
import html
import scipy 

def calc_vector(text, model):
    item = html.unescape(text)
    gen_doc = [w for w in word_tokenize(item)]
    normalized = normalize_text(gen_doc)
    return model.infer_vector(normalized)

def similarity(text1, text2, model):
    vec1 = calc_vector(text1, model)
    vec2 = calc_vector(text2, model)    
    return 1-scipy.spatial.distance.cosine(vec1, vec2)

In [34]:
def get_sim_per_sentence(or_sents, bck_sents):
    
    sims = []
    for or_s, bck_s in zip(or_sents, bck_sents):
        sims.append(similarity(or_s, bck_s, model))
        
    return np.array(sims)

In [35]:
from gensim.models.doc2vec import Doc2Vec

model = Doc2Vec.load('/home/krivas/projects/disambiguation-dataset/data/vectors/enwiki_doc2vec')

In [36]:
sim_omsti = get_sim_per_sentence(or_omsti, bck_tr_omsti)
sim_semcor = get_sim_per_sentence(or_semcor, bck_tr_semcor)

# Transform translations

In [37]:
def count_words(sent):
    temp_ix = []
    temp_cont = []
    cont_char = 0
    for ix, word in enumerate(sent.split()):
        temp_cont.append(cont_char)
        temp_ix.append(ix)
        
        cont_char += len(word) + 1
        
    return temp_cont, temp_ix

In [38]:
def check_word(tr_word, orig_words):
    if tr_word in PT_DICT_VERBS:
        orig_options = PT_DICT_VERBS[tr_word]
        for orig_word in orig_words:
            if orig_word in orig_options:
                return True
                
    return False

In [39]:
from spacy.tokens import Doc

class WhitespaceTokenizer(object):
    def __init__(self, vocab):
        self.vocab = vocab

    def __call__(self, text):
        words = text.split(' ')
        # All tokens 'own' a subsequent space character in this tokenizer
        spaces = [True] * len(words)
        return Doc(self.vocab, words=words, spaces=spaces)


In [40]:
import spacy

nlp_pt = spacy.load('pt_core_news_sm')
nlp_pt.tokenizer = WhitespaceTokenizer(nlp_pt.vocab)

nlp_en = spacy.load('en_core_web_sm')
nlp_en.tokenizer = WhitespaceTokenizer(nlp_en.vocab)

In [41]:
def get_pos_tags(orig_sentences, trns_sentences): 
    tags_tr = []
    for sent in (trns_sentences):
        if len(sent):
            tag_tr = []
            temp = nlp_pt(str(sent).strip())
            for token in temp:
                tag_tr.append([token.text, token.pos_])
            tags_tr.append(tag_tr)
        else:
            tags_tr.append(['invalid', 'invalid'])
    
    tags_or = []
    for sent in (orig_sentences):
        if len(sent):
            tag_or = []
            temp = nlp_en(str(sent).strip())
            for token in temp:
                tag_or.append([token.text, token.pos_])
            tags_or.append(tag_or)
        else:
            tags_or.append(['invalid', 'invalid'])
        
    return tags_or, tags_tr

In [92]:
def tagged_translations(selected_id_tr, id_arr, al_arr, tr_arr, input_sent_arr, sent_to_id_arr, target_sent_arr, pos_or, pos_tr):
    cont = 0
    tagged_tr = []
    
    for ix_tr, ix_input_sent in zip(selected_id_tr, id_arr.take(selected_id_tr)):
        input_sent = input_sent_arr[ix_input_sent]
        sent_to_id = sent_to_id_arr[ix_input_sent]
        target_sent = target_sent_arr[ix_input_sent]
        
        al = al_arr[ix_tr]
        tr = tr_arr[ix_tr]
        
        arr_cont_char, arr_pos_word = count_words(tr)
        if len(arr_cont_char):
            
            tags_tr = pos_tr[ix_tr]
            tags_or = pos_or[ix_input_sent]
            if verbose:
                print()
                print('sentence orig: ', input_sent)
                print('sentence translated: ', tr)
            temp = []
            for cont_char, pos_word in zip(arr_cont_char, arr_pos_word):
                alingment = get_alignment(cont_char, al)
                if len(alingment):
                    t_or = min(alingment[0])
                    d_or = max(alingment[0])
                    t_tr = min(alingment[1])
                    d_tr = max(alingment[1])

                    ant_n_words_or = len(input_sent[:t_or].split())
                    n_words_or = len(input_sent[t_or:d_or].split())
                    
                    ant_n_words_tr = len(tr[:t_tr].split())
                    n_words_tr = len(tr[t_tr:d_tr].split())
                    
                    input_sent_tokens = input_sent.split()[ant_n_words_or:(ant_n_words_or+n_words_or)]
                    trns_sent_tokens = tr.split()[ant_n_words_tr:(ant_n_words_tr+n_words_tr)]
                    
                    if verbose:
                        print()
                        print('translated word: ', tr.split()[pos_word])
                        print('senses: ', np.array(target_sent).take(sent_to_id[ant_n_words_or:(ant_n_words_or+n_words_or)]))
                        print('token input original: ', input_sent_tokens)
                        print('token translations: ', tr.split()[ant_n_words_tr:(ant_n_words_tr+n_words_tr)])
                        print('pos original: ', tags_tr[pos_word])
                        print('pos translation: ', tags_or[ant_n_words_or:(ant_n_words_or+n_words_or)])
                    if len(input_sent_tokens) == 1:   
                        
                        tag_or = tags_or[ant_n_words_or:(ant_n_words_or+n_words_or)][0]
                        tag_tr = tags_tr[pos_word]
                        id_sense = sent_to_id[ant_n_words_or:(ant_n_words_or+n_words_or)][0]
                        if verbose:
                            print('token 1: ', target_sent[id_sense])
                        if target_sent[id_sense] != 'no_instance' and tag_or[1] == tag_tr[1]: 
                            temp.append(target_sent[id_sense])
                        else:
                            temp.append('no_instance') 

                    elif len(input_sent_tokens) <= 4 and len(trns_sent_tokens) <= 5 and check_word(tr.split()[pos_word], input_sent[t_or:d_or].split()):
                        is_ambiguous = False

                        for id_sense, word in zip(sent_to_id[ant_n_words_or:(ant_n_words_or+n_words_or)], input_sent_tokens):
                            if target_sent[id_sense] != 'no_instance' and word in PT_DICT_VERBS[tr.split()[pos_word]]:
                                is_ambiguous = True
                                break
                        
                        if is_ambiguous:
                            if verbose:
                                print('token check_word: ', target_sent[id_sense])
                            temp.append(target_sent[id_sense])
                        else:
                            temp.append('no_instance')
                            
                    elif len(input_sent_tokens) <= 4 and len(trns_sent_tokens) <= 4:   
                        
                        is_ambiguous = False
                        tag_tr = tags_tr[pos_word]
                        
                        tag_cntr_or = Counter([tag[1] for tag in tags_or[ant_n_words_or:(ant_n_words_or+n_words_or)]])
                        tag_cntr_tr = Counter([tag[1] for tag in tags_tr[ant_n_words_tr:(ant_n_words_tr+n_words_tr)]])

                        for id_sense, tag_or in zip(sent_to_id[ant_n_words_or:(ant_n_words_or+n_words_or)], tags_or[ant_n_words_or:(ant_n_words_or+n_words_or)]):
                            if target_sent[id_sense] != 'no_instance' and tag_or[1] == tag_tr[1] and tag_cntr_tr[tag_tr[1]] == 1 and tag_cntr_or[tag_or[1]] == 1: 
                                is_ambiguous = True
                                temp_id = target_sent[id_sense]
                                if verbose:
                                    print('sentido encontrado: ', temp_id)
                                break
                                
                        if is_ambiguous:
                            temp.append(temp_id)
                        else:
                            temp.append('no_instance')

                    else:
                        temp.append('no_instance')
                else:
                    temp.append('no_instance')
            
            tagged_tr.append([temp, tr, input_sent])
    
    return tagged_tr

In [93]:
def get_verbs(sent):
    temp_cont = []
    for ix, word in enumerate(sent.split()):
        if word in PT_VERBS:
            temp_cont.append(ix)
    return temp_cont

In [94]:
def count_instances_with_verbs(tagged_tr):
    cont = []
    for ix, instance in enumerate(tagged_tr):
        arr_pos_word = get_verbs(instance[1])
        if len(arr_pos_word):
            for pos_word in arr_pos_word:
                if instance[0][pos_word] != 'no_instance':
                    cont.append(ix)
                    
    return len(cont)

In [95]:
def filter_index(metrics, sims, id_arr, p=''):
    
    filtered_ids = []
    threshold_metric = np.mean(metrics) - np.std(metrics)
    threshold_sim = np.mean(sims) - np.std(sims)

    for ix, (metric, sim) in enumerate(zip(metrics, sims)):
        if metric > threshold_metric:
            filtered_ids.append(ix)
        elif sim > threshold_sim:
            filtered_ids.append(ix)
                
    print(f'{p:15s}total ids: {len(id_arr)}\t|\t After filter: {len(filtered_ids)}')
    return filtered_ids

## Filtering semcor

### Filter id's

In [46]:
filtered_ids_semcor_r = filter_index(rouge_semcor, sim_semcor, id_semcor, p='rouge')
filtered_ids_semcor_b = filter_index(bleu_semcor, sim_semcor, id_semcor, p='bleu')
filtered_ids_semcor_r_b = filter_index(f1_semcor, sim_semcor, id_semcor, p='f1')

rouge          total ids: 37163	|	 After filter: 34774
bleu           total ids: 37163	|	 After filter: 34355
f1             total ids: 37163	|	 After filter: 34523


### Make target

In [47]:
# pos_or, pos_tr = get_pos_tags(input_sent_semcor, tr_semcor)
pos_or = np.load('data/serialize/pos_or_semcor.npy')
pos_tr = np.load('data/serialize/pos_tr_semcor.npy')

In [48]:
#np.save('pos_or.npy', pos_or)
#np.save('pos_tr.npy', pos_tr)

In [49]:
verbose=False

In [96]:
print('tagging with rouge')
tagged_tr_semcor_r = tagged_translations(filtered_ids_semcor_r, id_semcor, al_semcor, tr_semcor, input_sent_semcor, sent_to_id_semcor, target_sent_semcor, pos_or, pos_tr)
print('tagging with bleu')
tagged_tr_semcor_b = tagged_translations(filtered_ids_semcor_b, id_semcor, al_semcor, tr_semcor, input_sent_semcor, sent_to_id_semcor, target_sent_semcor, pos_or, pos_tr)
print('tagging with rouge and bleu')
tagged_tr_semcor_r_b = tagged_translations(filtered_ids_semcor_r_b, id_semcor, al_semcor, tr_semcor, input_sent_semcor, sent_to_id_semcor, target_sent_semcor, pos_or, pos_tr)

tagging with rouge
tagging with bleu
tagging with rouge and bleu


In [51]:
k= []
for ix, sent in enumerate(tagged_tr_semcor_r):
    c = Counter([i for i in sent[0] if i != 'no_instance'])
    if len(c) > 0 and (c.most_common(1)[0][1] > 1) :
        k.append([ix, sent])


### Checking target

In [56]:
for instance in tagged_tr_semcor_r:
    if len(instance[0]) != len(instance[1].split()):
        print(False)

In [110]:
# Rouge
count_instances_with_verbs(tagged_tr_semcor_r)

3206

In [111]:
for instance in tagged_tr_semcor_b:
    if len(instance[0]) != len(instance[1].split()):
        print(False)

In [47]:
# Bleu
count_instances_with_verbs(tagged_tr_semcor_b)

2968

In [112]:
# Bleu
count_instances_with_verbs(tagged_tr_semcor_b)

3188

In [113]:
for instance in tagged_tr_semcor_r_b:
    if len(instance[0]) != len(instance[1].split()):
        print(False)

In [49]:
# Rouge and Bleu
count_instances_with_verbs(tagged_tr_semcor_r_b)

2962

In [114]:
# Rouge and Bleu
count_instances_with_verbs(tagged_tr_semcor_r_b)

3195

In [115]:
len(tagged_tr_semcor_r_b)

34536

### Annotate target

In [52]:
d_senses = {}
with open(Path.cwd() / 'data/WSD_Evaluation_Framework/Training_Corpora/SemCor/semcor.gold.key.txt', 'r') as f:
    for line in f.read().split('\n'):
        if len(line):
            temp = line.split()
            d_senses[temp[0]] = temp[1:]

In [121]:
def save_tagged_sentence(dir_path, tagged_tr, name_file, d_senses, metric, mode='w', wordnet_senses={}, lemma=False, sel_verbs=True):
    tagged_sense_sent = []
    input_sense_sent = []
    orig_sent = []
    cont = 0
    a = 0
    for instance in tagged_tr:
        temp_o = []
        temp_i = []
        for ix, word in enumerate(instance[1].split()):
            if instance[0][ix] != 'no_instance':
                if len(wordnet_senses) != 0:
                    if sel_verbs and d_senses[instance[0][ix]][0] in wordnet_senses and wordnet_senses[d_senses[instance[0][ix]][0]] in SELECTED_SYNSETS:
                        temp_o.append(wordnet_senses[d_senses[instance[0][ix]][0]])
                        if lemma:
                            d = nlp_pt(word)
                            lemma = d[0].lemma_ 
                            if lemma == 'deixar-me':
                                lemma = 'deixar'
                            temp_i.append(lemma + '_tag')
                        else:
                            temp_i.append(word)
                        a += 1
                    elif not sel_verbs and d_senses[instance[0][ix]][0] in wordnet_senses:
                        temp_o.append(wordnet_senses[d_senses[instance[0][ix]][0]])
                        if lemma:
                            d = nlp_pt(word)
                            lemma = d[0].lemma_ 
                            if lemma == 'deixar-me':
                                lemma = 'deixar'
                            temp_i.append(lemma + '_tag')
                        else:
                            temp_i.append(word)
                        a += 1
                    else:
                        cont += 1
                        temp_o.append(word)
                        temp_i.append(word)
                else:
                    temp_o.append(d_senses[instance[0][ix]][0])
                    temp_i.append(word)
            else:
                temp_o.append(word)
                temp_i.append(word)

        input_sense_sent.append(' '.join(temp_i))
        tagged_sense_sent.append(' '.join(temp_o))
        orig_sent.append(instance[2])

    if mode == 'w':
        np.save(dir_path / f'{name_file}.npy', np.array(list(zip(tagged_sense_sent, metric))))
    else:
        temp = np.load(dir_path / f'{name_file}.npy')
        print(len(temp))
        np.save(dir_path / f'{name_file}.npy', np.concatenate((temp, np.array(list(zip(tagged_sense_sent, metric))))))
    
    with open(dir_path / f'{name_file}_out.txt', mode) as file:
        for sent in tagged_sense_sent:
            file.write(sent + '\n')
    file.close()
    
    with open(dir_path / f'{name_file}_in.txt', mode) as file:
        for sent in input_sense_sent:
            file.write(sent + '\n')
    file.close() 
    
    with open(dir_path / f'{name_file}_orig.txt', mode) as file:
        for sent in orig_sent:
            file.write(sent + '\n')
    file.close()     
    
    return cont, a

In [54]:
wordnet_senses = {}
with open(Path.cwd() / 'data/wordnet/index.sense') as f:
    for line in f.read().split('\n'):
        if len(line) > 1:
            wordnet_senses[line.split()[0]] = line.split()[1]
f.close()

In [138]:
print('Saving rouge')
print(save_tagged_sentence(Path.cwd() / 'data/disambiguation/semcor', tagged_tr_semcor_r, 'semcor_rouge_map_lemma', d_senses, rouge_semcor.take(filtered_ids_semcor_r), wordnet_senses=wordnet_senses), lemma=True)
print('Saving bleu')
print(save_tagged_sentence(Path.cwd() / 'data/disambiguation/semcor', tagged_tr_semcor_b, 'semcor_bleu_map_lemma', d_senses, bleu_semcor.take(filtered_ids_semcor_r), wordnet_senses=wordnet_senses), lemma=True)
print('Saving rouge and bleu')
print(save_tagged_sentence(Path.cwd() / 'data/disambiguation/semcor', tagged_tr_semcor_r_b, 'semcor_rouge_bleu_map_lemma', d_senses, list(zip(rouge_semcor.take(filtered_ids_semcor_r), bleu_semcor.take(filtered_ids_semcor_r))), wordnet_senses=wordnet_senses), lemma=True)

Saving rouge
(0, 136807)
Saving bleu
(0, 136300)
Saving rouge and bleu
(0, 136554)


In [68]:
print('Saving rouge')
print(save_tagged_sentence(Path.cwd() / 'data/disambiguation/semcor', tagged_tr_semcor_r, 'semcor_rouge_map_sel', d_senses, rouge_semcor.take(filtered_ids_semcor_r), wordnet_senses=wordnet_senses, lemma=False, sel_verbs=True))
print('Saving bleu')
print(save_tagged_sentence(Path.cwd() / 'data/disambiguation/semcor', tagged_tr_semcor_b, 'semcor_bleu_map_sel', d_senses, bleu_semcor.take(filtered_ids_semcor_r), wordnet_senses=wordnet_senses, lemma=False, sel_verbs=True))
print('Saving rouge and bleu')
print(save_tagged_sentence(Path.cwd() / 'data/disambiguation/semcor', tagged_tr_semcor_r_b, 'semcor_rouge_bleu_map_sel', d_senses, list(zip(rouge_semcor.take(filtered_ids_semcor_r), bleu_semcor.take(filtered_ids_semcor_r))), wordnet_senses=wordnet_senses, lemma=False, sel_verbs=True))

Saving rouge
(116859, 19941)
Saving bleu
(116421, 19772)
Saving rouge and bleu
(116654, 19854)


## Filtering omsti

### Filter id's

In [62]:
filtered_ids_omsti_r = filter_index(rouge_omsti, sim_omsti, id_omsti, p='rouge')
filtered_ids_omsti_b = filter_index(bleu_omsti, sim_omsti, id_omsti, p='bleu')
filtered_ids_omsti_r_b = filter_index(f1_omsti, sim_omsti, id_omsti, p='bleu-rouge')

rouge          total ids: 40260	|	 After filter: 37890
bleu           total ids: 40260	|	 After filter: 37823
bleu-rouge     total ids: 40260	|	 After filter: 37810


### Make target

In [63]:
#pos_or_omsti, pos_tr_omsti = get_pos_tags(input_sent_omsti, tr_omsti)
pos_or_omsti = np.load('data/serialize/pos_or_omsti.npy')
pos_tr_omsti = np.load('data/serialize/pos_tr_omsti.npy')

In [64]:
#np.save('pos_or_omsti.npy', pos_or_omsti)
#np.save('pos_tr_omsti.npy', pos_tr_omsti)

In [65]:
verbose=False

In [97]:
print('tagging with rouge')
tagged_tr_omsti_r = tagged_translations(filtered_ids_omsti_r, id_omsti, al_omsti, tr_omsti, input_sent_omsti, sent_to_id_omsti, target_sent_omsti, pos_or_omsti, pos_tr_omsti)
print('tagging with bleu')
tagged_tr_omsti_b = tagged_translations(filtered_ids_omsti_b, id_omsti, al_omsti, tr_omsti, input_sent_omsti, sent_to_id_omsti, target_sent_omsti, pos_or_omsti, pos_tr_omsti)
print('tagging with rouge and bleu')
tagged_tr_omsti_r_b = tagged_translations(filtered_ids_omsti_r_b, id_omsti, al_omsti, tr_omsti, input_sent_omsti, sent_to_id_omsti, target_sent_omsti, pos_or_omsti, pos_tr_omsti)


tagging with rouge
tagging with bleu
tagging with rouge and bleu


In [131]:
for instance in tagged_tr_omsti_r:
    if len(instance[0]) != len(instance[1].split()):
        print(False)

In [90]:
# Rouge
count_instances_with_verbs(tagged_tr_omsti_r)

4267

In [91]:
for instance in tagged_tr_omsti_b:
    if len(instance[0]) != len(instance[1].split()):
        print(False)

In [92]:
# Bleu
count_instances_with_verbs(tagged_tr_omsti_b)

4238

In [132]:
for instance in tagged_tr_omsti_r_b:
    if len(instance[0]) != len(instance[1].split()):
        print(False)

In [135]:
# Bleu
count_instances_with_verbs(tagged_tr_omsti_r_b)

4700

In [67]:
# Bleu
count_instances_with_verbs(tagged_tr_omsti_r_b)

4719

### Annotate corpus

In [69]:
d_senses_omsti = {}
with open(Path.cwd() / 'data/WSD_Evaluation_Framework/Training_Corpora/SemCor+OMSTI/semcor+omsti.gold.key.txt', 'r') as f:
    for line in f.read().split('\n'):
        if len(line):
            temp = line.split()
            d_senses_omsti[temp[0]] = temp[1:]

In [139]:
print('Saving rouge')
print(save_tagged_sentence(Path.cwd() / 'data/disambiguation/omsti', tagged_tr_omsti_r, 'omsti_rouge_map_lemma', d_senses_omsti, rouge_omsti.take(filtered_ids_omsti_r), wordnet_senses=wordnet_senses, lemma=True, sel_verbs=True))
print('Saving bleu')
print(save_tagged_sentence(Path.cwd() / 'data/disambiguation/omsti', tagged_tr_omsti_b, 'omsti_bleu_map_lemma', d_senses_omsti, bleu_omsti.take(filtered_ids_omsti_r), wordnet_senses=wordnet_senses, lemma=True, sel_verbs=True))
print('Saving rouge and bleu')
print(save_tagged_sentence(Path.cwd() / 'data/disambiguation/omsti', tagged_tr_omsti_r_b, 'omsti_rouge_bleu_map_lemma', d_senses_omsti, list(zip(rouge_omsti.take(filtered_ids_omsti_r), bleu_omsti.take(filtered_ids_omsti_r))), wordnet_senses=wordnet_senses, lemma=True, sel_verbs=True))


Saving rouge
(0, 23472)
Saving bleu
(0, 23444)
Saving rouge and bleu
(0, 23423)


In [70]:
print('Saving rouge')
print(save_tagged_sentence(Path.cwd() / 'data/disambiguation/omsti', tagged_tr_omsti_r, 'omsti_rouge_map_sel', d_senses_omsti, rouge_omsti.take(filtered_ids_omsti_r), wordnet_senses=wordnet_senses, lemma=False, sel_verbs=True))
print('Saving bleu')
print(save_tagged_sentence(Path.cwd() / 'data/disambiguation/omsti', tagged_tr_omsti_b, 'omsti_bleu_map_sel', d_senses_omsti, bleu_omsti.take(filtered_ids_omsti_r), wordnet_senses=wordnet_senses, lemma=False, sel_verbs=True))
print('Saving rouge and bleu')
print(save_tagged_sentence(Path.cwd() / 'data/disambiguation/omsti', tagged_tr_omsti_r_b, 'omsti_rouge_bleu_map_sel', d_senses_omsti, list(zip(rouge_omsti.take(filtered_ids_omsti_r), bleu_omsti.take(filtered_ids_omsti_r))), wordnet_senses=wordnet_senses, lemma=False, sel_verbs=True))


Saving rouge
(17060, 6438)
Saving bleu
(17022, 6438)
Saving rouge and bleu
(17008, 6438)


# Saving all

In [122]:
print('Saving rouge')
print(save_tagged_sentence(Path.cwd() / 'data/disambiguation/all', tagged_tr_omsti_r, 'all_rouge_map_sel', d_senses_omsti, rouge_omsti.take(filtered_ids_omsti_r), wordnet_senses=wordnet_senses, lemma=False, sel_verbs=True))
print(save_tagged_sentence(Path.cwd() / 'data/disambiguation/all', tagged_tr_semcor_r, 'all_rouge_map_sel', d_senses, rouge_semcor.take(filtered_ids_semcor_r), mode='a', wordnet_senses=wordnet_senses, lemma=False, sel_verbs=True))
print('Saving bleu')
print(save_tagged_sentence(Path.cwd() / 'data/disambiguation/all', tagged_tr_omsti_b, 'all_bleu_map_sel', d_senses_omsti, bleu_omsti.take(filtered_ids_omsti_b), wordnet_senses=wordnet_senses, lemma=False, sel_verbs=True))
print(save_tagged_sentence(Path.cwd() / 'data/disambiguation/all', tagged_tr_semcor_b, 'all_bleu_map_sel', d_senses, bleu_semcor.take(filtered_ids_semcor_b), mode='a', wordnet_senses=wordnet_senses, lemma=False, sel_verbs=True))
print('Saving rouge and bleu')
print(save_tagged_sentence(Path.cwd() / 'data/disambiguation/all', tagged_tr_omsti_r_b, 'all_rouge_bleu_map_sel', d_senses_omsti, f1_omsti.take(filtered_ids_omsti_r_b), wordnet_senses=wordnet_senses, lemma=False, sel_verbs=True))
print(save_tagged_sentence(Path.cwd() / 'data/disambiguation/all', tagged_tr_semcor_r_b, 'all_rouge_bleu_map_sel', d_senses, f1_semcor.take(filtered_ids_semcor_r_b), mode='a', wordnet_senses=wordnet_senses, lemma=False, sel_verbs=True))

Saving rouge
(17060, 6438)
37890
(116859, 19941)
Saving bleu
(17022, 6438)
37823
(116421, 19772)
Saving rouge and bleu
(17008, 6438)
37810
(116654, 19854)


In [123]:
print('Saving rouge')
print(save_tagged_sentence(Path.cwd() / 'data/disambiguation/all', tagged_tr_omsti_r, 'all_rouge_map_sel_lemma', d_senses_omsti, rouge_omsti.take(filtered_ids_omsti_r), wordnet_senses=wordnet_senses, lemma=True, sel_verbs=True))
print(save_tagged_sentence(Path.cwd() / 'data/disambiguation/all', tagged_tr_semcor_r, 'all_rouge_map_sel_lemma', d_senses, rouge_semcor.take(filtered_ids_semcor_r), mode='a', wordnet_senses=wordnet_senses, lemma=True, sel_verbs=True))
print('Saving bleu')
print(save_tagged_sentence(Path.cwd() / 'data/disambiguation/all', tagged_tr_omsti_b, 'all_bleu_map_sel_lemma', d_senses_omsti, bleu_omsti.take(filtered_ids_omsti_b), wordnet_senses=wordnet_senses, lemma=True, sel_verbs=True))
print(save_tagged_sentence(Path.cwd() / 'data/disambiguation/all', tagged_tr_semcor_b, 'all_bleu_map_sel_lemma', d_senses, bleu_semcor.take(filtered_ids_semcor_b), mode='a', wordnet_senses=wordnet_senses, lemma=True, sel_verbs=True))
print('Saving rouge and bleu')
print(save_tagged_sentence(Path.cwd() / 'data/disambiguation/all', tagged_tr_omsti_r_b, 'all_rouge_bleu_map_sel_lemma', d_senses_omsti, f1_omsti.take(filtered_ids_omsti_r_b), wordnet_senses=wordnet_senses, lemma=True, sel_verbs=True))
print(save_tagged_sentence(Path.cwd() / 'data/disambiguation/all', tagged_tr_semcor_r_b, 'all_rouge_bleu_map_sel_lemma', d_senses, f1_semcor.take(filtered_ids_semcor_r_b), mode='a', wordnet_senses=wordnet_senses, lemma=True, sel_verbs=True))

Saving rouge
(17060, 6438)
37890
(116859, 19941)
Saving bleu
(17022, 6438)
37823
(116421, 19772)
Saving rouge and bleu
(17008, 6438)
37810
(116654, 19854)


In [1161]:
def save_tagged_sentence(dir_path, tagged_tr, name_file, d_senses, metric, mode='w', wordnet_senses={}):
    tagged_sense_sent = []
    cont = 0
    for instance in tagged_tr:
        temp = []
        for ix, word in enumerate(instance[1].split()):
            if instance[0][ix] != 'no_instance':
                if len(wordnet_senses) != 0:
                    if d_senses[instance[0][ix]][0] in wordnet_senses:
                        temp.append(wordnet_senses[d_senses[instance[0][ix]][0]])
                    else:
                        cont += 1
                        temp.append(word)
                else:
                    temp.append(d_senses[instance[0][ix]][0])
            else:
                temp.append(word)

        tagged_sense_sent.append(' '.join(temp))

    if mode == 'w':
        np.save(dir_path / f'{name_file}.npy', np.array(zip(tagged_sense_sent, metric)))
    
    with open(dir_path / f'{name_file}_out.txt', mode) as file:
        for sent in tagged_sense_sent:
            file.write(sent + '\n')
    file.close()
            
    with open(dir_path / f'{name_file}_in.txt', mode) as file:
        for sent in tagged_tr:
            file.write(sent[1] + '\n')
    file.close()        
    return cont

In [1163]:
print('Saving rouge')
print(save_tagged_sentence(Path.cwd() / 'data/disambiguation/all', tagged_tr_omsti_r, 'all_rouge_map', d_senses_omsti, rouge_omsti.take(filtered_ids_omsti_r), wordnet_senses=wordnet_senses))
print(save_tagged_sentence(Path.cwd() / 'data/disambiguation/all', tagged_tr_semcor_r, 'all_rouge_map', d_senses, rouge_semcor.take(filtered_ids_semcor_r), mode='a', wordnet_senses=wordnet_senses))
print('Saving bleu')
print(save_tagged_sentence(Path.cwd() / 'data/disambiguation/all', tagged_tr_omsti_b, 'all_bleu_map', d_senses_omsti, bleu_omsti.take(filtered_ids_omsti_b), wordnet_senses=wordnet_senses))
print(save_tagged_sentence(Path.cwd() / 'data/disambiguation/all', tagged_tr_semcor_b, 'all_bleu_map', d_senses, bleu_semcor.take(filtered_ids_semcor_b), mode='a', wordnet_senses=wordnet_senses))
print('Saving rouge and bleu')
print(save_tagged_sentence(Path.cwd() / 'data/disambiguation/all', tagged_tr_omsti_r_b, 'all_rouge_bleu_map', d_senses_omsti, list(zip(rouge_omsti.take(filtered_ids_omsti_r_b), bleu_omsti.take(filtered_ids_omsti_r))), wordnet_senses=wordnet_senses))
print(save_tagged_sentence(Path.cwd() / 'data/disambiguation/all', tagged_tr_semcor_r_b, 'all_rouge_bleu_map', d_senses, list(zip(rouge_semcor.take(filtered_ids_semcor_r_b), bleu_semcor.take(filtered_ids_semcor_r))), mode='a', wordnet_senses=wordnet_senses))

Saving rouge
0
0
Saving bleu
0
0
Saving rouge and bleu
0
0


In [1167]:
f_i = open(Path.cwd() / 'data/disambiguation/all/all_rouge_map_in.txt', 'r')
f_o = open(Path.cwd() / 'data/disambiguation/all/all_rouge_map_out.txt', 'r')
f = open(Path.cwd() / 'data/disambiguation/all/all_rouge_map.tsv', 'w')
for l_i, l_o in zip(f_i.read().split('\n'), f_o.read().split('\n')):
    f.write(l_i + '\t' + l_o + '\n')
f.close()
f_i.close()
f_o.close()

In [1168]:
f_i = open(Path.cwd() / 'data/disambiguation/all/all_bleu_map_in.txt', 'r')
f_o = open(Path.cwd() / 'data/disambiguation/all/all_bleu_map_out.txt', 'r')
f = open(Path.cwd() / 'data/disambiguation/all/all_bleu_map.tsv', 'w')
for l_i, l_o in zip(f_i.read().split('\n'), f_o.read().split('\n')):
    f.write(l_i + '\t' + l_o + '\n')
f.close()
f_i.close()
f_o.close()

In [1169]:
f_i = open(Path.cwd() / 'data/disambiguation/all/all_rouge_bleu_map_in.txt', 'r')
f_o = open(Path.cwd() / 'data/disambiguation/all/all_rouge_bleu_map_out.txt', 'r')
f = open(Path.cwd() / 'data/disambiguation/all/all_rouge_bleu_map.tsv', 'w')
for l_i, l_o in zip(f_i.read().split('\n'), f_o.read().split('\n')):
    f.write(l_i + '\t' + l_o + '\n')
f.close()
f_i.close()
f_o.close()

In [1150]:
    cont = []
    with open(Path.cwd() / 'data/disambiguation/semcor' / 'semcor_rouge_out.txt', 'r') as file:
        for sent in file.read().split('\n'):
            cont.append(sent)
    file.close()

In [1151]:
cont[0]

'quanto tempo tem sido desde que review%2:31:00:: os objectivos do seu benefit%1:21:00:: e service%1:04:07:: de program%1:09:01:: ?'

In [1147]:
    cont = 0
    with open(Path.cwd() / 'data/disambiguation/omsti' / 'omsti_rouge_in.txt', 'r') as file:
        for sent in file.read().split('\n'):
            cont+=1
    file.close()

In [4]:
cont

NameError: name 'cont' is not defined

In [1146]:
cont

32489

In [1144]:
cont

66621