In [171]:
import os
from stanfordcorenlp import StanfordCoreNLP
from tqdm import tqdm
import corenlp

import re
import numpy as np

import nltk
from nltk.parse.stanford import StanfordParser
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet

import enchant

os.environ["CORENLP_HOME"] = '/home/krivas/projects/neural-wsd/new_experiments/data/lib/stanford-corenlp'

def join_words(sentences, word_dict):
    arr_sentences = []
    
    for sentence in sentences:
        tokens = sentence.split()
        ant = ''
        new_sentence = ''
        add_word = True
        
        for ix, token in enumerate(tokens):
            if(add_word):
                if(token == '-'  and ix > 0 and ix < (len(tokens) - 1)):
                    join_word = tokens[ix-1] + '-' + tokens[ix+1]
                    if word_dict.check(join_word):
                        ant = join_word + ' '
                        add_word = False

                if(add_word):
                    new_sentence += ant
                    ant = token + " "
            else:
                add_word = True

        new_sentence += ant
        arr_sentences.append(new_sentence)
    
    return arr_sentences

def remove_unnecesary_char(sentence):
    sentence = sentence.strip(' ')
    sentence = sentence.lstrip(')')
    
    return sentence

def find_index(sentence):
    words = sentence.split()
    
    lst_index = []
    for ix_word, word in enumerate(words):
        if '<head>' in word:
            lst_index.append(ix_word)

    return lst_index

def find_trim_sentence(sentence, iz_del, der_del):
    cont_iz = 0
    seqs = []
    seq = ''
    tokens = sentence.split()
    add = False
    
    for token in tokens:
        if iz_del in token:
            cont_iz += 1
        
        if cont_iz > 0:
            add = True
            seq += token + ' '
        
        if der_del in token and cont_iz:
            cont_iz -= 1
        
        if cont_iz == 0 and add:
            seq = seq.strip(' ')
            seqs.append(seq)
            seq = ''
            add = False
            
    if cont_iz:
        seqs.append(seq)
            
    return seqs

def remove_LRB_RRB(sentences, iz_del, der_del):
    arr_sentence = []
    
    for sentence in sentences:
        seqs = find_trim_sentence(sentence, iz_del, der_del)
        remove = True
        for seq in seqs:
            arr = find_index(seq)
            if len(arr):
                sentence = seq.strip(iz_del + der_del + ' ')
                remove = False
                break

        if remove:
            sentence = re.sub(r'\((.*?)\)', '', sentence, re.DOTALL)
        if ')' not in sentence.split()[-1]:
            sentence = re.sub(r'(.*?)\)', '', sentence, re.DOTALL)
        else:
            sentence = re.sub(r'\)', '', sentence, re.DOTALL) 

        if '(' not in sentence.split()[0]:
            sentence += ' .'
            sentence = re.sub(r'\((.*?)\.', '', sentence, re.DOTALL) 
        else:
            sentence = re.sub(r'\(', '', sentence, re.DOTALL)     
        arr_sentence.append(sentence)
    
    return arr_sentence

def process_instance(file, word_target, text, word_dict, sense_ids=None, tokenizer=None, verbose=False):
    pairs = []
    sentences = []
    
    if not sense_ids:
        sense_ids = re.findall(r'senseid=\"(.*?)\"', text, re.DOTALL)
        is_test = False
    else:
        is_test = True
        
    context = re.findall(r'<context>(.*?)</context>', text, re.DOTALL)
    word_ambiguos = re.findall(r'<head>(.*?)</head>', context[0], re.DOTALL)
    
    sentences = re.split(r'[\.|:|?|!]', context[0])    
    for sentence in sentences:
        if '<head>' in sentence:
            sentence = remove_unnecesary_char(sentence)     
            prunes = remove_LRB_RRB([sentence], '(', ')')
            
            if(verbose):
                print('---oracion')
                print(sentence)
                print('---oracion sin parentesis')
                print(prunes[0])
                print('\n')
                
            for prune in prunes:
                prune = join_words([prune], word_dict)[0] + ' .'
                prune = re.sub('<head>', ' <head>', prune)
                prune = re.sub('</head>', '</head> ', prune)
                ann = tokenizer.annotate(prune)
                prune = ' '.join([w.word for w in ann.sentence[0].token])
                index_word = find_index(prune) 

                for sense_id in sense_ids:   
                    pair = [[],[],[],[]]
                    sense_id = re.sub(r'%|:', '', sense_id)
                    pair[0] = ' '.join(re.sub(r'<head>(.*?)</head>', ' ' + word_ambiguos[0] + ' ', prune).split())
                    pair[1] = ' '.join(re.sub(r'<head>(.*?)</head>', ' ' + word_target + '_' + sense_id + ' ', prune).split())
                    pair[2] = word_target + '_' + sense_id
                    pair[3] = index_word
                    pairs.append(pair)
                    
                    # Saving in a file
                    if not is_test:
                        file.write(pair[0] + '\t' + pair[1] + '\n')
                if is_test:
                    index_word = str(index_word[0] if len(index_word) else index_word)
                    file.write(pair[0] + '\t' + index_word)
                    for pair in pairs:
                        file.write('\t' + pair[2])
                    file.write('\n')
        
    return pairs

def load_senses(path_senses, path_test):
    
    targets_all = []
    with open(path_test, 'r', encoding='iso-8859-1') as f:
        xml = f.read()  
    
    instances = re.findall(r'<instance(.*?)</instance>', xml, re.DOTALL)
    for ix_ins, instance in enumerate(instances):
        data = '<instance' + instance + '</instance>'
        senses_ids = re.findall(r'<head>', data, re.DOTALL)
        targets_all.append(len(senses_ids))
    
    senses_all = []
    with open(path_senses, 'r') as f:
        lines = f.read().split('\n')
        for line in lines:
            senses = []
            words = line.split()
            for ix, word in enumerate(words):
                if ix > 1:
                    word = re.sub(r'%|:', '', word)
                    senses.append(word)
                    
            senses_all.append(senses)
    
    return senses_all, targets_all

def replace_characters(instance):
    
    data = '<instance' + instance + '</instance>'
    data = re.sub(r'[^\x20-\x7E]', '', data)
    data = re.sub(r' n\'t', 'n\'t', data)
    data = re.sub(r'wou \'d', 'uld', data)

    data = re.sub(r' \'re', ' are', data)
    data = re.sub(r' \'ve', ' have', data)

    data = re.sub(r'it \'s', 'it is', data)
    data = re.sub(r'he \'s', 'he is', data)
    data = re.sub(r'i \'m', 'i am', data)
    data = re.sub(r'It \'s', 'it is', data)
    data = re.sub(r'He \'s', 'he is', data)
    data = re.sub(r'I \'m', 'i am', data)

    data = re.sub(r'\[(.*?)\]', '', data)
    data = re.sub(r'&(.*?);', '', data)

    data = re.sub(r' \'d', 'd', data)
    data = re.sub(r'&', '', data)
    
    return data

def construct_pairs(path_source, path_model, test_path = None, tokenizer = None, verbose=True, name_file='input.raw'):
    
    word_dict = enchant.Dict('en_US')
    
    with open(path_source, 'r', encoding='iso-8859-1') as f:
        xml = f.read()

    if test_path:
        senses_all, _ = load_senses(test_path, path_source)    
    
    lexelts = re.findall(r'<lexelt(.*?)</lexelt>', xml, re.DOTALL)
    pairs= []
    ix_ins = 0
    with open(os.path.dirname(path_source) + f'/{name_file}', 'w') as file:
        for ix_lex, lexelt in enumerate(lexelts):
            item = re.findall(r'item=\"(.*?)\"', lexelt, re.DOTALL)
            word_target = item[0].split('.')[0]

            instances = re.findall(r'<instance(.*?)</instance>', lexelt, re.DOTALL)
            for instance in instances:
                data = replace_characters(instance)
                
                context = re.findall(r'<context>(.*?)</context>', data, re.DOTALL)

                if not test_path:
                    pairs.extend(process_instance(file, word_target, data, word_dict, None, tokenizer, verbose))
                else:
                    pairs.extend(process_instance(file, word_target, data, word_dict, senses_all[ix_ins], tokenizer, verbose))
                ix_ins += 1
    return pairs

def make_dirs(dirs):
    for d in dirs:
        if not os.path.exists(d):
            os.makedirs(d)
            
def dependency_parse(filepath,  client, tokenize=True):
    print('\nDependency parsing ' + filepath)
    dirpath = os.path.dirname(filepath)
    filepre = os.path.splitext(os.path.basename(filepath))[0]
    parentpath = os.path.join(dirpath, filepre + '.parents')
    deps = []
    with open(filepath) as file:
        for line in tqdm(file, total=file.tell()):
            temp = client.dependency_parse(line)
            temp = list(map(lambda x: [int(x[1]), int(x[2])], temp))
            temp = list(itertools.chain(*temp))
            deps.append(temp)
    np.save(parentpath, np.array(deps))

def build_vocab(filepaths, dst_path, lowercase=True):
    vocab = set()
    for filepath in filepaths:
        with open(filepath) as f:
            for line in f:
                if lowercase:
                    line = line.lower()
                vocab |= set(line.split())
    with open(dst_path, 'w') as f:
        for w in sorted(vocab):
            f.write(w + '\n')

def split(filepath, dst_dir, client):
    with open(filepath) as datafile, \
            open(os.path.join(dst_dir, 'in.txt'), 'w') as afile, \
            open(os.path.join(dst_dir, 'out.txt'), 'w') as bfile:
        datafile.readline()
        for line in datafile:
            a, b = line.strip().split('\t')

            ann = client.annotate(a)
            s = ' '.join([w.word for w in ann.sentence[0].token])
            afile.write(s + '\n')
                
            ann = client.annotate(b)
            s = ' '.join([w.word for w in ann.sentence[0].token])
            bfile.write(s + '\n')

def parse(dirpath, client):
    dependency_parse(os.path.join(dirpath, 'in.txt'), client, tokenize=True)
    #dependency_parse(os.path.join(dirpath, 'out.txt'), client, cp=cp, tokenize=True)


In [172]:

if __name__ == '__main__':
    print('=' * 80)
    print('Preprocessing dataset')
    print('=' * 80)
    base_dir = ''
    data_dir = os.path.join(base_dir, 'data')
    lib_dir = os.path.join(data_dir, 'lib')
    client_tree = StanfordCoreNLP(os.path.join(lib_dir, 'stanford-corenlp'))
    client_tok = corenlp.CoreNLPClient(annotators="tokenize ssplit".split())
    
    #Processing corpus train
    print('Construction train data')
    construct_pairs('data/disambiguation/sense-eval3/train/EnglishLS.train', '/home/krivas/projects/wsd-v2/data/lib/',\
                              test_path=None, tokenizer=client_tok, verbose=False, name_file='train.raw')
    construct_pairs('data/disambiguation/sense-eval2/train/eng-lex-sample.training.xml', '/home/krivas/projects/wsd-v2/data/lib/',\
                              test_path=None, tokenizer=client_tok, verbose=False, name_file='train.raw')
    #Processing corpus test
    print('Construction test data')
    construct_pairs('data/disambiguation/sense-eval3/test/EnglishLS.test', '/home/krivas/projects/wsd-v2/data/lib/',\
                             test_path='data/disambiguation/sense-eval3/test/EnglishLS.test.key', tokenizer=client_tok, verbose=False, name_file='test.raw')
    construct_pairs('data/disambiguation/sense-eval2/test/eng-lex-samp.evaluation.xml', '/home/krivas/projects/wsd-v2/data/lib/',\
                             test_path='data/disambiguation/sense-eval2/test/key.txt', tokenizer=client_tok, verbose=False, name_file='test.raw')
    
    senses_dir=['sense-eval2', 'sense-eval3']
    for sense_dir in senses_dir:
        train_dir = os.path.join(data_dir, f'disambiguation/{sense_dir}/train')
        test_dir = os.path.join(data_dir, f'disambiguation/{sense_dir}/test')

        # split into separate files
        print('Splitting data')
        split(os.path.join(train_dir, 'train.raw'), train_dir, client_tok)
        #split(os.path.join(test_dir, 'test.raw'), test_dir, client_tok)

        # parse sentences
        print('Parsing data')
        parse(train_dir, client_tree)

Preprocessing dataset
Construction train data
Construction test data
Splitting data


0it [00:00, ?it/s]

Parsing data

Dependency parsing data/disambiguation/sense-eval2/train/in.txt





NameError: name 'itertools' is not defined