In [3]:
import os
import numpy as np
import glob
from stanfordcorenlp import StanfordCoreNLP
from tqdm import tqdm
import itertools
import corenlp

os.environ["CORENLP_HOME"] = '/home/krivas/projects/neural-wsd/new_experiments/data/lib/stanford-corenlp'

In [None]:
def make_dirs(dirs):
    for d in dirs:
        if not os.path.exists(d):
            os.makedirs(d)

def dependency_parse(filepath,  client, cp='', tokenize=True):
    print('\nDependency parsing ' + filepath)
    dirpath = os.path.dirname(filepath)
    filepre = os.path.splitext(os.path.basename(filepath))[0]
    parentpath = os.path.join(dirpath, filepre + '.parents')
    deps = []
    with open(filepath) as file:
        for line in tqdm(file, total=file.tell()):
            temp = client.dependency_parse(line)
            temp = list(map(lambda x: [int(x[1]), int(x[2])], temp))
            temp = list(itertools.chain(*temp))
            deps.append(temp)
    np.save(parentpath, np.array(deps))

def split(filepath, dst_dir, client):
    with open(filepath) as datafile, \
            open(os.path.join(dst_dir, 'a.txt'), 'w') as afile, \
            open(os.path.join(dst_dir, 'b.txt'), 'w') as bfile:
        datafile.readline()
        for line in tqdm(datafile):
            a, b = line.strip().split('\t')

            ann = client.annotate(a)
            s = ' '.join([w.word for w in ann.sentence[0].token])
            afile.write(a + '\n')
                
            ann = client.annotate(b)
            s = ' '.join([w.word for w in ann.sentence[0].token])
            bfile.write(b + '\n')

def parse(dirpath, client, cp=''):
    dependency_parse(os.path.join(dirpath, 'a.txt'), client, cp=cp, tokenize=True)
    dependency_parse(os.path.join(dirpath, 'b.txt'), client, cp=cp, tokenize=True)

if __name__ == '__main__':
    print('=' * 80)
    print('Preprocessing dataset')
    print('=' * 80)

    base_dir = ''
    data_dir = os.path.join(base_dir, 'data')
    all_dir = os.path.join(data_dir, 'translation/all_data')
    lib_dir = os.path.join(base_dir, 'lib')
    train_dir = os.path.join(data_dir, 'translation/train')
    #dev_dir = os.path.join(data_dir, 'translation/dev')
    #test_dir = os.path.join(data_dir, 'translation/test')
    make_dirs([train_dir])

    # java classpath for calling Stanford parser
    classpath = ':'.join([
        lib_dir,
        os.path.join(lib_dir, 'stanford-parser/stanford-parser.jar'),
        os.path.join(lib_dir, 'stanford-parser/stanford-parser-3.5.1-models.jar')])

    # split into separate files
    client = corenlp.CoreNLPClient(annotators="tokenize ssplit".split())
    print('create client')
    split(os.path.join(all_dir, 'en-spa.txt'), train_dir, client)
    #split(os.path.join(all_dir, 'SICK_trial.txt'), dev_dir)
    #split(os.path.join(all_dir, 'SICK_test_annotated.txt'), test_dir)

    # parse sentences
    client = StanfordCoreNLP(r'data/lib/stanford-corenlp')
    parse(train_dir, client, cp=classpath)

In [65]:
import unicodedata
import string
import re
import random
import time
import math
import os
import sys
import pandas as pd
import numpy as np

import nltk

import torch
import torch.nn as nn
from torch.nn import functional
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F

from stanfordcorenlp import StanfordCoreNLP
from nltk.parse.stanford import StanfordParser
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet

import enchant

import torchtext 
from torchtext import data
from torchtext import datasets

def find_words_cutoff(sentences, ner):
    words_cutoff = []
    
    for ix, sentence in enumerate(sentences):
        first_word = True
        words_cutoff.append("")
        tokens = sentence.split()
        tags = ner.tag(tokens)
        for tag in tags:
            # Veryfing if word is He, She, It, That or To
            if(tag[0] == ''):
                continue
            if(tag[1] == 'O' and \
               (tag[0][0] == 'S' or tag[0][0] == 'H' or tag[0][0] == 'I' or tag[0][0] == 'T')):
                # we want to get a sentence of the form of word1|word2|word3
                if(first_word):
                    first_word = False
                else:
                    words_cutoff[ix] += "|"
                    
                words_cutoff[ix] += tag[0]
                
    return words_cutoff

def is_target_tree(tree, word_target):

    for pos in tree.pos():
        if(pos[0] == word_target):
            return True
    
    return False

def prune_subtree(tree, word_target):
    subtrees = tree.subtrees(filter = lambda x: x.label()=="S" )
    
    for t in reversed(list(subtrees)):
        pos = t.treeposition()
        if(len(pos) != 0 and not is_target_tree(t, word_target)):
            del tree[pos]
            
    return tree

def find_subtree(tree, pos_word):
    aux = tree.copy(deep=True)
    index = -1
    
    for pos in range(len(pos_word) - 1):
        label = aux[pos_word[pos]].label()
        if(label == 'S'):
            index = pos
        aux = aux[pos_word[pos]]

    
    l_tree = list(tree[pos_word[:index+1]])
    present_np = False
    for child in range(len(l_tree)):
        if(l_tree[child].label() == 'NP'):
            present_np = True

    if(not present_np and index != -1) :      
        parent_tree = tree[pos_word[:index+1]].parent()
        pos_parent = parent_tree.treeposition()
        
        while(tree[pos_parent].label() != 'NP' and tree[pos_parent].label() != 'ROOT'):
            parent_tree = tree[pos_parent].parent()
            pos_parent = parent_tree.treeposition()
    else:
        pos_parent = pos_word[:index+1]

    return tree[pos_parent].copy(deep=True)

def find_last_pos(lst_pos):
    index = 0
    
    for pos in reversed(lst_pos):
        if(pos[1] != 'WDT' and pos[1] != ',' and pos[1] != 'IN' and pos[1] != 'WRB'):
            return index 
        index += 1
        
    return -1

def tree_to_sentence(tree):
    sentence = ""

    lst_pos = list(tree.pos())
    len_pos = len(lst_pos)
    last_pos = find_last_pos(lst_pos)
    first_comma = True
    
    for ix, pair in enumerate(tree.pos()):      
        if(ix >= len_pos - last_pos):
            break
        
        if(pair[1] != ',' and pair[1] != 'WRB' and pair[1] != 'IN'):
            first_comma = False
        
        if(not first_comma):
            sentence += pair[0] + ' '

    sentence += '.'
    
    return sentence

def sentence_prune(sentence, lst_index, parser):
    
    raw_tree = parser.raw_parse(sentence)
    aux = list(raw_tree)
    tree = nltk.ParentedTree.convert(aux[0])
    lst_sentences = []
    
    for index in lst_index:
        
        word_target = sentence.split()[index]
        pos_word = tree.leaf_treeposition(index)   
        aux_tree = find_subtree(tree, pos_word)
        aux_tree = prune_subtree(aux_tree, word_target)
        lst_sentences.append(tree_to_sentence(aux_tree))
 
    return list(set(lst_sentences))

def join_words(sentences, word_dict):
    arr_sentences = []
    
    for sentence in sentences:
        tokens = sentence.split()
        ant = ''
        new_sentence = ''
        add_word = True
        
        for ix, token in enumerate(tokens):
            if(add_word):
                if(token == '-'  and ix > 0 and ix < (len(tokens) - 1)):
                    join_word = tokens[ix-1] + '-' + tokens[ix+1]
                    if word_dict.check(join_word):
                        ant = join_word + ' '
                        add_word = False

                if(add_word):
                    new_sentence += ant
                    ant = token + " "
            else:
                add_word = True

        new_sentence += ant
        arr_sentences.append(new_sentence)
    
    return arr_sentences

def remove_unnecesary_char(sentence):
    sentence = sentence.strip(' ')
    sentence = sentence.lstrip(')')
    
    return sentence

def find_index(sentence):
    words = sentence.split()
    
    lst_index = []
    for ix_word, word in enumerate(words):
        if(word.startswith('<head>')):
            lst_index.append(ix_word)

    return lst_index

def find_trim_sentence(sentence, iz_del, der_del):
    cont_iz = 0
    seqs = []
    seq = ''
    tokens = sentence.split()
    add = False
    
    for token in tokens:
        if(iz_del in token):
            cont_iz += 1
        
        if(cont_iz > 0):
            add = True
            seq += token + ' '
        
        if(der_del in token):
            cont_iz -= 1
        
        if(cont_iz == 0 and add):
            seq = seq.strip(' ')
            seqs.append(seq)
            seq = ''
            add = False
            
    return seqs

def remove_LRB_RRB(sentences, iz_del, der_del):
    arr_sentence = []
    
    for sentence in sentences:
        seqs = find_trim_sentence(sentence, iz_del, der_del)
        for seq in seqs:
            arr = find_index(seq)
            if(len(arr) == 0):
                sentence = sentence.replace(seq, '')
            else:
                sentence = seq.strip(iz_del + der_del + ' ')
                break
                
        arr_sentence.append(sentence)
    
    return arr_sentence

def process_instance(word_target, ix_ins, text, ner, parser, word_dict, nlp, is_train = True, sense_ids = None, prune_sentence = False, verbose = False):
    pairs = []
    sentences = []
    
    if is_train:
        sense_ids = re.findall(r'senseid=\"(.*?)\"', text, re.DOTALL)
        
    context = re.findall(r'<context>(.*?)</context>', text, re.DOTALL)
    word_ambiguos = re.findall(r'<head>(.*?)</head>', context[0], re.DOTALL)
    
    c = re.split(r'[\.|:|?|!]', context[0])
    
    c = join_words(c, word_dict)
    #words_cutoff = find_words_cutoff(c, ner)
    
    for ix, sent in enumerate(c):
        #if(len(words_cutoff[ix]) != 0):
        #    sentences.extend(re.split(r'\s(?=(?:' + words_cutoff[ix] + r')\b)', sent))
        #else:
        sentences.append(sent)
    
    for sentence in sentences:
        if(sentence.endswith('and')):
            sentence = sentence.rsplit(' and', 1)[0]
            
        tags = re.findall(r'<head>(.*?)</head>', sentence)
        if(len(tags) != 0):
            
            sentence = remove_unnecesary_char(sentence)
            index_word = find_index(sentence)            
            
            if(verbose):
                print('---oracion')
                print(sentence) 

            if(prune_sentence):
                sentence = re.sub(r'<head>(.*?)</head>', word_ambiguos[0], sentence)
                sentences_prune = sentence_prune(sentence, index_word, parser)
                sentences_prune = remove_LRB_RRB(sentences_prune, '-LRB-', '-RRB-')
            else:
                sentences_prune = []
                sentences_prune.append(sentence)
                sentences_prune = remove_LRB_RRB(sentences_prune, '(', ')')
                
            
            if(verbose):
                print('---oracion sin parentesis')
                print(sentences_prune[0])
                print('\n')
                
            for s in sentences_prune:

                for sense_id in sense_ids:   
                    pair = [[],[],[],[]]
                    sense_id = re.sub(r'%|:', '', sense_id)
                    if(prune_sentence):
                        pair[0] = s
                        pair[1] = re.sub(word_ambiguos[0], word_target + '_' + sense_id, s)
                        pair[2] = word_target + '_' + sense_id
                        pair[3] = ix_ins
                    else:
                        pair[0] = re.sub(r'<head>(.*?)</head>', word_ambiguos[0], s)
                        pair[1] = re.sub(r'<head>(.*?)</head>', word_target + '_' + sense_id, s)
                        pair[2] = word_target + '_' + sense_id
                        pair[3] = ix_ins
                    pairs.append(pair)
        
    return pairs

def load_senses(path_senses, path_test):
    
    targets_all = []
    with open(path_test, 'r', encoding='iso-8859-1') as f:
        xml = f.read()  
    
    instances = re.findall(r'<instance(.*?)</instance>', xml, re.DOTALL)
    for ix_ins, instance in enumerate(instances):
        data = '<instance' + instance + '</instance>'
        senses_ids = re.findall(r'<head>', data, re.DOTALL)
        targets_all.append(len(senses_ids))
    
    senses_all = []
    with open(path_senses, 'r') as f:
        lines = f.read().split('\n')
        for line in lines:
            senses = []
            words = line.split()
            for ix, word in enumerate(words):
                if ix > 1:
                    word = re.sub(r'%|:', '', word)
                    senses.append(word)
                    
            senses_all.append(senses)
    
    return senses_all, targets_all

def construct_pairs(path_source, path_model, is_train = True, test_path = None, prune_sentence = False, verbose=True):
    
    ner = StanfordNERTagger(path_model + 'stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz',
                        path_model + 'stanford-ner/stanford-ner.jar',
                        encoding='utf-8')

    parser=StanfordParser(path_model + "stanford-parser/stanford-parser.jar", \
                     path_model + "stanford-parser/stanford-parser-3.9.2-models.jar")

    nlp = StanfordCoreNLP(path_model + "stanford-corenlp/")
    
    word_dict = enchant.Dict('en_US')
    
    with open(path_source, 'r', encoding='iso-8859-1') as f:
        xml = f.read()

    if(not is_train):
        senses_all, _ = load_senses(test_path, path_source)    
    
    lexelts = re.findall(r'<lexelt(.*?)</lexelt>', xml, re.DOTALL)
    pairs= []
    ix_ins = 0
    
    with open(os.path.dirname(path_source) + '/input.raw', 'w') as file:
        for ix_lex, lexelt in enumerate(lexelts):
            item = re.findall(r'item=\"(.*?)\"', lexelt, re.DOTALL)
            word_target = item[0].split('.')[0]

            instances = re.findall(r'<instance(.*?)</instance>', lexelt, re.DOTALL)
            for instance in instances:

                data = '<instance' + instance + '</instance>'
                data = re.sub(r'[^\x20-\x7E]', '', data)
                data = re.sub(r' n\'t', 'n\'t', data)
                data = re.sub(r'wou \'d', 'uld', data)

                data = re.sub(r' \'re', ' are', data)
                data = re.sub(r' \'ve', ' have', data)

                data = re.sub(r'it \'s', 'it is', data)
                data = re.sub(r'he \'s', 'he is', data)
                data = re.sub(r'i \'m', 'i am', data)
                data = re.sub(r'It \'s', 'it is', data)
                data = re.sub(r'He \'s', 'he is', data)
                data = re.sub(r'I \'m', 'i am', data)

                data = re.sub(r'\[(.*?)\]', '', data)
                data = re.sub(r'&(.*?);', '', data)

                data = re.sub(r' \'d', 'd', data)
                data = re.sub(r'&', '', data)
                
                context = re.findall(r'<context>(.*?)</context>', data, re.DOTALL)
                
                file.write(context[0] + '\n')

                if(is_train):
                    pairs.extend(process_instance(word_target, ix_ins, data, ner, parser, word_dict, nlp, is_train, None, prune_sentence, verbose))
                else:
                    pairs.extend(process_instance(word_target, ix_ins, data, ner, parser, word_dict, nlp, is_train, senses_all[ix_ins], prune_sentence, verbose))
                ix_ins += 1
    
    return np.array(pairs)

In [66]:
pairs_test = construct_pairs('data/disambiguation/sense-eval2/test/eng-lex-samp.evaluation.xml', '/home/krivas/projects/wsd-v2/data/lib/',\
                             is_train=False, test_path='data/disambiguation/sense-eval2/test/key.txt', prune_sentence=False, verbose=True)

The StanfordTokenizer will be deprecated in version 3.2.5.
Please use [91mnltk.tag.corenlp.CoreNLPPOSTagger[0m or [91mnltk.tag.corenlp.CoreNLPNERTagger[0m instead.
  super(StanfordNERTagger, self).__init__(*args, **kwargs)


---oracion
Pop <head>Art</head>is an example
---oracion sin parentesis
Pop <head>Art</head>is an example


---oracion
Reform and decentralise <head>arts</head>funding and organisation
---oracion sin parentesis
Reform and decentralise <head>arts</head>funding and organisation


---oracion
You ever notice how all <head>art</head>focuses on people in trouble
---oracion sin parentesis
You ever notice how all <head>art</head>focuses on people in trouble


---oracion
Countless books and classes teach the <head>art</head>of asserting oneself and, if you want to know more, turn to page 120
---oracion sin parentesis
Countless books and classes teach the <head>art</head>of asserting oneself and, if you want to know more, turn to page 120


---oracion
Forsake your usual stationer, who probably won't have the room or the reason for stocking unusual papers or cards, and go to a specialist <head>art</head>or paper shop instead
---oracion sin parentesis
Forsake your usual stationer, who probably won'

In [8]:
pairs_train = construct_pairs('data/disambiguation/sense-eval3/train/EnglishLS.train', '/home/krivas/projects/wsd-v2/data/lib/',\
                              is_train=True, test_path=None, prune_sentence=False, verbose=True)

The StanfordTokenizer will be deprecated in version 3.2.5.
Please use [91mnltk.tag.corenlp.CoreNLPPOSTagger[0m or [91mnltk.tag.corenlp.CoreNLPNERTagger[0m instead.
  super(StanfordNERTagger, self).__init__(*args, **kwargs)


---oracion
It is quite a hefty spade , with bicycle-type handlebars and a sprung lever at the rear , which you step on to <head>activate</head> it
---oracion sin parentesis
It is quite a hefty spade , with bicycle-type handlebars and a sprung lever at the rear , which you step on to <head>activate</head> it


---oracion
Qualitative Aspects of Experience The quality or modality of the experience depends less upon the quality of energy reaching the nervous system than upon which parts of the sensory system are <head>activated</head>
---oracion sin parentesis
Qualitative Aspects of Experience The quality or modality of the experience depends less upon the quality of energy reaching the nervous system than upon which parts of the sensory system are <head>activated</head>


---oracion
At a neurophysiological level , the intensity of an experience is typically reflected in the number of neurones <head>activated</head> ( the phenomenon of recruitment ) and , more specifically , in the firing 

In [67]:
pairs_train[0]

array([ 'It is quite a hefty spade , with bicycle-type handlebars and a sprung lever at the rear , which you step on to activate it',
       'It is quite a hefty spade , with bicycle-type handlebars and a sprung lever at the rear , which you step on to activate_38201 it',
       'activate_38201', '0'],
      dtype='<U1327')

In [39]:
l = [[len(pair[0].split()), int(pair[3])] for pair in pairs_train]

In [41]:
sorted(l, key = lambda x: x[0], reverse=True)[:3]

[[241, 7258], [142, 1888], [121, 5070]]

In [43]:
pairs_train[7258]

array([ 'The caves gave shelter to a new generation of troglodytes European nomads , Australian girls ending their Grand Tours by going native , those few English women who had felt the lure of the desert so intently that they now nursed fair-haired , olive-skinned babies and hung their washing-lines from one eroded pinnacle to another',
       'The caves gave shelter_shelter12600 to a new generation of troglodytes European nomads , Australian girls ending their Grand Tours by going native , those few English women who had felt the lure of the desert so intently that they now nursed fair-haired , olive-skinned babies and hung their washing-lines from one eroded pinnacle to another',
       'shelter_shelter12600', '6652'],
      dtype='<U1327')

In [33]:
pairs_train[10][3]

'8'

In [44]:
#text = "My own conclusions , which in this case are probably completely worthless , are that this ticking represents a period of grace I mean that it cannot explode as long as the ticking lasts and that it is not designed to explode when the ticking stops but is then <head>activated</head> and ready to explode when triggered by passing engines"
text = 'The caves gave shelter to a new generation of troglodytes European nomads , Australian girls ending their Grand Tours by going native , those few English women who had felt the lure of the desert so intently that they now nursed fair-haired , olive-skinned babies and hung their washing-lines from one eroded pinnacle to another'
# We assume that you've downloaded Stanford CoreNLP and defined an environment
# variable $CORENLP_HOME that points to the unzipped directory.
# The code below will launch StanfordCoreNLPServer in the background
# and communicate with the server to annotate the sentence.
with corenlp.CoreNLPClient(annotators="tokenize ssplit".split(), timeout=10000, endpoint='http://localhost:9010') as client:
  ann = client.annotate(text)

# You can access annotations using ann.
sentence = ann.sentence[0]

In [3]:
import numpy as np
nlp = StanfordCoreNLP(r'data/lib/stanford-corenlp')

sentence = 'Guangdong University of Foreign Studies is located in Guangzhou.'

a = np.array(nlp.dependency_parse(sentence))

In [18]:
a = np.loadtxt('f.txt')

In [19]:

a

array([[  0.,   7.],
       [  2.,   1.],
       [  7.,   2.],
       [  5.,   3.],
       [  5.,   4.],
       [  2.,   5.],
       [  7.,   6.],
       [  9.,   8.],
       [  7.,   9.],
       [  7.,  10.]])

In [51]:
list(map(lambda x:[int(x[1]), int(x[2])], a))

[[0, 7],
 [2, 1],
 [7, 2],
 [5, 3],
 [5, 4],
 [2, 5],
 [7, 6],
 [9, 8],
 [7, 9],
 [7, 10]]

In [21]:
"""
Preprocessing script for SICK data.
"""



In [22]:
    print('=' * 80)
    print('Preprocessing SICK dataset')
    print('=' * 80)

    base_dir = ''
    data_dir = os.path.join(base_dir, 'data')
    all_dir = os.path.join(data_dir, 'translation/all_data')
    lib_dir = os.path.join(base_dir, 'lib')
    train_dir = os.path.join(data_dir, 'translation/train')
    #dev_dir = os.path.join(data_dir, 'translation/dev')
    #test_dir = os.path.join(data_dir, 'translation/test')
    make_dirs([train_dir])

    # java classpath for calling Stanford parser
    classpath = ':'.join([
        lib_dir,
        os.path.join(lib_dir, 'stanford-parser/stanford-parser.jar'),
        os.path.join(lib_dir, 'stanford-parser/stanford-parser-3.5.1-models.jar')])

    # split into separate files
    split(os.path.join(all_dir, 'en-spa.txt'), train_dir)
    #split(os.path.join(all_dir, 'SICK_trial.txt'), dev_dir)
    #split(os.path.join(all_dir, 'SICK_test_annotated.txt'), test_dir)

    # parse sentences
    parse(train_dir, cp=classpath)
    #parse(dev_dir, cp=classpath)
    #parse(test_dir, cp=classpath)


0it [00:00, ?it/s]

Preprocessing SICK dataset

Dependency parsing data/translation/train/a.txt


115244it [16:37, 115.58it/s]
25it [00:00, 241.35it/s]


Dependency parsing data/translation/train/b.txt


115244it [27:51, 68.94it/s]


In [33]:
import corenlp
text = "Chris wrote a simple gotta sentence that he parsed with Stanford CoreNLP."

# We assume that you've downloaded Stanford CoreNLP and defined an environment
# variable $CORENLP_HOME that points to the unzipped directory.
# The code below will launch StanfordCoreNLPServer in the background
# and communicate with the server to annotate the sentence.
ann = client.annotate(text)
s = ' '.join([w.word for w in ann.sentence[0].token])



In [32]:
client =  corenlp.CoreNLPClient(annotators="tokenize ssplit".split())


In [4]:
import os
os.environ["CORENLP_HOME"] = '/home/krivas/projects/neural-wsd/new_experiments/data/lib/stanford-corenlp'

In [None]:
import subprocess
com = 'java -cp "/home/krivas/projects/neural-wsd/new_experiments/data/lib/stanford-corenlp/stanford-corenlp-3.9.2-models.jar:/home/krivas/projects/neural-wsd/new_experiments/data/lib/stanford-corenlp/stanford-corenlp-3.9.2.jar"  -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize -file input.txt -outputFormat json'
output = subprocess.Popen(['ls', '-la'], stdout=subprocess.PIPE)
response=output.communicate()[0]
print (response.decode())

In [17]:
t.shape

(115244,)

In [19]:
t[0]

[[0, 1], [1, 2]]