In [102]:
import re
import nltk
import data_helpers
import numpy as np
import pickle as pk
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
#import spacy
#from spacy import displacy
#% matplotlib inline

## Load Data

In [103]:
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

In [104]:
def labels_mapping(relation):
    labelsMapping = {'Other': 0,
                 'Message-Topic(e1,e2)': 1, 'Message-Topic(e2,e1)': 2,
                 'Product-Producer(e1,e2)': 3, 'Product-Producer(e2,e1)': 4,
                 'Instrument-Agency(e1,e2)': 5, 'Instrument-Agency(e2,e1)': 6,
                 'Entity-Destination(e1,e2)': 7, 'Entity-Destination(e2,e1)': 8,
                 'Cause-Effect(e1,e2)': 9, 'Cause-Effect(e2,e1)': 10,
                 'Component-Whole(e1,e2)': 11, 'Component-Whole(e2,e1)': 12,
                 'Entity-Origin(e1,e2)': 13, 'Entity-Origin(e2,e1)': 14,
                 'Member-Collection(e1,e2)': 15, 'Member-Collection(e2,e1)': 16,
                 'Content-Container(e1,e2)': 17, 'Content-Container(e2,e1)': 18}
    return labelsMapping[relation]

In [105]:
def find_between( s, first, last ):
    try:
        start = s.index( first ) + len( first )
        end = s.index( last, start )
        return s[start:end].strip()
    except ValueError:
        return ""

In [106]:
def read_data(path):
    with open(path, "r") as f:
        lines = [line.strip() for line in f.readlines()]
        #lines = lines[:10]
    return lines

def load_train_data(path):
    data = []
    sentence_list = []
    data_label = []
    position = []
    lines = read_data(path)     
    for idx in range(0, len(lines), 4):
        ID = lines[idx].split("\t")[0]
        
        sentence = lines[idx].split("\t")[1][1:-1]
        sentence_list.append(sentence.replace('<e1>', "").replace('<e2>', "").replace('</e1>', "").replace('</e2>', ""))
        sentence = clean_str(sentence)
        entity_1 = find_between(sentence, 'e1', 'e1')
        entity_2 = find_between(sentence, 'e2', 'e2')

        if len(entity_1.split(' ')) > 1:
            entity_1 = entity_1.split(' ')[-1]
        if len(entity_2.split(' ')) > 1:
            entity_2 = entity_2.split(' ')[-1]
        sentence = sentence.replace('e1 ', "").replace(' e2', "").replace('e2 ', "").strip()

        p1 = sentence.split(' ').index(entity_1)
        p2 = sentence.split(' ').index(entity_2)
        if p1 == p2:
            p2 = [index for index in range(len(sentence.split(' '))) if sentence.split(' ')[index] == entity_2][1]
        position.append([p1,p2])
        
        relation = lines[idx + 1]
        relation_label = labels_mapping(relation)
        
        data.append([ID,sentence, entity_1, entity_2])
        data_label.append([ID, relation, relation_label])
        
    return data, data_label, position, sentence_list

def load_test_data(path):
    data = []
    position = []
    sentence_list = []
    lines = read_data(path)
    for idx in range(0, len(lines)):
        ID = lines[idx].split("\t")[0]
        
        sentence = lines[idx].split("\t")[1][1:-1]
        sentence_list.append(sentence.replace('<e1>', "").replace('<e2>', "").replace('</e1>', "").replace('</e2>', ""))
        sentence = clean_str(sentence)
        entity_1 = find_between(sentence, 'e1', 'e1')
        entity_2 = find_between(sentence, 'e2', 'e2')
        if len(entity_1.split(' ')) > 1:
            entity_1 = entity_1.split(' ')[-1]
        if len(entity_2.split(' ')) > 1:
            entity_2 = entity_2.split(' ')[-1]
        sentence = sentence.replace('e1 ', "").replace(' e2', "").replace('e2 ', "").strip()
        
        p1 = sentence.split(' ').index(entity_1)
        p2 = sentence.split(' ').index(entity_2)
        if p1 == p2:
            p2 = [index for index in range(len(sentence.split(' '))) if sentence.split(' ')[index] == entity_2][1]
        position.append([p1,p2])
        data.append([ID, sentence, entity_1, entity_2])
    return data, position, sentence_list

def load_test_answer(path):
    lines = read_data(path)
    data = []
    for idx in range(0, len(lines)):
        ID = lines[idx].split("\t")[0]
        relation = lines[idx].split("\t")[1]
        relation_label = labels_mapping(relation)
        data.append([ID, relation, relation_label])
    return data

In [107]:
train, train_label, train_position, train_sentence = load_train_data('data/TRAIN_FILE.txt')
test, test_position, test_sentence = load_test_data('data/TEST_FILE.txt')
test_label = load_test_answer('data/answer_key.txt')
sentence_list = train_sentence + test_sentence
print("number of training instances:", len(train))
print("number of testing instances:", len(test))

number of training instances: 8000
number of testing instances: 2717


In [69]:
print(train[514])
print(train_label[514])
print(train_position[514])

['515', 'the small box was placed inside the large box and the 4 inch of surrounding space filled with either shredded fabric , fiberglass insulation or no insulation', 'box', 'box']
['515', 'Entity-Destination(e1,e2)', 7]
[2, 8]


In [70]:
print(train[5120])
print(train_label[5120])
print(train_position[5120])

['5121', 'the mp3 player was inside a silicone case that covered everything but the click wheel and the screen', 'player', 'case']
['5121', 'Content-Container(e1,e2)', 17]
[2, 7]


In [71]:
print(test[0])
print(test_label[0])
print(test_position[0])

['8001', 'the most common audits were about waste and recycling', 'audits', 'waste']
['8001', 'Message-Topic(e1,e2)', 1]
[3, 6]


In [85]:
np.save('data/sentence_list.npy', sentence_list)  #used for propBank parser

### features

1. dependency parse

In [23]:
nlp = spacy.load('en_core_web_sm')

In [70]:
demo_id = 5120
demo_sentence = train[demo_id][1]
demo_sentence

'the mp3 player was inside a silicone case e2 that covered everything but the click wheel and the screen'

In [64]:
train_position[demo_id]

[1, 2]

In [68]:
doc = nlp(demo_sentence)

for pos, token in enumerate(doc):
    if pos in train_position[demo_id]:
        ancestors = [i for i in token.ancestors]
        print(token.text, token.tag_, token.dep_,
              token.shape_, token.is_stop, token.ent_type,ancestors)

# displacy.serve(doc, style='dep')

student NN compound xxxx False 0 [association, is]
association NN nsubj xxxx False 0 [is]


In [31]:
position_id = 2
print([token.text for token in doc[position_id].lefts])  # ['bright', 'red']
print([token.text for token in doc[position_id].rights])  # ['on']
print(doc[position_id].n_lefts)  # 2
print(doc[position_id].n_rights)  # 1

['the', 'student']
[]
2
0


2 propBank (resource：https://github.com/google/sling)

In [91]:
with open('dict/propBank_dict.pk', 'rb') as f:
    propBank_dict = pk.load(f)

In [99]:
print(train[116])
print(test_label[116])
print(test_position[116])

['117', 'forward motion of the vehicle through the air caused a suction on the road draft tube', 'motion', 'suction']
['8117', 'Product-Producer(e1,e2)', 3]
[1, 8]


In [96]:
propBank_dict[116]['text']

'Forward motion of the vehicle through the air caused a suction on the road draft tube.'

In [97]:
propBank_dict[116]['propBank']

[['cause.01', 'caused', 8, 'ARG0_agent', 'Forward motion'],
 ['cause.01', 'caused', 8, 'ARG1_patient', 'suction']]

3 wordnet

In [100]:
def get_wordnet_tag(tag):
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB
    return 'None'

In [114]:
def check(token, token_tag):
    print('-> ' + token + '  pos: ' + token_tag)
    if token_tag in (wn.NOUN, wn.ADJ, wn.ADV,  wn.VERB):            
        lemma = lemmatizer.lemmatize(word, pos=token_tag)
        if lemma:
            synsets = wn.synsets(lemma, pos=token_tag)
            print('wordnet:')
            print(synsets) 
        else:
            print(token + ' not in wordnet.')
    else:
        print(token + ' not in wordnet.')
    print()
        
    

In [115]:
for sentence in sentence_list[:2]:
    
    sentence_token = nltk.word_tokenize(sentence)
    sentence_tagged = nltk.pos_tag(sentence_token)
    print(sentence)
    for tagged in sentence_tagged:
        
        wn_tag = get_wordnet_tag(tagged[1])
        word = tagged[0].lower()
        check(word, wn_tag)
    print('------------------------------------------------------------------')

The system as described above has its greatest application in an arrayed configuration of antenna elements.
-> the  pos: None
the not in wordnet.

-> system  pos: n
wordnet:
[Synset('system.n.01'), Synset('system.n.02'), Synset('system.n.03'), Synset('system.n.04'), Synset('arrangement.n.03'), Synset('system.n.06'), Synset('system.n.07'), Synset('system.n.08'), Synset('organization.n.05')]

-> as  pos: None
as not in wordnet.

-> described  pos: v
wordnet:
[Synset('describe.v.01'), Synset('report.v.01'), Synset('trace.v.02'), Synset('identify.v.05')]

-> above  pos: None
above not in wordnet.

-> has  pos: v
wordnet:
[Synset('have.v.01'), Synset('have.v.02'), Synset('experience.v.03'), Synset('own.v.01'), Synset('get.v.03'), Synset('consume.v.02'), Synset('have.v.07'), Synset('hold.v.03'), Synset('have.v.09'), Synset('have.v.10'), Synset('have.v.11'), Synset('have.v.12'), Synset('induce.v.02'), Synset('accept.v.02'), Synset('receive.v.01'), Synset('suffer.v.02'), Synset('have.v.17'), S