In [2]:
from CorpusReader import *
from FeatureExtractor import *
import pandas as pd
from DataTransform import transform
from LSTM import train_model
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support
import numpy as np
from utils import *

Loading Glove Model
1917494  words loaded!


In [3]:
labelMappings = {'Other': 0,'Instrument-Agency(e1,e2)': 1, 'Instrument-Agency(e2,e1)': 2,
                     'Cause-Effect(e1,e2)': 3, 'Cause-Effect(e2,e1)': 4,'Member-Collection(e1,e2)': 5,
                     'Member-Collection(e2,e1)' : 6,'Entity-Destination(e1,e2)': 7, 'Entity-Destination(e2,e1)': 8,
                     'Content-Container(e1,e2)': 9, 'Content-Container(e2,e1)': 10,'Message-Topic(e1,e2)': 11,
                     'Message-Topic(e2,e1)': 12, 'Product-Producer(e1,e2)': 13,'Product-Producer(e2,e1)': 14,
                     'Entity-Origin(e1,e2)': 15, 'Entity-Origin(e2,e1)': 16,'Component-Whole(e1,e2)': 17,
                     'Component-Whole(e2,e1)': 18}

classToLabel = {0: ('Other','NA'),1: ('Instrument-Agency','(e1,e2)'), 2: ('Instrument-Agency','(e2,e1)'),
                    3: ('Cause-Effect','(e1,e2)'),4: ('Cause-Effect','(e2,e1)'),5: ('Member-Collection','(e1,e2)'),
                    6: ('Member-Collection','(e2,e1)'),7: ('Entity-Destination','(e1,e2)'),8: ('Entity-Destination','(e2,e1)'),
                    9: ('Content-Container','(e1,e2)'),10: ('Content-Container','(e2,e1)'),11: ('Message-Topic','(e1,e2)'),
                    12: ('Message-Topic','(e2,e1)'),13: ('Product-Producer','(e1,e2)'),14: ('Product-Producer','(e2,e1)'),
                    15: ('Entity-Origin','(e1,e2)'),16: ('Entity-Origin','(e2,e1)'),17: ('Component-Whole','(e1,e2)'),
                    18: ('Component-Whole','(e2,e1)')}

In [4]:
import time

In [5]:
CR_train = CorpusReader()
dataset_train = CR_train.read('semeval_train.txt')
featureExtractor_train = FeatureExtractor()
new_dataset_train = featureExtractor_train.getFeatures(dataset_train)
CR_test = CorpusReader()
dataset_test = CR_test.read('semeval_test.txt')
featureExtractor_test = FeatureExtractor()
new_dataset_test = featureExtractor_test.getFeatures(dataset_test)

In [6]:
label_train = [labelMappings[ele] for ele in dataset_train['labels']]
label_train = dense_to_one_hot(np.array(label_train), len(labelMappings))
x_label = np.asarray(label_train)
label_test = [labelMappings[ele] for ele in dataset_test['labels']]
label_test = dense_to_one_hot(np.array(label_test), len(labelMappings))
y_label = np.asarray(label_test)
tokenizer,embedding_matrix, max_length, major_dep, word_index, x_text_seq, x_mut_ancestors_list = transform(labelMappings,new_dataset_train,train = True)
(_,_,_,_,_,y_text_seq,y_mut_ancestors_list) = transform(labelMappings,new_dataset_test,tokenizer,max_length,major_dep,word_index,train=False)

In [7]:
from keras.models import load_model

In [8]:
model = load_model('LSTM')

In [9]:
dependency_list = new_dataset_train['dependents']
a, b = np.unique(dependency_list, return_counts=True)
a_sorted = a[np.argsort(b)[::-1]]
major_dep = a_sorted[:33]
major_deps = {}
i = 0
for j in range(33):
    major_deps[major_dep[j]] = i
    i += 1

In [10]:
t0 = time.time()
s = "The opening and closing of the <e1>heart</e1> <e2>valves</e2> produce the sound of the heartbeat."
df_sent = read_sentence(s)
featureExtractor_sent = FeatureExtractor()
new_data_sent = featureExtractor_sent.getFeatures(df_sent)
(_,_,_,_,_,s_text_seq,s_mut_ancestors_list) = transform(labelMappings,new_data_sent,tokenizer,max_length,major_dep,word_index,train=False)
dependency_list = new_data_sent['dependents']
s_dependency = dependency_encoder(major_deps,dependency_list)
prediction = model.predict([s_text_seq,s_mut_ancestors_list,s_dependency],batch_size=1)
class_pred = np.argmax(prediction, axis=1)
print(classToLabel[class_pred[0]])
t1 = time.time()
print("\n total time taken : ", t1-t0)

('Component-Whole', '(e2,e1)')

 total time taken :  2.626636505126953


In [12]:
t0 = time.time()
CR_test = CorpusReader()
dataset_test = CR_test.read('semeval_test.txt')
featureExtractor_test = FeatureExtractor()
new_dataset_test = featureExtractor_test.getFeatures(dataset_test)
label_test = [labelMappings[ele] for ele in dataset_test['labels']]
label_test = dense_to_one_hot(np.array(label_test), len(labelMappings))
test_dependency_list = new_dataset_test['dependents']
y_dependency_list_filter = dependency_encoder(major_deps,test_dependency_list)
y_label = np.asarray(label_test)
(_,_,_,_,_,y_text_seq,y_mut_ancestors_list) = transform(labelMappings,new_dataset_test,tokenizer,max_length,major_dep,word_index,train=False)
prediction = model.predict([y_text_seq,y_mut_ancestors_list,y_dependency_list_filter],batch_size=1000)
t1 = time.time()
print("\n total time taken : ", t1-t0)


 total time taken :  34.083810329437256


In [13]:
class_pred = np.argmax(prediction, axis=1)
class_true = np.argmax(y_label, axis=1)
conf = confusion_matrix(class_true, class_pred)
precision,recall,fscore,support = precision_recall_fscore_support(class_true, class_pred, average='macro')
print("precision: ",precision*100,"\nrecall: ",recall*100,"\nfscore: ",fscore*100)
y_true = [classToLabel[x] for x in class_true]
y_pred = [classToLabel[x] for x in class_pred]
cor_rel_cor_edge, cor_rel_wr_edge = 0, 0
for i in range(len(y_true)):
    if y_true[i][0] == y_pred[i][0]:
        if y_true[i][1] == y_pred[i][1]:
            cor_rel_cor_edge += 1
        else:
            cor_rel_wr_edge += 1
print("Correct relation,correct edges: ",100 * cor_rel_cor_edge/ len(y_true))
print("Correct relation, wrong edge: ", 100 * cor_rel_wr_edge / len(y_true))

precision:  71.71103576526859 
recall:  71.95760891693234 
fscore:  71.60816071745583
Correct relation,correct edges:  75.26683842473317
Correct relation, wrong edge:  0.9569377990430622


  _warn_prf(average, modifier, msg_start, len(result))
