In [1]:
from future.utils import iteritems
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import time
import re
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
import gensim.downloader

In [7]:
model = gensim.downloader.load('word2vec-google-news-300')

In [291]:
train = pd.read_pickle("train.pkl")
test = pd.read_pickle("test.pkl")

In [292]:
train_x = train['token']
train_y = train['label']
train_id = train['sen_id']

In [293]:
test_x = test['token']
test_y = test['label']
test_id = test['sen_id']

In [294]:
vocab = set([item for sublist in train_x for item in sublist] + [item for sublist in test_x for item in sublist])
tag = ['O', 'C', 'E']

In [295]:
zero_300 = np.array([0] * 300)
zero_301 = np.concatenate((np.array([1]), zero_300))

In [296]:
word2idx = {}

not_present = []

In [297]:
for i in vocab:
    try:
        vec = model[i]
        vec1 = np.concatenate((np.array([0]), vec))
        word2idx[i]= vec1
        
    except:
        word2idx[i]= zero_301


In [298]:
len(vocab)

5177

In [299]:
len(word2idx['guitar'])

301

In [300]:

tag2idx = {t: i for i, t in enumerate(tag)}
idx2tag = {v: k for k, v in iteritems(tag2idx)}

In [301]:
def prepare_sequence(seq, to_idx):
    
    idxs = [to_idx[w] for w in seq]
    idxs = np.array(idxs)
    output_idxs = torch.from_numpy(idxs)
    output_idxs = output_idxs.type(torch.FloatTensor).cuda()
    
    return output_idxs

In [302]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        ''' Initialize the layers of this model.'''
        super(LSTMTagger, self).__init__()
        
        self.hidden_dim = hidden_dim
        
        #LAYER1 : EMBEDDING LAYER
        # embedding layer that turns words into a vector of a specified size
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        #LAYER2 : LSTM
        # the LSTM takes embedded word vectors (of a specified size) as inputs and outputs hidden states of size hidden_dim
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True)

        #LAYER3 : DENSE
        # the linear layer that maps the hidden state output dimension to the number of tags we want as output, tagset_size (in this case this is 3 tags)
        self.hidden2tag = nn.Linear(hidden_dim*2, tagset_size)
        
        # initialize the hidden state (see code below)
        self.hidden = self.init_hidden()

        
    def init_hidden(self):

        # The axes dimensions are (n_layers, batch_size, hidden_dim)
        
        #return(torch.randn(2, 1, self.hidden_dim).cuda(),
        #    torch.randn(2, 1, self.hidden_dim).cuda())
    
        return (torch.zeros(2, 1, self.hidden_dim).cuda(),
                torch.zeros(2, 1, self.hidden_dim).cuda())
    
    def forward(self, sentence):
       
        embeds = sentence
        #print(embeds.shape)
        
        #print((embeds.view(len(sentence), 1, -1)).shape)
        
        lstm_out, self.hidden = self.lstm((embeds.view(len(sentence), 1, -1)), self.hidden)
        #print(lstm_out.shape)
        
        # get the scores for the most likely tag for a word
        tag_scores = self.hidden2tag(lstm_out.view(len(sentence), -1))

        return tag_scores

In [303]:
print(torch.cuda.is_available())
print(torch.cuda.current_device())
torch.cuda.set_device(3)
print(torch.cuda.current_device())

True
3
3


In [304]:
torch.manual_seed(0)

<torch._C.Generator at 0x7f814020bbf0>

In [305]:
# the embedding dimension defines the size of our word vectors
# for our simple vocabulary and training set, we will keep these small
EMBEDDING_DIM = 301
HIDDEN_DIM = 120

# instantiate our model
model1 = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word2idx), len(tag2idx))
model1.cuda()

# define our loss and optimizer
loss_function =  nn.CrossEntropyLoss()  #nn.NLLLoss()   
optimizer = optim.SGD(model1.parameters(), lr=0.1)       ##SGD


In [306]:
train_x

0      [i, used, to, get, terrible, headaches, from, ...
1      [peters, s, pain, and, symptoms, were, caused,...
2      [in, the, years, since, the, adm, scandal, of,...
3      [a, first, revolution, was, triggered, by, the...
4      [when, they, took, the, floor, against, a, tea...
                             ...                        
797    [using, solar, electricity, instead, of, conve...
798    [fatigue, corrosion, and, stress, corrosion, a...
799    [the, electromagnetic, em, radiation, from, th...
800    [it, gets, a, little, bit, chicken, first, or,...
801    [the, microphone, converts, sound, into, an, e...
Name: token, Length: 802, dtype: object

In [307]:
test_x

0      [many, adults, retain, scars, from, acne, brea...
1      [we, estimate, a, wind, speed, associated, wit...
2      [outbreaks, caused, by, the, oral, vaccine, s,...
3      [production, and, investigation, of, such, a, ...
4      [the, drugs, he, sold, had, caused, the, overd...
                             ...                        
196    [the, real, possibility, of, total, engulfment...
197    [we, find, evidence, that, ernst, suffered, a,...
198    [the, influx, caused, a, further, drain, on, t...
199    [ngu, nongonococcal, urethritis, is, an, infec...
200    [eye, discomfort, from, this, staring, effect,...
Name: token, Length: 201, dtype: object

In [308]:
training_input_data = []
for i in range(len(train_x)):
    sen = train_x[i]
    tag = train_y[i]
    sen_t = prepare_sequence(sen, word2idx)
    tag_t = prepare_sequence(tag, tag2idx)
    training_input_data.append((sen_t, tag_t))
    
    
testing_input_data = []
for i in range(len(test_x)):
    sen = test_x[i]
    tag = test_y[i]
    sen_t = prepare_sequence(sen, word2idx)
    tag_t = prepare_sequence(tag, tag2idx)
    testing_input_data.append((sen_t, tag_t))
    
    

In [309]:
len(train_x[0])

20

In [310]:
len(training_input_data[0][0])

20

In [311]:
n_epochs = 15
score_save = []

start_time = time.time()
for epoch in range(n_epochs):
    
    epoch_loss = 0.0
    count=0
    
    for sentence, tags in training_input_data:
        if len(sentence)>0:
            count+=1
            
            model1.zero_grad()

            # zero the hidden state of the LSTM, this detaches it from its history
            model1.hidden = model1.init_hidden()

   
            # forward 
           
            tags = tags.type(torch.LongTensor).cuda()
            
            tag_scores = model1(sentence)
            
            # Loss
            tag_scores = tag_scores.type(torch.FloatTensor).cuda()
            
            if epoch==n_epochs-1:
                score_save.append(tag_scores.tolist())
            
            loss = loss_function(tag_scores, tags)
            #print(loss)
            epoch_loss += loss.item()
            loss.backward()

            # update the model parameters with optimizer.step()
            optimizer.step()
        
    
    print("Epoch: %d, loss: %1.5f" % (epoch+1, epoch_loss/len(train_x)))
    print("--- %s seconds ---" % (time.time() - start_time))

print("Total time taken : ")
print("--- %s seconds ---" % (time.time() - start_time))

Epoch: 1, loss: 0.43536
--- 4.966158151626587 seconds ---
Epoch: 2, loss: 0.31675
--- 10.054753303527832 seconds ---
Epoch: 3, loss: 0.27250
--- 15.240069389343262 seconds ---
Epoch: 4, loss: 0.24484
--- 20.281968355178833 seconds ---
Epoch: 5, loss: 0.22212
--- 25.012757539749146 seconds ---
Epoch: 6, loss: 0.20271
--- 29.83329129219055 seconds ---
Epoch: 7, loss: 0.18662
--- 34.940752267837524 seconds ---
Epoch: 8, loss: 0.17309
--- 40.00309920310974 seconds ---
Epoch: 9, loss: 0.16132
--- 44.855435848236084 seconds ---
Epoch: 10, loss: 0.15103
--- 49.7520067691803 seconds ---
Epoch: 11, loss: 0.14170
--- 54.78516912460327 seconds ---
Epoch: 12, loss: 0.13287
--- 59.84591364860535 seconds ---
Epoch: 13, loss: 0.12430
--- 64.88073539733887 seconds ---
Epoch: 14, loss: 0.11588
--- 69.62064933776855 seconds ---
Epoch: 15, loss: 0.10745
--- 74.61324453353882 seconds ---
Total time taken : 
--- 74.61415505409241 seconds ---


In [312]:
#Evaluation for training data

pred_tag = []
actual_tag = []
for sen, tag in training_input_data:
    tag_scores = model1(sen)
    _, predicted_tags = torch.max(tag_scores, 1)
    pred = predicted_tags.cpu().tolist()
    pred = [idx2tag[idx] for idx in pred]
    pred_tag.append(pred)
    tag = tag.cpu().tolist()
    act = [idx2tag[idx] for idx in tag]
    actual_tag.append(act)


In [313]:
pred_tag_f = [item for sublist in pred_tag for item in sublist]
actual_tag_f = [item for sublist in actual_tag for item in sublist]

In [314]:
from sklearn.metrics import classification_report

label_names = ['O', 'C', 'E']

print(classification_report(actual_tag_f, pred_tag_f,target_names=label_names))

              precision    recall  f1-score   support

           O       0.88      0.52      0.65       890
           C       0.86      0.67      0.75       887
           E       0.95      0.99      0.97     13185

    accuracy                           0.94     14962
   macro avg       0.90      0.73      0.79     14962
weighted avg       0.94      0.94      0.94     14962



In [315]:
#Evaluation for testing data

pred_tag = []
actual_tag = []
for sen, tag in testing_input_data:
    tag_scores = model1(sen)
    _, predicted_tags = torch.max(tag_scores, 1)
    pred = predicted_tags.cpu().tolist()
    pred = [idx2tag[idx] for idx in pred]
    pred_tag.append(pred)
    tag = tag.cpu().tolist()
    act = [idx2tag[idx] for idx in tag]
    actual_tag.append(act)


In [316]:
pred_tag_f = [item for sublist in pred_tag for item in sublist]
actual_tag_f = [item for sublist in actual_tag for item in sublist]

In [317]:
from sklearn.metrics import classification_report

label_names = ['O', 'C', 'E']

print(classification_report(actual_tag_f, pred_tag_f,target_names=label_names))

              precision    recall  f1-score   support

           O       0.63      0.38      0.47       224
           C       0.70      0.62      0.66       217
           E       0.94      0.98      0.96      3516

    accuracy                           0.92      3957
   macro avg       0.76      0.66      0.70      3957
weighted avg       0.91      0.92      0.92      3957



In [None]:
def clean_x(sen):
    sen = re.sub('[^a-zA-Z0-9 \n\.]', ' ', sen)
    sen = sen.replace('.', ' ')
    sen = sen.lower()
    sen = sen.split()
    return sen

def clean_y(rel):
    cause = rel[0]
    cause = re.sub('[^a-zA-Z0-9 \n\.]', ' ', cause)
    cause = cause.lower().split()
    effect = rel[1]
    effect = re.sub('[^a-zA-Z0-9 \n\.]', ' ', effect)
    effect = effect.lower().split()

    return ((cause, effect))

In [None]:
#TEST WITH ANY INPUT SENTENCE


def test_with_sen(input_sen):
    #input_sen = 'Smoking causes cancer'
    #input_sen = 'For example , the book also described how to distinguish between a drowning (water in the lungs) and strangulation (broken neck cartilage), along with other evidence from examining corpses on determining if a death was caused by murder, suicide or an accident.'
    input_sen = clean_x(input_sen)
    #input_sen = input_sen.lower().replace(".", "")

    inputs = prepare_sequence(input_sen, word2idx)

    tag_scores = model1(inputs)


    _, predicted_tags = torch.max(tag_scores, 1)

    print("Input sentence: ", input_sen)
    for i in range(0,len(input_sen)):
        t = ''
        if predicted_tags[i].item()==0:
            t = 'Other'
        elif predicted_tags[i].item()==1:
            t = 'Cause'
        elif predicted_tags[i].item()==2:
            t = 'Effect'
        #elif predicted_tags[i].item==3:
        #    t = 'B-Effect'
        #else:
        #    t = 'I-Effect'
        print(predicted_tags[i].item())
        print(input_sen[i] , ': ', t )
    return
    

In [None]:
test_sen = 'Smoking causes cancer'
test_with_sen(test_sen)

In [None]:
a = ' '.join(test_x[46])

print(a)
test_with_sen(a)