In [0]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from google.colab import drive
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
import itertools
import numpy as np

#Loading Data

In [0]:
train_df = pd.read_csv('https://drive.google.com/uc?id=17s-v7RkT7LTojqDDGkVa1dbi6GhukUu6', encoding = "ISO-8859-1")
val_df = pd.read_csv('https://drive.google.com/uc?id=1afH0fbRM8w9N41R0o9WHHvr1nEso8UEb', encoding = "ISO-8859-1")
test_df = pd.read_csv('https://drive.google.com/uc?id=1CRQv7ojJG0wSxaRZiotXo4Dwcb2ChKDH', encoding = "ISO-8859-1")

In [0]:
sen_list = train_df['Sentence'].tolist()
sen_list_val = val_df['Sentence'].tolist()
tag_list = train_df['NER'].tolist()
tag_list_val = val_df['NER'].tolist()
sen_list_test = test_df['Sentence'].tolist()
n_data = train_df.shape[0]

In [0]:
def pre_process(sent_list):
    output = []
    for sent in sent_list:
        output.append(sent.split())
    return output

#Pre-processing for GRU Encoder-Decoder with Attention

In [0]:
input_token_list = pre_process(sen_list)
answer_token_list = pre_process(tag_list)
input_token_list_val = pre_process(sen_list_val)
total_token_list = pre_process(sen_list)
total_token_list.extend(input_token_list_val)
answer_token_list_val = pre_process(tag_list_val)
output_token_list = [["<BOS>"] + s for s in answer_token_list]
target_token_list = [s + ["<EOS>"] for s in answer_token_list]
total_token_list = total_token_list + output_token_list
input_token_list_test = pre_process(sen_list_test) #Pre-processing here, only to consider it while establishing the max_length variable due to memory constraints of Colab.
total_token_list = total_token_list + input_token_list_test  #Here, we are adding words from test data to use in word_to_ix function so that OOV words don't appear while putting in test data.
tot = list(itertools.chain.from_iterable(total_token_list))

In [6]:
MAX_LENGTH = max([len(s) for s in total_token_list] + [len(s) for s in target_token_list] + [len(s) for s in input_token_list_test])
MAX_LENGTH

124

##Converting word and data to ix

In [0]:
word_to_ix = {"<BOS>":0,"<EOS>":1}

for word in tot:
  if word not in word_to_ix:
    word_to_ix[word] = len(word_to_ix)

word_list = list(word_to_ix.keys())

In [0]:
def to_index(data, to_ix):
    input_index_list = []
    for sent in data:
        input_index_list.append([to_ix[w] for w in sent])
    return input_index_list

input_index = to_index(input_token_list, word_to_ix)
output_index = to_index(output_token_list, word_to_ix)
target_index = to_index(target_token_list, word_to_ix)

#Input embedding

##Word embedding

In [9]:
import gensim.downloader as api
word_emb_model = api.load("glove-twitter-25") 

embedding_size = 25

embedding_matrix = []
for word in word_list:
    try:
        embedding_matrix.append(word_emb_model.wv[word])
    except:
        embedding_matrix.append([0]*embedding_size)
embedding_matrix = np.array(embedding_matrix)



  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  if __name__ == '__main__':


##PoS tags

In [10]:
import nltk
nltk.download('averaged_perceptron_tagger')
from nltk.tag.perceptron import PerceptronTagger

pretrain = PerceptronTagger()
POS_mat=pretrain.tag(word_list)
POS_mat=np.array(POS_mat)
diff_pos=[]
for i in range(13972):
  if(POS_mat[i][1] not in diff_pos):
    diff_pos.append(POS_mat[i,1])

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [0]:
from sklearn.preprocessing import OneHotEncoder
onehotencoder = OneHotEncoder()

X = onehotencoder.fit_transform(POS_mat[:,1].reshape(-1,1)).toarray() #reshape the 1-D country array to 2-D as fit_transform expects 2-D and finally fit the object 
pos_matrix=pd.DataFrame(X) #To add this back into the original dataframe 

##Final embedding matrix

In [0]:
emb_mat_df=pd.DataFrame(embedding_matrix)
merged_mat= pd.concat((emb_mat_df,pos_matrix), axis=1)

merged_matrix=np.array(merged_mat)

#Encoder

In [0]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, embedding):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = embedding
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.gru2 = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output, hidden = self.gru(embedded, hidden)
        output, hidden = self.gru2(output,hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)    

#Decoder

In [0]:
class AttnDecoderRNN(nn.Module):
    ATTN_TYPE_DOT_PRODUCT = "Dot Product"
    ATTN_TYPE_SCALE_DOT_PRODUCT = "Scale Dot Product"

    def __init__(self, hidden_size, output_size, embedding, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = embedding
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.gru2 = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size*2, self.output_size)


    def cal_attention(self, hidden, encoder_hiddens, method):
        if method == AttnDecoderRNN.ATTN_TYPE_DOT_PRODUCT:
            attn_weights = F.softmax(torch.bmm(hidden, encoder_hiddens.T.unsqueeze(0)),dim=-1)
            attn_output = torch.bmm(attn_weights, encoder_hiddens.unsqueeze(0))
            concat_output = torch.cat((attn_output[0], hidden[0]), 1)

        elif method == AttnDecoderRNN.ATTN_TYPE_SCALE_DOT_PRODUCT:
            attn_weights = F.softmax(1/np.sqrt(hidden_size)*torch.bmm(hidden, encoder_hiddens.T.unsqueeze(0)),dim=-1)
            attn_output = torch.bmm(attn_weights, encoder_hiddens.unsqueeze(0))
            concat_output = torch.cat((attn_output[0], hidden[0]), 1)

        return concat_output

    def forward(self, input, hidden, encoder_hiddens):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)
        emb, hidden = self.gru(embedded, hidden)
        _, hidden = self.gru2(emb, hidden)
        
        concat_output = self.cal_attention(hidden, encoder_hiddens, AttnDecoderRNN.ATTN_TYPE_SCALE_DOT_PRODUCT)

        output = F.log_softmax(self.out(concat_output), dim=1)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

#Train function

In [0]:
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_hiddens = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for i in range(input_length):
        encoder_output, encoder_hidden = encoder(input_tensor[i], encoder_hidden)
        encoder_hiddens[i] = encoder_hidden[0, 0]

    decoder_input = torch.tensor([[0]], device=device)

    decoder_hidden = encoder_hidden

    # Teacher forcing: Feed the target as the next input
    for i in range(target_length):
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, encoder_hiddens)
        loss += criterion(decoder_output, target_tensor[i])
        decoder_input = target_tensor[i]  # Teacher forcing

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

#Evaluate function

In [0]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_sent = pre_process([sentence])[0]
        intput_index = [word_to_ix[word] for word in input_sent]
        input_tensor = torch.LongTensor([[ind] for ind in intput_index]).to(device)

        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_hiddens = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
            encoder_hiddens[ei] += encoder_hidden[0, 0]

        decoder_input = torch.tensor([[0]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []

        for di in range(max_length):
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, encoder_hiddens)
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == 1:
                #decoded_words.append('<EOS>')
                break
            #elif len(decoded_words) == len(input_tensor[ei]):
                #break
            else:
                decoded_words.append(word_list[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words

#Validation accuracy function

In [0]:
from sklearn.metrics import accuracy_score

val_true_tags = [word for sen in tag_list_val for word in sen]
test_words_input = val_df['Sentence'].tolist()
gr_tags = [i.split() for i in tag_list_val]

def calc_acc(pred,actual):
  s = 0
  pred_tags=[]
  for i in range(0,len(pred)):
    if len(pred[i]) < len(actual[i]):
      pre = ['O'] * (len(actual[i])-len(pred[i]))
      pre1 = pred[i]
      pre1.extend(pre)
      s += accuracy_score(pre1,actual[i])
      pred_tags.append(pre1)
    else:
      pre = pred[i][:len(actual[i])]
      s += accuracy_score(pre,actual[i])
      pred_tags.append(pre)
  return (s/len(pred)), pred_tags

#Train-Iterations function

In [0]:
import random
def trainIters(encoder, decoder, n_iters, print_every=1000, learning_rate=0.01):
    print_loss_total = 0  # Reset every print_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        random_choice_ix = random.choice(range(n_data))
        input_index_r = [[ind] for ind in input_index[random_choice_ix]]
        target_index_r = [[ind] for ind in target_index[random_choice_ix]]
        
        input_tensor = torch.LongTensor(input_index_r).to(device)
        target_tensor = torch.LongTensor(target_index_r).to(device)

        loss = train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('Epochs - (%d %d%%) %.4f' % (iter, iter / n_iters * 100, print_loss_avg))
        

        #We started generating val_accuracy after model has trained with some epochs so as to preserve some computation time
        val_print = print_every*4
        if iter>20000:
            if iter % val_print == 0:
                val_predicted = []
                for i in test_words_input:
                    predicted_sen = (evaluate(encoder1, attn_decoder1, i, max_length=MAX_LENGTH))
                    val_predicted.append(predicted_sen)
                val_acc, val_tags_pred = calc_acc(val_predicted, gr_tags)
                print('Epochs - (%d %d%%) Train loss - %.4f Validation accuracy - %.4f' % (iter, iter / n_iters * 100, print_loss_avg, val_acc))

#Training

In [24]:
input_size = merged_matrix.shape[0]
hidden_size = merged_matrix.shape[1]

embedding = nn.Embedding(input_size, hidden_size)
embedding.weight.data.copy_(torch.from_numpy(merged_matrix))

encoder1 = EncoderRNN(input_size, hidden_size, embedding).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, input_size, embedding, dropout_p=0.1).to(device)

trainIters(encoder1, attn_decoder1, 48000, print_every=1000)

Epochs - (1000 2%) 1.3608
Epochs - (2000 4%) 0.9279
Epochs - (3000 6%) 0.7922
Epochs - (4000 8%) 0.7285
Epochs - (5000 10%) 0.6494
Epochs - (6000 12%) 0.6228
Epochs - (7000 14%) 0.6106
Epochs - (8000 16%) 0.5596
Epochs - (9000 18%) 0.5134
Epochs - (10000 20%) 0.5091
Epochs - (11000 22%) 0.4954
Epochs - (12000 25%) 0.4612
Epochs - (13000 27%) 0.4330
Epochs - (14000 29%) 0.4274
Epochs - (15000 31%) 0.4073
Epochs - (16000 33%) 0.3902
Epochs - (17000 35%) 0.3627
Epochs - (18000 37%) 0.3366
Epochs - (19000 39%) 0.3294
Epochs - (20000 41%) 0.3273
Epochs - (21000 43%) 0.2964
Epochs - (22000 45%) 0.2965
Epochs - (23000 47%) 0.2736
Epochs - (24000 50%) 0.2738
Epochs - (24000 50%) Train loss - 0.2738 Validation accuracy - 0.9081
Epochs - (25000 52%) 0.2555
Epochs - (26000 54%) 0.2542
Epochs - (27000 56%) 0.2332
Epochs - (28000 58%) 0.2355
Epochs - (28000 58%) Train loss - 0.2355 Validation accuracy - 0.8979
Epochs - (29000 60%) 0.2443
Epochs - (30000 62%) 0.2227
Epochs - (31000 64%) 0.1971
Epoch