In [149]:
import sys
import os
import re
import string
import json
import urllib.request
import numpy as np

from tqdm import tqdm

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

torch.manual_seed(1)

<torch._C.Generator at 0x115ba73d0>

In [150]:
device = torch.device("cpu")

In [151]:
with open('Data/labels.json') as f:
    labels = json.load(f)

In [152]:
def read_file_line_by_line(filename):
    with open(filename, 'r') as file:
        text = file.readlines()
    return text

In [153]:
train_data = read_file_line_by_line('Data/train.txt')
val_data = read_file_line_by_line('Data/dev.txt')

In [154]:
embeddings = {}
emb_dim = 50
with open('glove.6B/glove.6B.50d.txt','r') as f:
  for line in f:
    values = line.split()
    word = values[0]
    vector = np.asarray(values[1:],'float32')
    embeddings[word]=vector

In [155]:
full_vocab_keys = list(embeddings.keys())
full_vocab_keys.append("<unk>")
full_vocab = {k: v for v, k in enumerate(full_vocab_keys)}

In [156]:
matrix_len = len(full_vocab)
weights_matrix = np.zeros((matrix_len, emb_dim))

for i, word in enumerate(full_vocab):
    try: 
        weights_matrix[i] = embeddings[word]
    except KeyError:
        weights_matrix[i] = np.random.normal(scale=0.6, size=(emb_dim, ))

In [157]:
weights_matrix = torch.from_numpy(weights_matrix).float()

In [158]:
def get_data(data):
    sent_labels = []
    all_labels = []
    sent_idx = []
    all_idx = []
    for line in (data):
        split_line = line.split("\t")
        if len(split_line) == 2:
            word = split_line[0]
            tag = split_line[1]
            tag = tag.replace("\n", "")
            word = word.lower()
            if word in full_vocab:
                sent_idx.append(full_vocab[word])
            else:
                sent_idx.append(full_vocab["<unk>"])
            tag_idx = labels[tag]
            sent_labels.append(tag_idx)
        elif line=="\n":
            sent_idx = np.array(sent_idx)
            sent_labels = np.array(sent_labels)
            all_idx.append(sent_idx)
            all_labels.append(sent_labels)
            sent_idx = []
            sent_labels = []
    return np.asarray(all_idx, dtype=object), np.asarray(all_labels, dtype=object)

In [159]:
trainX, trainY = get_data(train_data)
valX, valY = get_data(val_data)

In [160]:
def create_emb_layer(weights_matrix, non_trainable=False):
    num_embeddings, embedding_dim = weights_matrix.shape
    emb_layer = nn.Embedding(num_embeddings, embedding_dim)
    emb_layer.load_state_dict({'weight': weights_matrix})
    if non_trainable:
        emb_layer.weight.requires_grad = False

    return emb_layer, num_embeddings, embedding_dim

In [161]:
class BiLSTM(nn.Module):
    def __init__(self, weights_matrix, hidden_dim, tagset_size):
        super(BiLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding, num_embeddings, embedding_dim = create_emb_layer(weights_matrix, True)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True)
        self.hidden2tag = nn.Linear(hidden_dim*2, tagset_size)
        
    def init_hidden(self):
        return (torch.zeros(2, 1, self.hidden_dim),
                torch.zeros(2, 1, self.hidden_dim))

    def forward(self, sentence):
        embeds = self.embedding(sentence)
        lstm_out, _ = self.lstm(embeds)
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.softmax(tag_space, dim=1)
        return tag_scores

In [162]:
def train_model(model, trainX, trainY, valX, valY, epochs):
    loss_function = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.1)
    for epoch in (range(epochs)):
        print("Training Epoch {}".format(epoch))
        for sentence, tags in tqdm(zip(trainX, trainY)):
            model.zero_grad()
            sentence_in = torch.tensor(sentence)
            tag_scores = model(sentence_in)
            tags = torch.tensor(tags, dtype=torch.long)
            loss = loss_function(tag_scores, tags)
            loss.backward()
            optimizer.step()
        
        val_loss = 0
        for sentence, tags in tqdm(zip(valX, valY)):
            sentence_in = torch.tensor(sentence)
            tag_scores = model(sentence_in)
            tags = torch.tensor(tags, dtype=torch.long)
            loss = loss_function(tag_scores, tags)
            val_loss += loss.item()
        print("Validation Loss: {}".format(val_loss/len(valX)))

    return model

In [163]:
ner = BiLSTM(weights_matrix, 64, 39)

In [164]:
trained_model = train_model(ner, trainX, trainY, valX, valY, 10)

Training Epoch 0


7198it [00:31, 226.41it/s]
2267it [00:02, 763.89it/s]


Validation Loss: 3.2738102113406566
Training Epoch 1


7198it [00:31, 227.90it/s]
2267it [00:02, 795.46it/s]


Validation Loss: 3.157193200200152
Training Epoch 2


7198it [00:31, 227.26it/s]
2267it [00:02, 791.41it/s]


Validation Loss: 3.1458017985937508
Training Epoch 3


7198it [00:31, 227.70it/s]
2267it [00:02, 788.84it/s]


Validation Loss: 3.1423842746043027
Training Epoch 4


7198it [00:31, 227.94it/s]
2267it [00:02, 791.14it/s]


Validation Loss: 3.1404846714728194
Training Epoch 5


7198it [00:31, 226.45it/s]
2267it [00:02, 757.95it/s]


Validation Loss: 3.138112233607423
Training Epoch 6


7198it [00:31, 225.56it/s]
2267it [00:02, 794.31it/s]


Validation Loss: 3.1360698358361607
Training Epoch 7


7198it [00:31, 227.79it/s]
2267it [00:02, 786.06it/s]


Validation Loss: 3.1345885845339136
Training Epoch 8


7198it [00:31, 229.29it/s]
2267it [00:02, 784.11it/s]


Validation Loss: 3.133643036115837
Training Epoch 9


7198it [00:31, 228.40it/s]
2267it [00:02, 793.44it/s]

Validation Loss: 3.1329308340714586





In [171]:
correct = 0
total = 0
for i in range(len(valX)):
    inp = torch.tensor(valX[i])
    out = ner(inp)
    # take the argmax of the output
    out = out.detach().numpy()
    out = np.argmax(out, axis=1)
    
    gold = valY[i]
    correct += np.sum(out == gold)
    total += len(gold)

In [172]:
correct/total *100

57.628238662958786