In [202]:
import sys
import os
import re
import string
import json
import urllib.request
import numpy as np

from tqdm import tqdm

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader, TensorDataset

torch.manual_seed(1)
device = torch.device("cpu")

In [203]:
BATCH_SIZE = 4

In [204]:
with open('Data/labels.json') as f:
    labels = json.load(f)

In [205]:
def read_file(filename):
    with open(filename, 'r') as file:
        text = file.readlines()
    return text

In [206]:
train_data = read_file('Data/train.txt')
val_data = read_file('Data/dev.txt')

In [207]:
embeddings = {}
emb_dim = 50
with open('glove.6B/glove.6B.50d.txt','r') as f:
  for line in f:
    values = line.split()
    word = values[0]
    vector = np.asarray(values[1:],'float32')
    embeddings[word]=vector

In [208]:
vocab_keys = list(embeddings.keys())
vocab_keys.append("<unk>")
vocab_keys.append("<pad>")
vocab = {k: v for v, k in enumerate(vocab_keys)}

In [209]:
matrix_len = len(vocab)
weights_matrix = np.zeros((matrix_len, emb_dim))

for i, word in enumerate(vocab):
    try: 
        weights_matrix[i] = embeddings[word]
    except KeyError:
        weights_matrix[i] = np.random.normal(scale=0.6, size=(emb_dim, ))

In [210]:
weights_matrix = torch.from_numpy(weights_matrix).float()

In [211]:
def get_data(data):
    sent_labels = []
    all_labels = []
    sent_idx = []
    all_idx = []
    for line in (data):
        split_line = line.split("\t")
        if len(split_line) == 2:
            word = split_line[0]
            tag = split_line[1]
            tag = tag.replace("\n", "")
            word = word.lower()
            if word in vocab:
                sent_idx.append(vocab[word])
            else:
                sent_idx.append(vocab["<unk>"])
            tag_idx = labels[tag]
            sent_labels.append(tag_idx)
        elif line=="\n":
            sent_idx = np.array(sent_idx)
            sent_labels = np.array(sent_labels)
            all_idx.append(sent_idx)
            all_labels.append(sent_labels)
            sent_idx = []
            sent_labels = []
    return np.asarray(all_idx, dtype=object), np.asarray(all_labels, dtype=object)

In [212]:
trainX, trainY = get_data(train_data)
valX, valY = get_data(val_data)

In [213]:
trainData = []
valData = []
for i in range(len(trainX)):
    trainData.append((trainX[i], trainY[i]))
for i in range(len(valX)):
    valData.append((valX[i], valY[i]))
trainData = np.array(trainData, dtype=object)
valData = np.array(valData, dtype=object)

In [214]:
def custom_collate(data):
    
    batch_size = len(data)
    
    max_len = -1
    for i in range(batch_size):
        if len(data[i][0]) > max_len:
            max_len = len(data[i][0])
    
    padded_data = []
    padded_labels = []
    for i in range(batch_size):
        padded_data.append(np.pad(data[i][0], (0, max_len-len(data[i][0])), 'constant', constant_values=(vocab["<pad>"])))
        padded_labels.append(np.pad(data[i][1], (0, max_len-len(data[i][1])), 'constant', constant_values=["37"]))
    
    padded_data = torch.from_numpy(np.array(padded_data))
    padded_labels = torch.from_numpy(np.array(padded_labels))

    return [padded_data, padded_labels]

In [215]:
trainDataLoader = DataLoader(trainData, batch_size=BATCH_SIZE, shuffle=False, collate_fn=custom_collate)
valDataLoader = DataLoader(valData, batch_size=BATCH_SIZE, shuffle=False, collate_fn=custom_collate)

In [216]:
def create_emb_layer(weights_matrix, non_trainable=False):
    num_embeddings, embedding_dim = weights_matrix.shape
    emb_layer = nn.Embedding(num_embeddings, embedding_dim)
    emb_layer.load_state_dict({'weight': weights_matrix})
    if non_trainable:
        emb_layer.weight.requires_grad = False

    return emb_layer, num_embeddings, embedding_dim

In [217]:
class BiLSTM(nn.Module):
    def __init__(self, weights_matrix, hidden_dim, tagset_size):
        super(BiLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding, num_embeddings, embedding_dim = create_emb_layer(weights_matrix, True)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True)
        self.hidden2tag = nn.Linear(hidden_dim*2, tagset_size)
        
    def init_hidden(self):
        return (torch.zeros(2, 1, self.hidden_dim),
                torch.zeros(2, 1, self.hidden_dim))

    def forward(self, sentence):
        embeds = self.embedding(sentence)
        lstm_out, _ = self.lstm(embeds)
        tag_space = self.hidden2tag(lstm_out)
        tag_scores = F.softmax(tag_space, dim=1)
        return tag_scores

In [219]:
def train_one_epoch(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    for batch in tqdm(iterator):
        optimizer.zero_grad()
        X, y = batch
        predictions = model(X)
        predictions = predictions.permute(0, 2, 1)
        loss = criterion(predictions, y)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(iterator)

In [220]:
def train_model(model,epochs):
    loss_function = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.1)
    for epoch in (range(epochs)):
        print("Training Epoch {}".format(epoch))
        training_loss = train_one_epoch(model, trainDataLoader, optimizer, loss_function)
        print("Training Loss: {}".format(training_loss))

    return model

In [221]:
ner = BiLSTM(weights_matrix, 64, 39)

In [222]:
trained_model = train_model(ner, 10)

Training Epoch 0


100%|██████████| 1800/1800 [00:03<00:00, 457.80it/s]


Training Loss: 3.6430135454071895
Training Epoch 1


100%|██████████| 1800/1800 [00:03<00:00, 474.06it/s]


Training Loss: 3.6213050679365795
Training Epoch 2


100%|██████████| 1800/1800 [00:03<00:00, 471.71it/s]


Training Loss: 3.6108315796322294
Training Epoch 3


100%|██████████| 1800/1800 [00:03<00:00, 475.30it/s]


Training Loss: 3.5932151340113747
Training Epoch 4


100%|██████████| 1800/1800 [00:03<00:00, 476.63it/s]


Training Loss: 3.5796316646205053
Training Epoch 5


100%|██████████| 1800/1800 [00:03<00:00, 467.61it/s]


Training Loss: 3.569438490205341
Training Epoch 6


100%|██████████| 1800/1800 [00:03<00:00, 459.93it/s]


Training Loss: 3.5585711288452146
Training Epoch 7


100%|██████████| 1800/1800 [00:03<00:00, 459.21it/s]


Training Loss: 3.5481018348534903
Training Epoch 8


100%|██████████| 1800/1800 [00:03<00:00, 463.98it/s]


Training Loss: 3.5391218022505444
Training Epoch 9


100%|██████████| 1800/1800 [00:03<00:00, 479.55it/s]

Training Loss: 3.531314436859555





In [223]:
correct = 0
total = 0
for i in range(len(valX)):
    inp = torch.tensor(valX[i])
    out = ner(inp)
    # take the argmax of the output
    out = out.detach().numpy()
    out = np.argmax(out, axis=1)
    
    gold = valY[i]
    correct += np.sum(out == gold)
    total += len(gold)

In [224]:
correct/total *100

55.52465651587324