In [13]:
import sys
import os
import re
import string
import json
import urllib.request
import numpy as np

from tqdm import tqdm

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

torch.manual_seed(1)

<torch._C.Generator at 0x115ba73d0>

In [14]:
device = torch.device("cpu")

In [23]:
with open('Data/labels.json') as f:
    labels = json.load(f)

In [24]:
def read_file_line_by_line(filename):
    with open(filename, 'r') as file:
        text = file.readlines()
    return text
train_data = read_file_line_by_line('Data/train.txt')

In [25]:
def build_vocab(data):
    vocab = {}
    num_words = 0
    for line in data:
        split_line = line.split("\t")
        if len(split_line) == 2:
            word = split_line[0].lower()
            if word not in vocab:
                vocab[word] = num_words
                num_words += 1
    return vocab

In [26]:
vocab = build_vocab(train_data)

In [28]:
embeddings = {}
emb_dim = 50
with open('glove.6B/glove.6B.50d.txt','r') as f:
  for line in f:
    values = line.split()
    word = values[0]
    vector = np.asarray(values[1:],'float32')
    embeddings[word]=vector

In [45]:
full_vocab_keys = list(embeddings.keys())
full_vocab_keys.append("<unk>")
full_vocab = {k: v for v, k in enumerate(full_vocab_keys)}

In [62]:
matrix_len = len(full_vocab)
weights_matrix = np.zeros((matrix_len, emb_dim))

for i, word in enumerate(full_vocab):
    try: 
        weights_matrix[i] = embeddings[word]
    except KeyError:
        weights_matrix[i] = np.random.normal(scale=0.6, size=(emb_dim, ))

In [63]:
weights_matrix = torch.from_numpy(weights_matrix).float()

In [64]:
def get_data(data):
    sent_labels = []
    all_labels = []
    sent_embeddings = []
    all_embeddings = []
    for line in (data):
        split_line = line.split("\t")
        if len(split_line) == 2:
            word = split_line[0]
            tag = split_line[1]
            tag = tag.replace("\n", "")
            word = word.lower()
            if word in full_vocab:
                sent_embeddings.append(full_vocab[word])
            else:
                sent_embeddings.append(full_vocab["<unk>"])
            tag_idx = labels[tag]
            sent_labels.append(tag_idx)
        elif line=="\n":
            sent_embeddings = np.array(sent_embeddings)
            sent_labels = np.array(sent_labels)
            all_embeddings.append(sent_embeddings)
            all_labels.append(sent_labels)
            sent_embeddings = []
            sent_labels = []
    return np.asarray(all_embeddings), np.asarray(all_labels)

In [70]:
trainX, trainY = get_data(train_data)

  return np.asarray(all_embeddings), np.asarray(all_labels)


In [66]:
def create_emb_layer(weights_matrix, non_trainable=False):
    num_embeddings, embedding_dim = weights_matrix.shape
    emb_layer = nn.Embedding(num_embeddings, embedding_dim)
    emb_layer.load_state_dict({'weight': weights_matrix})
    if non_trainable:
        emb_layer.weight.requires_grad = False

    return emb_layer, num_embeddings, embedding_dim

In [108]:
class BiLSTM(nn.Module):
    def __init__(self, weights_matrix, hidden_dim, tagset_size):
        super(BiLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding, num_embeddings, embedding_dim = create_emb_layer(weights_matrix, True)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True)
        self.hidden2tag = nn.Linear(hidden_dim*2, tagset_size)
        
    def init_hidden(self):
        return (torch.zeros(2, 1, self.hidden_dim),
                torch.zeros(2, 1, self.hidden_dim))

    def forward(self, sentence):
        embeds = self.embedding(sentence)
        lstm_out, _ = self.lstm(embeds)
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.softmax(tag_space, dim=1)
        return tag_scores

In [112]:
def train_model(model, train_data, train_labels, epochs):
    loss_function = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.1)
    for epoch in (range(epochs)):
        print("Training Epoch {}".format(epoch))
        for sentence, tags in tqdm(zip(train_data, train_labels)):
            model.zero_grad()
            # convert sentence to tensor
            sentence_in = torch.tensor(sentence)
            tag_scores = model(sentence_in)
            # convert tags to tensor
            tags = torch.tensor(tags, dtype=torch.long)
            loss = loss_function(tag_scores, tags)
            loss.backward()
            optimizer.step()
        # compute loss
        print("Loss: {}".format(loss.item()))
    return model

In [113]:
ner = BiLSTM(weights_matrix, 64, 39)

In [114]:
trained_model = train_model(ner, trainX, trainY, 10)

Currently on Epoch 0


698it [00:03, 228.21it/s]


KeyboardInterrupt: 

In [99]:
inp = torch.tensor(trainX[0])
out = ner(inp)

# take the argmax of the output
out = out.detach().numpy()
out = np.argmax(out, axis=1)
out

  input_tensor = torch.tensor(sentence)


array([0, 2, 0, 0, 6, 0, 2, 0, 0, 0, 0, 6, 0, 2, 2, 0])

In [102]:
correct = 0
total = 0
for i in range(len(trainX)):
    inp = torch.tensor(trainX[i])
    out = ner(inp)
    # take the argmax of the output
    out = out.detach().numpy()
    out = np.argmax(out, axis=1)
    gold = trainY[i]

    correct += np.sum(out == gold)
    total += len(gold)

  input_tensor = torch.tensor(sentence)


In [103]:
correct/total *100

58.164094958140865