In [1]:
import sys
import os
import re
import string
import json
import urllib.request
import numpy as np

from tqdm import tqdm

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

torch.manual_seed(1)

<torch._C.Generator at 0x10779b410>

In [20]:
device = torch.device("cpu")

In [4]:
with open('Data/labels.json') as f:
    labels = json.load(f)

In [5]:
additional_labels = {
    "START_OF_SENTENCE": 37,
    "END_OF_SENTENCE": 38,
    "PAD": 39
}

In [6]:
labels.update(additional_labels)

In [7]:
def read_file_line_by_line(filename):
    with open(filename, 'r') as file:
        text = file.readlines()
    return text
train_data = read_file_line_by_line('Data/train.txt')

In [8]:
def build_vocab(data):
    vocab = {}
    num_words = 0
    for line in data:
        split_line = line.split("\t")
        if len(split_line) == 2:
            word = split_line[0]
            if word not in vocab:
                vocab[word] = num_words
                num_words += 1
    return vocab

In [9]:
vocab = build_vocab(train_data)

In [10]:
emmbeddings = {}
with open('glove.6B/glove.6B.200d.txt','r') as f:
  for line in f:
    values = line.split()
    word = values[0]
    vector = np.asarray(values[1:],'float32')
    emmbeddings[word]=vector

In [24]:
def get_data(data, max_length = 150):
    sent_labels = []
    all_labels = []
    sent_embeddings = []
    all_embeddings = []
    for line in (data):
        split_line = line.split("\t")
        if len(split_line) == 2:
            word = split_line[0]
            tag = split_line[1]
            tag = tag.replace("\n", "")
            if word in emmbeddings:
                embed = np.array(emmbeddings[word])
                sent_embeddings.append(embed)
            else:
                sent_embeddings.append(np.zeros(200))
            tag_idx = labels[tag]
            sent_labels.append(tag_idx)
        elif line=="\n":
            # if len(sent_embeddings) < max_length:
            #     for i in range(max_length - len(sent_embeddings)):
            #         sent_embeddings.append(np.zeros(200))
            #         sent_labels.append(38)
            sent_embeddings = np.array(sent_embeddings)
            sent_labels = np.array(sent_labels)
            all_embeddings.append(sent_embeddings)
            all_labels.append(sent_labels)
            sent_embeddings = []
            sent_labels = []
    return np.asarray(all_embeddings), np.asarray(all_labels)

In [25]:
train_data_embed, train_labels = get_data(train_data)

  return np.asarray(all_embeddings), np.asarray(all_labels)


In [26]:
train_data_embed.shape

(7198,)

In [27]:
max_yet = -1
for i in range(len(train_data_embed)):
    if len(train_data_embed[i]) > max_yet:
        max_yet = len(train_data_embed[i])
max_yet

110

In [28]:
class BiLSTM(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, tagset_size):
        super(BiLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True)
        self.hidden2tag = nn.Linear(hidden_dim*2, tagset_size)
        self.word_embedding = nn.from_pretrained_embeddings()    
        
    def init_hidden(self):
        return (torch.zeros(2, 1, self.hidden_dim),
                torch.zeros(2, 1, self.hidden_dim))

    def forward(self, sentence):
        # sentence: [[size 200],[size 200],[size 200]....]
        input_tensor = torch.tensor(sentence, dtype=torch.float)
        lstm_out, _ = self.lstm(sentence)
        tag_space = self.hidden2tag(lstm_out)
        tag_scores = F.softmax(tag_space, dim=1)
        return tag_scores

In [43]:
def train_model(model, train_data, train_labels, epochs):
    loss_function = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.1)
    for epoch in (range(epochs)):
        for sentence, tags in tqdm(zip(train_data, train_labels)):
            model.zero_grad()
            model.hidden = model.init_hidden()
            # convert sentence to tensor
            sentence_in = torch.tensor(sentence, dtype=torch.float)
            tag_scores = model(sentence_in)
            # convert tags to tensor
            tags = torch.tensor(tags, dtype=torch.long)
            loss = loss_function(tag_scores, tags)
            loss.backward()
            optimizer.step()
    return model

In [44]:
ner = BiLSTM(200, 64, 39)

In [45]:
# train_data_tensor = torch.tensor(train_data_embed, dtype=torch.float)
# train_labels_tensor = torch.tensor(train_labels, dtype=torch.long)

# to device
# ner = ner.to(device)
# train_data_tensor = train_data_tensor.to(device)
# train_labels_tensor = train_labels_tensor.to(device)

trained_model = train_model(ner, train_data_embed, train_labels, 10)

  input_tensor = torch.tensor(sentence, dtype=torch.float)
7198it [00:26, 275.53it/s]
7198it [00:25, 277.63it/s]
7198it [00:26, 272.19it/s]
7198it [00:25, 277.08it/s]
7198it [00:25, 277.72it/s]
7198it [00:26, 275.22it/s]
7198it [00:26, 272.57it/s]
7198it [00:25, 284.44it/s]
7198it [00:26, 268.92it/s]
7198it [00:27, 264.14it/s]


In [47]:
inp = torch.tensor(train_data_embed[0], dtype=torch.float)
out = ner(inp)

# take the argmax of the output
out = out.detach().numpy()
out = np.argmax(out, axis=1)
out

  input_tensor = torch.tensor(sentence, dtype=torch.float)


array([6, 2, 0, 0, 6, 0, 2, 0, 0, 0, 0, 6, 0, 2, 2, 0])

In [48]:
train_labels[0]

array([11,  2,  3,  0,  6, 13,  0,  2,  0, 11, 12,  0,  0,  2,  3,  6])

In [49]:
correct = 0
total = 0
for i in range(len(train_data_embed)):
    inp = torch.tensor(train_data_embed[i], dtype=torch.float)
    out = ner(inp)
    # take the argmax of the output
    out = out.detach().numpy()
    out = np.argmax(out, axis=1)
    gold = train_labels[i]

    correct += np.sum(out == gold)
    total += len(gold)

  input_tensor = torch.tensor(sentence, dtype=torch.float)


In [50]:
correct/total *100

56.68093242675768