In [1]:
import sys
import os
import re
import string
import json
import urllib.request
import numpy as np

from tqdm import tqdm

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader, TensorDataset

torch.manual_seed(1)
device = torch.device("cpu")

In [2]:
BATCH_SIZE = 32

In [3]:
with open('Data/labels.json') as f:
    labels = json.load(f)

In [4]:
def read_file(filename):
    with open(filename, 'r') as file:
        text = file.readlines()
    return text

In [5]:
train_data = read_file('Data/train.txt')
val_data = read_file('Data/dev.txt')

In [6]:
embeddings = {}
emb_dim = 50
with open('glove.6B/glove.6B.50d.txt','r') as f:
  for line in f:
    values = line.split()
    word = values[0]
    vector = np.asarray(values[1:],'float32')
    embeddings[word]=vector

In [7]:
# vocab_keys = list(embeddings.keys())
vocab_keys = []
vocab_keys.append("<unk>")
vocab_keys.append("<pad>")
vocab = {k: v for v, k in enumerate(vocab_keys)}

In [8]:
len(vocab)

2

In [9]:
def build_train_vocab(data):
    vocab = {}
    num_words = 0
    for line in data:
        split_line = line.split("\t")
        if len(split_line) == 2:
            word = split_line[0]
            word = word.lower()
            if word not in vocab:
                vocab[word] = 1
    return vocab

In [10]:
train_vocab = build_train_vocab(train_data)
# extend the vocab with the train_vocab
idx = len(vocab)
for word in train_vocab:
    if word not in vocab:
        vocab[word] = idx
        idx += 1

In [11]:
len(vocab)

7399

In [12]:
matrix_len = len(vocab)
weights_matrix = np.zeros((matrix_len, emb_dim))

for i, word in enumerate(vocab):
    try: 
        weights_matrix[i] = embeddings[word]
    except KeyError:
        print("Generating random embedding for word: ", word)
        weights_matrix[i] = np.random.normal(scale=0.6, size=(emb_dim, ))

Generating random embedding for word:  <unk>
Generating random embedding for word:  <pad>
Generating random embedding for word:  omniprep
Generating random embedding for word:  hotplate
Generating random embedding for word:  andcentrifuge
Generating random embedding for word:  bio-rad
Generating random embedding for word:  cyanophage
Generating random embedding for word:  microfuge
Generating random embedding for word:  di-tagged
Generating random embedding for word:  30-kd
Generating random embedding for word:  300ml
Generating random embedding for word:  parafilm
Generating random embedding for word:  ~20
Generating random embedding for word:  vivaspin20
Generating random embedding for word:  *without
Generating random embedding for word:  m2200
Generating random embedding for word:  m0202
Generating random embedding for word:  c-
Generating random embedding for word:  n-
Generating random embedding for word:  1-5x106
Generating random embedding for word:  trypsinized
Generating rand

In [13]:
weights_matrix = torch.from_numpy(weights_matrix).float()

In [14]:
def get_data(data):
    sent_labels = []
    all_labels = []
    sent_idx = []
    all_idx = []
    for line in (data):
        split_line = line.split("\t")
        if len(split_line) == 2:
            word = split_line[0]
            tag = split_line[1]
            tag = tag.replace("\n", "")
            word = word.lower()
            if word in vocab:
                sent_idx.append(vocab[word])
            else:
                sent_idx.append(vocab["<unk>"])
            tag_idx = labels[tag]
            sent_labels.append(tag_idx)
        elif line=="\n":
            sent_idx = np.array(sent_idx)
            sent_labels = np.array(sent_labels)
            all_idx.append(sent_idx)
            all_labels.append(sent_labels)
            sent_idx = []
            sent_labels = []
        else:
            print(line)
    return np.asarray(all_idx, dtype=object), np.asarray(all_labels, dtype=object)

In [15]:
trainX, trainY = get_data(train_data)
valX, valY = get_data(val_data)

In [16]:
trainData = []
valData = []
for i in range(len(trainX)):
    trainData.append((trainX[i], trainY[i]))
for i in range(len(valX)):
    valData.append((valX[i], valY[i]))
trainData = np.array(trainData, dtype=object)
valData = np.array(valData, dtype=object)

In [17]:
def custom_collate(data):
    
    batch_size = len(data)
    
    max_len = -1
    for i in range(batch_size):
        if len(data[i][0]) > max_len:
            max_len = len(data[i][0])
    
    seq_lengths = []
    for i in range(batch_size):
        seq_lengths.append(len(data[i][0]))
    
    padded_data = []
    padded_labels = []
    for i in range(batch_size):
        padded_data.append(np.pad(data[i][0], (0, max_len-len(data[i][0])), 'constant', constant_values=(vocab["<pad>"])))
        padded_labels.append(np.pad(data[i][1], (0, max_len-len(data[i][1])), 'constant', constant_values=["37"]))
    
    padded_data = torch.from_numpy(np.array(padded_data))
    padded_labels = torch.from_numpy(np.array(padded_labels))

    return [padded_data, padded_labels, seq_lengths]

In [18]:
trainDataLoader = DataLoader(trainData, batch_size=BATCH_SIZE, shuffle=False, collate_fn=custom_collate)
valDataLoader = DataLoader(valData, batch_size=BATCH_SIZE, shuffle=False, collate_fn=custom_collate)

In [19]:
class BiLSTM(nn.Module):
    def __init__(self, weights_matrix, hidden_dim, tagset_size):
        super(BiLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding.from_pretrained(weights_matrix, freeze=False)
        embedding_dim = weights_matrix.shape[1]
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True)
        self.dropout_layer   = nn.Dropout(p=0.5)
        self.hidden2tag = nn.Linear(hidden_dim*2, tagset_size)

    def forward(self, sentence):
        embeds = self.embedding(sentence)
        lstm_out, _ = self.lstm(embeds)
        lstm_out = self.dropout_layer(lstm_out)
        tag_space = self.hidden2tag(lstm_out)
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [24]:
def train_one_epoch(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    for batch in tqdm(iterator):
        optimizer.zero_grad()
        X, y, seq_lens = batch
        predictions = model(X)
        y = y.view(-1)
        predictions = predictions.view(-1, predictions.shape[-1])
        loss = criterion(predictions, y)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(iterator)

In [25]:
def train_model(model,epochs):
    loss_function = nn.CrossEntropyLoss(ignore_index=37)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    for epoch in (range(epochs)):
        print("Training Epoch {}".format(epoch))
        training_loss = train_one_epoch(model, trainDataLoader, optimizer, loss_function)
        print("Training Loss: {}".format(training_loss))

    return model

In [26]:
ner = BiLSTM(weights_matrix, 256, 38)

In [27]:
ner = train_model(ner, 30)

Training Epoch 0


100%|██████████| 225/225 [00:15<00:00, 14.86it/s]


Training Loss: 1.7943354278140597
Training Epoch 1


100%|██████████| 225/225 [00:15<00:00, 14.26it/s]


Training Loss: 1.2295388123724196
Training Epoch 2


100%|██████████| 225/225 [00:15<00:00, 14.68it/s]


Training Loss: 1.098670114411248
Training Epoch 3


100%|██████████| 225/225 [00:16<00:00, 13.79it/s]


Training Loss: 1.0203860931926303
Training Epoch 4


100%|██████████| 225/225 [00:15<00:00, 14.17it/s]


Training Loss: 0.9654548282093472
Training Epoch 5


100%|██████████| 225/225 [00:15<00:00, 14.14it/s]


Training Loss: 0.9214399947060479
Training Epoch 6


100%|██████████| 225/225 [00:15<00:00, 14.08it/s]


Training Loss: 0.8842188506656223
Training Epoch 7


100%|██████████| 225/225 [00:16<00:00, 14.05it/s]


Training Loss: 0.8527624938223097
Training Epoch 8


100%|██████████| 225/225 [00:15<00:00, 14.10it/s]


Training Loss: 0.8254117295477126
Training Epoch 9


100%|██████████| 225/225 [00:16<00:00, 13.94it/s]


Training Loss: 0.7989896400769552
Training Epoch 10


100%|██████████| 225/225 [00:16<00:00, 13.53it/s]


Training Loss: 0.773981403377321
Training Epoch 11


100%|██████████| 225/225 [00:15<00:00, 14.50it/s]


Training Loss: 0.7495540144708421
Training Epoch 12


100%|██████████| 225/225 [00:15<00:00, 14.51it/s]


Training Loss: 0.7243207825554742
Training Epoch 13


100%|██████████| 225/225 [00:15<00:00, 14.32it/s]


Training Loss: 0.698889729446835
Training Epoch 14


100%|██████████| 225/225 [00:16<00:00, 14.03it/s]


Training Loss: 0.672646959622701
Training Epoch 15


100%|██████████| 225/225 [00:16<00:00, 13.94it/s]


Training Loss: 0.6435366476906671
Training Epoch 16


100%|██████████| 225/225 [00:16<00:00, 13.91it/s]


Training Loss: 0.6138216838571761
Training Epoch 17


100%|██████████| 225/225 [00:16<00:00, 13.94it/s]


Training Loss: 0.5815838042895
Training Epoch 18


100%|██████████| 225/225 [00:16<00:00, 13.87it/s]


Training Loss: 0.547906479967965
Training Epoch 19


100%|██████████| 225/225 [00:16<00:00, 13.87it/s]


Training Loss: 0.5150726624329884
Training Epoch 20


100%|██████████| 225/225 [00:16<00:00, 13.82it/s]


Training Loss: 0.4832207907570733
Training Epoch 21


100%|██████████| 225/225 [00:16<00:00, 13.70it/s]


Training Loss: 0.44985538005828857
Training Epoch 22


100%|██████████| 225/225 [00:15<00:00, 14.07it/s]


Training Loss: 0.4207250227530797
Training Epoch 23


100%|██████████| 225/225 [00:16<00:00, 14.00it/s]


Training Loss: 0.39355158726374306
Training Epoch 24


100%|██████████| 225/225 [00:16<00:00, 13.74it/s]


Training Loss: 0.37010228282875485
Training Epoch 25


100%|██████████| 225/225 [00:16<00:00, 13.64it/s]


Training Loss: 0.34822922547658286
Training Epoch 26


100%|██████████| 225/225 [00:16<00:00, 13.61it/s]


Training Loss: 0.3281098571750853
Training Epoch 27


100%|██████████| 225/225 [00:16<00:00, 13.73it/s]


Training Loss: 0.3116471676694022
Training Epoch 28


100%|██████████| 225/225 [00:16<00:00, 13.69it/s]


Training Loss: 0.29124262664053174
Training Epoch 29


100%|██████████| 225/225 [00:16<00:00, 14.01it/s]

Training Loss: 0.2738841743601693





In [28]:
ner.eval()

BiLSTM(
  (embedding): Embedding(7399, 50)
  (lstm): LSTM(50, 256, bidirectional=True)
  (dropout_layer): Dropout(p=0.5, inplace=False)
  (hidden2tag): Linear(in_features=512, out_features=38, bias=True)
)

In [29]:
correct = 0
total = 0
for i in range(len(trainX)):
    inp = torch.tensor(trainX[i])
    out = ner(inp)
    # take the argmax of the output
    out = out.detach().numpy()
    out = np.argmax(out, axis=1)
    
    gold = trainY[i]
    correct += np.sum(out == gold)
    total += len(gold)

In [30]:
correct/total *100

71.86101158832412

In [31]:
# get classwise accuracy
classwise_correct = np.zeros(38)
classwise_total = np.zeros(38)
for i in range(len(trainX)):
    inp = torch.tensor(trainX[i])
    out = ner(inp)
    # take the argmax of the output
    out = out.detach().numpy()
    out = np.argmax(out, axis=1)
    
    gold = trainY[i]
    for j in range(len(gold)):
        classwise_correct[gold[j]] += (out[j] == gold[j])
        classwise_total[gold[j]] += 1

In [32]:
from sklearn.metrics import f1_score

In [33]:
# get validation predictions using valDataloader
val_preds = []
for batch in valDataLoader:
    X, y, seq_lens = batch
    predictions = ner(X)
    predictions = predictions.detach().numpy()
    predictions = np.argmax(predictions, axis=2)
    for i in range(len(predictions)):
        val_preds.append(predictions[i][:seq_lens[i]])
val_preds = np.array(val_preds, dtype=object)

In [34]:
flatten_val_preds = []
flatten_valY = []
for i in range(len(val_preds)):
    for j in range(len(val_preds[i])):
        flatten_val_preds.append(val_preds[i][j])
        flatten_valY.append(valY[i][j])

In [35]:
f1_score(flatten_valY, flatten_val_preds, average='micro')

0.6833062584322256

In [36]:
f1_score(flatten_valY, flatten_val_preds, average='macro')

0.4349093615097704

In [39]:
# idx where flatten_valY is not 0
idx = np.where(np.array(flatten_valY) != 0)[0]
f1_score(np.array(flatten_valY)[idx], np.array(flatten_val_preds)[idx], average='micro')

0.5202719225314227

In [40]:
f1_score(np.array(flatten_valY)[idx], np.array(flatten_val_preds)[idx], average='macro')

0.4293210567820484