In [1]:
import sys
import os
import re
import string
import json
import urllib.request
import numpy as np

from tqdm import tqdm

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader, TensorDataset

from torchcrf import CRF
from sklearn.metrics import f1_score

torch.manual_seed(1)
device = torch.device("cpu")

In [2]:
BATCH_SIZE = 32

In [3]:
with open('Data/labels.json') as f:
    labels = json.load(f)

In [50]:
# invert the labels
labels_inv = {v: k for k, v in labels.items()}

In [4]:
def read_file(filename):
    with open(filename, 'r') as file:
        text = file.readlines()
    return text

In [5]:
train_data = read_file('Data/train.txt')
val_data = read_file('Data/dev.txt')

In [6]:
embeddings = {}
emb_dim = 50
with open('glove.6B/glove.6B.50d.txt','r') as f:
  for line in f:
    values = line.split()
    word = values[0]
    vector = np.asarray(values[1:],'float32')
    embeddings[word]=vector

In [7]:
NUMERIC_KEY = "<numeric>"
UNK_KEY = "<unk>"

ADDITIONAL_KEYS = [NUMERIC_KEY, UNK_KEY]

In [8]:
for k in ADDITIONAL_KEYS:
    embeddings[k] = np.random.normal(scale=0.6, size=(emb_dim, ))

In [9]:
# vocab_keys = list(embeddings.keys())
vocab_keys = []
vocab_keys.append("<unk>")
vocab_keys.append("<pad>")
vocab_keys.append("<numeric>")
vocab = {k: v for v, k in enumerate(vocab_keys)}

In [10]:
len(vocab)

3

In [11]:
def build_train_vocab(data):
    vocab = {}
    num_words = 0
    for line in data:
        split_line = line.split("\t")
        if len(split_line) == 2:
            word = split_line[0]
            word = word.lower()
            if word not in vocab:
                vocab[word] = 1
    return vocab

In [12]:
train_vocab = build_train_vocab(train_data)
# extend the vocab with the train_vocab
idx = len(vocab)
for word in train_vocab:
    if word not in vocab:
        vocab[word] = idx
        idx += 1

In [13]:
len(vocab)

7400

In [14]:
def check_if_quantity(word):
    # check if the word is a quantity
    if re.match(r'^\d+\.?\d*[a-zA-Z]*$', word):
        return True
    return False

In [15]:
def get_quantity_vector(word):
    num = re.findall(r'\d+\.?\d*', word)
    if len(num) > 0:
        num = float(num[0])
    else:
        num = 0
    unit = re.findall(r'[a-zA-Z]+', word)
    if len(unit) > 0:
        unit = unit[0]
    else:
        unit = ""
    if unit in embeddings:
        return embeddings["<numeric>"] + embeddings[unit]
    else:
        return np.random.normal(scale=0.6, size=(emb_dim, ))

In [16]:
def get_vector(word):
    word = word.replace("~","")
    temp = word.replace(",", "")
    temp = temp.replace("-", "")
    if temp.replace(".", "", 1).isdigit():
        return embeddings[NUMERIC_KEY]
    elif word in embeddings:
        return embeddings[word]
    elif check_if_quantity(word):
        return get_quantity_vector(word)
    else:
        return np.random.normal(scale=0.6, size=(emb_dim, ))

In [17]:
matrix_len = len(vocab)
weights_matrix = np.zeros((matrix_len, emb_dim))

for i, word in enumerate(vocab):
    weights_matrix[i] = get_vector(word)
    # if word in embeddings:
    #     weights_matrix[i] = embeddings[word]
    # else:
    #     weights_matrix[i] = np.random.normal(scale=0.6, size=(emb_dim, ))

In [18]:
weights_matrix = torch.from_numpy(weights_matrix).float()

In [19]:
def get_data(data):
    sent_labels = []
    all_labels = []
    sent_idx = []
    all_idx = []
    for line in (data):
        split_line = line.split("\t")
        if len(split_line) == 2:
            word = split_line[0]
            tag = split_line[1]
            tag = tag.replace("\n", "")
            word = word.lower()
            if word in vocab:
                sent_idx.append(vocab[word])
            else:
                sent_idx.append(vocab["<unk>"])
            tag_idx = labels[tag]
            sent_labels.append(tag_idx)
        elif line=="\n":
            sent_idx = np.array(sent_idx)
            sent_labels = np.array(sent_labels)
            all_idx.append(sent_idx)
            all_labels.append(sent_labels)
            sent_idx = []
            sent_labels = []
        else:
            print(line)
    return np.asarray(all_idx, dtype=object), np.asarray(all_labels, dtype=object)

In [20]:
trainX, trainY = get_data(train_data)
valX, valY = get_data(val_data)

In [21]:
trainData = []
valData = []
for i in range(len(trainX)):
    trainData.append((trainX[i], trainY[i]))
for i in range(len(valX)):
    valData.append((valX[i], valY[i]))
trainData = np.array(trainData, dtype=object)
valData = np.array(valData, dtype=object)

In [22]:
def custom_collate(data):
    
    batch_size = len(data)
    
    max_len = -1
    for i in range(batch_size):
        if len(data[i][0]) > max_len:
            max_len = len(data[i][0])
    
    seq_lengths = []
    for i in range(batch_size):
        seq_lengths.append(len(data[i][0]))
    
    padded_data = []
    padded_labels = []
    mask = []
    for i in range(batch_size):
        padded_data.append(np.pad(data[i][0], (0, max_len-len(data[i][0])), 'constant', constant_values=(vocab["<pad>"])))
        padded_labels.append(np.pad(data[i][1], (0, max_len-len(data[i][1])), 'constant', constant_values=["37"]))
        mask.append(np.pad(np.ones(len(data[i][0])), (0, max_len-len(data[i][0])), 'constant', constant_values=0).astype(bool))
    
    padded_data = torch.from_numpy(np.array(padded_data))
    padded_labels = torch.from_numpy(np.array(padded_labels))
    mask = torch.from_numpy(np.array(mask))

    return [padded_data, padded_labels, seq_lengths, mask]

In [23]:
trainDataLoader = DataLoader(trainData, batch_size=BATCH_SIZE, shuffle=False, collate_fn=custom_collate)
valDataLoader = DataLoader(valData, batch_size=BATCH_SIZE, shuffle=False, collate_fn=custom_collate)

In [24]:
for batch in trainDataLoader:
    X, y, seq_lens, mask = batch

In [25]:
class BiLSTM(nn.Module):
    def __init__(self, weights_matrix, hidden_dim, tagset_size):
        super(BiLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding.from_pretrained(weights_matrix, freeze=False)
        embedding_dim = weights_matrix.shape[1]
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True)
        self.dropout_layer   = nn.Dropout(p=0.5)
        self.hidden2tag = nn.Linear(hidden_dim*2, tagset_size)

    def forward(self, sentence):
        embeds = self.embedding(sentence)
        lstm_out, _ = self.lstm(embeds)
        lstm_out = self.dropout_layer(lstm_out)
        tag_space = self.hidden2tag(lstm_out)
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [26]:
class BiLSTMCRF(nn.Module):
    def __init__(self, weights_matrix, hidden_dim, tagset_size):
        super(BiLSTMCRF, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding.from_pretrained(weights_matrix, freeze=False)
        embedding_dim = weights_matrix.shape[1]
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True)
        self.dropout_layer = nn.Dropout(p=0.3)
        self.hidden2tag = nn.Linear(hidden_dim*2, tagset_size)
        self.crf = CRF(tagset_size, batch_first=True)
        

    def forward(self, sentence, labels, mask):
        embeds = self.embedding(sentence)
        lstm_out, _ = self.lstm(embeds)
        lstm_out = self.dropout_layer(lstm_out)
        emissions = self.hidden2tag(lstm_out)
        return -self.crf(emissions, labels, mask=mask)

    def predict(self, sentence, mask):
        embeds = self.embedding(sentence)
        lstm_out, _ = self.lstm(embeds)
        lstm_out = self.dropout_layer(lstm_out)
        scores = self.hidden2tag(lstm_out)
        return self.crf.decode(scores, mask=mask)

In [27]:
def train_one_epoch(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    for batch in tqdm(iterator):
        optimizer.zero_grad()
        X, y, seq_lens, mask = batch
        loss = model(X, y, mask)
        predictions = model.predict(X, mask)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(iterator)

In [28]:
def get_scores(preds, gold):
    flatten_preds = []
    flatten_gold = []
    for i in range(len(preds)):
        for j in range(len(preds[i])):
            flatten_preds.append(preds[i][j])
            flatten_gold.append(gold[i][j])
    idx = np.where(np.array(flatten_gold) != 0)[0]
    micro_f1 =  f1_score(np.array(flatten_preds)[idx], np.array(flatten_gold)[idx], average='micro')
    macro_f1 =  f1_score(np.array(flatten_preds)[idx], np.array(flatten_gold)[idx], average='macro')
    return micro_f1, macro_f1

In [29]:
def train_model(model,epochs):
    loss_function = nn.CrossEntropyLoss(ignore_index=37)
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    train_f1s = []
    val_f1s = []

    best_epoch = 0
    best_val_f1 = 0

    for epoch in (range(epochs)):
        print("Training Epoch {}".format(epoch))
        training_loss = train_one_epoch(model, trainDataLoader, optimizer, loss_function)
        print("Training Loss: {}".format(training_loss))

        model.eval()

        train_preds = []
        for batch in trainDataLoader:
            X, y, seq_lens, mask = batch
            predictions = model.predict(X, mask)
            train_preds.extend(predictions)
        train_preds = np.array(train_preds, dtype=object)

        train_micro_f1, train_macro_f1 = get_scores(train_preds, trainY)

        val_preds = []
        for batch in valDataLoader:
            X, y, seq_lens, mask = batch
            predictions = model.predict(X, mask)
            val_preds.extend(predictions)
        val_preds = np.array(val_preds, dtype=object)

        val_micro_f1, val_macro_f1 = get_scores(val_preds, valY)

        print("Training Micro F1: {}".format(train_micro_f1))
        print("Training Macro F1: {}".format(train_macro_f1))
        print("Validation Micro F1: {}".format(val_micro_f1))
        print("Validation Macro F1: {}".format(val_macro_f1))

        train_f1 = (train_micro_f1 + train_macro_f1) / 2
        val_f1 = (val_micro_f1 + val_macro_f1) / 2

        train_f1s.append(train_f1)
        val_f1s.append(val_f1)

        if val_f1 > best_val_f1:
            print("New Best Model at Epoch {}".format(epoch))
            print("Validation Micro F1: {}".format(val_micro_f1))
            print("Validation Macro F1: {}".format(val_macro_f1))
            best_val_f1 = val_f1
            best_epoch = epoch
            torch.save(model.state_dict(), 'best_model.pt')
        
        if epoch>=best_epoch + 3:
            break
        
        model.train()

    return model, train_f1s, val_f1s

In [30]:
ner = BiLSTMCRF(weights_matrix, 256, 38)

In [31]:
ner, train_f1s, val_f1s = train_model(ner, 30)

Training Epoch 0


100%|██████████| 225/225 [00:34<00:00,  6.47it/s]


Training Loss: 778.0048285590278
Training Micro F1: 0.5147155515200852
Training Macro F1: 0.2786839400521984
Validation Micro F1: 0.469359808284834
Validation Macro F1: 0.25728418251663
New Best Model at Epoch 0
Validation Micro F1: 0.469359808284834
Validation Macro F1: 0.25728418251663
Training Epoch 1


100%|██████████| 225/225 [00:35<00:00,  6.27it/s]


Training Loss: 504.4840157063802
Training Micro F1: 0.6020695701180375
Training Macro F1: 0.3917065572901869
Validation Micro F1: 0.5386609282535335
Validation Macro F1: 0.3561231410148542
New Best Model at Epoch 1
Validation Micro F1: 0.5386609282535335
Validation Macro F1: 0.3561231410148542
Training Epoch 2


100%|██████████| 225/225 [00:36<00:00,  6.24it/s]


Training Loss: 418.88360026041664
Training Micro F1: 0.6553899621152822
Training Macro F1: 0.47286465359222857
Validation Micro F1: 0.577297403042011
Validation Macro F1: 0.4230122446541476
New Best Model at Epoch 2
Validation Micro F1: 0.577297403042011
Validation Macro F1: 0.4230122446541476
Training Epoch 3


100%|██████████| 225/225 [00:36<00:00,  6.14it/s]


Training Loss: 361.76423597547745
Training Micro F1: 0.6858855944143524
Training Macro F1: 0.5234355206728373
Validation Micro F1: 0.5992566146622976
Validation Macro F1: 0.46791468300553307
New Best Model at Epoch 3
Validation Micro F1: 0.5992566146622976
Validation Macro F1: 0.46791468300553307
Training Epoch 4


100%|██████████| 225/225 [00:37<00:00,  6.05it/s]


Training Loss: 319.81729790581596
Training Micro F1: 0.7119665612573969
Training Macro F1: 0.5727905688176599
Validation Micro F1: 0.6136841590453367
Validation Macro F1: 0.5022277434466136
New Best Model at Epoch 4
Validation Micro F1: 0.6136841590453367
Validation Macro F1: 0.5022277434466136
Training Epoch 5


100%|██████████| 225/225 [00:34<00:00,  6.50it/s]


Training Loss: 286.6393647596571
Training Micro F1: 0.7354488243213627
Training Macro F1: 0.6133023665052525
Validation Micro F1: 0.6263510539443439
Validation Macro F1: 0.5317327344612656
New Best Model at Epoch 5
Validation Micro F1: 0.6263510539443439
Validation Macro F1: 0.5317327344612656
Training Epoch 6


100%|██████████| 225/225 [00:34<00:00,  6.60it/s]


Training Loss: 258.48229905870227
Training Micro F1: 0.7563793481323774
Training Macro F1: 0.6472196189698626
Validation Micro F1: 0.6340783489020394
Validation Macro F1: 0.5500900049461157
New Best Model at Epoch 6
Validation Micro F1: 0.6340783489020394
Validation Macro F1: 0.5500900049461157
Training Epoch 7


100%|██████████| 225/225 [00:34<00:00,  6.51it/s]


Training Loss: 235.02923075358072
Training Micro F1: 0.774413726165503
Training Macro F1: 0.6773496685954213
Validation Micro F1: 0.6348608597838313
Validation Macro F1: 0.5549235972223823
New Best Model at Epoch 7
Validation Micro F1: 0.6348608597838313
Validation Macro F1: 0.5549235972223823
Training Epoch 8


100%|██████████| 225/225 [00:35<00:00,  6.36it/s]


Training Loss: 214.86238762749565
Training Micro F1: 0.7912113716772597
Training Macro F1: 0.7031194958052785
Validation Micro F1: 0.6378930894507752
Validation Macro F1: 0.5620776782414733
New Best Model at Epoch 8
Validation Micro F1: 0.6378930894507752
Validation Macro F1: 0.5620776782414733
Training Epoch 9


100%|██████████| 225/225 [00:36<00:00,  6.24it/s]


Training Loss: 196.3770221625434
Training Micro F1: 0.8084786624502959
Training Macro F1: 0.7258235037678938
Validation Micro F1: 0.6380398102411111
Validation Macro F1: 0.5655844460530236
New Best Model at Epoch 9
Validation Micro F1: 0.6380398102411111
Validation Macro F1: 0.5655844460530236
Training Epoch 10


100%|██████████| 225/225 [00:35<00:00,  6.28it/s]


Training Loss: 179.55515228271486
Training Micro F1: 0.8262782178527819
Training Macro F1: 0.747272839104776
Validation Micro F1: 0.6371105785689832
Validation Macro F1: 0.5640966284028532
Training Epoch 11


100%|██████████| 225/225 [00:35<00:00,  6.30it/s]


Training Loss: 162.69178327772352
Training Micro F1: 0.8455962929334043
Training Macro F1: 0.7721255772769287
Validation Micro F1: 0.6373062062894312
Validation Macro F1: 0.5594101945032083
Training Epoch 12


100%|██████████| 225/225 [00:36<00:00,  6.25it/s]


Training Loss: 145.81724202473958
Training Micro F1: 0.8628948933905257
Training Macro F1: 0.7958880417243573
Validation Micro F1: 0.6349586736440553
Validation Macro F1: 0.5543933326860897


In [41]:
# load best model
ner.load_state_dict(torch.load('best_model.pt'))

<All keys matched successfully>

In [42]:
ner.eval()

BiLSTMCRF(
  (embedding): Embedding(7400, 50)
  (lstm): LSTM(50, 256, bidirectional=True)
  (dropout_layer): Dropout(p=0.3, inplace=False)
  (hidden2tag): Linear(in_features=512, out_features=38, bias=True)
  (crf): CRF(num_tags=38)
)

In [43]:
from sklearn.metrics import f1_score

In [44]:
# get validation predictions using valDataloader
val_preds = []
for batch in valDataLoader:
    X, y, seq_lens, mask = batch
    predictions = ner.predict(X, mask)
    val_preds.extend(predictions)
val_preds = np.array(val_preds, dtype=object)

In [45]:
flatten_val_preds = []
flatten_valY = []
for i in range(len(val_preds)):
    for j in range(len(val_preds[i])):
        flatten_val_preds.append(val_preds[i][j])
        flatten_valY.append(valY[i][j])

In [46]:
f1_score(flatten_valY, flatten_val_preds, average='micro')

0.7689363693934305

In [47]:
f1_score(flatten_valY, flatten_val_preds, average='macro')

0.5756355319572172

In [48]:
# idx where flatten_valY is not 0
idx = np.where(np.array(flatten_valY) != 0)[0]
f1_score(np.array(flatten_valY)[idx], np.array(flatten_val_preds)[idx], average='micro')

0.6380398102411111

In [49]:
f1_score(np.array(flatten_valY)[idx], np.array(flatten_val_preds)[idx], average='macro')

0.5655844460530236

In [52]:
val_preds

array([list([6, 14, 15, 2, 3, 0, 0, 22, 0, 0, 26, 0]),
       list([6, 2, 0, 0, 0, 6, 14, 15, 0, 16, 17, 2, 0, 0, 0, 0, 0, 2, 3, 3, 3, 0, 0, 0]),
       list([6, 14, 15, 12, 27, 0, 2, 3, 0]), ...,
       list([6, 0, 22, 0, 11, 12, 0, 6, 2, 0, 0, 22, 0]),
       list([6, 4, 0, 1, 0, 6, 0, 22, 0]),
       list([6, 0, 18, 19, 0, 11, 12, 0, 0, 0, 4, 6, 0, 6, 0, 2, 0])],
      dtype=object)

In [56]:
# write predictions to file
with open('val_preds.txt', 'w') as f:
    for i in range(len(val_preds)):
        for j in range(len(val_preds[i])):
            f.write(labels_inv[val_preds[i][j]] + '\n')
        f.write("\n")