In [33]:
import sys
import os
import re
import string
import json
import urllib.request
import numpy as np

from tqdm import tqdm

import torch
torch.manual_seed(42)
np.random.seed(42)
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader, TensorDataset

from torchcrf import CRF
from sklearn.metrics import f1_score

torch.manual_seed(1)
device = torch.device("cpu")

In [34]:
BATCH_SIZE = 32

In [35]:
with open('Data/labels.json') as f:
    labels = json.load(f)

In [36]:
# invert the labels
labels_inv = {v: k for k, v in labels.items()}

In [37]:
def read_file(filename):
    with open(filename, 'r') as file:
        text = file.readlines()
    return text

In [38]:
train_data = read_file('Data/train.txt')
val_data = read_file('Data/dev.txt')

In [39]:
embeddings = {}
emb_dim = 50
with open('glove.6B/glove.6B.50d.txt','r') as f:
  for line in f:
    values = line.split()
    word = values[0]
    vector = np.asarray(values[1:],'float32')
    embeddings[word]=vector

In [40]:
NUMERIC_KEY = "<numeric>"
UNK_KEY = "<unk>"

ADDITIONAL_KEYS = [NUMERIC_KEY, UNK_KEY]

In [41]:
for k in ADDITIONAL_KEYS:
    embeddings[k] = np.random.normal(scale=0.6, size=(emb_dim, ))

In [42]:
# vocab_keys = list(embeddings.keys())
vocab_keys = []
vocab_keys.append("<unk>")
vocab_keys.append("<pad>")
vocab_keys.append("<numeric>")
vocab = {k: v for v, k in enumerate(vocab_keys)}

In [43]:
len(vocab)

3

In [44]:
def build_train_vocab(data):
    vocab = {}
    num_words = 0
    for line in data:
        split_line = line.split("\t")
        if len(split_line) == 2:
            word = split_line[0]
            word = word.lower()
            if word not in vocab:
                vocab[word] = 1
    return vocab

In [45]:
train_vocab = build_train_vocab(train_data)
# extend the vocab with the train_vocab
idx = len(vocab)
for word in train_vocab:
    if word not in vocab:
        vocab[word] = idx
        idx += 1

In [46]:
len(vocab)

7400

In [47]:
# dump vocab
with open('vocab.json', 'w') as fp:
    json.dump(vocab, fp)

In [48]:
def check_if_quantity(word):
    # check if the word is a quantity
    if re.match(r'^\d+\.?\d*[a-zA-Z]*$', word):
        return True
    return False

In [49]:
def get_quantity_vector(word):
    num = re.findall(r'\d+\.?\d*', word)
    if len(num) > 0:
        num = float(num[0])
    else:
        num = 0
    unit = re.findall(r'[a-zA-Z]+', word)
    if len(unit) > 0:
        unit = unit[0]
    else:
        unit = ""
    if unit in embeddings:
        return embeddings["<numeric>"] + embeddings[unit]
    else:
        return np.random.normal(scale=0.6, size=(emb_dim, ))

In [50]:
def get_vector(word):
    word = word.replace("~","")
    temp = word.replace(",", "")
    temp = temp.replace("-", "")
    if temp.replace(".", "", 1).isdigit():
        return embeddings[NUMERIC_KEY]
    elif word in embeddings:
        return embeddings[word]
    # elif check_if_quantity(word):
    #     return get_quantity_vector(word)
    else:
        return np.random.normal(scale=0.6, size=(emb_dim, ))

In [51]:
matrix_len = len(vocab)
weights_matrix = np.zeros((matrix_len, emb_dim))

for i, word in enumerate(vocab):
    weights_matrix[i] = get_vector(word)
    # if word in embeddings:
    #     weights_matrix[i] = embeddings[word]
    # else:
    #     weights_matrix[i] = np.random.normal(scale=0.6, size=(emb_dim, ))

In [52]:
weights_matrix = torch.from_numpy(weights_matrix).float()

In [53]:
def get_data(data):
    sent_labels = []
    all_labels = []
    sent_idx = []
    all_idx = []
    for line in (data):
        split_line = line.split("\t")
        if len(split_line) == 2:
            word = split_line[0]
            tag = split_line[1]
            tag = tag.replace("\n", "")
            word = word.lower()
            if word in vocab:
                sent_idx.append(vocab[word])
            else:
                sent_idx.append(vocab["<unk>"])
            tag_idx = labels[tag]
            sent_labels.append(tag_idx)
        elif line=="\n":
            sent_idx = np.array(sent_idx)
            sent_labels = np.array(sent_labels)
            all_idx.append(sent_idx)
            all_labels.append(sent_labels)
            sent_idx = []
            sent_labels = []
        else:
            print(line)
    return np.asarray(all_idx, dtype=object), np.asarray(all_labels, dtype=object)

In [54]:
trainX, trainY = get_data(train_data)
valX, valY = get_data(val_data)

In [55]:
trainData = []
valData = []
for i in range(len(trainX)):
    trainData.append((trainX[i], trainY[i]))
for i in range(len(valX)):
    valData.append((valX[i], valY[i]))
trainData = np.array(trainData, dtype=object)
valData = np.array(valData, dtype=object)

In [56]:
def custom_collate(data):
    
    batch_size = len(data)
    
    max_len = -1
    for i in range(batch_size):
        if len(data[i][0]) > max_len:
            max_len = len(data[i][0])
    
    seq_lengths = []
    for i in range(batch_size):
        seq_lengths.append(len(data[i][0]))
    
    padded_data = []
    padded_labels = []
    mask = []
    for i in range(batch_size):
        padded_data.append(np.pad(data[i][0], (0, max_len-len(data[i][0])), 'constant', constant_values=(vocab["<pad>"])))
        padded_labels.append(np.pad(data[i][1], (0, max_len-len(data[i][1])), 'constant', constant_values=["37"]))
        mask.append(np.pad(np.ones(len(data[i][0])), (0, max_len-len(data[i][0])), 'constant', constant_values=0).astype(bool))
    
    padded_data = torch.from_numpy(np.array(padded_data))
    padded_labels = torch.from_numpy(np.array(padded_labels))
    mask = torch.from_numpy(np.array(mask))

    return [padded_data, padded_labels, seq_lengths, mask]

In [57]:
trainDataLoader = DataLoader(trainData, batch_size=BATCH_SIZE, shuffle=False, collate_fn=custom_collate)
valDataLoader = DataLoader(valData, batch_size=BATCH_SIZE, shuffle=False, collate_fn=custom_collate)

In [58]:
for batch in trainDataLoader:
    X, y, seq_lens, mask = batch

In [59]:

class BiLSTMCRF(nn.Module):
    def __init__(self, weights_matrix, hidden_dim, tagset_size):
        super(BiLSTMCRF, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding.from_pretrained(weights_matrix, freeze=False)
        embedding_dim = weights_matrix.shape[1]
        # self.embedding = nn.Embedding(len(vocab), embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True)
        self.dropout_layer = nn.Dropout(p=0.5)
        self.hidden2tag = nn.Linear(hidden_dim*2, tagset_size)
        self.crf = CRF(tagset_size, batch_first=True)
        

    def forward(self, sentence, labels, mask):
        embeds = self.embedding(sentence)
        lstm_out, _ = self.lstm(embeds)
        lstm_out = self.dropout_layer(lstm_out)
        emissions = self.hidden2tag(lstm_out)
        return -self.crf(emissions, labels, mask=mask)

    def predict(self, sentence, mask):
        embeds = self.embedding(sentence)
        lstm_out, _ = self.lstm(embeds)
        lstm_out = self.dropout_layer(lstm_out)
        scores = self.hidden2tag(lstm_out)
        return self.crf.decode(scores, mask=mask)

In [60]:
def train_one_epoch(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    for batch in tqdm(iterator):
        optimizer.zero_grad()
        X, y, seq_lens, mask = batch
        loss = model(X, y, mask)
        predictions = model.predict(X, mask)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(iterator)

In [61]:
def get_scores(preds, gold):
    flatten_preds = []
    flatten_gold = []
    for i in range(len(preds)):
        for j in range(len(preds[i])):
            flatten_preds.append(preds[i][j])
            flatten_gold.append(gold[i][j])
    idx = np.where(np.array(flatten_gold) != 0)[0]
    micro_f1 =  f1_score(np.array(flatten_preds)[idx], np.array(flatten_gold)[idx], average='micro')
    macro_f1 =  f1_score(np.array(flatten_preds)[idx], np.array(flatten_gold)[idx], average='macro')
    return micro_f1, macro_f1

In [62]:
def train_model(model,epochs):
    loss_function = nn.CrossEntropyLoss(ignore_index=37)
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    train_f1s = []
    val_f1s = []

    best_epoch = 0
    best_val_f1 = 0

    for epoch in (range(epochs)):
        print("Training Epoch {}".format(epoch))
        training_loss = train_one_epoch(model, trainDataLoader, optimizer, loss_function)
        print("Training Loss: {}".format(training_loss))

        model.eval()

        train_preds = []
        for batch in trainDataLoader:
            X, y, seq_lens, mask = batch
            predictions = model.predict(X, mask)
            train_preds.extend(predictions)
        train_preds = np.array(train_preds, dtype=object)

        train_micro_f1, train_macro_f1 = get_scores(train_preds, trainY)

        val_preds = []
        for batch in valDataLoader:
            X, y, seq_lens, mask = batch
            predictions = model.predict(X, mask)
            val_preds.extend(predictions)
        val_preds = np.array(val_preds, dtype=object)

        val_micro_f1, val_macro_f1 = get_scores(val_preds, valY)

        print("Training Micro F1: {}".format(train_micro_f1))
        print("Training Macro F1: {}".format(train_macro_f1))
        print("Validation Micro F1: {}".format(val_micro_f1))
        print("Validation Macro F1: {}".format(val_macro_f1))

        train_f1 = (train_micro_f1 + train_macro_f1) / 2
        val_f1 = (val_micro_f1 + val_macro_f1) / 2

        train_f1s.append(train_f1)
        val_f1s.append(val_f1)

        if val_f1 > best_val_f1:
            print("New Best Model at Epoch {}".format(epoch))
            print("Validation Micro F1: {}".format(val_micro_f1))
            print("Validation Macro F1: {}".format(val_macro_f1))
            best_val_f1 = val_f1
            best_epoch = epoch
            torch.save(model.state_dict(), 'best_model.pt')
        
        if epoch>=best_epoch + 3:
            break
        
        model.train()

    return model, train_f1s, val_f1s

In [63]:
ner = BiLSTMCRF(weights_matrix, 256, 38)

In [64]:
ner, train_f1s, val_f1s = train_model(ner, 30)

Training Epoch 0


100%|██████████| 225/225 [00:35<00:00,  6.27it/s]


Training Loss: 700.6398346625434
Training Micro F1: 0.5683490403581828
Training Macro F1: 0.33070636444002394
Validation Micro F1: 0.4954272020345283
Validation Macro F1: 0.3003557460059168
New Best Model at Epoch 0
Validation Micro F1: 0.4954272020345283
Validation Macro F1: 0.3003557460059168
Training Epoch 1


100%|██████████| 225/225 [00:36<00:00,  6.17it/s]


Training Loss: 449.2090400526259
Training Micro F1: 0.6529791164407152
Training Macro F1: 0.47871753764206376
Validation Micro F1: 0.5592507458306842
Validation Macro F1: 0.4269260284001654
New Best Model at Epoch 1
Validation Micro F1: 0.5592507458306842
Validation Macro F1: 0.4269260284001654
Training Epoch 2


100%|██████████| 225/225 [00:37<00:00,  6.07it/s]


Training Loss: 370.2796192762587
Training Micro F1: 0.7004132878299258
Training Macro F1: 0.541361979989149
Validation Micro F1: 0.5840465593974666
Validation Macro F1: 0.47147133233288335
New Best Model at Epoch 2
Validation Micro F1: 0.5840465593974666
Validation Macro F1: 0.47147133233288335
Training Epoch 3


100%|██████████| 225/225 [00:37<00:00,  5.97it/s]


Training Loss: 317.7520138888889
Training Micro F1: 0.7331788722251794
Training Macro F1: 0.6052369469656443
Validation Micro F1: 0.5987186384310657
Validation Macro F1: 0.520977014940782
New Best Model at Epoch 3
Validation Micro F1: 0.5987186384310657
Validation Macro F1: 0.520977014940782
Training Epoch 4


100%|██████████| 225/225 [00:38<00:00,  5.81it/s]


Training Loss: 278.20322068956165
Training Micro F1: 0.7596981746454178
Training Macro F1: 0.6469984366848226
Validation Micro F1: 0.6078642343620091
Validation Macro F1: 0.5428483985564891
New Best Model at Epoch 4
Validation Micro F1: 0.6078642343620091
Validation Macro F1: 0.5428483985564891
Training Epoch 5


100%|██████████| 225/225 [00:38<00:00,  5.78it/s]


Training Loss: 247.06235643174912
Training Micro F1: 0.7818184664516736
Training Macro F1: 0.6795716418920611
Validation Micro F1: 0.6135374382550007
Validation Macro F1: 0.5505621362916454
New Best Model at Epoch 5
Validation Micro F1: 0.6135374382550007
Validation Macro F1: 0.5505621362916454
Training Epoch 6


100%|██████████| 225/225 [00:37<00:00,  6.03it/s]


Training Loss: 220.76384677463108
Training Micro F1: 0.8021697611071105
Training Macro F1: 0.7112822149883017
Validation Micro F1: 0.6186237589866485
Validation Macro F1: 0.5636699360371269
New Best Model at Epoch 6
Validation Micro F1: 0.6186237589866485
Validation Macro F1: 0.5636699360371269
Training Epoch 7


100%|██████████| 225/225 [00:36<00:00,  6.11it/s]


Training Loss: 199.53359419080945
Training Micro F1: 0.819155264723379
Training Macro F1: 0.737946391774091
Validation Micro F1: 0.6182325035457524
Validation Macro F1: 0.5691037896672797
New Best Model at Epoch 7
Validation Micro F1: 0.6182325035457524
Validation Macro F1: 0.5691037896672797
Training Epoch 8


100%|██████████| 225/225 [00:37<00:00,  6.04it/s]


Training Loss: 179.40003217909072
Training Micro F1: 0.8390525689595794
Training Macro F1: 0.7625995983913573
Validation Micro F1: 0.6184281312662004
Validation Macro F1: 0.5685909140715285
Training Epoch 9


100%|██████████| 225/225 [00:37<00:00,  6.04it/s]


Training Loss: 161.19162170410155
Training Micro F1: 0.856069382259933
Training Macro F1: 0.7846236403116618
Validation Micro F1: 0.6178412481048564
Validation Macro F1: 0.5695699345564489
New Best Model at Epoch 9
Validation Micro F1: 0.6178412481048564
Validation Macro F1: 0.5695699345564489
Training Epoch 10


100%|██████████| 225/225 [00:37<00:00,  6.06it/s]


Training Loss: 144.00107245551214
Training Micro F1: 0.8758884122859201
Training Macro F1: 0.8133150892297829
Validation Micro F1: 0.6156404362498166
Validation Macro F1: 0.5638314051110899
Training Epoch 11


 34%|███▍      | 77/225 [00:13<00:25,  5.77it/s]


KeyboardInterrupt: 

In [65]:
# load best model
ner.load_state_dict(torch.load('best_model.pt'))

<All keys matched successfully>

In [None]:
torch.save(ner, "model.pt")

In [66]:
ner.eval()

BiLSTMCRF(
  (embedding): Embedding(7400, 200)
  (lstm): LSTM(200, 256, bidirectional=True)
  (dropout_layer): Dropout(p=0.5, inplace=False)
  (hidden2tag): Linear(in_features=512, out_features=38, bias=True)
  (crf): CRF(num_tags=38)
)

In [79]:
torch.save(ner.state_dict(), "kshitij.pt")

In [67]:
from sklearn.metrics import f1_score

In [68]:
# get validation predictions using valDataloader
val_preds = []
for batch in valDataLoader:
    X, y, seq_lens, mask = batch
    predictions = ner.predict(X, mask)
    val_preds.extend(predictions)
val_preds = np.array(val_preds, dtype=object)

In [69]:
flatten_val_preds = []
flatten_valY = []
for i in range(len(val_preds)):
    for j in range(len(val_preds[i])):
        flatten_val_preds.append(val_preds[i][j])
        flatten_valY.append(valY[i][j])

In [70]:
f1_score(flatten_valY, flatten_val_preds, average='micro')

0.7604834934882568

In [71]:
f1_score(flatten_valY, flatten_val_preds, average='macro')

0.5791153088548913

In [72]:
# idx where flatten_valY is not 0
idx = np.where(np.array(flatten_valY) != 0)[0]
f1_score(np.array(flatten_valY)[idx], np.array(flatten_val_preds)[idx], average='micro')

0.6178412481048564

In [73]:
f1_score(np.array(flatten_valY)[idx], np.array(flatten_val_preds)[idx], average='macro')

0.5695699345564489

In [74]:
val_preds

array([list([6, 14, 15, 2, 3, 0, 0, 22, 0, 0, 26, 0]),
       list([6, 2, 0, 0, 0, 6, 14, 15, 0, 16, 17, 2, 0, 0, 0, 0, 0, 2, 3, 3, 3, 0, 0, 0]),
       list([6, 14, 15, 12, 27, 0, 2, 3, 0]), ...,
       list([6, 0, 22, 0, 11, 12, 0, 6, 2, 0, 0, 22, 0]),
       list([6, 4, 0, 1, 0, 6, 0, 22, 0]),
       list([6, 0, 18, 19, 0, 11, 12, 0, 0, 0, 4, 6, 0, 6, 0, 2, 0])],
      dtype=object)

In [75]:
# write predictions to file
with open('val_preds.txt', 'w') as f:
    for i in range(len(val_preds)):
        for j in range(len(val_preds[i])):
            f.write(labels_inv[val_preds[i][j]] + '\n')
        f.write("\n")

In [76]:
!python3 eval.py Data/dev.txt val_preds.txt

CLASSIFICATION Report
                 precision    recall  f1_score  true_entities  pred_entities
Reagent           0.808605  0.757997  0.782484         5033.0         4718.0
Action            0.838700  0.794231  0.815860         3640.0         3447.0
Modifier          0.584369  0.329329  0.421255         1998.0         1126.0
Location          0.779203  0.580191  0.665130         1989.0         1481.0
Amount            0.876660  0.806752  0.840255         1718.0         1581.0
Time              0.902857  0.862602  0.882271         1099.0         1050.0
Device            0.563830  0.528239  0.545455          903.0          846.0
Method            0.536210  0.385809  0.448743          902.0          649.0
Concentration     0.784411  0.668079  0.721587          708.0          603.0
Temperature       0.907716  0.895522  0.901578          670.0          661.0
Measure-Type      0.563187  0.476744  0.516373          430.0          364.0
Generic-Measure   0.503759  0.223333  0.309469        

In [None]:
len(trainX)

7198

In [None]:
len(valX)

2267