In [1]:
import sys
import os
import re
import string
import json
import urllib.request
import numpy as np

from tqdm import tqdm

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader, TensorDataset

from torchcrf import CRF
from sklearn.metrics import f1_score

torch.manual_seed(1)
device = torch.device("cpu")

In [2]:
BATCH_SIZE = 32

In [3]:
with open('Data/labels.json') as f:
    labels = json.load(f)

In [4]:
# invert the labels
labels_inv = {v: k for k, v in labels.items()}

In [5]:
def read_file(filename):
    with open(filename, 'r') as file:
        text = file.readlines()
    return text

In [6]:
train_data = read_file('Data/train.txt')
val_data = read_file('Data/dev.txt')

In [7]:
w2v_file = read_file("W2V/patent_w2v.txt")
def get_w2v_embeddings(w2v_file):
    w2v_embeddings = {}
    for line in tqdm(w2v_file[1:]):
        values = line.split()
        word = values[0]
        for i in range(1, len(values)):
            values[i] = float(values[i])
        vector = np.asarray(values[1:], 'float32')
        w2v_embeddings[word]=vector
    return w2v_embeddings
embeddings = get_w2v_embeddings(w2v_file)

100%|██████████| 1252586/1252586 [00:33<00:00, 37630.26it/s]


In [8]:
NUMERIC_KEY = "<numeric>"
UNK_KEY = "<unk>"

ADDITIONAL_KEYS = [NUMERIC_KEY, UNK_KEY]

In [9]:
emb_dim = 200

In [10]:
for k in ADDITIONAL_KEYS:
    embeddings[k] = np.random.normal(scale=0.6, size=(emb_dim, ))

In [11]:
# vocab_keys = list(embeddings.keys())
vocab_keys = []
vocab_keys.append("<unk>")
vocab_keys.append("<pad>")
vocab_keys.append("<numeric>")
vocab = {k: v for v, k in enumerate(vocab_keys)}

In [12]:
len(vocab)

3

In [13]:
def build_train_vocab(data):
    vocab = {}
    num_words = 0
    for line in data:
        split_line = line.split("\t")
        if len(split_line) == 2:
            word = split_line[0]
            word = word.lower()
            if word not in vocab:
                vocab[word] = 1
    return vocab

In [14]:
train_vocab = build_train_vocab(train_data)
# extend the vocab with the train_vocab
idx = len(vocab)
for word in train_vocab:
    if word not in vocab:
        vocab[word] = idx
        idx += 1

In [15]:
len(vocab)

7400

In [16]:
def check_if_quantity(word):
    # check if the word is a quantity
    if re.match(r'^\d+\.?\d*[a-zA-Z]*$', word):
        return True
    return False

In [17]:
def get_quantity_vector(word):
    num = re.findall(r'\d+\.?\d*', word)
    if len(num) > 0:
        num = float(num[0])
    else:
        num = 0
    unit = re.findall(r'[a-zA-Z]+', word)
    if len(unit) > 0:
        unit = unit[0]
    else:
        unit = ""
    if unit in embeddings:
        return embeddings["<numeric>"] + embeddings[unit]
    else:
        return np.random.normal(scale=0.6, size=(emb_dim, ))

In [18]:
def get_vector(word):
    word = word.replace("~","")
    temp = word.replace(",", "")
    temp = temp.replace("-", "")
    if temp.replace(".", "", 1).isdigit():
        return embeddings[NUMERIC_KEY]
    elif word in embeddings:
        return embeddings[word]
    elif check_if_quantity(word):
        return get_quantity_vector(word)
    else:
        return np.random.normal(scale=0.6, size=(emb_dim, ))

In [19]:
matrix_len = len(vocab)
weights_matrix = np.zeros((matrix_len, emb_dim))

for i, word in enumerate(vocab):
    # weights_matrix[i] = get_vector(word)
    if word in embeddings:
        weights_matrix[i] = embeddings[word]
    else:
        weights_matrix[i] = np.random.normal(scale=0.6, size=(emb_dim, ))

In [20]:
weights_matrix = torch.from_numpy(weights_matrix).float()

In [21]:
def get_data(data):
    sent_labels = []
    all_labels = []
    sent_idx = []
    all_idx = []
    for line in (data):
        split_line = line.split("\t")
        if len(split_line) == 2:
            word = split_line[0]
            tag = split_line[1]
            tag = tag.replace("\n", "")
            word = word.lower()
            if word in vocab:
                sent_idx.append(vocab[word])
            else:
                sent_idx.append(vocab["<unk>"])
            tag_idx = labels[tag]
            sent_labels.append(tag_idx)
        elif line=="\n":
            sent_idx = np.array(sent_idx)
            sent_labels = np.array(sent_labels)
            all_idx.append(sent_idx)
            all_labels.append(sent_labels)
            sent_idx = []
            sent_labels = []
        else:
            print(line)
    return np.asarray(all_idx, dtype=object), np.asarray(all_labels, dtype=object)

In [22]:
trainX, trainY = get_data(train_data)
valX, valY = get_data(val_data)

In [23]:
trainData = []
valData = []
for i in range(len(trainX)):
    trainData.append((trainX[i], trainY[i]))
for i in range(len(valX)):
    valData.append((valX[i], valY[i]))
trainData = np.array(trainData, dtype=object)
valData = np.array(valData, dtype=object)

In [24]:
def custom_collate(data):
    
    batch_size = len(data)
    
    max_len = -1
    for i in range(batch_size):
        if len(data[i][0]) > max_len:
            max_len = len(data[i][0])
    
    seq_lengths = []
    for i in range(batch_size):
        seq_lengths.append(len(data[i][0]))
    
    padded_data = []
    padded_labels = []
    mask = []
    for i in range(batch_size):
        padded_data.append(np.pad(data[i][0], (0, max_len-len(data[i][0])), 'constant', constant_values=(vocab["<pad>"])))
        padded_labels.append(np.pad(data[i][1], (0, max_len-len(data[i][1])), 'constant', constant_values=["37"]))
        mask.append(np.pad(np.ones(len(data[i][0])), (0, max_len-len(data[i][0])), 'constant', constant_values=0).astype(bool))
    
    padded_data = torch.from_numpy(np.array(padded_data))
    padded_labels = torch.from_numpy(np.array(padded_labels))
    mask = torch.from_numpy(np.array(mask))

    return [padded_data, padded_labels, seq_lengths, mask]

In [25]:
trainDataLoader = DataLoader(trainData, batch_size=BATCH_SIZE, shuffle=False, collate_fn=custom_collate)
valDataLoader = DataLoader(valData, batch_size=BATCH_SIZE, shuffle=False, collate_fn=custom_collate)

In [26]:
for batch in trainDataLoader:
    X, y, seq_lens, mask = batch

In [27]:
class BiLSTMCRF(nn.Module):
    def __init__(self, weights_matrix, hidden_dim, tagset_size):
        super(BiLSTMCRF, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding.from_pretrained(weights_matrix, freeze=False)
        embedding_dim = weights_matrix.shape[1]
        # self.embedding = nn.Embedding(len(vocab), embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True)
        self.dropout_layer = nn.Dropout(p=0.5)
        self.hidden2tag = nn.Linear(hidden_dim*2, tagset_size)
        self.crf = CRF(tagset_size, batch_first=True)
        

    def forward(self, sentence, labels, mask):
        embeds = self.embedding(sentence)
        lstm_out, _ = self.lstm(embeds)
        lstm_out = self.dropout_layer(lstm_out)
        emissions = self.hidden2tag(lstm_out)
        return -self.crf(emissions, labels, mask=mask)

    def predict(self, sentence, mask):
        embeds = self.embedding(sentence)
        lstm_out, _ = self.lstm(embeds)
        lstm_out = self.dropout_layer(lstm_out)
        scores = self.hidden2tag(lstm_out)
        return self.crf.decode(scores, mask=mask)

In [28]:
def train_one_epoch(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    for batch in tqdm(iterator):
        optimizer.zero_grad()
        X, y, seq_lens, mask = batch
        loss = model(X, y, mask)
        predictions = model.predict(X, mask)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(iterator)

In [29]:
def get_scores(preds, gold):
    flatten_preds = []
    flatten_gold = []
    for i in range(len(preds)):
        for j in range(len(preds[i])):
            flatten_preds.append(preds[i][j])
            flatten_gold.append(gold[i][j])
    idx = np.where(np.array(flatten_gold) != 0)[0]
    micro_f1 =  f1_score(np.array(flatten_preds)[idx], np.array(flatten_gold)[idx], average='micro')
    macro_f1 =  f1_score(np.array(flatten_preds)[idx], np.array(flatten_gold)[idx], average='macro')
    return micro_f1, macro_f1

In [30]:
def train_model(model,epochs):
    loss_function = nn.CrossEntropyLoss(ignore_index=37)
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    train_f1s = []
    val_f1s = []

    best_epoch = 0
    best_val_f1 = 0

    for epoch in (range(epochs)):
        print("Training Epoch {}".format(epoch))
        training_loss = train_one_epoch(model, trainDataLoader, optimizer, loss_function)
        print("Training Loss: {}".format(training_loss))

        model.eval()

        train_preds = []
        for batch in trainDataLoader:
            X, y, seq_lens, mask = batch
            predictions = model.predict(X, mask)
            train_preds.extend(predictions)
        train_preds = np.array(train_preds, dtype=object)

        train_micro_f1, train_macro_f1 = get_scores(train_preds, trainY)

        val_preds = []
        for batch in valDataLoader:
            X, y, seq_lens, mask = batch
            predictions = model.predict(X, mask)
            val_preds.extend(predictions)
        val_preds = np.array(val_preds, dtype=object)

        val_micro_f1, val_macro_f1 = get_scores(val_preds, valY)

        print("Training Micro F1: {}".format(train_micro_f1))
        print("Training Macro F1: {}".format(train_macro_f1))
        print("Validation Micro F1: {}".format(val_micro_f1))
        print("Validation Macro F1: {}".format(val_macro_f1))

        train_f1 = (train_micro_f1 + train_macro_f1) / 2
        val_f1 = (val_micro_f1 + val_macro_f1) / 2

        train_f1s.append(train_f1)
        val_f1s.append(val_f1)

        if val_f1 > best_val_f1:
            print("New Best Model at Epoch {}".format(epoch))
            print("Validation Micro F1: {}".format(val_micro_f1))
            print("Validation Macro F1: {}".format(val_macro_f1))
            best_val_f1 = val_f1
            best_epoch = epoch
            torch.save(model.state_dict(), 'best_model.pt')
        
        if epoch>=best_epoch + 3:
            break
        
        model.train()

    return model, train_f1s, val_f1s

In [31]:
ner = BiLSTMCRF(weights_matrix, 256, 38)

In [32]:
ner, train_f1s, val_f1s = train_model(ner, 30)

Training Epoch 0


100%|██████████| 225/225 [00:35<00:00,  6.32it/s]


Training Loss: 701.8913517252604
Training Micro F1: 0.5908293935314193
Training Macro F1: 0.3521464893854044
Validation Micro F1: 0.5181200176064948
Validation Macro F1: 0.31601508382971394
New Best Model at Epoch 0
Validation Micro F1: 0.5181200176064948
Validation Macro F1: 0.31601508382971394
Training Epoch 1


100%|██████████| 225/225 [00:35<00:00,  6.28it/s]


Training Loss: 423.997747124566
Training Micro F1: 0.6684930649049751
Training Macro F1: 0.5018955487330797
Validation Micro F1: 0.5733848486330513
Validation Macro F1: 0.4461281038749001
New Best Model at Epoch 1
Validation Micro F1: 0.5733848486330513
Validation Macro F1: 0.4461281038749001
Training Epoch 2


100%|██████████| 225/225 [00:37<00:00,  6.06it/s]


Training Loss: 352.8575948079427
Training Micro F1: 0.7104480415792604
Training Macro F1: 0.5792843978570342
Validation Micro F1: 0.5954907810436739
Validation Macro F1: 0.5096407090655473
New Best Model at Epoch 2
Validation Micro F1: 0.5954907810436739
Validation Macro F1: 0.5096407090655473
Training Epoch 3


100%|██████████| 225/225 [00:37<00:00,  6.04it/s]


Training Loss: 305.7823756917318
Training Micro F1: 0.7366542471586461
Training Macro F1: 0.6155601989476462
Validation Micro F1: 0.6078642343620091
Validation Macro F1: 0.5318397594251538
New Best Model at Epoch 3
Validation Micro F1: 0.6078642343620091
Validation Macro F1: 0.5318397594251538
Training Epoch 4


100%|██████████| 225/225 [00:37<00:00,  5.93it/s]


Training Loss: 270.1366017659505
Training Micro F1: 0.7630796205266289
Training Macro F1: 0.6531222763096136
Validation Micro F1: 0.6174499926639605
Validation Macro F1: 0.5502004454772701
New Best Model at Epoch 4
Validation Micro F1: 0.6174499926639605
Validation Macro F1: 0.5502004454772701
Training Epoch 5


100%|██████████| 225/225 [00:37<00:00,  6.02it/s]


Training Loss: 241.45596455891928
Training Micro F1: 0.7830551989730423
Training Macro F1: 0.684747996623141
Validation Micro F1: 0.6202376876803443
Validation Macro F1: 0.5549680092710757
New Best Model at Epoch 5
Validation Micro F1: 0.6202376876803443
Validation Macro F1: 0.5549680092710757
Training Epoch 6


100%|██████████| 225/225 [00:37<00:00,  6.00it/s]


Training Loss: 217.9472102186415
Training Micro F1: 0.8029994677353706
Training Macro F1: 0.7187061656594725
Validation Micro F1: 0.6262043331540079
Validation Macro F1: 0.5647671580072361
New Best Model at Epoch 6
Validation Micro F1: 0.6262043331540079
Validation Macro F1: 0.5647671580072361
Training Epoch 7


100%|██████████| 225/225 [00:37<00:00,  6.00it/s]


Training Loss: 197.44150319417318
Training Micro F1: 0.8186386549359717
Training Macro F1: 0.7406751096316901
Validation Micro F1: 0.6268890301755758
Validation Macro F1: 0.5670369771806291
New Best Model at Epoch 7
Validation Micro F1: 0.6268890301755758
Validation Macro F1: 0.5670369771806291
Training Epoch 8


100%|██████████| 225/225 [00:37<00:00,  5.95it/s]


Training Loss: 178.04374047173394
Training Micro F1: 0.8375183944394002
Training Macro F1: 0.7627680588508928
Validation Micro F1: 0.6265955885949039
Validation Macro F1: 0.5667955184151072
Training Epoch 9


100%|██████████| 225/225 [00:38<00:00,  5.88it/s]


Training Loss: 160.5964633517795
Training Micro F1: 0.854973543316948
Training Macro F1: 0.7810440002560258
Validation Micro F1: 0.6263021470142319
Validation Macro F1: 0.5681951553772734
New Best Model at Epoch 9
Validation Micro F1: 0.6263021470142319
Validation Macro F1: 0.5681951553772734
Training Epoch 10


100%|██████████| 225/225 [00:38<00:00,  5.82it/s]


Training Loss: 143.616334499783
Training Micro F1: 0.8752778734462568
Training Macro F1: 0.8058845438233927
Validation Micro F1: 0.624688218320536
Validation Macro F1: 0.562903551299026
Training Epoch 11


100%|██████████| 225/225 [00:36<00:00,  6.13it/s]


Training Loss: 126.6094692993164
Training Micro F1: 0.8987601365102226
Training Macro F1: 0.8408620233958692
Validation Micro F1: 0.6268890301755758
Validation Macro F1: 0.5683359908960649
New Best Model at Epoch 11
Validation Micro F1: 0.6268890301755758
Validation Macro F1: 0.5683359908960649
Training Epoch 12


100%|██████████| 225/225 [00:36<00:00,  6.10it/s]


Training Loss: 109.27097744411893
Training Micro F1: 0.9144932527630796
Training Macro F1: 0.859049206608165
Validation Micro F1: 0.6230742896268401
Validation Macro F1: 0.562653780384435
Training Epoch 13


100%|██████████| 225/225 [00:36<00:00,  6.14it/s]


Training Loss: 94.40848739624023
Training Micro F1: 0.9290052913366103
Training Macro F1: 0.8805823542462607
Validation Micro F1: 0.6235633589279601
Validation Macro F1: 0.5616466842785937
Training Epoch 14


100%|██████████| 225/225 [00:36<00:00,  6.15it/s]


Training Loss: 80.76393747965494
Training Micro F1: 0.9359716960455869
Training Macro F1: 0.8927338437048564
Validation Micro F1: 0.6184770381963124
Validation Macro F1: 0.5493059976613026


In [33]:
# load best model
ner.load_state_dict(torch.load('best_model.pt'))

<All keys matched successfully>

In [34]:
ner.eval()

BiLSTMCRF(
  (embedding): Embedding(7400, 200)
  (lstm): LSTM(200, 256, bidirectional=True)
  (dropout_layer): Dropout(p=0.5, inplace=False)
  (hidden2tag): Linear(in_features=512, out_features=38, bias=True)
  (crf): CRF(num_tags=38)
)

In [35]:
from sklearn.metrics import f1_score

In [36]:
# get validation predictions using valDataloader
val_preds = []
for batch in valDataLoader:
    X, y, seq_lens, mask = batch
    predictions = ner.predict(X, mask)
    val_preds.extend(predictions)
val_preds = np.array(val_preds, dtype=object)

In [37]:
flatten_val_preds = []
flatten_valY = []
for i in range(len(val_preds)):
    for j in range(len(val_preds[i])):
        flatten_val_preds.append(val_preds[i][j])
        flatten_valY.append(valY[i][j])

In [38]:
f1_score(flatten_valY, flatten_val_preds, average='micro')

0.7564635590186954

In [39]:
f1_score(flatten_valY, flatten_val_preds, average='macro')

0.5750447567361079

In [40]:
# idx where flatten_valY is not 0
idx = np.where(np.array(flatten_valY) != 0)[0]
f1_score(np.array(flatten_valY)[idx], np.array(flatten_val_preds)[idx], average='micro')

0.6268890301755758

In [41]:
f1_score(np.array(flatten_valY)[idx], np.array(flatten_val_preds)[idx], average='macro')

0.5683359908960649

In [42]:
val_preds

array([list([6, 14, 15, 2, 3, 0, 0, 22, 0, 0, 26, 0]),
       list([6, 2, 0, 4, 0, 6, 14, 15, 0, 16, 17, 2, 0, 0, 0, 0, 0, 2, 3, 3, 3, 0, 0, 0]),
       list([6, 14, 15, 26, 27, 0, 2, 3, 0]), ...,
       list([6, 0, 22, 0, 11, 12, 0, 6, 2, 0, 0, 22, 0]),
       list([6, 4, 0, 6, 0, 6, 0, 22, 0]),
       list([6, 0, 18, 19, 0, 11, 12, 0, 0, 0, 4, 6, 0, 6, 0, 2, 0])],
      dtype=object)

In [43]:
# write predictions to file
with open('val_preds.txt', 'w') as f:
    for i in range(len(val_preds)):
        for j in range(len(val_preds[i])):
            f.write(labels_inv[val_preds[i][j]] + '\n')
        f.write("\n")

In [44]:
!python3 eval.py Data/dev.txt val_preds.txt

CLASSIFICATION Report
                 precision    recall  f1_score  true_entities  pred_entities
Reagent           0.795750  0.766342  0.780769         5033.0         4847.0
Action            0.834977  0.790934  0.812359         3640.0         3448.0
Modifier          0.316424  0.380881  0.345673         1998.0         2405.0
Location          0.743812  0.604324  0.666852         1989.0         1616.0
Amount            0.868325  0.832945  0.850267         1718.0         1648.0
Time              0.888889  0.866242  0.877419         1099.0         1071.0
Device            0.558859  0.520487  0.538991          903.0          841.0
Method            0.560633  0.353659  0.433719          902.0          569.0
Concentration     0.751857  0.714689  0.732802          708.0          673.0
Temperature       0.904048  0.900000  0.902019          670.0          667.0
Measure-Type      0.560000  0.455814  0.502564          430.0          350.0
Generic-Measure   0.439189  0.216667  0.290179        