In [4]:
# here go all the imports
!pip install seqeval
!pip install torchtext==0.6.0
import csv
from pprint import pprint
from collections import Counter,OrderedDict
import random
import numpy as np
from tqdm import tqdm
from torchtext.vocab import Vocab
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from typing import Dict, Iterator, List, Union, Optional
from sklearn.metrics import f1_score 
from seqeval.metrics import f1_score as f1
import seaborn as sns
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
SEED = 36
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
device='cuda' 

In [5]:
class NerDataset(Dataset):
  def __init__(self, 
                  input_file:str, 
                  device="cuda"
                  ):
    super().__init__()
    self.data_file=input_file
    self.device = device
    self.data=self.data_load()
    self.encoded_data = None
    
  def data_load(self):
    final_list=[]
    wordTag={}
    temp_word_list=[]
    temp_tag_list=[]
    with open(self.data_file) as input:
      for row in csv.reader(input,delimiter="\t"):
          if 'id' not in row :
            if (row==[]) or (row==['', '']):
              wordTag['word']=temp_word_list
              wordTag['tag']=temp_tag_list
              final_list.append(wordTag)
              wordTag={}
              temp_word_list=[]
              temp_tag_list=[]
              continue
            temp_word_list.append(row[0])
            temp_tag_list.append(row[1])
    return final_list

  def __len__(self) -> int:
        return len(self.data)

  def __getitem__(self, index) -> List[Dict]:
        return self.data[index]
    

      

In [6]:
vocab,embeddings = [],[]
with open('../input/glove-embeddings/glove.6B.300d.txt','rt') as fi:
    full_content = fi.read().strip().split('\n')
for i in range(len(full_content)):
    i_word = full_content[i].split(' ')[0]
    i_embeddings = [float(val) for val in full_content[i].split(' ')[1:]]
    vocab.append(i_word)
    embeddings.append(i_embeddings)

import numpy as np
vocab_npa = np.array(vocab)
embs_npa = np.array(embeddings)

#insert '<pad>' and '<unk>' tokens at start of vocab_npa.
vocab_npa = np.insert(vocab_npa, 0, '<pad>')
vocab_npa = np.insert(vocab_npa, 1, '<unk>')
print(vocab_npa[:10])

pad_emb_npa = np.zeros((1,embs_npa.shape[1]))   #embedding for '<pad>' token.
unk_emb_npa = np.mean(embs_npa,axis=0,keepdims=True)    #embedding for '<unk>' token.



#insert embeddings for pad and unk tokens at top of embs_npa.
embs_npa = np.vstack((pad_emb_npa,unk_emb_npa,embs_npa))
print(embs_npa.shape)

word2ind={k: v for v, k in enumerate(vocab_npa)}

del vocab,embeddings

In [7]:
input_file="../input/ner-dataset/train_p.tsv"
train_data=NerDataset(input_file,device="cuda")
input_file="../input/ner-dataset/dev_p.tsv"
val_data=NerDataset(input_file,device="cuda")

In [9]:
#Use this code when not using pre-trained embedding to create lookup index
# def build_vocab(dataset, min_freq=1):
#     all_words = [item for sublist in dataset for item in sublist['word']]
#     # [word for sample in dataset for word in sample["word"]]
#     counter = Counter(all_words)
#     # we add special tokens for handling padding and unknown words at testing time.
#     return Vocab(counter, min_freq=min_freq,specials=['<pad>', '<unk>'])
# word2ind = build_vocab(train_data,min_freq=5)

In [10]:
def build_vocab_tag(dataset):
    all_words = [item for sublist in dataset for item in sublist['tag']]
    counter = Counter(all_words)
    # we add special tokens for handling padding and unknown words at testing time.
    return Vocab(counter,specials=['<pad>'])
vocabulary_tag = build_vocab_tag(train_data)

In [None]:
# partial copy of code from notebook from classes
def prepare_batch(batch: List[Dict]) -> List[Dict]:
  PAD_TOKEN = "<pad>"
  UNK_TOKEN = "<unk>"
  # extract features and labels from batch
  x = [sample["word"] for sample in batch]
  y = [sample["tag"] for sample in batch]
  # convert words to index
  x = [[word2ind.get(word, word2ind[UNK_TOKEN]) for word in sample] for sample in x]
  # convert labels to index
  y = [[vocabulary_tag.stoi.get(label) for label in sample] for sample in y]
  # convert features to tensor and pad them
  x = pad_sequence(
    [torch.as_tensor(sample) for sample in x],
    padding_value=word2ind['<pad>']
  )
  # convert and pad labels too
  y = pad_sequence(
    [torch.as_tensor(sample) for sample in y],
    padding_value=vocabulary_tag.stoi['<pad>']
  )
  return {"word": x, "tag": y}

In [15]:
# data loader parameters
collate_fn = prepare_batch # the function that will prepare the data for the model
batch_sizes = 32
# num_workers = min(os.cpu_count(), 4)  # it is usually 4 workers per GPU
is_train_dataloader = True # we don"t want to shuffle dev and test data
train_data_loader = DataLoader(
  train_data,
  collate_fn=collate_fn,
  shuffle=True,
  batch_size=batch_sizes
)
val_data_loader=DataLoader(
  val_data,
  collate_fn=collate_fn,
  shuffle=False,
  batch_size=batch_sizes
)

In [16]:
class HParams():
    vocab_size = len(word2ind)
    hidden_dim = 264
    embedding_dim = 300
    num_classes = len(vocabulary_tag) # number of different universal POS tags
    bidirectional = True
    num_layers = 2
    emb_dropout=0.6
    lstm_dropout=0.5
    fc_dropout=0.6
params = HParams()

In [17]:
class NERModel(nn.Module):
    # we provide the hyperparameters as input
    def __init__(self, hparams):
        super().__init__()
        pprint(vars(hparams))
#         self.word_embedding = nn.Embedding(hparams.vocab_size, hparams.embedding_dim)
        self.word_embedding = torch.nn.Embedding.from_pretrained(torch.from_numpy(embs_npa).float(),freeze=False)
        self.emb_dropout = nn.Dropout(hparams.emb_dropout)
        self.lstm = nn.LSTM(input_size=hparams.embedding_dim, 
                            hidden_size=hparams.hidden_dim, 
                            bidirectional=hparams.bidirectional,
                            num_layers=hparams.num_layers, 
                            dropout = hparams.lstm_dropout)
        lstm_output_dim = hparams.hidden_dim if hparams.bidirectional is False else hparams.hidden_dim * 2
        self.fc_dropout = nn.Dropout(hparams.fc_dropout)
        self.classifier = nn.Linear(lstm_output_dim, hparams.num_classes)

    
    def forward(self, x):
        embeddings =  self.emb_dropout(self.word_embedding(x))
        o, (h, c) = self.lstm(embeddings)
        output = self.classifier(self.fc_dropout(o))
        return output


In [18]:
class Trainer():
    """Utility class to train and evaluate a model."""

    def __init__(
        self,
        model: nn.Module,
        loss_function,
        optimizer,
        label_vocab: Vocab,
        log_steps:int=10_000,
        log_level:int=2):
        """
        Args:
            model: the model we want to train.
            loss_function: the loss_function to minimize.
            optimizer: the optimizer used to minimize the loss_function.
        """
        self.model = model
        self.loss_function = loss_function
        self.optimizer = optimizer

        self.label_vocab = label_vocab
        self.log_steps = log_steps
        self.log_level = log_level

    def train(self, train_dataset:Dataset, 
              valid_dataset:Dataset, 
              epochs:int=1):
        """
        Args:
            train_dataset: a Dataset or DatasetLoader instance containing
                the training instances.
            valid_dataset: a Dataset or DatasetLoader instance used to evaluate
                learning progress.
            epochs: the number of times to iterate over train_dataset.

        Returns:
            avg_train_loss: the average training loss on train_dataset over
                epochs.
        """
        assert epochs > 1 and isinstance(epochs, int)
        if self.log_level > 0:
            print('---------------Training  Started-----------')
        train_loss = 0.0
        plot_loss_train=[]
        plot_loss_val=[]
        for epoch in range(epochs):
            if self.log_level > 0:
                print(' Epoch {:03d}'.format(epoch + 1))

            epoch_loss = 0.0
            self.model.train()

            # for each batch 
            for step, sample in enumerate(tqdm(train_dataset)):
                inputs = sample['word'].to(device)
                labels = sample['tag'].to(device)
                self.optimizer.zero_grad()
                predictions = self.model(inputs)
                predictions = predictions.view(-1, predictions.shape[-1])
                labels = labels.view(-1)
                sample_loss = self.loss_function(predictions, labels)
                sample_loss.backward()
                self.optimizer.step()

                epoch_loss += sample_loss.tolist()

                if self.log_level > 1 and step % self.log_steps == self.log_steps - 1:
                    print('[Epoch: {:2d} @ step {}] current avg loss = {:0.4f} '.format(epoch, step, epoch_loss / (step + 1)))
            avg_epoch_loss = epoch_loss / len(train_dataset)
            train_loss += avg_epoch_loss
            if self.log_level > 0:
                print('\t[Epoch: {:2d}] train loss = {:0.4f}'.format(epoch, avg_epoch_loss))
                plot_loss_train.append(avg_epoch_loss)

            valid_loss = self.evaluate(valid_dataset)
            if self.log_level > 0:
                print('\t[Epoch: {:2d}] valid loss = {:0.4f}'.format(epoch, valid_loss))
                plot_loss_val.append(valid_loss)

        if self.log_level > 0:
            print('... Done!')
        
        avg_epoch_loss = train_loss / epochs
        return self.model,plot_loss_train,plot_loss_val
    

    def evaluate(self, valid_dataset):
        """
        Args:
            valid_dataset: the dataset to use to evaluate the model.

        Returns:
            avg_valid_loss: the average validation loss over valid_dataset.
        """
        valid_loss = 0.0
        # set dropout to 0!! Needed when we are in inference mode.
        self.model.eval()
        with torch.no_grad():
            for sample in valid_dataset:
                inputs = sample['word'].to(device)
                labels = sample['tag'].to(device)
                predictions = self.model(inputs)
                predictions = predictions.view(-1, predictions.shape[-1])
                labels = labels.view(-1)
                sample_loss = self.loss_function(predictions, labels)
                valid_loss += sample_loss.tolist()
        
        return valid_loss / len(valid_dataset)



    def predict(self, x):
        """
        Args:
            x: a tensor of indices.
        Returns: 
            A list containing the predicted POS tag for each token in the
            input sentences.
        """
        self.model.eval()
        with torch.no_grad():
            logits = self.model(x)
            predictions = torch.argmax(logits, -1)
            return predictions

    

    
    


In [21]:
tagger = NERModel(params).to(device)

In [22]:
trainer = Trainer(
    model = tagger,
    loss_function = nn.CrossEntropyLoss(ignore_index=word2ind['<pad>']),
    optimizer = optim.Adam(tagger.parameters()),
    label_vocab=vocabulary_tag.stoi)

In [None]:
#get trained model along with train loss and val loss arrays
trained_model,t_l,v_l=trainer.train(train_data_loader, val_data_loader,10)

In [None]:
#Plot the loss curves
plt.figure(figsize=[8,6])
plt.plot(t_l,'r',linewidth=3.0)
plt.plot(v_l,'b',linewidth=3.0)
plt.legend(['Training loss', 'Validation Loss'],fontsize=18)
plt.xlabel('Epochs ',fontsize=16)
plt.ylabel('Loss',fontsize=16)
plt.title('Loss Curves With Glove Embeddings',fontsize=16)
plt.show()

In [43]:
#partial copy of code from notebook from class
def to_labels(xx:list):
  return [[list(vocabulary_tag.stoi.keys())[list(vocabulary_tag.stoi.values()).index(w)] for w in xx]]
def compute_precision(model, dataset):
    all_predictions = list()
    all_labels = list()
    for sample in dataset:
        inputs = sample['word'].to('cuda')
        print(inputs)
        labels = sample['tag'].to('cuda')
        predictions = model(inputs)
        predictions = torch.argmax(predictions, -1)
        print(predictions)
        labels = labels
        predictions = predictions.view(-1)
        labels = labels.view(-1)
        valid_indices = labels != 0
        valid_predictions = predictions[valid_indices]
        valid_labels = labels[valid_indices]
        all_predictions.extend(valid_predictions.tolist())
        all_labels.extend(valid_labels.tolist())
    precision = f1_score(all_labels,all_predictions, average=None, zero_division=0)
    seqeval = f1(to_labels(all_labels),to_labels(all_predictions),mode='strict',)
    cm=confusion_matrix(all_labels,all_predictions,normalize='pred')
    
    return precision,seqeval,cm

In [None]:
pre,seqP,cm=compute_precision(trained_model,val_data_loader)

In [None]:
plt.figure(figsize=(16, 8)) 
plt.title('F1 scores of each Class with Glove Embeddings',fontsize=16)
sns.barplot(vocabulary_tag.itos[1:],pre.tolist())

In [None]:
plt.figure(figsize=(16, 8)) 
ax= plt.subplot()
sns.heatmap(cm, fmt='g',cmap="OrRd")
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title(' Normalized Confusion Matrix with Glove Embeddings'); 
ax.xaxis.set_ticklabels(vocabulary_tag.itos[1:]); ax.yaxis.set_ticklabels(vocabulary_tag.itos[1:],rotation=0);
