# Assignment 3
Training a neural named entity recognition (NER) tagger 

In [None]:
import torch
import torch.nn as nn
import numpy as np
from random import shuffle
from sklearn.metrics import classification_report
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
pd.set_option('display.max_columns', None)  
from sklearn.metrics import classification_report
cuda = torch.device('cuda')
import numpy as np
np.set_printoptions(linewidth=201)

import warnings
warnings.filterwarnings("ignore")

In this assignment you are required to build a full training and testing pipeline for a neural sequentail tagger for named entities, using LSTM.

The dataset that you will be working on is called ReCoNLL 2003, which is a corrected version of the CoNLL 2003 dataset: https://www.clips.uantwerpen.be/conll2003/ner/

[Train data](https://drive.google.com/file/d/1hG66e_OoezzeVKho1w7ysyAx4yp0ShDz/view?usp=sharing)

[Dev data](https://drive.google.com/file/d/1EAF-VygYowU1XknZhvzMi2CID65I127L/view?usp=sharing)

[Test data](https://drive.google.com/file/d/16gug5wWnf06JdcBXQbcICOZGZypgr4Iu/view?usp=sharing)

As you can see, the annotated texts are labeled according to the IOB annotation scheme, for 3 entity types: Person, Organization, Location.

**Task 1:** Write a funtion for reading the data from a single file (of the ones that are provided above). The function recieves a filepath and then it encodes every sentence individually using a pair of lists, one list contains the words and one list contains the tags. Each list pair will be added to a general list (data), which will be returned back from the function.

In [None]:
!git clone https://github.com/kfirbar/nlp-course

fatal: destination path 'nlp-course' already exists and is not an empty directory.


In [None]:
def read_data(filepath):
    data = []
    
    with open(filepath) as file:
        words = []
        labels = []

        for index, line in enumerate(file, start=1):
            if line != '\n':
                word, label = line.split()
                words.append(word)
                labels.append(label)
            else:
                data.append((words, labels))
                words = []
                labels = []
    return data

train = read_data('/content/nlp-course/connl03_train.txt')
test = read_data('/content/nlp-course/connl03_test.txt')
dev = read_data('/content/nlp-course/connl03_dev.txt')


The following Vocab class can be served as a dictionary that maps words and tags into Ids. The UNK_TOKEN should be used for words that are not part of the training data.

In [None]:
UNK_TOKEN = 0

class Vocab:
    def __init__(self):
        self.word2id = {"__unk__": UNK_TOKEN}
        self.id2word = {UNK_TOKEN: "__unk__"}
        self.n_words = 1
        
        self.tag2id = {"O":0, "B-PER":1, "I-PER": 2, "B-LOC": 3, "I-LOC": 4, "B-ORG": 5, "I-ORG": 6}
        self.id2tag = {0:"O", 1:"B-PER", 2:"I-PER", 3:"B-LOC", 4:"I-LOC", 5:"B-ORG", 6:"I-ORG"}
        
    def index_words(self, words):
      word_indexes = [self.index_word(w) for w in words]
      return word_indexes

    def index_tags(self, tags):
      tag_indexes = [self.tag2id[t] for t in tags]
      return tag_indexes
    
    def index_word(self, w):
        if w not in self.word2id:
            self.word2id[w] = self.n_words
            self.id2word[self.n_words] = w
            self.n_words += 1
        return self.word2id[w]
            

**Task 2:** Write a function prepare_data that takes one of the [train, dev, test] and the Vocab instance, for converting each pair of (words,tags) to a pair of indexes. Each pair should be added to data_sequences, which will be returned back from the function.

In [None]:
vocab = Vocab()

def prepare_data(data, vocab):
    data_sequences = []
    for words, tags in data:
      words_indexes_tensor = torch.tensor(vocab.index_words(words), dtype=torch.long)
      tags_indexes_tensor = torch.tensor(vocab.index_tags(tags), dtype=torch.long)
      data_sequences.append((words_indexes_tensor, tags_indexes_tensor))
    
    return data_sequences, vocab

train_sequences, train_vocab = prepare_data(train, vocab)
dev_sequences, dev_vocab = prepare_data(dev, vocab)
test_sequences, test_vocab = prepare_data(test, vocab)

**Task 3:** Write NERNet, a PyTorch Module for labeling words with NER tags. 

*input_size:* the size of the vocabulary

*embedding_size:* the size of the embeddings

*hidden_size:* the LSTM hidden size

*output_size:* the number tags we are predicting for

*n_layers:* the number of layers we want to use in LSTM

*directions:* could 1 or 2, indicating unidirectional or bidirectional LSTM, respectively

The input for your forward function should be a single sentence tensor.

*note:* the embeddings in this section are learned embedding. That means that you don't need to use pretrained embedding like the one used in class. You will use them in part 5

In [None]:
class NERNet(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size, n_layers, directions):
        super(NERNet, self).__init__()
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size, n_layers, bidirectional=(True if directions==2 else False))
        self.out = nn.Linear(hidden_size * directions, output_size)
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.directions = directions
    
    def forward(self, input_sentence):
        lstm = self._get_lstm(input_sentence,)
        _, tag_seq = torch.max(lstm, 1)
        return tag_seq.tolist()
    
    def _get_lstm(self, input_sentence):
        embedded_input = self.embedding(input_sentence).view(len(input_sentence), 1, -1)
        lstm_out, _ = self.lstm(embedded_input)
        lstm_out = lstm_out.view(-1, lstm_out.shape[2]).cuda()
        output = self.out(lstm_out).cuda()
        return output

**Task 4:** write a training loop, which takes a model (instance of NERNet) and number of epochs to train on. The loss is always CrossEntropyLoss and the optimizer is always Adam.

In [None]:
def train_loop(model, n_epochs):
  shuffle(train_sequences)
  criterion = nn.CrossEntropyLoss()
  optimizer = optim.Adam(model.parameters(), lr=0.0001)
  
  for e in range(1, n_epochs + 1):
    for sentence, tags in train_sequences:
        optimizer.zero_grad()
        output = model._get_lstm(torch.tensor(sentence).cuda())
        loss = criterion(output, torch.tensor(tags).cuda())
        loss.backward()
        optimizer.step()

**Task 5:** write an evaluation loop on a trained model, using the dev and test datasets. This function print the true positive rate (TPR), also known as Recall and the opposite to false positive rate (FPR), also known as precision, of each label seperately (7 labels in total), and for all the 6 labels (except O) together. The caption argument for the function should be served for printing, so that when you print include it as a prefix.

In [None]:
def evaluate(model, prefix):
  total_count =float(0)
  total_currect = float(0)
  y_true = []
  y_pred = []
  datasets = {'dev file': dev_sequences,
              'test file': test_sequences}
  
  print(prefix)
  for dataset_name, dataset in datasets.items():
    for words, labels in dataset:
      output = model(torch.tensor(words).cuda())
      y_true += labels
      y_pred += output
      for (tag, label) in zip(output, labels):
        if label != 0:
          total_count += 1
          if tag == label:
            total_currect +=1

    labels_names = vocab.tag2id.keys()
    res = dict()
    classification_report_res = classification_report(y_true, y_pred, target_names=labels_names, output_dict=True)
    for key, value in classification_report_res.items():
      if key in labels_names:
        res[key]=  {'Recall': value['recall'], 'Precision': value['precision']}
      

    print(dataset_name)
    print(pd.DataFrame(res).head())
    print(f'lables accuracy: {total_currect/total_count}\n')

**Task 6:** Train and evaluate a few models, all with embedding_size=300, and with the following hyper parameters (you may use that as captions for the models as well):

Model 1: (hidden_size: 500, n_layers: 1, directions: 1)

Model 2: (hidden_size: 500, n_layers: 2, directions: 1)

Model 3: (hidden_size: 500, n_layers: 3, directions: 1)

Model 4: (hidden_size: 500, n_layers: 1, directions: 2)

Model 5: (hidden_size: 500, n_layers: 2, directions: 2)

Model 6: (hidden_size: 500, n_layers: 3, directions: 2)

Model 4: (hidden_size: 800, n_layers: 1, directions: 2)

Model 5: (hidden_size: 800, n_layers: 2, directions: 2)

Model 6: (hidden_size: 800, n_layers: 3, directions: 2)

In [None]:
embedding_size = 300
epoch = 10
output_size = 7

models = list()
models.append(NERNet(input_size=vocab.n_words, embedding_size=embedding_size, hidden_size=500, output_size=len(vocab.tag2id), n_layers=1, directions=1))
models.append(NERNet(input_size=vocab.n_words, embedding_size=embedding_size, hidden_size=500, output_size=len(vocab.tag2id), n_layers=2, directions=1))
models.append(NERNet(input_size=vocab.n_words, embedding_size=embedding_size, hidden_size=500, output_size=len(vocab.tag2id), n_layers=3, directions=1))
models.append(NERNet(input_size=vocab.n_words, embedding_size=embedding_size, hidden_size=500, output_size=len(vocab.tag2id), n_layers=1, directions=2))
models.append(NERNet(input_size=vocab.n_words, embedding_size=embedding_size, hidden_size=500, output_size=len(vocab.tag2id), n_layers=2, directions=2))
models.append(NERNet(input_size=vocab.n_words, embedding_size=embedding_size, hidden_size=500, output_size=len(vocab.tag2id), n_layers=3, directions=2))
models.append(NERNet(input_size=vocab.n_words, embedding_size=embedding_size, hidden_size=800, output_size=len(vocab.tag2id), n_layers=1, directions=2))
models.append(NERNet(input_size=vocab.n_words, embedding_size=embedding_size, hidden_size=800, output_size=len(vocab.tag2id), n_layers=2, directions=2))
models.append(NERNet(input_size=vocab.n_words, embedding_size=embedding_size, hidden_size=800, output_size=len(vocab.tag2id), n_layers=3, directions=2))

order = [1,2,3,4,5,6,4,5,6]
def run_models(embeddings=None):
  for index, model in zip(order, models): 
    if embeddings is not None:
      model.embedding.weight.data.copy_(torch.from_numpy(embeddings))
    model = model.to(cuda)
    train_loop(model, epoch)
    evaluate(model, f'Model {index}: (hidden_size: {model.hidden_size}, n_layers: {model.n_layers}, directions: {model.directions})')
    print('\n\n')

run_models()

Model 1: (hidden_size: 500, n_layers: 1, directions: 1)
dev file
                  O     B-PER     I-PER     B-LOC     I-LOC     B-ORG  \
Recall     0.964147  0.625000  0.617834  0.693989  0.434783  0.601190   
Precision  0.917615  0.722543  0.829060  0.819355  0.833333  0.608434   

              I-ORG  
Recall     0.387931  
Precision  0.671642  
lables accuracy: 0.5962219598583235

test file
                  O     B-PER     I-PER     B-LOC     I-LOC     B-ORG  \
Recall     0.958812  0.608833  0.644592  0.709125  0.539474  0.581081   
Precision  0.921982  0.714815  0.813370  0.790254  0.820000  0.583333   

              I-ORG  
Recall     0.370253  
Precision  0.585000  
lables accuracy: 0.5984938565200159




Model 2: (hidden_size: 500, n_layers: 2, directions: 1)
dev file
                  O     B-PER     I-PER    B-LOC     I-LOC     B-ORG     I-ORG
Recall     0.969315  0.665000  0.681529  0.73224  0.478261  0.571429  0.448276
Precision  0.928815  0.658416  0.869919  0.82716  0.7

**Task 6:** Download the GloVe embeddings from https://nlp.stanford.edu/projects/glove/ (use the 300-dim vectors from glove.6B.zip). Then intialize the nn.Embedding module in your NERNet with these embeddings, so that you can start your training with pre-trained vectors. Repeat Task 6 and print the results for each model.

Note: make sure that vectors are aligned with the IDs in your Vocab, in other words, make sure that for example the word with ID 0 is the first vector in the GloVe matrix of vectors that you initialize nn.Embedding with. For a dicussion on how to do that, check it this link:
https://discuss.pytorch.org/t/can-we-use-pre-trained-word-embeddings-for-weight-initialization-in-nn-embedding/1222

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip
glove_path = 'glove.6B.300d.txt'

--2022-06-01 07:13:36--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2022-06-01 07:13:36--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2022-06-01 07:13:37--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2022-0

In [None]:
embeddings = np.zeros((vocab.n_words, embedding_size))
with open(glove_path) as f:
  lines = f.readlines()
for line in lines:
  line = line.strip().split()
  word_id = vocab.word2id.get(line[0], -1)
  if word_id != -1:
    words_id_list = list()
    for item in line[1:]:
      words_id_list.append(float(item))
    embeddings[word_id] = words_id_list

run_models(embeddings)

Model 1: (hidden_size: 500, n_layers: 1, directions: 1)
dev file
                  O     B-PER     I-PER     B-LOC     I-LOC     B-ORG  \
Recall     0.967700  0.810000  0.834395  0.852459  0.565217  0.779762   
Precision  0.967076  0.880435  0.935714  0.857143  0.541667  0.639024   

              I-ORG  
Recall     0.637931  
Precision  0.672727  
lables accuracy: 0.7874852420306966

test file
                  O     B-PER     I-PER     B-LOC     I-LOC     B-ORG  \
Recall     0.965125  0.815457  0.834437  0.853612  0.684211  0.799228   
Precision  0.973385  0.854545  0.919708  0.860153  0.712329  0.637904   

              I-ORG  
Recall     0.664557  
Precision  0.608696  
lables accuracy: 0.8006341656757828




Model 2: (hidden_size: 500, n_layers: 2, directions: 1)
dev file
                  O     B-PER     I-PER     B-LOC     I-LOC     B-ORG  \
Recall     0.969315  0.820000  0.866242  0.863388  0.608696  0.821429   
Precision  0.986522  0.877005  0.951049  0.863388  0.560000  0.63

**Good luck!**