# Task 1
### What are the precision, recall, and F1 score on the validation data?

Precision: 82.36%
Recall: 76.39%
F1 score: 79.04
on the best run

### What are the precision, recall, and F1 score on the test data?

Precision: 74.25%
Recall: 68.25%
F1 score: 71.13
with the model from the best run

# Task 2
### What is the precision, recall, and F1 score on the validation data?

Precision: 89.27%
Recall: 89.48%
F1 score: 89.37
on the best run

### What are the precision, recall, and F1 score on the test data?

Precision: 84.33%
Recall: 85.09%
F1 score: 84.71
with the model from the best run

### BiLSTM with Glove Embeddings outperforms the model without. Can you provide a rationale for this?

There are two main reasons for this observation.
- The GloVe embeddings are trained on a corpora of around 400,000 words compared to the 23,623 words from the training dataset. This means the GloVe model will understand a lot more words and replace a lot fewer with an unknown tag.
- The GloVe embeddings also encode semantic information. They embeddings will have contextual information and will therefore, in a sense, be able to understand the meaning and context of words.

In [None]:
!pip install datasets accelerate
!wget https://raw.githubusercontent.com/sighsmile/conlleval/master/conlleval.py

Collecting datasets
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/493.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━[0m [32m245.8/493.7 kB[0m [31m7.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━

In [None]:
import datasets
import numpy as np
import torch
import torch.nn as nn
import copy
from torch.utils.data import DataLoader
from accelerate import Accelerator
from torch.optim.lr_scheduler import ReduceLROnPlateau
import itertools
from collections import Counter
from conlleval import evaluate

In [None]:
DATASET_NAME = 'conll2003'
UNKNOWN = '[UNK]'
PAD = '[PAD]'
PAD_TOKEN_IDX = 0

In [None]:
tag2idx = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}
tags = list(tag2idx.keys())
tag2idx[PAD] = 9

In [None]:
dataset = datasets.load_dataset(DATASET_NAME)

Downloading builder script:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

## Task 1 training

In [None]:
word2idx = Counter(itertools.chain(*dataset['train']['tokens']))

word2idx = {
    word: frequency
    for word, frequency in word2idx.items()
    if frequency >= 3
}

word2idx = {
    word: index
    for index, word in enumerate(word2idx.keys(), start=2)
}

word2idx[PAD] = PAD_TOKEN_IDX
word2idx[UNKNOWN] = 1

In [None]:
def preprocess_sample_task1(sample):
    sample['input_ids'] = [word2idx.get(token, word2idx[UNKNOWN]) for token in sample['tokens']]

    del sample['pos_tags']
    del sample['chunk_tags']
    del sample['tokens']
    del sample['id']

    sample['labels'] = sample['ner_tags']
    del sample['ner_tags']

    return sample

dataset_task1 = dataset.map(preprocess_sample_task1)

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [None]:
INPUT_DIMENSION = len(word2idx)
EMBEDDING_DIMENSION = 100
NUM_LSTM_LAYER = 1
LSTM_HIDDEN_DIMENSION = 256
LSTM_DROPOUT = 0.33
LINEAR_OUTPUT_DIMENSION = 128
NUM_LABELS = 9

In [None]:
def collate_fn(batch):
    input_ids = [torch.LongTensor(sample['input_ids']) for sample in batch]
    labels = [torch.LongTensor(sample['labels']) for sample in batch]
    input_ids = nn.utils.rnn.pad_sequence(
        input_ids, batch_first=True, padding_value=PAD_TOKEN_IDX)
    labels = nn.utils.rnn.pad_sequence(
        labels, batch_first=True, padding_value=tag2idx[PAD])
    # Store the original lengths of sequences so we can get the original length back for prediction evaluation.
    input_lengths = [len(sample['input_ids']) for sample in batch]
    return {
        'input_ids': input_ids,
        'labels': labels,
        'input_lengths': input_lengths,
    }

In [None]:
batch_size = 32
train_dataset = dataset_task1['train']
val_dataset = dataset_task1['validation']

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, collate_fn=collate_fn)

In [None]:
class BiLSTMModel(nn.Module):
    def __init__(self, embedding_dim, embedding_layer, lstm_hidden_dim, lstm_num_layers, lstm_dropout, linear_output_dim, num_labels):
        super(BiLSTMModel, self).__init__()

        self.embedding = embedding_layer
        self.bilstm = nn.LSTM(embedding_dim, lstm_hidden_dim, num_layers=lstm_num_layers, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(lstm_dropout)
        self.linear = nn.Linear(lstm_hidden_dim * 2, linear_output_dim)
        self.elu = nn.ELU()
        self.classifier = nn.Linear(linear_output_dim, num_labels)

    def forward(self, input_ids):
        embeddings = self.embedding(input_ids)
        lstm_output, _ = self.bilstm(embeddings)
        lstm_output = self.dropout(lstm_output)
        linear_output = self.linear(lstm_output)
        elu_output = self.elu(linear_output)
        logits = self.classifier(elu_output)
        return logits

In [None]:
learning_rate = 0.001
embedding_layer = nn.Embedding(INPUT_DIMENSION, EMBEDDING_DIMENSION)
model = BiLSTMModel(EMBEDDING_DIMENSION, embedding_layer, LSTM_HIDDEN_DIMENSION, NUM_LSTM_LAYER, LSTM_DROPOUT,LINEAR_OUTPUT_DIMENSION, NUM_LABELS)
criterion = nn.CrossEntropyLoss(ignore_index=tag2idx[PAD])
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [None]:
accelerator = Accelerator()
model, optimizer, train_loader, val_loader = accelerator.prepare(model, optimizer, train_loader, val_loader)

scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=0.1, patience=2)
num_epochs = 30
best_f1_score = 0
best_precision = 0
best_recall = 0
best_model_state = {}
for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        input_ids = batch['input_ids']
        labels = batch['labels']

        optimizer.zero_grad()
        logits = model(input_ids)
        loss = criterion(logits.view(-1, NUM_LABELS), labels.view(-1))
        accelerator.backward(loss)
        optimizer.step()

    # Validation
    model.eval()
    true_ner_tags = []
    predicted_ner_tags = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids']
            labels = batch['labels']
            logits = model(input_ids)
            predicted_labels = logits.argmax(dim=2)
            true_ner_tags.extend([list(map(tags.__getitem__, label_indices[:length])) for label_indices, length in zip(labels, batch['input_lengths'])])
            predicted_ner_tags.extend([list(map(tags.__getitem__, label_indices[:length])) for label_indices, length in zip(predicted_labels, batch['input_lengths'])])

        precision, recall, f1 = evaluate(
            itertools.chain(*true_ner_tags),
            itertools.chain(*predicted_ner_tags)
        )
        scheduler.step(f1)
    if f1 > best_f1_score:
      best_f1_score = f1
      best_precision = precision
      best_recall = recall
      best_model_state = copy.deepcopy(model.state_dict())
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}, precision: {precision}, recall: {recall}, F1: {f1}")

print(f"Best precision, recall, F1 score is {best_precision}, {best_recall}, {best_f1_score}")
torch.save({
      'word2idx': word2idx,
      'model_state_dict': best_model_state
    }, 'task1.pt')

processed 51362 tokens with 5942 phrases; found: 4385 phrases; correct: 2951.
accuracy:  53.77%; (non-O)
accuracy:  91.54%; precision:  67.30%; recall:  49.66%; FB1:  57.15
              LOC: precision:  84.77%; recall:  58.46%; FB1:  69.20  1267
             MISC: precision:  67.09%; recall:  34.71%; FB1:  45.75  477
              ORG: precision:  48.00%; recall:  41.98%; FB1:  44.79  1173
              PER: precision:  67.71%; recall:  53.96%; FB1:  60.06  1468
Epoch 1/30, Loss: 0.3174291253089905, precision: 67.2976054732041, recall: 49.6634129922585, F1: 57.15115716084051
processed 51362 tokens with 5942 phrases; found: 5215 phrases; correct: 3954.
accuracy:  70.46%; (non-O)
accuracy:  94.46%; precision:  75.82%; recall:  66.54%; FB1:  70.88
              LOC: precision:  89.92%; recall:  70.44%; FB1:  79.00  1439
             MISC: precision:  81.38%; recall:  62.58%; FB1:  70.75  709
              ORG: precision:  60.82%; recall:  61.60%; FB1:  61.21  1358
              PER: prec

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

--2023-11-11 00:56:49--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2023-11-11 00:56:49--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2023-11-11 00:56:50--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [None]:
# adjust this to point to location of glove embeddings
GLOVE_EMBEDDINGS_PATH = './glove.6B.100d.txt'

In [None]:

vocab,embeddings = [],[]
with open('glove.6B.100d.txt','rt') as fi:
    full_content = fi.read().strip().split('\n')
for i in range(len(full_content)):
    i_word = full_content[i].split(' ')[0]
    i_embeddings = [float(val) for val in full_content[i].split(' ')[1:]]
    vocab.append(i_word)
    embeddings.append(i_embeddings)

vocab_npa = np.array(vocab)
embs_npa = np.array(embeddings)

vocab_npa = np.insert(vocab_npa, 0, PAD)
vocab_npa = np.insert(vocab_npa, 1, UNKNOWN)

pad_emb_npa = np.zeros((1,embs_npa.shape[1]))
unk_emb_npa = np.mean(embs_npa,axis=0,keepdims=True)

embs_npa = np.vstack((pad_emb_npa,unk_emb_npa,embs_npa))

glove_embedding_layer = torch.nn.Embedding.from_pretrained(torch.from_numpy(embs_npa).float())

## Task 2 training

In [None]:
word2idx_glove = {
    word: index
    for index, word in enumerate(vocab_npa)
}

In [None]:
def preprocess_sample_glove(sample):
    sample['input_ids'] = [word2idx_glove.get(token.lower(), word2idx_glove[UNKNOWN]) for token in sample['tokens']]

    del sample['pos_tags']
    del sample['chunk_tags']
    del sample['tokens']
    del sample['id']

    sample['labels'] = sample['ner_tags']
    del sample['ner_tags']

    return sample
dataset_task2 = dataset.map(preprocess_sample_glove)

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [None]:
train_dataset = dataset_task2['train']
val_dataset = dataset_task2['validation']

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, collate_fn=collate_fn)

In [None]:
model = BiLSTMModel(EMBEDDING_DIMENSION, glove_embedding_layer, LSTM_HIDDEN_DIMENSION, NUM_LSTM_LAYER, LSTM_DROPOUT,LINEAR_OUTPUT_DIMENSION, NUM_LABELS)
criterion = nn.CrossEntropyLoss(ignore_index=tag2idx[PAD])
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [None]:
accelerator = Accelerator()
model, optimizer, train_loader, val_loader = accelerator.prepare(model, optimizer, train_loader, val_loader)

scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=0.1, patience=2)
num_epochs = 20
best_f1_score = 0
best_precision = 0
best_recall = 0
best_model_state = {}
for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        input_ids = batch['input_ids']
        labels = batch['labels']

        optimizer.zero_grad()
        logits = model(input_ids)
        loss = criterion(logits.view(-1, NUM_LABELS), labels.view(-1))
        accelerator.backward(loss)
        optimizer.step()

    # Validation
    model.eval()
    true_ner_tags = []
    predicted_ner_tags = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids']
            labels = batch['labels']
            logits = model(input_ids)
            predicted_labels = logits.argmax(dim=2)
            true_ner_tags.extend([list(map(tags.__getitem__, label_indices[:length])) for label_indices, length in zip(labels, batch['input_lengths'])])
            predicted_ner_tags.extend([list(map(tags.__getitem__, label_indices[:length])) for label_indices, length in zip(predicted_labels, batch['input_lengths'])])

        precision, recall, f1 = evaluate(
            itertools.chain(*true_ner_tags),
            itertools.chain(*predicted_ner_tags)
        )
        scheduler.step(f1)
    if f1 > best_f1_score:
        best_f1_score = f1
        best_precision = precision
        best_recall = recall
        best_model_state = copy.deepcopy(model.state_dict())
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}, precision: {precision}, recall: {recall}, F1: {f1}")

print(f"Best precision, recall, F1 score is {best_precision}, {best_recall}, {best_f1_score}")
torch.save({
      'word2idx_glove': word2idx_glove,
      'glove_model_dict': best_model_state
    }, 'task2.pt')

processed 51362 tokens with 5942 phrases; found: 5731 phrases; correct: 4726.
accuracy:  79.54%; (non-O)
accuracy:  96.13%; precision:  82.46%; recall:  79.54%; FB1:  80.97
              LOC: precision:  87.39%; recall:  84.10%; FB1:  85.71  1768
             MISC: precision:  72.80%; recall:  64.75%; FB1:  68.54  820
              ORG: precision:  71.99%; recall:  68.61%; FB1:  70.26  1278
              PER: precision:  89.22%; recall:  90.34%; FB1:  89.78  1865
Epoch 1/20, Loss: 0.20355863869190216, precision: 82.46379340429245, recall: 79.53550992931673, F1: 80.97318598475113
processed 51362 tokens with 5942 phrases; found: 5948 phrases; correct: 5039.
accuracy:  85.21%; (non-O)
accuracy:  96.90%; precision:  84.72%; recall:  84.80%; FB1:  84.76
              LOC: precision:  89.64%; recall:  89.98%; FB1:  89.81  1844
             MISC: precision:  81.83%; recall:  70.82%; FB1:  75.93  798
              ORG: precision:  73.01%; recall:  74.42%; FB1:  73.71  1367
              PER: p