<a href="https://colab.research.google.com/github/jadriant/CSCI544/blob/main/Task_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [52]:
!pip install datasets accelerate



In [53]:
!wget http://nlp.stanford.edu/data/glove.6B.zip

--2023-11-10 04:12:44--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2023-11-10 04:12:44--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2023-11-10 04:12:44--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip.2’


2

In [54]:
!unzip glove.6B.zip

Archive:  glove.6B.zip
replace glove.6B.50d.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [55]:
!wget https://raw.githubusercontent.com/sighsmile/conlleval/master/conlleval.py

--2023-11-10 04:18:32--  https://raw.githubusercontent.com/sighsmile/conlleval/master/conlleval.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7502 (7.3K) [text/plain]
Saving to: ‘conlleval.py.2’


2023-11-10 04:18:32 (92.2 MB/s) - ‘conlleval.py.2’ saved [7502/7502]



# Load Dataset

In [56]:
import datasets

dataset = datasets.load_dataset("conll2003")

# Create the Vocabulary

In [57]:
import itertools
from collections import Counter

word_frequency = Counter(itertools.chain(*dataset['train']['tokens']))  # type: ignore

# Remove words below threshold 3
word_frequency = {
    word: frequency
    for word, frequency in word_frequency.items()
    if frequency >= 3
}

word2idx = {
    word: index
    for index, word in enumerate(word_frequency.keys(), start=2)
}

word2idx['[PAD]'] = 0
word2idx['[UNK]'] = 1

# Tokenize to ids

In [58]:
dataset = (
    dataset
    .map(lambda x: {
            'input_ids': [
                word2idx.get(word, word2idx['[UNK]'])
                for word in x['tokens']
            ]
        }
    )
)

dataset['train']['input_ids'][:3]

[[2, 1, 3, 4, 5, 6, 7, 8, 9], [10, 11], [12, 13]]

In [59]:
# Rename 'ner_tags' to labels
dataset = dataset.rename_column("ner_tags", "labels")

# Remove 'pos_tag' and 'chunk_tags'
dataset = dataset.remove_columns(["pos_tags", "chunk_tags"])

# Check before moving on
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'labels', 'input_ids'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'labels', 'input_ids'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'labels', 'input_ids'],
        num_rows: 3453
    })
})


## Model Architecture: BiLSTM Model

Embedding dim 100 \
Num LSTM layers 1 \
LSTM hidden dim 256 \
LSTM Dropout 0.33 \
Linear output dim 128

In [60]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

In [61]:
class BiLSTMForNER(nn.Module):
    def __init__(self, vocab_size, embedding_dim, lstm_hidden_dim, linear_output_dim, num_labels, dropout=0.33):
        super(BiLSTMForNER, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.bilstm = nn.LSTM(embedding_dim, lstm_hidden_dim // 2, num_layers=1,
                              bidirectional=True, batch_first=True, dropout=dropout)
        self.linear = nn.Linear(lstm_hidden_dim, linear_output_dim)
        self.elu = nn.ELU()
        self.classifier = nn.Linear(linear_output_dim, num_labels)

    def forward(self, input_ids):
        embeddings = self.embedding(input_ids)
        bilstm_output, _ = self.bilstm(embeddings)
        linear_output = self.elu(self.linear(bilstm_output))
        logits = self.classifier(linear_output)
        return logits

In [62]:
# unique_input_ids = set()
# for split in ['train', 'validation', 'test']:
#     for input_id_list in dataset[split]['input_ids']:
#         unique_input_ids.update(input_id_list)

def get_max_label(dataset):
    max_label = 0
    for split in dataset:
        split_max = max([max(labels) for labels in dataset[split]['labels']])
        max_label = max(max_label, split_max)
    return max_label + 1  # Adding 1 because labels are zero-indexed


vocab_size = len(word2idx)
num_labels = get_max_label(dataset)

model = BiLSTMForNER(
    vocab_size,
    embedding_dim=100,
    lstm_hidden_dim=256,
    linear_output_dim=128,
    num_labels=num_labels,
    dropout=0.33
    )

# Define an optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.CrossEntropyLoss()

# Checking the number of labels
print(num_labels)

9




In [70]:
# Function to pad sequences and create TensorDataset
def create_dataset2(input_ids, labels, pad_label_value=-100):
    # Pad the input sequences and the labels
    input_ids_padded = pad_sequence([torch.tensor(s) for s in input_ids],
                                    batch_first=True, padding_value=0)
    labels_padded = pad_sequence([torch.tensor(l) for l in labels],
                                 batch_first=True, padding_value=pad_label_value)
    return TensorDataset(input_ids_padded, labels_padded)

train_dataset = create_dataset2(dataset['train']['input_ids'], dataset['train']['labels'])
val_dataset = create_dataset2(dataset['validation']['input_ids'], dataset['validation']['labels'])

# Create DataLoader objects
batch_size = 32
num_epochs = 40
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

In [73]:
# Set the device to GPU (cuda) if available, otherwise stick with CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device} device")

# Move the model to the specified device
model.to(device)


# Training Loop
for epoch in range(1, num_epochs + 1):
    model.train()
    total_loss = 0
    for batch in train_loader:
        # Transfer batch to the device
        input_ids, labels = batch[0].to(device), batch[1].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids)
        loss = loss_fn(outputs.view(-1, num_labels), labels.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch} total loss: {total_loss}")

    # Evaluation Loop
    model.eval()
    total_eval_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            # Transfer batch to the device
            input_ids, labels = batch[0].to(device), batch[1].to(device)

            outputs = model(input_ids)
            loss = loss_fn(outputs.view(-1, num_labels), labels.view(-1))
            total_eval_loss += loss.item()
    print(f"Validation loss: {total_eval_loss}")

    if epoch > 10 and float(total_loss) < 1.0:
      break

Using cuda device
Epoch 1 total loss: 0.7959661375311953
Validation loss: 41.22647560811069
Epoch 2 total loss: 0.8046723342886253
Validation loss: 38.85210501128768
Epoch 3 total loss: 0.7769911385724981
Validation loss: 39.608064562282834
Epoch 4 total loss: 0.807534541399832
Validation loss: 40.8617992223962
Epoch 5 total loss: 0.8615019235458021
Validation loss: 41.15451389513774
Epoch 6 total loss: 2.5866465404233168
Validation loss: 37.80479847879906
Epoch 7 total loss: 1.987935914317859
Validation loss: 37.034843524097596
Epoch 8 total loss: 0.8977204087004793
Validation loss: 37.86874765544053
Epoch 9 total loss: 0.7332256262634473
Validation loss: 38.85467227516983
Epoch 10 total loss: 0.7119645048001075
Validation loss: 39.50379347663966
Epoch 11 total loss: 0.7155515567244493
Validation loss: 39.47706316190761


In [None]:
# Define the list of NER tags
ner_tags = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

# Create ner_to_index and index_to_tag dictionaries
ner_to_index = {tag: index for index, tag in enumerate(ner_tags)}
index_to_tag = {index: tag for tag, index in ner_to_index.items()}

In [74]:
from conlleval import evaluate
import itertools

# Evaluation Loop
model.eval()
all_true_tags = []
all_pred_tags = []
with torch.no_grad():
    for batch in val_loader:
        # Transfer batch to the device
        input_ids, labels = batch[0].to(device), batch[1].to(device)

        outputs = model(input_ids)

        # Get the model's predictions
        predictions = torch.argmax(outputs, dim=2)

        # Exclude padding from evaluation
        for i in range(labels.size(0)):  # Batch size
            true_labels = labels[i]
            pred_labels = predictions[i]
            for j in range(true_labels.size(0)):  # Sequence length
                if true_labels[j] != -100:  # Assuming -100 is used for padding
                    true_tag = index_to_tag[true_labels[j].item()]
                    pred_tag = index_to_tag[pred_labels[j].item()]
                    all_true_tags.append(true_tag)
                    all_pred_tags.append(pred_tag)

# Evaluate with conlleval
prec, rec, f1 = evaluate(all_true_tags, all_pred_tags, verbose=True)
print(f'Precision: {prec}, Recall: {rec}, F1 Score: {f1}')

processed 51362 tokens with 5942 phrases; found: 5723 phrases; correct: 4581.
accuracy:  79.41%; (non-O)
accuracy:  95.71%; precision:  80.05%; recall:  77.10%; FB1:  78.54
              LOC: precision:  87.85%; recall:  85.03%; FB1:  86.42  1778
             MISC: precision:  76.28%; recall:  74.30%; FB1:  75.27  898
              ORG: precision:  75.24%; recall:  70.25%; FB1:  72.66  1252
              PER: precision:  77.55%; recall:  75.57%; FB1:  76.55  1795
Precision: 80.0454307181548, Recall: 77.09525412319084, F1 Score: 78.54264894984998
