<a href="https://colab.research.google.com/github/jadriant/CSCI544/blob/main/task2_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets accelerate

Collecting datasets
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m27.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0 (from datasets)
  Downloading huggingface_hub-0.19.0-py3-none-any.whl (311 kB)
[2K     [90m━━━━━━━━━━━━

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip

In [None]:
!unzip glove.6B.zip

In [None]:
!wget https://raw.githubusercontent.com/sighsmile/conlleval/master/conlleval.py

# Load Dataset

In [None]:
import datasets

dataset = datasets.load_dataset("conll2003")

# Create the Vocabulary

In [None]:
import itertools
from collections import Counter

word_frequency = Counter(itertools.chain(*dataset['train']['tokens']))  # type: ignore

# Remove words below threshold 3
word_frequency = {
    word: frequency
    for word, frequency in word_frequency.items()
    if frequency >= 3
}

word2idx = {
    word: index
    for index, word in enumerate(word_frequency.keys(), start=2)
}

word2idx['[PAD]'] = 0
word2idx['[UNK]'] = 1

# Tokenize to ids

In [None]:
dataset = (
    dataset
    .map(lambda x: {
            'input_ids': [
                word2idx.get(word, word2idx['[UNK]'])
                for word in x['tokens']
            ]
        }
    )
)

dataset['train']['input_ids'][:3]

In [None]:
# Rename 'ner_tags' to labels
dataset = dataset.rename_column("ner_tags", "labels")

# Remove 'pos_tag' and 'chunk_tags'
dataset = dataset.remove_columns(["pos_tags", "chunk_tags"])

# Check before moving on
print(dataset)

# GloVe Embedding
- also taking into account case sensitive

In [None]:
import numpy as np

# Function to load GloVe embeddings
def load_glove_embeddings(path, word2idx, embedding_dim):
    embeddings = np.zeros((len(word2idx), embedding_dim))
    # Create a mapping for lowercased words to their GloVe vectors
    glove_index = {}
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            glove_index[word] = vector

    # Assign GloVe vector to the word if present, otherwise use the lowercase version from GloVe
    for word, idx in word2idx.items():
        vector = glove_index.get(word)
        if vector is not None:
            embeddings[idx] = vector
        else:
            # Use the lowercase version if the case-sensitive version is not found
            lowercase_vector = glove_index.get(word.lower())
            if lowercase_vector is not None:
                embeddings[idx] = lowercase_vector

    return embeddings

glove_path = 'glove.6B.100d.txt'
embedding_dim = 100  # Dimensionality of GloVe vectors

# Load the embeddings
glove_embeddings = load_glove_embeddings(glove_path, word2idx, embedding_dim)

## Model Architecture: BiLSTM Model

Embedding dim 100 \
Num LSTM layers 1 \
LSTM hidden dim 256 \
LSTM Dropout 0.33 \
Linear output dim 128

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

In [None]:
class BiLSTMForNER(nn.Module):
    def __init__(self, vocab_size, embedding_dim, lstm_hidden_dim, linear_output_dim, num_labels, dropout, pretrained_embeddings):
        super(BiLSTMForNER, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding.weight = nn.Parameter(torch.tensor(pretrained_embeddings, dtype=torch.float32))
        self.embedding.weight.requires_grad = False  # Freeze the embeddings

        self.bilstm = nn.LSTM(embedding_dim, lstm_hidden_dim // 2, num_layers=1,
                              bidirectional=True, batch_first=True, dropout=dropout)
        self.linear = nn.Linear(lstm_hidden_dim, linear_output_dim)
        self.elu = nn.ELU()
        self.classifier = nn.Linear(linear_output_dim, num_labels)

    def forward(self, input_ids):
        embeddings = self.embedding(input_ids)
        bilstm_output, _ = self.bilstm(embeddings)
        linear_output = self.elu(self.linear(bilstm_output))
        logits = self.classifier(linear_output)
        return logits

In [None]:
def get_max_label(dataset):
    max_label = 0
    for split in dataset:
        split_max = max([max(labels) for labels in dataset[split]['labels']])
        max_label = max(max_label, split_max)
    return max_label + 1  # Adding 1 because labels are zero-indexed


vocab_size = len(word2idx)
num_labels = get_max_label(dataset)

model = BiLSTMForNER(
    vocab_size=len(word2idx),
    embedding_dim=100,  # The dimensionality of GloVe vectors
    lstm_hidden_dim=256,  # LSTM hidden layer dimensionality
    linear_output_dim=128,  # Linear layer output dimensionality
    num_labels=num_labels,  # The number of labels in your dataset
    dropout=0.33,  # The dropout rate for LSTM
    pretrained_embeddings=glove_embeddings
)

# Define an optimizer and a loss function
optimizer = optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.CrossEntropyLoss()

# Checking the number of labels
print(num_labels)

In [None]:
# Function to pad sequences and create TensorDataset
def create_dataset2(input_ids, labels, pad_label_value=-100):
    # Pad the input sequences and the labels
    input_ids_padded = pad_sequence([torch.tensor(s) for s in input_ids],
                                    batch_first=True, padding_value=0)
    labels_padded = pad_sequence([torch.tensor(l) for l in labels],
                                 batch_first=True, padding_value=pad_label_value)
    return TensorDataset(input_ids_padded, labels_padded)

# Create the datasets
train_dataset = create_dataset2(dataset['train']['input_ids'], dataset['train']['labels'])
val_dataset = create_dataset2(dataset['validation']['input_ids'], dataset['validation']['labels'])

# Create DataLoader objects
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

In [None]:
# Set the device to GPU (cuda) if available, otherwise stick with CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device} device")

# Move the model to the specified device
model.to(device)

num_epochs = 50

# Training Loop
for epoch in range(1, num_epochs + 1):
    model.train()
    total_loss = 0
    for batch in train_loader:
        # Transfer batch to the device
        input_ids, labels = batch[0].to(device), batch[1].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids)
        loss = loss_fn(outputs.view(-1, num_labels), labels.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch} total loss: {total_loss}")

    # Evaluation Loop
    model.eval()
    total_eval_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            # Transfer batch to the device
            input_ids, labels = batch[0].to(device), batch[1].to(device)

            outputs = model(input_ids)
            loss = loss_fn(outputs.view(-1, num_labels), labels.view(-1))
            total_eval_loss += loss.item()
    print(f"Validation loss: {total_eval_loss}")

    if epoch > 10 and float(total_loss) < 1.0:
      break

In [None]:
# Define the list of NER tags
ner_tags = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

# Create ner_to_index and index_to_tag dictionaries
ner_to_index = {tag: index for index, tag in enumerate(ner_tags)}
index_to_tag = {index: tag for tag, index in ner_to_index.items()}

In [None]:
from conlleval import evaluate
import itertools

# Evaluation Loop
model.eval()
all_true_tags = []
all_pred_tags = []
with torch.no_grad():
    for batch in val_loader:
        # Transfer batch to the device
        input_ids, labels = batch[0].to(device), batch[1].to(device)

        outputs = model(input_ids)

        # Get the model's predictions
        predictions = torch.argmax(outputs, dim=2)

        # Exclude padding from evaluation
        for i in range(labels.size(0)):  # Batch size
            true_labels = labels[i]
            pred_labels = predictions[i]
            for j in range(true_labels.size(0)):  # Sequence length
                if true_labels[j] != -100:  # Assuming -100 is used for padding
                    true_tag = index_to_tag[true_labels[j].item()]
                    pred_tag = index_to_tag[pred_labels[j].item()]
                    all_true_tags.append(true_tag)
                    all_pred_tags.append(pred_tag)

# Evaluate with conlleval
prec, rec, f1 = evaluate(all_true_tags, all_pred_tags, verbose=True)
print(f'Precision: {prec}, Recall: {rec}, F1 Score: {f1}')