<a href="https://colab.research.google.com/github/jadriant/CSCI544/blob/main/Task_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets accelerate

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip

In [None]:
!unzip glove.6B.zip

In [None]:
!wget https://raw.githubusercontent.com/sighsmile/conlleval/master/conlleval.py

# Load Dataset

In [5]:
import datasets

dataset = datasets.load_dataset("conll2003")

Downloading builder script:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

# Create the Vocabulary

In [6]:
import itertools
from collections import Counter

word_frequency = Counter(itertools.chain(*dataset['train']['tokens']))  # type: ignore

# Remove words below threshold 3
word_frequency = {
    word: frequency
    for word, frequency in word_frequency.items()
    if frequency >= 3
}

word2idx = {
    word: index
    for index, word in enumerate(word_frequency.keys(), start=2)
}

word2idx['[PAD]'] = 0
word2idx['[UNK]'] = 1

# Tokenize to ids

In [7]:
dataset = (
    dataset
    .map(lambda x: {
            'input_ids': [
                word2idx.get(word, word2idx['[UNK]'])
                for word in x['tokens']
            ]
        }
    )
)

dataset['train']['input_ids'][:3]

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

[[2, 1, 3, 4, 5, 6, 7, 8, 9], [10, 11], [12, 13]]

In [8]:
# Rename 'ner_tags' to labels
dataset = dataset.rename_column("ner_tags", "labels")

# Remove 'pos_tag' and 'chunk_tags'
dataset = dataset.remove_columns(["pos_tags", "chunk_tags"])

# Check before moving on
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'labels', 'input_ids'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'labels', 'input_ids'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'labels', 'input_ids'],
        num_rows: 3453
    })
})


## Model Architecture: BiLSTM Model

Embedding dim 100 \
Num LSTM layers 1 \
LSTM hidden dim 256 \
LSTM Dropout 0.33 \
Linear output dim 128

In [9]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

In [10]:
class BiLSTMForNER(nn.Module):
    def __init__(self, vocab_size, embedding_dim, lstm_hidden_dim, linear_output_dim, num_labels, dropout=0.33):
        super(BiLSTMForNER, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.bilstm = nn.LSTM(embedding_dim, lstm_hidden_dim // 2, num_layers=1,
                              bidirectional=True, batch_first=True, dropout=dropout)
        self.linear = nn.Linear(lstm_hidden_dim, linear_output_dim)
        self.elu = nn.ELU()
        self.classifier = nn.Linear(linear_output_dim, num_labels)

    def forward(self, input_ids):
        embeddings = self.embedding(input_ids)
        bilstm_output, _ = self.bilstm(embeddings)
        linear_output = self.elu(self.linear(bilstm_output))
        logits = self.classifier(linear_output)
        return logits

In [38]:
# unique_input_ids = set()
# for split in ['train', 'validation', 'test']:
#     for input_id_list in dataset[split]['input_ids']:
#         unique_input_ids.update(input_id_list)

def get_max_label(dataset):
    max_label = 0
    for split in dataset:
        split_max = max([max(labels) for labels in dataset[split]['labels']])
        max_label = max(max_label, split_max)
    return max_label + 1  # Adding 1 because labels are zero-indexed


vocab_size = len(word2idx)
num_labels = get_max_label(dataset)
model = BiLSTMForNER(vocab_size, embedding_dim=100, lstm_hidden_dim=256,
                     linear_output_dim=128, num_labels=num_labels, dropout=0.33)

# Define an optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# You should also define a loss function, such as CrossEntropyLoss
loss_fn = nn.CrossEntropyLoss()

# Checking the number of labels
print(num_labels)

9


In [43]:
# Function to pad sequences and create TensorDataset
def create_dataset2(input_ids, labels, pad_label_value=-100):
    # Pad the input sequences and the labels
    input_ids_padded = pad_sequence([torch.tensor(s) for s in input_ids],
                                    batch_first=True, padding_value=0)
    labels_padded = pad_sequence([torch.tensor(l) for l in labels],
                                 batch_first=True, padding_value=pad_label_value)
    return TensorDataset(input_ids_padded, labels_padded)

train_dataset = create_dataset2(dataset['train']['input_ids'], dataset['train']['labels'])
val_dataset = create_dataset2(dataset['validation']['input_ids'], dataset['validation']['labels'])

# Create DataLoader objects
batch_size = 32
num_epochs = 10
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

In [44]:
# Set the device to GPU (cuda) if available, otherwise stick with CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device} device")

# Move the model to the specified device
model.to(device)


# Training Loop
for epoch in range(1, num_epochs + 1):
    model.train()
    total_loss = 0
    for batch in train_loader:
        # Transfer batch to the device
        input_ids, labels = batch[0].to(device), batch[1].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids)
        loss = loss_fn(outputs.view(-1, num_labels), labels.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch} total loss: {total_loss}")

    # Evaluation Loop
    model.eval()
    total_eval_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            # Transfer batch to the device
            input_ids, labels = batch[0].to(device), batch[1].to(device)

            outputs = model(input_ids)
            loss = loss_fn(outputs.view(-1, num_labels), labels.view(-1))
            total_eval_loss += loss.item()
    print(f"Validation loss: {total_eval_loss}")

Using cuda device
Epoch 1 total loss: 1.8564759304535983
Validation loss: 36.219076077918544
Epoch 2 total loss: 1.694969797750673
Validation loss: 33.84328517808581
Epoch 3 total loss: 1.5344973409628437
Validation loss: 37.34701970632159
Epoch 4 total loss: 1.3758148565011652
Validation loss: 35.17377464380297
Epoch 5 total loss: 1.892049454516382
Validation loss: 35.0866580131551
Epoch 6 total loss: 2.4993663750174164
Validation loss: 35.318455004993666
Epoch 7 total loss: 2.269344789405295
Validation loss: 36.574614559823885
Epoch 8 total loss: 1.5959238123577961
Validation loss: 37.843096348896324
Epoch 9 total loss: 1.2776238178876156
Validation loss: 37.01592907751274
Epoch 10 total loss: 1.1379936382022606
Validation loss: 38.452581290628586


In [45]:
# Define the list of NER tags
ner_tags = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

# Create ner_to_index and index_to_tag dictionaries
ner_to_index = {tag: index for index, tag in enumerate(ner_tags)}
index_to_tag = {index: tag for tag, index in ner_to_index.items()}

In [46]:
from conlleval import evaluate
import itertools

# Evaluation Loop
model.eval()
all_true_tags = []
all_pred_tags = []
with torch.no_grad():
    for batch in val_loader:
        # Transfer batch to the device
        input_ids, labels = batch[0].to(device), batch[1].to(device)

        outputs = model(input_ids)

        # Get the model's predictions
        predictions = torch.argmax(outputs, dim=2)

        # Exclude padding from evaluation
        for i in range(labels.size(0)):  # Batch size
            true_labels = labels[i]
            pred_labels = predictions[i]
            for j in range(true_labels.size(0)):  # Sequence length
                if true_labels[j] != -100:  # Assuming -100 is used for padding
                    true_tag = index_to_tag[true_labels[j].item()]
                    pred_tag = index_to_tag[pred_labels[j].item()]
                    all_true_tags.append(true_tag)
                    all_pred_tags.append(pred_tag)

# Evaluate with conlleval
prec, rec, f1 = evaluate(all_true_tags, all_pred_tags, verbose=True)
print(f'Precision: {prec}, Recall: {rec}, F1 Score: {f1}')

processed 51362 tokens with 5942 phrases; found: 5623 phrases; correct: 4502.
accuracy:  78.34%; (non-O)
accuracy:  95.57%; precision:  80.06%; recall:  75.77%; FB1:  77.86
              LOC: precision:  85.16%; recall:  83.40%; FB1:  84.27  1799
             MISC: precision:  83.09%; recall:  73.54%; FB1:  78.02  816
              ORG: precision:  72.97%; recall:  68.23%; FB1:  70.52  1254
              PER: precision:  78.51%; recall:  74.76%; FB1:  76.59  1754
Precision: 80.0640227636493, Recall: 75.76573544261191, F1 Score: 77.85559878945094
