## Install Dependencies


In [None]:
%pip install torch==1.11.0+cu113 torchdata==0.3.0 torchtext==0.12.0 -f https://download.pytorch.org/whl/cu113/torch_stable.html
%pip install datasets spacy tqdm

**CHECKING VERSIONS**


In [4]:
import torch
import torchtext

SEED = 1234
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

print("PyTorch Version: ", torch.__version__)
print("torchtext Version: ", torchtext.__version__)
print(f"Using {'GPU' if str(DEVICE) == 'cuda' else 'CPU'}.")

PyTorch Version:  2.2.0+cpu
torchtext Version:  0.16.2+cpu
Using CPU.


In [5]:
from datasets import load_dataset, load_metric
dataset = load_dataset("surrey-nlp/PLOD-CW")

# Experiment 1: Pre-trained BERT model vs fine-tuned 
In this first experiment I am going to use the BERT model from the 'transformers' library \
The point of this experiment is to see the difference in performance between a pre-trained model and a fine-tuned model.



In [7]:
#start by fetching the training part of the dataset
X_training = dataset["train"]
X_training

Dataset({
    features: ['tokens', 'pos_tags', 'ner_tags'],
    num_rows: 1072
})

In [8]:
example = X_training[0]
print(example['tokens'])
print(example['pos_tags'])
print(example['ner_tags'])


['For', 'this', 'purpose', 'the', 'Gothenburg', 'Young', 'Persons', 'Empowerment', 'Scale', '(', 'GYPES', ')', 'was', 'developed', '.']
['ADP', 'DET', 'NOUN', 'DET', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PUNCT', 'PROPN', 'PUNCT', 'AUX', 'VERB', 'PUNCT']
['B-O', 'B-O', 'B-O', 'B-O', 'B-LF', 'I-LF', 'I-LF', 'I-LF', 'I-LF', 'B-O', 'B-AC', 'B-O', 'B-O', 'B-O', 'B-O']


## Load BERT model and Tokenizer from 'transformers' library

In [9]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=4)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
#seperate different splits of datasets
short_dataset = dataset["train"][:200]
val_dataset = dataset["validation"]
test_dataset = dataset["test"]

# Testing the model without fine-tuning
I first want to evaluate the performance of the pre-trained BERT model from HuggingFace \
We must first tokenize the test data

In [11]:
#tokenizing the test data
tokenized_test = tokenizer(test_dataset["tokens"], is_split_into_words=True)
print(tokenized_test)
for token in tokenized_test["input_ids"]:
  print(tokenizer.convert_ids_to_tokens(token))
  break

{'input_ids': [[101, 22498, 2015, 1024, 20296, 1010, 3795, 4607, 2594, 4800, 13013, 2121, 2817, 1025, 21722, 1010, 18834, 11733, 3064, 5301, 6770, 1012, 102], [101, 12884, 2015, 2013, 1042, 24759, 2278, 28406, 2020, 5845, 2007, 2474, 6633, 19968, 2072, 17698, 1031, 6445, 1033, 2007, 2184, 3461, 1015, 1010, 1018, 1011, 4487, 15222, 14573, 2890, 9956, 2140, 1006, 26718, 2102, 1007, 1998, 9685, 2005, 1019, 1049, 2012, 5594, 1080, 1039, 2059, 16578, 2006, 1037, 1018, 1003, 2000, 2321, 1003, 12532, 16778, 11231, 3560, 17371, 2015, 21500, 2007, 1037, 1020, 1003, 9991, 2075, 21500, 2448, 2012, 17093, 4860, 2012, 1037, 5377, 2531, 1058, 1012, 2048, 4958, 8939, 24587, 22330, 18715, 10586, 2060, 2084, 6335, 22394, 1010, 6335, 17788, 1010, 1998, 15177, 7712, 2358, 21716, 2389, 1048, 24335, 8458, 7361, 10448, 20624, 2078, 1006, 24529, 14277, 1007, 2024, 2124, 2000, 20544, 6335, 2278, 2475, 1999, 1996, 11192, 1031, 2570, 1010, 2484, 1033, 1012, 102], [101, 2057, 2764, 1037, 8349, 1997, 4962, 2275, 

### Pre-processing the PLOD labels

In [12]:
label_encoding = {"B-O": 0, "B-AC": 1, "B-LF": 2, "I-LF": 3}

label_list = []
for sample in short_dataset["ner_tags"]:
    label_list.append([label_encoding[tag] for tag in sample])

val_label_list = []
for sample in val_dataset["ner_tags"]:
    val_label_list.append([label_encoding[tag] for tag in sample])

test_label_list = []
for sample in test_dataset["ner_tags"]:
    test_label_list.append([label_encoding[tag] for tag in sample])


Because we used the BERT tokenizer (which inserts special tokens) we have to go through and re-align the labels

In [13]:
def tokenize_and_align_labels(short_dataset, list_name):
    tokenized_inputs = tokenizer(short_dataset["tokens"], truncation=True, is_split_into_words=True) ## For some models, you may need to set max_length to approximately 500.

    labels = []
    for i, label in enumerate(list_name):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [33]:
tokenized_train = tokenize_and_align_labels(short_dataset, label_list)
tokenized_val = tokenize_and_align_labels(val_dataset, val_label_list)
tokenized_testset = tokenize_and_align_labels(test_dataset, test_label_list)


In [34]:
# BERT's tokenizer returns the dataset in the form of a dictionary of lists (sentences). 
# we have to convert it into a list of dictionaries for training.
def turn_dict_to_list_of_dict(d):
    new_list = []

    for labels, inputs in zip(d["labels"], d["input_ids"]):
        entry = {"input_ids": inputs, "labels": labels}
        new_list.append(entry)

    return new_list

In [35]:
tokenized_train = turn_dict_to_list_of_dict(tokenized_train)
tokenized_val = turn_dict_to_list_of_dict(tokenized_val)
tokenised_test = turn_dict_to_list_of_dict(tokenized_testset)

In [36]:
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer)

### 4. Training and metrics


In [37]:
import numpy as np

metric = load_metric("seqeval")
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


## Testing pre-trained model against the test set
I define a trainer for the model. Note that for testing the pre-trained model, I will not be running the training loop, and only using it to evaluate.

In [40]:

from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

# Training arguments (feel free to play arround with these values)
model_name = "bert-base-uncased"
epochs = 6
batch_size = 4
learning_rate = 2e-5

args = TrainingArguments(
    f"BERT-finetuned-NER",
    # evaluation_strategy = "epoch", ## Instead of focusing on loss and accuracy, we will focus on the F1 score
    evaluation_strategy ='steps',
    eval_steps = 7000,
    save_total_limit = 3,
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=0.001,
    save_steps=35000,
    metric_for_best_model = 'f1',
    load_best_model_at_end=True
)

trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator = data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


## Evaluating our pre-trained (non fine-tuned) BERT model against test set
At this point we have not fine-tuned the model, and therefore can expect bad performance from the model

In [41]:
# Prepare the test data for evaluation in the same format as the training data

predictions, labels, _ = trainer.predict(tokenised_test)
predictions = np.argmax(predictions, axis=2)

# Remove the predictions for the [CLS] and [SEP] tokens 
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

# Compute multiple metrics on the test restuls
results = metric.compute(predictions=true_predictions, references=true_labels)
results

100%|██████████| 39/39 [00:12<00:00,  3.15it/s]


{'0, 0, 0, 0, 0, 0, 0, 2, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 3, 3, 0, 1, 0, 0, 0, 0, 0, 0, 2, 3, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0]': {'precision': 0.036281179138321996,
  'recall': 0.1797752808988764,
  'f1': 0.060377358490566045,
  'number': 267},
 '0, 0, 0, 0, 2, 3, 3, 3, 3, 0, 1, 0, 0, 0, 0]': {'precision': 0.026068821689259645,
  'recall': 0.04664179104477612,
  'f1': 0.033444816053511704,
  'number': 536},
 '0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 3, 0, 1, 0, 0, 0, 0, 2, 3, 3, 0, 1, 0, 0, 0, 0, 0]': {'precision': 0.00596252129471891,
  'recall': 0.04697986577181208,
  'f1': 0.010582010582010583,
  'number': 149},
 '1, 0, 2, 3, 3, 0]': {'precision': 0.0012919896640826874,
  'recall': 0.007751937984496124,
  'f1': 0.00221483942414175,
  'number': 129},
 'overall_precision': 0.019148936170212766,
 'overall_recall': 0.07493061979648474,
 'overall_f1': 0.030502730182639805,
 'overall_accuracy': 0.25063900165388664}

# Results of pre-trained model
From the accuracy above, we can see that the pre-trained model is not very accurate at making predictions. For this reason, we want to use fine tuning to suit it more to our use case. \
We already have testing and validation sets formatted from when we setup the test set for testing.



## Fine tuning the model with the training data

In [42]:
#To fine tune the model, we just train the model with dataset
trainer.train()


100%|██████████| 300/300 [06:57<00:00,  1.39s/it]

{'train_runtime': 417.5803, 'train_samples_per_second': 2.874, 'train_steps_per_second': 0.718, 'train_loss': 0.24068687438964845, 'epoch': 6.0}





TrainOutput(global_step=300, training_loss=0.24068687438964845, metrics={'train_runtime': 417.5803, 'train_samples_per_second': 2.874, 'train_steps_per_second': 0.718, 'train_loss': 0.24068687438964845, 'epoch': 6.0})

## Now that we have trained the model with our dataset, it is 'fine-tuned' for our use case. We can expect much higher accuracy when we run our test set against it now.

In [43]:
#Once training is finished, we can re-run predictions for the test set and then measure the accuracy again. We expect much higher accuracy this time
# Prepare the test data for evaluation in the same format as the training data

predictions, labels, _ = trainer.predict(tokenised_test)
predictions = np.argmax(predictions, axis=2)

# Remove the predictions for the [CLS] and [SEP] tokens 
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

# Compute multiple metrics on the test restuls
results = metric.compute(predictions=true_predictions, references=true_labels)
results


100%|██████████| 39/39 [00:13<00:00,  2.99it/s]


{'0, 0, 0, 0, 0, 0, 0, 2, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 3, 3, 0, 1, 0, 0, 0, 0, 0, 0, 2, 3, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0]': {'precision': 0.6371951219512195,
  'recall': 0.7827715355805244,
  'f1': 0.7025210084033614,
  'number': 267},
 '0, 0, 0, 0, 2, 3, 3, 3, 3, 0, 1, 0, 0, 0, 0]': {'precision': 0.5723270440251572,
  'recall': 0.6791044776119403,
  'f1': 0.621160409556314,
  'number': 536},
 '0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 3, 0, 1, 0, 0, 0, 0, 2, 3, 3, 0, 1, 0, 0, 0, 0, 0]': {'precision': 0.5511363636363636,
  'recall': 0.6510067114093959,
  'f1': 0.5969230769230769,
  'number': 149},
 '1, 0, 2, 3, 3, 0]': {'precision': 0.4968944099378882,
  'recall': 0.6201550387596899,
  'f1': 0.5517241379310344,
  'number': 129},
 'overall_precision': 0.5764796310530361,
 'overall_recall': 0.6938020351526365,
 'overall_f1': 0.6297229219143577,
 'overall_accuracy': 0.914298601714028}