

# Experiment 1: Comparing BERT to ROBERTA model in both pre-trained and fine-tuned form



# Install dependencies

In [None]:
%pip install datasets
%pip install transformers
%pip install spacy
%pip install spacy-transformers
%pip install transformers[torch]
%pip install seqeval

In [None]:
!pip install datasets
!pip install seqeval

In [20]:
import torch
import torchtext

SEED = 1234
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

print("PyTorch Version: ", torch.__version__)
print("torchtext Version: ", torchtext.__version__)
print(f"Using {'GPU' if str(DEVICE) == 'cuda' else 'CPU'}.")

PyTorch Version:  2.2.1+cu121
torchtext Version:  0.17.1+cpu
Using GPU.


In [None]:
from datasets import load_dataset, load_metric
dataset = load_dataset("surrey-nlp/PLOD-CW")

In [21]:
def conv_label_indexes(training, valid, test ):
	label_encoding = {"B-O": 0, "B-AC": 1, "B-LF": 2, "I-LF": 3}

	label_list = []
	for sample in training:
		label_list.append([label_encoding[tag] for tag in sample])

	val_label_list = []
	for sample in valid:
		val_label_list.append([label_encoding[tag] for tag in sample])

	test_label_list = []
	for sample in test:
		test_label_list.append([label_encoding[tag] for tag in sample])
	return label_list, val_label_list, test_label_list





In [22]:
def tokenize_and_align_labels(dataset, tokenizer, label_list):
    tokenized_inputs = tokenizer(dataset["tokens"], truncation=True, is_split_into_words=True) ## For some models, you may need to set max_length to approximately 500.

    labels = []
    for i, label in enumerate(label_list):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [23]:
# BERT's tokenizer returns the dataset in the form of a dictionary of lists (sentences).
# we have to convert it into a list of dictionaries for training.
def turn_dict_to_list_of_dict(d):
    new_list = []

    for labels, inputs in zip(d["labels"], d["input_ids"]):
        entry = {"input_ids": inputs, "labels": labels}
        new_list.append(entry)

    return new_list

In [24]:
import numpy as np

metric = load_metric("seqeval")
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [31]:
import numpy as np
label_encoding = {"B-O": 0, "B-AC": 1, "B-LF": 2, "I-LF": 3}
inverse_label_map = {v: k for k, v in label_encoding.items()}

def calculate_results(trainer, dataset):

    predictions, labels, _ = trainer.predict(dataset)
    predictions = np.argmax(predictions, axis=2)

    textual_true_predictions = [
        [inverse_label_map[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    textual_true_labels = [
        [inverse_label_map[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=textual_true_predictions, references=textual_true_labels)

    return results



# Load BERT model
The first stage of this experiment is to view the performance of the BERT model in both its pre-trained and fine-tuned (transfer learning) form\
The BERT model uses the Encoder part of the transformer architecture, and its bidirectional nature makes it suitable for token classification

In [26]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
bert_model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=4)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#Pre-process data (like in other experiments)


In [27]:
training = dataset["train"]
valid = dataset["validation"]
test = dataset["test"]

training_labels = training["ner_tags"]
valid_labels = valid["ner_tags"]
test_labels = test["ner_tags"]

In [28]:
#convert label lists to indexes
label_list, val_label_list, test_label_list = conv_label_indexes(training_labels, valid_labels, test_labels)


tokenized_train = tokenize_and_align_labels(training, bert_tokenizer, label_list)
tokenized_val_datasets = tokenize_and_align_labels(valid, bert_tokenizer, val_label_list)
tokenized_test_datasets = tokenize_and_align_labels(test, bert_tokenizer, test_label_list)

tokenized_train = turn_dict_to_list_of_dict(tokenized_train)
tokenized_val = turn_dict_to_list_of_dict(tokenized_val_datasets)
tokenized_test = turn_dict_to_list_of_dict(tokenized_test_datasets)

from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(bert_tokenizer)


# Setup trainer, don't train but use to run test set against pre-trained model

In [29]:

from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

# Training arguments (feel free to play arround with these values)
model_name = "bert-base-uncased"
epochs = 6
batch_size = 4
learning_rate = 2e-5

args = TrainingArguments(
    f"BERT-finetuned-NER",
    # evaluation_strategy = "epoch", ## Instead of focusing on loss and accuracy, we will focus on the F1 score
    evaluation_strategy ='steps',
    eval_steps = 7000,
    save_total_limit = 3,
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=0.001,
    save_steps=35000,
    metric_for_best_model = 'f1',
    load_best_model_at_end=True
)

trainer = Trainer(
    bert_model,
    args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator = data_collator,
    tokenizer=bert_tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


In [33]:
results = calculate_results(trainer, tokenized_test)
results



{'AC': {'precision': 0.09341368919500646,
  'recall': 0.396709323583181,
  'f1': 0.15121951219512197,
  'number': 547},
 'LF': {'precision': 0.018369175627240143,
  'recall': 0.1357615894039735,
  'f1': 0.03235990528808209,
  'number': 302},
 'O': {'precision': 0.8756046993780235,
  'recall': 0.23476005188067445,
  'f1': 0.37025131502045583,
  'number': 5397},
 'overall_precision': 0.25408197267577476,
 'overall_recall': 0.24415626000640409,
 'overall_f1': 0.24902024820378837,
 'overall_accuracy': 0.25063900165388664}

# Results of pre-trained BERT
From the above f1 score of 40%, we can see that pre-trained BERT without fine tuning does not perform well for our token classification task

# Fine tune BERT and run test set against it

In [34]:
trainer.train()


Step,Training Loss,Validation Loss


TrainOutput(global_step=1608, training_loss=0.16912953829883937, metrics={'train_runtime': 188.3494, 'train_samples_per_second': 34.149, 'train_steps_per_second': 8.537, 'total_flos': 278431018433184.0, 'train_loss': 0.16912953829883937, 'epoch': 6.0})

In [35]:
results = calculate_results(trainer, tokenized_test)
results




{'AC': {'precision': 0.7469458987783595,
  'recall': 0.7824497257769653,
  'f1': 0.7642857142857142,
  'number': 547},
 'LF': {'precision': 0.6897590361445783,
  'recall': 0.7582781456953642,
  'f1': 0.722397476340694,
  'number': 302},
 'O': {'precision': 0.9656853553347085,
  'recall': 0.9542338336112656,
  'f1': 0.9599254426840633,
  'number': 5397},
 'overall_precision': 0.9309073420968259,
 'overall_recall': 0.9297150176112712,
 'overall_f1': 0.9303107978212112,
 'overall_accuracy': 0.9272289881220869}

# Roberta

Above we can see that fine tuned BERT is much better

# Run test set against model before fine tuning
We do this to gauge the performance of the models pre-trained state, i.e. testing how good it is for generalised use cases

In [39]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

# Load the RoBERTa tokenizer
rob_tokenizer = AutoTokenizer.from_pretrained("roberta-base", add_prefix_space=True)

# Load the RoBERTa model for token classification with the desired number of labels
pretrained_rob_model = AutoModelForTokenClassification.from_pretrained("roberta-base", num_labels=4)




Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [40]:
#tokenize training set
label_list, val_label_list, test_label_list = conv_label_indexes(training_labels, valid_labels, test_labels)
tokenized_train = tokenize_and_align_labels(training, rob_tokenizer, label_list)
tokenized_val_datasets = tokenize_and_align_labels(valid, rob_tokenizer,  val_label_list)
tokenized_test_datasets = tokenize_and_align_labels(test, rob_tokenizer, test_label_list)

tokenized_train = turn_dict_to_list_of_dict(tokenized_train)
tokenized_val = turn_dict_to_list_of_dict(tokenized_val_datasets)
tokenized_test = turn_dict_to_list_of_dict(tokenized_test_datasets)

from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(rob_tokenizer)

# Pre trained ROBERTA performance

In [41]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

# Training arguments (feel free to play arround with these values)
model_name = "roberta-base"
epochs = 6
batch_size = 4
learning_rate = 2e-5

args = TrainingArguments(
    f"ROBERTA-finedtuned-ner",
    # evaluation_strategy = "epoch", ## Instead of focusing on loss and accuracy, we will focus on the F1 score
    evaluation_strategy ='steps',
    eval_steps = 100,
    save_total_limit = 3,
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=0.001,
    save_steps=35000,
    metric_for_best_model = 'f1',
    load_best_model_at_end=True
)

trainer = Trainer(
    pretrained_rob_model,
    args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator = data_collator,
    tokenizer=rob_tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)

pretrain_results = calculate_results(trainer, tokenized_test)


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
pretrain_results

{'AC': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 559},
 'LF': {'precision': 0.04336043360433604,
  'recall': 0.05517241379310345,
  'f1': 0.048558421851289835,
  'number': 290},
 'O': {'precision': 0.807876882343207,
  'recall': 0.925673113386424,
  'f1': 0.8627728196518513,
  'number': 5274},
 'overall_precision': 0.7620974015870546,
 'overall_recall': 0.7999346725461375,
 'overall_f1': 0.7805577689243028,
 'overall_accuracy': 0.7554531490015362}

# Fine tune ROBERTA with training and look at result

In [42]:
trainer.train()


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
100,No log,0.232008,0.573084,0.614136,0.5929,0.909663
200,No log,0.197183,0.619614,0.796562,0.697033,0.928549
300,No log,0.184607,0.685545,0.774594,0.727354,0.934057
400,No log,0.189672,0.729901,0.771729,0.750232,0.937834
500,0.291700,0.194144,0.719364,0.734479,0.726843,0.931539
600,0.291700,0.204416,0.709091,0.819484,0.760301,0.934372
700,0.291700,0.198121,0.722222,0.819484,0.767785,0.939408
800,0.291700,0.220244,0.748815,0.754537,0.751665,0.935159
900,0.291700,0.252546,0.753861,0.745941,0.74988,0.932798
1000,0.140200,0.200415,0.762072,0.829035,0.794145,0.942713


TrainOutput(global_step=1608, training_loss=0.1671992943654606, metrics={'train_runtime': 193.7732, 'train_samples_per_second': 33.193, 'train_steps_per_second': 8.298, 'total_flos': 270367418028384.0, 'train_loss': 0.1671992943654606, 'epoch': 6.0})

In [43]:
results = calculate_results(trainer, tokenized_test)
results

{'AC': {'precision': 0.8464285714285714,
  'recall': 0.8479427549194991,
  'f1': 0.8471849865951743,
  'number': 559},
 'LF': {'precision': 0.7354838709677419,
  'recall': 0.7862068965517242,
  'f1': 0.76,
  'number': 290},
 'O': {'precision': 0.972259422230725,
  'recall': 0.9635949943117179,
  'f1': 0.9679078183030188,
  'number': 5274},
 'overall_precision': 0.948663277021486,
 'overall_recall': 0.9446349828515433,
 'overall_f1': 0.9466448445171849,
 'overall_accuracy': 0.9419354838709677}

# Results of fine-tuned ROBERTA Model
Performance of fine tuned roberta is very similar to that of the fine-tuned BERT model.


*   BERT (fine tuned) F1 score: 0.9303741687364795
*   ROBERTA (fine tuned) F1 score: 0.9421501147164864
The difference is negligible, but Roberta is more computationally expensive to train / fine tune.
It is worth noting that ROBERTA pre-trained out performs pre-trained BERT with an f1 score of


*   BERT (pre-trained): 0.4012605042016807
*   ROBERTA (pre-trained) 0.7805577689243028



