# Experiment 4?: Hyperparameter tuning on BERT model


# Requirements / Dependencies


In [None]:
%pip install datasets
%pip install transformers
%pip install spacy


In [None]:
%pip install spacy-transformers
%pip install transformers[torch]
%pip install seqeval

In [3]:
import torch
import torchtext

SEED = 1234
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

print("PyTorch Version: ", torch.__version__)
print("torchtext Version: ", torchtext.__version__)
print(f"Using {'GPU' if str(DEVICE) == 'cuda' else 'CPU'}.")

PyTorch Version:  2.2.1+cu121
torchtext Version:  0.17.1+cpu
Using GPU.


# Load in Dataset (regular coursework version)

In [4]:
from datasets import load_dataset, load_metric
dataset = load_dataset("surrey-nlp/PLOD-CW")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/8.37k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/188k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/28.4k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/28.7k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1072 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/126 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/153 [00:00<?, ? examples/s]

## Defining various helper functions for tokenizing / training


In [5]:
def conv_label_indexes(training, valid, test ):
	label_encoding = {"B-O": 0, "B-AC": 1, "B-LF": 2, "I-LF": 3}

	label_list = []
	for sample in training:
		label_list.append([label_encoding[tag] for tag in sample])

	val_label_list = []
	for sample in valid:
		val_label_list.append([label_encoding[tag] for tag in sample])

	test_label_list = []
	for sample in test:
		test_label_list.append([label_encoding[tag] for tag in sample])
	return label_list, val_label_list, test_label_list





In [6]:
def tokenize_and_align_labels(tokens, tokenizer, list_name):
    tokenized_inputs = tokenizer(tokens, truncation=True, is_split_into_words=True) ## For some models, you may need to set max_length to approximately 500.

    labels = []
    for i, label in enumerate(list_name):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [7]:
# BERT's tokenizer returns the dataset in the form of a dictionary of lists (sentences).
# we have to convert it into a list of dictionaries for training.
def turn_dict_to_list_of_dict(d):
    new_list = []

    for labels, inputs in zip(d["labels"], d["input_ids"]):
        entry = {"input_ids": inputs, "labels": labels}
        new_list.append(entry)

    return new_list

In [8]:
import numpy as np

metric = load_metric("seqeval")
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

  metric = load_metric("seqeval")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

In [9]:
# Prepare the test data for evaluation in the same format as the training data
def calculate_results(trainer, data):
	predictions, labels, _ = trainer.predict(data)
	predictions = np.argmax(predictions, axis=2)

	# Remove the predictions for the [CLS] and [SEP] tokens
	true_predictions = [
			[label_list[p] for (p, l) in zip(prediction, label) if l != -100]
			for prediction, label in zip(predictions, labels)
	]
	true_labels = [
			[label_list[l] for (p, l) in zip(prediction, label) if l != -100]
			for prediction, label in zip(predictions, labels)
	]

	# Compute multiple metrics on the test restuls
	results = metric.compute(predictions=true_predictions, references=true_labels)
	return results

#Load RoBERTa model

In [10]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

# Load the RoBERTa tokenizer
rob_tokenizer = AutoTokenizer.from_pretrained("roberta-base", add_prefix_space=True)

# Load the RoBERTa model for token classification with the desired number of labels
rob_model = AutoModelForTokenClassification.from_pretrained("roberta-base", num_labels=4)




  _torch_pytree._register_pytree_node(


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
training = dataset["train"]
valid = dataset["validation"]
test = dataset["test"]
training_tokens = training["tokens"]
training_labels = training["ner_tags"]
valid_tokens = valid["tokens"]
valid_labels = valid["ner_tags"]
test_tokens = test["tokens"]
test_labels = test["ner_tags"]

In [12]:
label_list, val_label_list, test_label_list = conv_label_indexes(training_labels, valid_labels, test_labels)

In [13]:
tokenized_datasets = tokenize_and_align_labels(training_tokens, rob_tokenizer, label_list)
tokenized_val_datasets = tokenize_and_align_labels(valid_tokens, rob_tokenizer,  val_label_list)
tokenized_test_datasets = tokenize_and_align_labels(test_tokens, rob_tokenizer, test_label_list)

In [14]:
tokenised_train = turn_dict_to_list_of_dict(tokenized_datasets)
tokenised_val = turn_dict_to_list_of_dict(tokenized_val_datasets)
tokenised_test = turn_dict_to_list_of_dict(tokenized_test_datasets)

In [15]:
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(rob_tokenizer)

In [16]:
import numpy as np

metric = load_metric("seqeval")
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


# Training with roBERTA
Here we fine tune the roberta model to the training set

In [27]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

# Training arguments (feel free to play arround with these values)
model_name = "bert-base-uncased"
epochs = 6
batch_size = 4
learning_rate = 2e-5

args = TrainingArguments(
    f"BERT-finetuned-NER",
    # evaluation_strategy = "epoch", ## Instead of focusing on loss and accuracy, we will focus on the F1 score
    evaluation_strategy ='steps',
    eval_steps = 100,
    save_total_limit = 3,
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=0.001,
    save_steps=35000,
    metric_for_best_model = 'f1',
    load_best_model_at_end=True
)

trainer = Trainer(
    rob_model,
    args,
    train_dataset=tokenised_train,
    eval_dataset=tokenised_val,
    data_collator = data_collator,
    tokenizer=rob_tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


In [29]:
trainer.train()

Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
100,No log,0.19784,0.703734,0.809933,0.753108,0.938621
200,No log,0.193255,0.729073,0.773639,0.750695,0.933428
300,No log,0.202126,0.727438,0.762178,0.744403,0.939723
400,No log,0.183905,0.730769,0.780325,0.754734,0.940038
500,0.193100,0.196047,0.745961,0.793696,0.769088,0.938464
600,0.193100,0.188264,0.756098,0.799427,0.777159,0.944759
700,0.193100,0.18283,0.763251,0.825215,0.793024,0.94649
800,0.193100,0.216929,0.768817,0.819484,0.793343,0.944287
900,0.193100,0.210817,0.736181,0.839542,0.784471,0.942713
1000,0.111300,0.230213,0.752848,0.820439,0.785192,0.943658


TrainOutput(global_step=1608, training_loss=0.11999777330094902, metrics={'train_runtime': 194.5871, 'train_samples_per_second': 33.055, 'train_steps_per_second': 8.264, 'total_flos': 308868558189024.0, 'train_loss': 0.11999777330094902, 'epoch': 6.0})

In [30]:
results = calculate_results(trainer, tokenised_test)
results

{'0, 0, 0, 0, 0, 0, 0, 2, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 3, 3, 0, 1, 0, 0, 0, 0, 0, 0, 2, 3, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0]': {'precision': 0.8044280442804428,
  'recall': 0.8164794007490637,
  'f1': 0.8104089219330854,
  'number': 267},
 '0, 0, 0, 0, 2, 3, 3, 3, 3, 0, 1, 0, 0, 0, 0]': {'precision': 0.7089285714285715,
  'recall': 0.7406716417910447,
  'f1': 0.7244525547445255,
  'number': 536},
 '0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 3, 0, 1, 0, 0, 0, 0, 2, 3, 3, 0, 1, 0, 0, 0, 0, 0]': {'precision': 0.7361963190184049,
  'recall': 0.8053691275167785,
  'f1': 0.7692307692307692,
  'number': 149},
 '1, 0, 2, 3, 3, 0]': {'precision': 0.6688741721854304,
  'recall': 0.7829457364341085,
  'f1': 0.7214285714285714,
  'number': 129},
 'overall_precision': 0.7301310043668122,
 'overall_recall': 0.7733580018501388,
 'overall_f1': 0.7511230907457322,
 'overall_accuracy': 0.938863287250384}

# Learning rate calculation loop

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

# Load the RoBERTa tokenizer
rob_tokenizer = AutoTokenizer.from_pretrained("roberta-base", add_prefix_space=True)

# Load the RoBERTa model for token classification with the desired number of labels
rob_model = AutoModelForTokenClassification.from_pretrained("roberta-base", num_labels=4)



learning_rates = [2e-5, 2.1e-5, 2.3e-5, 2.7e-5, 2.9e-5, 1.9e-5, 1.7e-5, 1.5e-5, 1.3e-5, 1.1e-5]
best_learning = 0
best_f1 = 0
model_name = "bert-base-uncased"
epochs = 6
batch_size = 4
learning_rate = 2e-5
for i in learning_rates:
  args = TrainingArguments(
      f"BERT-finetuned-NER",
      # evaluation_strategy = "epoch", ## Instead of focusing on loss and accuracy, we will focus on the F1 score
      evaluation_strategy ='steps',
      eval_steps = 1000,
      save_total_limit = 3,
      learning_rate=i,
      per_device_train_batch_size=batch_size,
      per_device_eval_batch_size=batch_size,
      num_train_epochs=epochs,
      weight_decay=0.001,
      save_steps=35000,
      metric_for_best_model = 'f1',
      load_best_model_at_end=True
  )

  trainer = Trainer(
      rob_model,
      args,
      train_dataset=tokenised_train,
      eval_dataset=tokenised_val,
      data_collator = data_collator,
      tokenizer=rob_tokenizer,
      compute_metrics=compute_metrics,
      callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
  )
  trainer.train()
  result = calculate_results(trainer, tokenised_test)
  f1 = result["overall_f1"]
  if f1 > best_f1:
    best_learning = i
    best_f1 = f1


In [33]:
print(best_learning)
print(best_f1)

1.7e-05
0.7816399286987522


## Results of learning rate testing
After testing different learning rates above, I have seen that F1 score does not vary hugely as learning rate changes with this algorithm.

# Testing different batch sizes
Now that we have discerned that learning rate does not appear to have a huge impact on the fine-tuned BERT models performance, I will now try to 'optimize' another hyperparameter - batch size.
For this I will do less iterations due to the relatively small dataset and already small initial batch size

In [20]:
from transformers import AutoTokenizer, AutoModelForTokenClassification,TrainingArguments, Trainer, EarlyStoppingCallback

# Load the RoBERTa tokenizer
rob_tokenizer = AutoTokenizer.from_pretrained("roberta-base", add_prefix_space=True)

# Load the RoBERTa model for token classification with the desired number of labels
rob_model = AutoModelForTokenClassification.from_pretrained("roberta-base", num_labels=4)



batches = [2, 4,8]
best_batch = None
best_f1 = 0
model_name = "bert-base-uncased"
epochs = 6
batch_size = 4
learning_rate = 1.7e-05

for i in batches:
  args = TrainingArguments(
      f"BERT-finetuned-NER",
      # evaluation_strategy = "epoch", ## Instead of focusing on loss and accuracy, we will focus on the F1 score
      evaluation_strategy ='steps',
      eval_steps = 1000,
      save_total_limit = 3,
      learning_rate=learning_rate,
      per_device_train_batch_size=i,
      per_device_eval_batch_size=i,
      num_train_epochs=epochs,
      weight_decay=0.001,
      save_steps=35000,
      metric_for_best_model = 'f1',
      load_best_model_at_end=True
  )

  trainer = Trainer(
      rob_model,
      args,
      train_dataset=tokenised_train,
      eval_dataset=tokenised_val,
      data_collator = data_collator,
      tokenizer=rob_tokenizer,
      compute_metrics=compute_metrics,
      callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
  )
  trainer.train()
  result = calculate_results(trainer, tokenised_test)
  f1 = result["overall_f1"]
  if f1 > best_f1:
    best_batch = i
    best_f1 = f1


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


OutOfMemoryError: CUDA out of memory. Tried to allocate 148.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 73.06 MiB is free. Process 1883 has 14.67 GiB memory in use. Of the allocated memory 14.33 GiB is allocated by PyTorch, and 221.62 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
print(best_batch)
print(best_f1)

# Train model with batch size of 2 and learning rate of 2e-5

# Result of batch fine tuning
Above we can see the 'preferable' batch value between 1-9 is 2
## Using scheduler for learning rate optimisation
When we 'optimized' our learning rate earlier, we used a constant learning rate instead of a scheduler

In [None]:
from transformers import TrainingArguments, Trainer, get_linear_schedule_with_warmup

from transformers import AdamW


batch = 4
model_name = "bert-base-uncased"
epochs = 6
batch_size = 4
initial_learning_rate = 2e-5

args = TrainingArguments(
    f"BERT-finetuned-NER",
    # evaluation_strategy = "epoch", ## Instead of focusing on loss and accuracy, we will focus on the F1 score
    evaluation_strategy ='steps',
    eval_steps = 1000,
    save_total_limit = 3,
    learning_rate=best_learning,
    per_device_train_batch_size=batch,
    per_device_eval_batch_size=batch,
    num_train_epochs=epochs,
    weight_decay=0.001,
    save_steps=35000,
    metric_for_best_model = 'f1',
    load_best_model_at_end=True,
    warmup_steps = 50
)

optimizer = AdamW(roberta_model.parameters(), lr = initial_learning_rate)
total_steps = (len(tokenised_train) // (batch_size * args.gradient_accumulation_steps)) * epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=args.warmup_steps,
    num_training_steps=total_steps
)

trainer = Trainer(
    roberta_model,
    args,
    train_dataset=tokenised_train,
    eval_dataset=tokenised_val,
    data_collator = data_collator,
    tokenizer=rob_tokenizer,
    compute_metrics=compute_metrics,
    optimizers = (optimizer, scheduler),
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)
trainer.train()
result = calculate_results(trainer, tokenised_test)
f1 = result["overall_f1"]
result


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1000,0.0017,0.785201,0.714795,0.765998,0.739511,0.924111






{'AC': {'precision': 0.7883597883597884,
  'recall': 0.8171846435100548,
  'f1': 0.8025134649910234,
  'number': 547},
 'LF': {'precision': 0.667590027700831,
  'recall': 0.7980132450331126,
  'f1': 0.726998491704374,
  'number': 302},
 'O': {'precision': 0.9715985415467281,
  'recall': 0.9381137669075412,
  'f1': 0.9545625942684767,
  'number': 5397},
 'overall_precision': 0.9367975240267145,
 'overall_recall': 0.920749279538905,
 'overall_f1': 0.9287040775131207,
 'overall_accuracy': 0.9198616749360998}

In [None]:
from transformers import TrainingArguments, Trainer

batch = 2
model_name = "bert-base-uncased"
epochs = 6
batch_size = 4
initial_learning_rate = 2e-5

args = TrainingArguments(
    f"BERT-finetuned-NER",
    # evaluation_strategy = "epoch", ## Instead of focusing on loss and accuracy, we will focus on the F1 score
    evaluation_strategy ='steps',
    eval_steps = 1000,
    save_total_limit = 3,
    learning_rate=best_learning,
    per_device_train_batch_size=batch,
    per_device_eval_batch_size=batch,
    num_train_epochs=epochs,
    weight_decay=0.001,
    save_steps=35000,
    metric_for_best_model = 'f1',
    load_best_model_at_end=True,
    warmup_steps = 50
)


trainer = Trainer(
    roberta_model,
    args,
    train_dataset=tokenised_train,
    eval_dataset=tokenised_val,
    data_collator = data_collator,
    tokenizer=rob_tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)
trainer.train()
result = calculate_results(trainer, tokenised_test)
f1 = result["overall_f1"]
result
