# Experiment 4?: Hyperparameter tuning on BERT model


# Requirements / Dependencies


In [None]:
%pip install datasets
%pip install transformers
%pip install spacy


In [None]:
%pip install spacy-transformers
%pip install transformers[torch]
%pip install seqeval

In [None]:
import torch
import torchtext

SEED = 1234
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

print("PyTorch Version: ", torch.__version__)
print("torchtext Version: ", torchtext.__version__)
print(f"Using {'GPU' if str(DEVICE) == 'cuda' else 'CPU'}.")

PyTorch Version:  2.2.1+cu121
torchtext Version:  0.17.1+cpu
Using GPU.


# Load in Dataset (regular coursework version)

In [None]:
from datasets import load_dataset, load_metric
dataset = load_dataset("surrey-nlp/PLOD-CW")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/8.37k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/188k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/28.4k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/28.7k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1072 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/126 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/153 [00:00<?, ? examples/s]

## Defining various helper functions for tokenizing / training


In [None]:
def conv_label_indexes(training, valid, test ):
	label_encoding = {"B-O": 0, "B-AC": 1, "B-LF": 2, "I-LF": 3}

	label_list = []
	for sample in training:
		label_list.append([label_encoding[tag] for tag in sample])

	val_label_list = []
	for sample in valid:
		val_label_list.append([label_encoding[tag] for tag in sample])

	test_label_list = []
	for sample in test:
		test_label_list.append([label_encoding[tag] for tag in sample])
	return label_list, val_label_list, test_label_list





In [None]:
def tokenize_and_align_labels(tokens, tokenizer, list_name):
    tokenized_inputs = tokenizer(tokens, truncation=True, is_split_into_words=True) ## For some models, you may need to set max_length to approximately 500.

    labels = []
    for i, label in enumerate(list_name):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
# BERT's tokenizer returns the dataset in the form of a dictionary of lists (sentences).
# we have to convert it into a list of dictionaries for training.
def turn_dict_to_list_of_dict(d):
    new_list = []

    for labels, inputs in zip(d["labels"], d["input_ids"]):
        entry = {"input_ids": inputs, "labels": labels}
        new_list.append(entry)

    return new_list

In [None]:
import numpy as np

metric = load_metric("seqeval")
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

  metric = load_metric("seqeval")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

In [None]:
# Prepare the test data for evaluation in the same format as the training data
def calculate_results(trainer, data):
	predictions, labels, _ = trainer.predict(data)
	predictions = np.argmax(predictions, axis=2)

	# Remove the predictions for the [CLS] and [SEP] tokens 
	true_predictions = [
			[label_list[p] for (p, l) in zip(prediction, label) if l != -100]
			for prediction, label in zip(predictions, labels)
	]
	true_labels = [
			[label_list[l] for (p, l) in zip(prediction, label) if l != -100]
			for prediction, label in zip(predictions, labels)
	]

	# Compute multiple metrics on the test restuls
	results = metric.compute(predictions=true_predictions, references=true_labels)
	return results

#Load BERT model

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
bert_model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=4)

  _torch_pytree._register_pytree_node(


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training = dataset["train"]
valid = dataset["validation"]
test = dataset["test"]
training_tokens = training["tokens"]
training_labels = training["ner_tags"]
valid_tokens = valid["tokens"]
valid_labels = valid["ner_tags"]
test_tokens = test["tokens"]
test_labels = test["ner_tags"]

In [None]:
label_list, val_label_list, test_label_list = conv_label_indexes(training_labels, valid_labels, test_labels)

In [None]:
tokenized_datasets = tokenize_and_align_labels(training_tokens, bert_tokenizer, label_list)
tokenized_val_datasets = tokenize_and_align_labels(valid_tokens, bert_tokenizer,  val_label_list)
tokenized_test_datasets = tokenize_and_align_labels(test_tokens, bert_tokenizer, test_label_list)

In [None]:
tokenised_train = turn_dict_to_list_of_dict(tokenized_datasets)
tokenised_val = turn_dict_to_list_of_dict(tokenized_val_datasets)
tokenised_test = turn_dict_to_list_of_dict(tokenized_test_datasets)

In [None]:
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(bert_tokenizer)

In [None]:
import numpy as np

metric = load_metric("seqeval")
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


# Training with BERT
Here we fine tune the bert model to the training set

In [None]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

# Training arguments (feel free to play arround with these values)
model_name = "bert-base-uncased"
epochs = 6
batch_size = 4
learning_rate = 2e-5

args = TrainingArguments(
    f"BERT-finetuned-NER",
    # evaluation_strategy = "epoch", ## Instead of focusing on loss and accuracy, we will focus on the F1 score
    evaluation_strategy ='steps',
    eval_steps = 100,
    save_total_limit = 3,
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=0.001,
    save_steps=35000,
    metric_for_best_model = 'f1',
    load_best_model_at_end=True
)

trainer = Trainer(
    bert_model,
    args,
    train_dataset=tokenised_train,
    eval_dataset=tokenised_val,
    data_collator = data_collator,
    tokenizer=bert_tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


In [None]:
trainer.train()

In [None]:
results = calculate_results(trainer, tokenised_test)
results

In [None]:
results["overall_f1"]

# Learning rate calculation loop

In [None]:
learning_rates = [2e-5, 2.1e-5, 2.3e-5, 2.7e-5, 2.9e-5, 1.9e-5, 1.7e-5, 1.5e-5, 1.3e-5, 1.1e-5]
best_learning = 0
best_f1 = 0
model_name = "bert-base-uncased"
epochs = 6
batch_size = 4
learning_rate = 2e-5
for i in learning_rates:
  args = TrainingArguments(
      f"BERT-finetuned-NER",
      # evaluation_strategy = "epoch", ## Instead of focusing on loss and accuracy, we will focus on the F1 score
      evaluation_strategy ='steps',
      eval_steps = 1000,
      save_total_limit = 3,
      learning_rate=i,
      per_device_train_batch_size=batch_size,
      per_device_eval_batch_size=batch_size,
      num_train_epochs=epochs,
      weight_decay=0.001,
      save_steps=35000,
      metric_for_best_model = 'f1',
      load_best_model_at_end=True
  )

  trainer = Trainer(
      bert_model,
      args,
      train_dataset=tokenised_train,
      eval_dataset=tokenised_val,
      data_collator = data_collator,
      tokenizer=bert_tokenizer,
      compute_metrics=compute_metrics,
      callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
  )
  trainer.train()
  result = calculate_results(trainer, tokenised_test)
  f1 = result["overall_f1"]
  if f1 > best_f1:
    best_learning = i
    best_f1 = f1


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1000,0.0013,0.607418,0.759243,0.804202,0.781076,0.935868




dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1000,0.0013,0.7065,0.746643,0.796562,0.770795,0.934647




dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1000,0.001,0.744601,0.746631,0.793696,0.769444,0.930218




dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1000,0.0025,0.600843,0.773272,0.801337,0.787054,0.93938




dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1000,0.0025,0.600065,0.746296,0.769819,0.757875,0.931135




dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1000,0.0015,0.762254,0.757009,0.773639,0.765234,0.929302




dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1000,0.0011,0.74821,0.734321,0.805158,0.768109,0.928844




dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1000,0.0006,0.710279,0.755773,0.812798,0.783249,0.934494




dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1000,0.0,0.953534,0.753153,0.798472,0.775151,0.932967




dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1000,0.0004,0.852955,0.763451,0.786055,0.774588,0.933883






In [None]:
print(best_learning)
print(best_f1)

2e-05
0.935616108029901


## Results of learning rate testing
After testing different learning rates above, I have seen that F1 score does not vary hugely as learning rate changes with this algorithm.

# Testing different batch sizes
Now that we have discerned that learning rate does not appear to have a huge impact on the fine-tuned BERT models performance, I will now try to 'optimize' another hyperparameter - batch size.
For this I will do less iterations due to the relatively small dataset and already small initial batch size

In [None]:
batches = [2, 3, 4, 5, 6, 7]
best_batch = None
best_f1 = 0
model_name = "bert-base-uncased"
epochs = 6
batch_size = 4
learning_rate = 2e-5
for i in batches:
  args = TrainingArguments(
      f"BERT-finetuned-NER",
      # evaluation_strategy = "epoch", ## Instead of focusing on loss and accuracy, we will focus on the F1 score
      evaluation_strategy ='steps',
      eval_steps = 1000,
      save_total_limit = 3,
      learning_rate=best_learning,
      per_device_train_batch_size=i,
      per_device_eval_batch_size=i,
      num_train_epochs=epochs,
      weight_decay=0.001,
      save_steps=35000,
      metric_for_best_model = 'f1',
      load_best_model_at_end=True
  )

  trainer = Trainer(
      bert_model,
      args,
      train_dataset=tokenised_train,
      eval_dataset=tokenised_val,
      data_collator = data_collator,
      tokenizer=bert_tokenizer,
      compute_metrics=compute_metrics,
      callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
  )
  trainer.train()
  result = calculate_results(trainer, tokenised_test)
  f1 = result["overall_f1"]
  if f1 > best_f1:
    best_batch = i
    best_f1 = f1


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1000,0.0067,0.728957,0.754821,0.7851,0.769663,0.932661
2000,0.004,0.686585,0.749097,0.792741,0.770302,0.932661
3000,0.0021,0.682079,0.751111,0.807068,0.778085,0.931593




dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1000,0.0026,0.714718,0.743659,0.784145,0.763366,0.928539
2000,0.0002,0.742892,0.738667,0.793696,0.765193,0.930066




dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1000,0.0029,0.551833,0.734291,0.78128,0.757057,0.929149




dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1000,0.0002,0.749919,0.747126,0.807068,0.775941,0.933272




dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1000,0.0007,0.69402,0.745961,0.793696,0.769088,0.933425




dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


Step,Training Loss,Validation Loss




In [None]:
print(best_batch)
print(best_f1)

2
0.935616108029901


# Train model with batch size of 2 and learning rate of 2e-5

In [None]:
from transformers import TrainingArguments, Trainer

batch = 2
model_name = "bert-base-uncased"
epochs = 6
batch_size = 4
initial_learning_rate = 2e-5

args = TrainingArguments(
    f"BERT-finetuned-NER",
    # evaluation_strategy = "epoch", ## Instead of focusing on loss and accuracy, we will focus on the F1 score
    evaluation_strategy ='steps',
    eval_steps = 1000,
    save_total_limit = 3,
    learning_rate=best_learning,
    per_device_train_batch_size=batch,
    per_device_eval_batch_size=batch,
    num_train_epochs=epochs,
    weight_decay=0.001,
    save_steps=35000,
    metric_for_best_model = 'f1',
    load_best_model_at_end=True,
    warmup_steps = 50
)


trainer = Trainer(
    bert_model,
    args,
    train_dataset=tokenised_train,
    eval_dataset=tokenised_val,
    data_collator = data_collator,
    tokenizer=bert_tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)
trainer.train()
result = calculate_results(trainer, tokenised_test)
f1 = result["overall_f1"]
result


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1000,0.0036,0.73316,0.722272,0.765043,0.743043,0.926554
2000,0.0033,0.844851,0.716007,0.794651,0.753282,0.926096
3000,0.0016,0.664062,0.739209,0.7851,0.761464,0.930066






{'AC': {'precision': 0.7612687813021702,
  'recall': 0.8336380255941499,
  'f1': 0.7958115183246074,
  'number': 547},
 'LF': {'precision': 0.6880466472303207,
  'recall': 0.7814569536423841,
  'f1': 0.731782945736434,
  'number': 302},
 'O': {'precision': 0.9704029024250526,
  'recall': 0.9416342412451362,
  'f1': 0.9558021440662028,
  'number': 5397},
 'overall_precision': 0.9344554134973296,
 'overall_recall': 0.9244316362471982,
 'overall_f1': 0.9294164989939637,
 'overall_accuracy': 0.9230190948729514}

# Result of batch fine tuning
Above we can see the 'preferable' batch value between 1-9 is 2
## Using scheduler for learning rate optimisation
When we 'optimized' our learning rate earlier, we used a constant learning rate instead of a scheduler

In [None]:
from transformers import TrainingArguments, Trainer, get_linear_schedule_with_warmup

from transformers import AdamW


batch = 4
model_name = "bert-base-uncased"
epochs = 6
batch_size = 4
initial_learning_rate = 2e-5

args = TrainingArguments(
    f"BERT-finetuned-NER",
    # evaluation_strategy = "epoch", ## Instead of focusing on loss and accuracy, we will focus on the F1 score
    evaluation_strategy ='steps',
    eval_steps = 1000,
    save_total_limit = 3,
    learning_rate=best_learning,
    per_device_train_batch_size=batch,
    per_device_eval_batch_size=batch,
    num_train_epochs=epochs,
    weight_decay=0.001,
    save_steps=35000,
    metric_for_best_model = 'f1',
    load_best_model_at_end=True,
    warmup_steps = 50
)

optimizer = AdamW(bert_model.parameters(), lr = initial_learning_rate)
total_steps = (len(tokenised_train) // (batch_size * args.gradient_accumulation_steps)) * epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=args.warmup_steps,
    num_training_steps=total_steps
)

trainer = Trainer(
    bert_model,
    args,
    train_dataset=tokenised_train,
    eval_dataset=tokenised_val,
    data_collator = data_collator,
    tokenizer=bert_tokenizer,
    compute_metrics=compute_metrics,
    optimizers = (optimizer, scheduler),
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)
trainer.train()
result = calculate_results(trainer, tokenised_test)
f1 = result["overall_f1"]
result


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1000,0.0017,0.785201,0.714795,0.765998,0.739511,0.924111






{'AC': {'precision': 0.7883597883597884,
  'recall': 0.8171846435100548,
  'f1': 0.8025134649910234,
  'number': 547},
 'LF': {'precision': 0.667590027700831,
  'recall': 0.7980132450331126,
  'f1': 0.726998491704374,
  'number': 302},
 'O': {'precision': 0.9715985415467281,
  'recall': 0.9381137669075412,
  'f1': 0.9545625942684767,
  'number': 5397},
 'overall_precision': 0.9367975240267145,
 'overall_recall': 0.920749279538905,
 'overall_f1': 0.9287040775131207,
 'overall_accuracy': 0.9198616749360998}