# Experiment 3: Hyper-parameter optimisation
In this experiment I am going to experiment with various different optimisers and loss functions with the same BERT model from the previous experiment \
As shown in the last experiment, it is a good idea to start by fine-tuning the model to our dataset


In [None]:
%pip install datasets
%pip install transformers
%pip install spacy


BELOW TAKES FOREVER!

In [None]:
%pip install torch

In [None]:
%pip install spacy-transformers
%pip install transformers[torch]
%pip install seqeval

**CHECKING VERSIONS**


In [4]:
import torch
import torchtext

SEED = 1234
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

print("PyTorch Version: ", torch.__version__)
print("torchtext Version: ", torchtext.__version__)
print(f"Using {'GPU' if str(DEVICE) == 'cuda' else 'CPU'}.")

PyTorch Version:  2.2.1+cu121
torchtext Version:  0.17.1+cpu
Using GPU.


In [5]:
from datasets import load_dataset, load_metric
dataset = load_dataset("surrey-nlp/PLOD-CW")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/8.37k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/188k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/28.4k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/28.7k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1072 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/126 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/153 [00:00<?, ? examples/s]

In [23]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=4)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
def conv_label_indexes(training, valid, test ):
	label_encoding = {"B-O": 0, "B-AC": 1, "B-LF": 2, "I-LF": 3}

	label_list = []
	for sample in training:
		label_list.append([label_encoding[tag] for tag in sample])

	val_label_list = []
	for sample in valid:
		val_label_list.append([label_encoding[tag] for tag in sample])

	test_label_list = []
	for sample in test:
		test_label_list.append([label_encoding[tag] for tag in sample])
	return label_list, val_label_list, test_label_list





In [30]:
def tokenize_and_align_labels(train_dataset, tokenizer, list_name):
    tokenized_inputs = tokenizer(train_dataset["tokens"], truncation=True, is_split_into_words=True) ## For some models, you may need to set max_length to approximately 500.

    labels = []
    for i, label in enumerate(list_name):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [19]:
# BERT's tokenizer returns the dataset in the form of a dictionary of lists (sentences).
# we have to convert it into a list of dictionaries for training.
def turn_dict_to_list_of_dict(d):
    new_list = []

    for labels, inputs in zip(d["labels"], d["input_ids"]):
        entry = {"input_ids": inputs, "labels": labels}
        new_list.append(entry)

    return new_list

In [20]:
import numpy as np

metric = load_metric("seqeval")
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [21]:
# Prepare the test data for evaluation in the same format as the training data
def calculate_results(trainer, data):
	predictions, labels, _ = trainer.predict(data)
	predictions = np.argmax(predictions, axis=2)

	# Remove the predictions for the [CLS] and [SEP] tokens 
	true_predictions = [
			[label_list[p] for (p, l) in zip(prediction, label) if l != -100]
			for prediction, label in zip(predictions, labels)
	]
	true_labels = [
			[label_list[l] for (p, l) in zip(prediction, label) if l != -100]
			for prediction, label in zip(predictions, labels)
	]

	# Compute multiple metrics on the test restuls
	results = metric.compute(predictions=true_predictions, references=true_labels)
	return results

In [26]:
label_encoding = {"B-O": 0, "B-AC": 1, "B-LF": 2, "I-LF": 3}
inverse_label_map = {v: k for k, v in label_encoding.items()}


In [27]:
training = dataset["train"]
valid = dataset["validation"]
test = dataset["test"]

training_labels = training["ner_tags"]
valid_labels = valid["ner_tags"]
test_labels = test["ner_tags"]

In [31]:
#convert label lists to indexes
label_list, val_label_list, test_label_list = conv_label_indexes(training_labels, valid_labels, test_labels)


tokenized_train = tokenize_and_align_labels(training, bert_tokenizer, label_list)
tokenized_val_datasets = tokenize_and_align_labels(valid, bert_tokenizer, val_label_list)
tokenized_test_datasets = tokenize_and_align_labels(test, bert_tokenizer, test_label_list)

tokenized_train = turn_dict_to_list_of_dict(tokenized_train)
tokenized_val = turn_dict_to_list_of_dict(tokenized_val_datasets)
tokenized_test = turn_dict_to_list_of_dict(tokenized_test_datasets)

from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(bert_tokenizer)


In [32]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=4)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Default optimisers
By default, the transformers Trainers use the AdamW Optimiser and is using cross-entropy for the loss function.
The learning rate change is also linear by default

In [33]:

from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

# Training arguments (feel free to play arround with these values)
model_name = "bert-base-uncased"
epochs = 6
batch_size = 4
learning_rate = 2e-5

args = TrainingArguments(
    f"BERT-finetuned-NER",
    # evaluation_strategy = "epoch", ## Instead of focusing on loss and accuracy, we will focus on the F1 score
    evaluation_strategy ='steps',
    eval_steps = 7000,
    save_total_limit = 3,
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=0.001,
    save_steps=35000,
    metric_for_best_model = 'f1',
    load_best_model_at_end=True
)

trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator = data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


Train the model (fine tuning) with the default optimiser

In [34]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss


TrainOutput(global_step=1608, training_loss=0.16119336281249771, metrics={'train_runtime': 177.6098, 'train_samples_per_second': 36.214, 'train_steps_per_second': 9.054, 'total_flos': 278075811731808.0, 'train_loss': 0.16119336281249771, 'epoch': 6.0})

In [35]:
results = calculate_results(trainer, tokenized_test)
results



{'AC': {'precision': 0.7368421052631579,
  'recall': 0.7678244972577697,
  'f1': 0.7520143240823635,
  'number': 547},
 'LF': {'precision': 0.659942363112392,
  'recall': 0.7582781456953642,
  'f1': 0.7057010785824345,
  'number': 302},
 'O': {'precision': 0.9631717399473883,
  'recall': 0.949786918658514,
  'f1': 0.9564325030319992,
  'number': 5397},
 'overall_precision': 0.9256291072287226,
 'overall_recall': 0.9245917387127762,
 'overall_f1': 0.9251101321585904,
 'overall_accuracy': 0.92106450157871}

# Trying different optimisers

In [36]:
from torch.optim import SGD

In [37]:
SGD_Optimizer = SGD(model.parameters())


Reload the model

In [38]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

model2 = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=4)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [39]:

from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

# Training arguments (feel free to play arround with these values)
model_name = "bert-base-uncased"
epochs = 6
batch_size = 4
learning_rate = 2e-5

args2 = TrainingArguments(
    f"BERT-finetuned-NER",
    # evaluation_strategy = "epoch", ## Instead of focusing on loss and accuracy, we will focus on the F1 score
    evaluation_strategy ='steps',
    eval_steps = 7000,
    save_total_limit = 3,
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=0.001,
    save_steps=35000,
    metric_for_best_model = 'f1',
    load_best_model_at_end=True
)

trainer2 = Trainer(
    model2,
    args2,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator = data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)],
    optimizers = (SGD_Optimizer, None)
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


In [40]:
trainer2.train()

Step,Training Loss,Validation Loss


TrainOutput(global_step=1608, training_loss=1.5025402562535224, metrics={'train_runtime': 121.6137, 'train_samples_per_second': 52.889, 'train_steps_per_second': 13.222, 'total_flos': 278075811731808.0, 'train_loss': 1.5025402562535224, 'epoch': 6.0})

In [41]:
results = calculate_results(trainer2, tokenized_test)
results



{'AC': {'precision': 0.14901960784313725,
  'recall': 0.06946983546617916,
  'f1': 0.09476309226932668,
  'number': 547},
 'LF': {'precision': 0.007751937984496124,
  'recall': 0.08609271523178808,
  'f1': 0.014223194748358862,
  'number': 302},
 'O': {'precision': 0.7790178571428571,
  'recall': 0.06466555493792848,
  'f1': 0.11941830624465356,
  'number': 5397},
 'overall_precision': 0.10179935913236382,
 'overall_recall': 0.06612231828370158,
 'overall_f1': 0.0801708240318354,
 'overall_accuracy': 0.10299203127349271}

# Performance of SGD optimizer over default (Adam)
As we can see from the above f1 scores, the SGD optimizer on its own is not a good fit for this model and does not perform well against the test set.

# Root Mean Square optimizer

In [42]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

model3 = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=4)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [43]:
import torch
optimizer = torch.optim.RMSprop(model.parameters(), lr=0.001, alpha=0.99, eps=1e-08, weight_decay=0, momentum=0, centered=False)


In [44]:

from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

# Training arguments (feel free to play arround with these values)
model_name = "bert-base-uncased"
epochs = 6
batch_size = 4
learning_rate = 2e-5

args3 = TrainingArguments(
    f"BERT-finetuned-NER",
    # evaluation_strategy = "epoch", ## Instead of focusing on loss and accuracy, we will focus on the F1 score
    evaluation_strategy ='steps',
    eval_steps = 7000,
    save_total_limit = 3,
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=0.001,
    save_steps=35000,
    metric_for_best_model = 'f1',
    load_best_model_at_end=True
)

trainer3 = Trainer(
    model3,
    args3,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator = data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)],
    optimizers = (optimizer, None)
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


In [45]:
trainer3.train()


Step,Training Loss,Validation Loss


TrainOutput(global_step=1608, training_loss=1.5025402562535224, metrics={'train_runtime': 109.7089, 'train_samples_per_second': 58.628, 'train_steps_per_second': 14.657, 'total_flos': 278075811731808.0, 'train_loss': 1.5025402562535224, 'epoch': 6.0})

In [46]:
results = calculate_results(trainer3, tokenized_test)
results



{'AC': {'precision': 0.14901960784313725,
  'recall': 0.06946983546617916,
  'f1': 0.09476309226932668,
  'number': 547},
 'LF': {'precision': 0.007751937984496124,
  'recall': 0.08609271523178808,
  'f1': 0.014223194748358862,
  'number': 302},
 'O': {'precision': 0.7790178571428571,
  'recall': 0.06466555493792848,
  'f1': 0.11941830624465356,
  'number': 5397},
 'overall_precision': 0.10179935913236382,
 'overall_recall': 0.06612231828370158,
 'overall_f1': 0.0801708240318354,
 'overall_accuracy': 0.10299203127349271}

# Results of Root Mean Square optimizer
Similar to the SGD Optimizer, the results are not great for this model. \
This is likely as a result of me not tuning hyperparameters when changing the optimizer.