

# Experiment #?: Comparing BERT to ROBERTA model with regular dataset and possibly larger version



# Install dependencies

In [None]:
%pip install datasets
%pip install transformers
%pip install spacy
%pip install spacy-transformers
%pip install transformers[torch]
%pip install seqeval

In [52]:
import torch
import torchtext

SEED = 1234
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

print("PyTorch Version: ", torch.__version__)
print("torchtext Version: ", torchtext.__version__)
print(f"Using {'GPU' if str(DEVICE) == 'cuda' else 'CPU'}.")

PyTorch Version:  2.2.1+cu121
torchtext Version:  0.17.1+cpu
Using GPU.


In [53]:
from datasets import load_dataset, load_metric
dataset = load_dataset("surrey-nlp/PLOD-CW")

In [54]:
def conv_label_indexes(training, valid, test ):
	label_encoding = {"B-O": 0, "B-AC": 1, "B-LF": 2, "I-LF": 3}

	label_list = []
	for sample in training:
		label_list.append([label_encoding[tag] for tag in sample])

	val_label_list = []
	for sample in valid:
		val_label_list.append([label_encoding[tag] for tag in sample])

	test_label_list = []
	for sample in test:
		test_label_list.append([label_encoding[tag] for tag in sample])
	return label_list, val_label_list, test_label_list





In [55]:
def tokenize_and_align_labels(tokens, tokenizer, list_name):
    tokenized_inputs = tokenizer(tokens, truncation=True, is_split_into_words=True) ## For some models, you may need to set max_length to approximately 500.

    labels = []
    for i, label in enumerate(list_name):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [56]:
# BERT's tokenizer returns the dataset in the form of a dictionary of lists (sentences).
# we have to convert it into a list of dictionaries for training.
def turn_dict_to_list_of_dict(d):
    new_list = []

    for labels, inputs in zip(d["labels"], d["input_ids"]):
        entry = {"input_ids": inputs, "labels": labels}
        new_list.append(entry)

    return new_list

In [57]:
import numpy as np

metric = load_metric("seqeval")
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [59]:
import numpy as np
label_encoding = {"B-O": 0, "B-AC": 1, "B-LF": 2, "I-LF": 3}
inverse_label_map = {v: k for k, v in label_encoding.items()}

def calculate_results(trainer, tokenised_test):

    predictions, labels, _ = trainer.predict(tokenised_test)
    predictions = np.argmax(predictions, axis=2)

    textual_true_predictions = [
        [inverse_label_map[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    textual_true_labels = [
        [inverse_label_map[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=textual_true_predictions, references=textual_true_labels)

    return results



#Pre-process data (like in other experiments)


In [60]:
training = dataset["train"]
valid = dataset["validation"]
test = dataset["test"]
training_tokens = training["tokens"]
training_labels = training["ner_tags"]
valid_tokens = valid["tokens"]
valid_labels = valid["ner_tags"]
test_tokens = test["tokens"]
test_labels = test["ner_tags"]

In [61]:
label_list, val_label_list, test_label_list = conv_label_indexes(training_labels, valid_labels, test_labels)


In [62]:
#tokenize training set
tokenized_train = tokenize_and_align_labels(training_tokens, rob_tokenizer, label_list)
tokenized_val_datasets = tokenize_and_align_labels(valid_tokens, rob_tokenizer,  val_label_list)
tokenized_test_datasets = tokenize_and_align_labels(test_tokens, rob_tokenizer, test_label_list)

tokenised_train = turn_dict_to_list_of_dict(tokenized_train)
tokenised_val = turn_dict_to_list_of_dict(tokenized_val_datasets)
tokenised_test = turn_dict_to_list_of_dict(tokenized_test_datasets)

from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(rob_tokenizer)

In [63]:
len(tokenised_train)

1072

# Run test set against model before fine tuning
We do this to gauge the performance of the models pre-trained state, i.e. testing how good it is for generalised use cases

In [64]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

# Load the RoBERTa tokenizer
rob_tokenizer = AutoTokenizer.from_pretrained("roberta-base", add_prefix_space=True)

# Load the RoBERTa model for token classification with the desired number of labels
pretrained_rob_model = AutoModelForTokenClassification.from_pretrained("roberta-base", num_labels=4)




Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [65]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

# Training arguments (feel free to play arround with these values)
model_name = "roberta-base"
epochs = 6
batch_size = 4
learning_rate = 2e-5

args = TrainingArguments(
    f"ROBERTA-finedtuned-ner",
    # evaluation_strategy = "epoch", ## Instead of focusing on loss and accuracy, we will focus on the F1 score
    evaluation_strategy ='steps',
    eval_steps = 100,
    save_total_limit = 3,
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=0.001,
    save_steps=35000,
    metric_for_best_model = 'f1',
    load_best_model_at_end=True
)

trainer = Trainer(
    pretrained_rob_model,
    args,
    train_dataset=tokenised_train,
    eval_dataset=tokenised_val,
    data_collator = data_collator,
    tokenizer=rob_tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)

pretrain_results = calculate_results(trainer, tokenised_test)


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.




In [66]:
pretrain_results

{'AC': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 559},
 'LF': {'precision': 0.04336043360433604,
  'recall': 0.05517241379310345,
  'f1': 0.048558421851289835,
  'number': 290},
 'O': {'precision': 0.807876882343207,
  'recall': 0.925673113386424,
  'f1': 0.8627728196518513,
  'number': 5274},
 'overall_precision': 0.7620974015870546,
 'overall_recall': 0.7999346725461375,
 'overall_f1': 0.7805577689243028,
 'overall_accuracy': 0.7554531490015362}

# Load ROBERTA Model
Up to this point, all of my experiments have been done with the BERT model. I now want to compare the performance of BERT against roBERTA (both pre-trained and fine-tuned)

In [67]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

# Load the RoBERTa tokenizer
rob_tokenizer = AutoTokenizer.from_pretrained("roberta-base", add_prefix_space=True)

# Load the RoBERTa model for token classification with the desired number of labels
rob_model = AutoModelForTokenClassification.from_pretrained("roberta-base", num_labels=4)


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [68]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

# Training arguments (feel free to play arround with these values)
model_name = "roberta-base"
epochs = 6
batch_size = 4
learning_rate = 2e-5

args = TrainingArguments(
    f"ROBERTA-finedtuned-ner",
    # evaluation_strategy = "epoch", ## Instead of focusing on loss and accuracy, we will focus on the F1 score
    evaluation_strategy ='steps',
    eval_steps = 100,
    save_total_limit = 3,
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=0.001,
    save_steps=35000,
    metric_for_best_model = 'f1',
    load_best_model_at_end=True
)

trainer = Trainer(
    rob_model,
    args,
    train_dataset=tokenised_train,
    eval_dataset=tokenised_val,
    data_collator = data_collator,
    tokenizer=rob_tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


In [69]:
 trainer.train()
#  results = compute_results(trainer, tokenised_test)
#  results

Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
100,No log,0.241215,0.596296,0.615091,0.605548,0.911394
200,No log,0.189206,0.66804,0.774594,0.717382,0.929021
300,No log,0.198662,0.732255,0.778415,0.75463,0.940038
400,No log,0.178167,0.732507,0.789876,0.76011,0.938149
500,0.285700,0.174491,0.75045,0.795606,0.772369,0.941926
600,0.285700,0.186773,0.729515,0.790831,0.758937,0.939093
700,0.285700,0.205523,0.757402,0.757402,0.757402,0.937362
800,0.285700,0.202537,0.737295,0.817574,0.775362,0.940825
900,0.285700,0.194402,0.761046,0.806113,0.782931,0.942556
1000,0.130900,0.190822,0.75426,0.803247,0.777983,0.942398




TrainOutput(global_step=1608, training_loss=0.15977293964642197, metrics={'train_runtime': 310.5928, 'train_samples_per_second': 20.709, 'train_steps_per_second': 5.177, 'total_flos': 270589932571200.0, 'train_loss': 0.15977293964642197, 'epoch': 6.0})

In [71]:
 results = calculate_results(trainer, tokenised_test)
 results



{'AC': {'precision': 0.8180272108843537,
  'recall': 0.8604651162790697,
  'f1': 0.8387096774193549,
  'number': 559},
 'LF': {'precision': 0.7319277108433735,
  'recall': 0.8379310344827586,
  'f1': 0.7813504823151125,
  'number': 290},
 'O': {'precision': 0.9757564003103181,
  'recall': 0.9539249146757679,
  'f1': 0.9647171620325983,
  'number': 5274},
 'overall_precision': 0.9471691902567478,
 'overall_recall': 0.9398987424465132,
 'overall_f1': 0.9435199606525125,
 'overall_accuracy': 0.9385560675883257}