

# Experiment #?: Comparing BERT to ROBERTA model with regular dataset and possibly larger version



# Install dependencies

In [None]:
%pip install datasets
%pip install transformers
%pip install spacy
%pip install spacy-transformers
%pip install transformers[torch]
%pip install seqeval

In [1]:
import torch
import torchtext

SEED = 1234
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

print("PyTorch Version: ", torch.__version__)
print("torchtext Version: ", torchtext.__version__)
print(f"Using {'GPU' if str(DEVICE) == 'cuda' else 'CPU'}.")

  from .autonotebook import tqdm as notebook_tqdm


PyTorch Version:  2.2.0+cpu
torchtext Version:  0.16.2+cpu
Using CPU.


In [2]:
from datasets import load_dataset, load_metric
dataset = load_dataset("surrey-nlp/PLOD-CW")

In [3]:
def conv_label_indexes(training, valid, test ):
	label_encoding = {"B-O": 0, "B-AC": 1, "B-LF": 2, "I-LF": 3}

	label_list = []
	for sample in training:
		label_list.append([label_encoding[tag] for tag in sample])

	val_label_list = []
	for sample in valid:
		val_label_list.append([label_encoding[tag] for tag in sample])

	test_label_list = []
	for sample in test:
		test_label_list.append([label_encoding[tag] for tag in sample])
	return label_list, val_label_list, test_label_list





In [4]:
def tokenize_and_align_labels(tokens, tokenizer, list_name):
    tokenized_inputs = tokenizer(tokens, truncation=True, is_split_into_words=True) ## For some models, you may need to set max_length to approximately 500.

    labels = []
    for i, label in enumerate(list_name):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [5]:
# BERT's tokenizer returns the dataset in the form of a dictionary of lists (sentences).
# we have to convert it into a list of dictionaries for training.
def turn_dict_to_list_of_dict(d):
    new_list = []

    for labels, inputs in zip(d["labels"], d["input_ids"]):
        entry = {"input_ids": inputs, "labels": labels}
        new_list.append(entry)

    return new_list

In [6]:
import numpy as np

metric = load_metric("seqeval")
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

  metric = load_metric("seqeval")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [7]:
import numpy as np
label_encoding = {"B-O": 0, "B-AC": 1, "B-LF": 2, "I-LF": 3}
inverse_label_map = {v: k for k, v in label_encoding.items()}

def calculate_results(trainer, tokenised_test):

    predictions, labels, _ = trainer.predict(tokenised_test)
    predictions = np.argmax(predictions, axis=2)

    textual_true_predictions = [
        [inverse_label_map[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    textual_true_labels = [
        [inverse_label_map[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=textual_true_predictions, references=textual_true_labels)

    return results



# Load ROBERTA Model
Up to this point, all of my experiments have been done with the BERT model. I now want to compare the performance of BERT against roBERTA (both pre-trained and fine-tuned)

In [30]:
#load roberta model
from transformers import RobertaConfig, RobertaModel

# Initializing a RoBERTa configuration
rob_configuration = RobertaConfig()

# Initializing a model (with random weights) from the configuration
rob_model = RobertaModel(rob_configuration)

# Accessing the model configuration
configuration = rob_model.config


In [35]:
from transformers import RobertaTokenizerFast

rob_tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base", add_prefix_space=True)


#Pre-process data (like in other experiments)


In [36]:
training = dataset["train"]
valid = dataset["validation"]
test = dataset["test"]
training_tokens = training["tokens"]
training_labels = training["ner_tags"]
valid_tokens = valid["tokens"]
valid_labels = valid["ner_tags"]
test_tokens = test["tokens"]
test_labels = test["ner_tags"]

In [37]:
label_list, val_label_list, test_label_list = conv_label_indexes(training_labels, valid_labels, test_labels)


In [38]:
#tokenize training set
tokenized_train = tokenize_and_align_labels(training_tokens, rob_tokenizer, label_list)