

# Experiment 1: Comparing BERT to ROBERTA model in both pre-trained and fine-tuned form



# Install dependencies

In [None]:
%pip install datasets
%pip install transformers
%pip install spacy
%pip install spacy-transformers
%pip install transformers[torch]
%pip install seqeval

In [None]:
!pip install datasets
!pip install seqeval

In [3]:
import torch
import torchtext

SEED = 1234
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

print("PyTorch Version: ", torch.__version__)
print("torchtext Version: ", torchtext.__version__)
print(f"Using {'GPU' if str(DEVICE) == 'cuda' else 'CPU'}.")

PyTorch Version:  2.2.1+cu121
torchtext Version:  0.17.1+cpu
Using GPU.


In [4]:
from datasets import load_dataset, load_metric
dataset = load_dataset("surrey-nlp/PLOD-CW")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/8.37k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/188k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/28.4k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/28.7k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1072 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/126 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/153 [00:00<?, ? examples/s]

In [5]:
def conv_label_indexes(training, valid, test ):
	label_encoding = {"B-O": 0, "B-AC": 1, "B-LF": 2, "I-LF": 3}

	label_list = []
	for sample in training:
		label_list.append([label_encoding[tag] for tag in sample])

	val_label_list = []
	for sample in valid:
		val_label_list.append([label_encoding[tag] for tag in sample])

	test_label_list = []
	for sample in test:
		test_label_list.append([label_encoding[tag] for tag in sample])
	return label_list, val_label_list, test_label_list





In [6]:
def tokenize_and_align_labels(dataset, tokenizer, label_list):
    tokenized_inputs = tokenizer(dataset["tokens"], truncation=True, is_split_into_words=True) ## For some models, you may need to set max_length to approximately 500.

    labels = []
    for i, label in enumerate(label_list):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [7]:
# BERT's tokenizer returns the dataset in the form of a dictionary of lists (sentences).
# we have to convert it into a list of dictionaries for training.
def turn_dict_to_list_of_dict(d):
    new_list = []

    for labels, inputs in zip(d["labels"], d["input_ids"]):
        entry = {"input_ids": inputs, "labels": labels}
        new_list.append(entry)

    return new_list

In [8]:
import numpy as np

metric = load_metric("seqeval")
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

  metric = load_metric("seqeval")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

In [9]:
# Prepare the test data for evaluation in the same format as the training data
def calculate_results(trainer, data):
	predictions, labels, _ = trainer.predict(data)
	predictions = np.argmax(predictions, axis=2)

	# Remove the predictions for the [CLS] and [SEP] tokens
	true_predictions = [
			[label_list[p] for (p, l) in zip(prediction, label) if l != -100]
			for prediction, label in zip(predictions, labels)
	]
	true_labels = [
			[label_list[l] for (p, l) in zip(prediction, label) if l != -100]
			for prediction, label in zip(predictions, labels)
	]

	# Compute multiple metrics on the test restuls
	results = metric.compute(predictions=true_predictions, references=true_labels)
	return results

# Load BERT model
The first stage of this experiment is to view the performance of the BERT model in both its pre-trained and fine-tuned (transfer learning) form\
The BERT model uses the Encoder part of the transformer architecture, and its bidirectional nature makes it suitable for token classification

In [10]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
bert_model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=4)

  _torch_pytree._register_pytree_node(


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#Pre-process data (like in other experiments)


In [11]:
training = dataset["train"]
valid = dataset["validation"]
test = dataset["test"]

training_labels = training["ner_tags"]
valid_labels = valid["ner_tags"]
test_labels = test["ner_tags"]

In [12]:
#convert label lists to indexes
label_list, val_label_list, test_label_list = conv_label_indexes(training_labels, valid_labels, test_labels)


tokenized_train = tokenize_and_align_labels(training, bert_tokenizer, label_list)
tokenized_val_datasets = tokenize_and_align_labels(valid, bert_tokenizer, val_label_list)
tokenized_test_datasets = tokenize_and_align_labels(test, bert_tokenizer, test_label_list)

tokenized_train = turn_dict_to_list_of_dict(tokenized_train)
tokenized_val = turn_dict_to_list_of_dict(tokenized_val_datasets)
tokenized_test = turn_dict_to_list_of_dict(tokenized_test_datasets)

from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(bert_tokenizer)


In [None]:
print(len(label_list))
print(len(valid_labels))

1072
126


# Setup trainer, don't train but use to run test set against pre-trained model

In [13]:

from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

# Training arguments (feel free to play arround with these values)
model_name = "bert-base-uncased"
epochs = 6
batch_size = 4
learning_rate = 2e-5

args = TrainingArguments(
    f"BERT-finetuned-NER",
    # evaluation_strategy = "epoch", ## Instead of focusing on loss and accuracy, we will focus on the F1 score
    evaluation_strategy ='steps',
    eval_steps = 7000,
    save_total_limit = 3,
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=0.001,
    save_steps=35000,
    metric_for_best_model = 'f1',
    load_best_model_at_end=True
)

trainer = Trainer(
    bert_model,
    args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator = data_collator,
    tokenizer=bert_tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


In [14]:
results = calculate_results(trainer, tokenized_test)
results

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.




{'0, 0, 0, 0, 0, 0, 0, 2, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 3, 3, 0, 1, 0, 0, 0, 0, 0, 0, 2, 3, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0]': {'precision': 0.036281179138321996,
  'recall': 0.1797752808988764,
  'f1': 0.060377358490566045,
  'number': 267},
 '0, 0, 0, 0, 2, 3, 3, 3, 3, 0, 1, 0, 0, 0, 0]': {'precision': 0.026068821689259645,
  'recall': 0.04664179104477612,
  'f1': 0.033444816053511704,
  'number': 536},
 '0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 3, 0, 1, 0, 0, 0, 0, 2, 3, 3, 0, 1, 0, 0, 0, 0, 0]': {'precision': 0.00596252129471891,
  'recall': 0.04697986577181208,
  'f1': 0.010582010582010583,
  'number': 149},
 '1, 0, 2, 3, 3, 0]': {'precision': 0.0012919896640826874,
  'recall': 0.007751937984496124,
  'f1': 0.00221483942414175,
  'number': 129},
 'overall_precision': 0.019148936170212766,
 'overall_recall': 0.07493061979648474,
 'overall_f1': 0.030502730182639805,
 'overall_accuracy': 0.25063900165388664}

# Results of pre-trained BERT
From the above f1 score of 0.03  and overall accuracy of 25%, we can see that pre-trained BERT without fine tuning does not perform well for our token classification task

# Fine tune BERT and run test set against it

In [None]:
trainer.train()


Step,Training Loss,Validation Loss


TrainOutput(global_step=1608, training_loss=0.16453654048454702, metrics={'train_runtime': 167.6305, 'train_samples_per_second': 38.37, 'train_steps_per_second': 9.593, 'total_flos': 278780100881088.0, 'train_loss': 0.16453654048454702, 'epoch': 6.0})

In [None]:
results = calculate_results(trainer, tokenized_test)
results


{'0, 0, 0, 0, 0, 0, 0, 2, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 3, 3, 0, 1, 0, 0, 0, 0, 0, 0, 2, 3, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0]': {'precision': 0.6983050847457627,
  'recall': 0.7715355805243446,
  'f1': 0.7330960854092526,
  'number': 267},
 '0, 0, 0, 0, 2, 3, 3, 3, 3, 0, 1, 0, 0, 0, 0]': {'precision': 0.6660929432013769,
  'recall': 0.7220149253731343,
  'f1': 0.6929274843330349,
  'number': 536},
 '0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 3, 0, 1, 0, 0, 0, 0, 2, 3, 3, 0, 1, 0, 0, 0, 0, 0]': {'precision': 0.7197452229299363,
  'recall': 0.7583892617449665,
  'f1': 0.738562091503268,
  'number': 149},
 '1, 0, 2, 3, 3, 0]': {'precision': 0.6643835616438356,
  'recall': 0.751937984496124,
  'f1': 0.7054545454545454,
  'number': 129},
 'overall_precision': 0.6810856658184903,
 'overall_recall': 0.7428307123034228,
 'overall_f1': 0.7106194690265487,
 'overall_accuracy': 0.9276800481130657}

# Roberta

Above we can see that fine tuned BERT performs much better with an f1 score of 0.71 and 92.7% accuracy overall.

# Run test set against model before fine tuning
We do this to gauge the performance of the models pre-trained state, i.e. testing how good it is for generalised use cases

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

# Load the RoBERTa tokenizer
rob_tokenizer = AutoTokenizer.from_pretrained("roberta-base", add_prefix_space=True)

# Load the RoBERTa model for token classification with the desired number of labels
pretrained_rob_model = AutoModelForTokenClassification.from_pretrained("roberta-base", num_labels=4)




Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
#tokenize training set
label_list, val_label_list, test_label_list = conv_label_indexes(training_labels, valid_labels, test_labels)
tokenized_train = tokenize_and_align_labels(training, rob_tokenizer, label_list)
tokenized_val_datasets = tokenize_and_align_labels(valid, rob_tokenizer,  val_label_list)
tokenized_test_datasets = tokenize_and_align_labels(test, rob_tokenizer, test_label_list)

tokenized_train = turn_dict_to_list_of_dict(tokenized_train)
tokenized_val = turn_dict_to_list_of_dict(tokenized_val_datasets)
tokenized_test = turn_dict_to_list_of_dict(tokenized_test_datasets)

from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(rob_tokenizer)

# Pre trained ROBERTA performance

In [None]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

# Training arguments (feel free to play arround with these values)
model_name = "roberta-base"
epochs = 6
batch_size = 4
learning_rate = 2e-5

args = TrainingArguments(
    f"ROBERTA-finedtuned-ner",
    # evaluation_strategy = "epoch", ## Instead of focusing on loss and accuracy, we will focus on the F1 score
    evaluation_strategy ='steps',
    eval_steps = 100,
    save_total_limit = 3,
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=0.001,
    save_steps=35000,
    metric_for_best_model = 'f1',
    load_best_model_at_end=True
)

trainer = Trainer(
    pretrained_rob_model,
    args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator = data_collator,
    tokenizer=rob_tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)



dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


In [None]:
pretrain_results = calculate_results(trainer, tokenized_test)
pretrain_results


You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.




{'0, 0, 0, 0, 0, 0, 0, 2, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 3, 3, 0, 1, 0, 0, 0, 0, 0, 0, 2, 3, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0]': {'precision': 0.028985507246376812,
  'recall': 0.1348314606741573,
  'f1': 0.047713717693836984,
  'number': 267},
 '0, 0, 0, 0, 2, 3, 3, 3, 3, 0, 1, 0, 0, 0, 0]': {'precision': 0.014345991561181435,
  'recall': 0.03171641791044776,
  'f1': 0.019755955839628123,
  'number': 536},
 '0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 3, 0, 1, 0, 0, 0, 0, 2, 3, 3, 0, 1, 0, 0, 0, 0, 0]': {'precision': 0.006644518272425249,
  'recall': 0.013422818791946308,
  'f1': 0.008888888888888889,
  'number': 149},
 '1, 0, 2, 3, 3, 0]': {'precision': 0.0,
  'recall': 0.0,
  'f1': 0.0,
  'number': 129},
 'overall_precision': 0.015660592255125286,
 'overall_recall': 0.05087881591119334,
 'overall_f1': 0.02394948835183976,
 'overall_accuracy': 0.3278033794162826}

# Fine tune ROBERTA with training and look at result

In [None]:
trainer.train()


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
100,No log,0.229626,0.607735,0.630372,0.618847,0.914227
200,No log,0.196862,0.64556,0.798472,0.71392,0.93028
300,No log,0.173979,0.711522,0.772684,0.740842,0.935946
400,No log,0.177296,0.735507,0.775549,0.754998,0.939093
500,0.289500,0.196222,0.734004,0.766953,0.750117,0.931224
600,0.289500,0.228715,0.694888,0.830946,0.756851,0.93091
700,0.289500,0.198644,0.74216,0.813754,0.77631,0.939251
800,0.289500,0.195292,0.75046,0.778415,0.764182,0.93752
900,0.289500,0.2179,0.751135,0.789876,0.770019,0.937834
1000,0.140300,0.20617,0.754296,0.838586,0.794211,0.943343


TrainOutput(global_step=1608, training_loss=0.16509925310884543, metrics={'train_runtime': 195.9923, 'train_samples_per_second': 32.818, 'train_steps_per_second': 8.204, 'total_flos': 270367418028384.0, 'train_loss': 0.16509925310884543, 'epoch': 6.0})

In [None]:
results = calculate_results(trainer, tokenized_test)
results

{'0, 0, 0, 0, 0, 0, 0, 2, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 3, 3, 0, 1, 0, 0, 0, 0, 0, 0, 2, 3, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0]': {'precision': 0.7922535211267606,
  'recall': 0.8426966292134831,
  'f1': 0.8166969147005445,
  'number': 267},
 '0, 0, 0, 0, 2, 3, 3, 3, 3, 0, 1, 0, 0, 0, 0]': {'precision': 0.7202797202797203,
  'recall': 0.7686567164179104,
  'f1': 0.7436823104693141,
  'number': 536},
 '0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 3, 0, 1, 0, 0, 0, 0, 2, 3, 3, 0, 1, 0, 0, 0, 0, 0]': {'precision': 0.7453416149068323,
  'recall': 0.8053691275167785,
  'f1': 0.7741935483870969,
  'number': 149},
 '1, 0, 2, 3, 3, 0]': {'precision': 0.6821192052980133,
  'recall': 0.7984496124031008,
  'f1': 0.7357142857142858,
  'number': 129},
 'overall_precision': 0.7363013698630136,
 'overall_recall': 0.7955596669750231,
 'overall_f1': 0.7647843485993775,
 'overall_accuracy': 0.9382488479262673}

# Results of fine-tuned ROBERTA Model
Performance of fine tuned roberta is very similar to that of the fine-tuned BERT model.


*   BERT (fine tuned) F1 score: 0.71, total accuracy 0.93
*   ROBERTA (fine tuned) F1 score: 0.76, total accuracy 0.94
The difference is negligible, but Roberta is more computationally expensive to train / fine tune.
It is worth noting that ROBERTA pre-trained out performs pre-trained BERT with an f1 score of


*   BERT (pre-trained): 'overall_f1': 0.030502730182639805,
 'overall_accuracy': 0.25063900165388664}
*   ROBERTA (pre-trained) 'overall_f1': 0.02394948835183976,
 'overall_accuracy': 0.3278033794162826}





# Experimenting with distilBERT


In [16]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

# Load the RoBERTa tokenizer
distilbert_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", add_prefix_space=True)

# Load the RoBERTa model for token classification with the desired number of labels
distilbert_model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased", num_labels=4)




tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
#tokenize training set
label_list, val_label_list, test_label_list = conv_label_indexes(training_labels, valid_labels, test_labels)
tokenized_train = tokenize_and_align_labels(training, distilbert_tokenizer, label_list)
tokenized_val_datasets = tokenize_and_align_labels(valid, distilbert_tokenizer,  val_label_list)
tokenized_test_datasets = tokenize_and_align_labels(test, distilbert_tokenizer, test_label_list)

tokenized_train = turn_dict_to_list_of_dict(tokenized_train)
tokenized_val = turn_dict_to_list_of_dict(tokenized_val_datasets)
tokenized_test = turn_dict_to_list_of_dict(tokenized_test_datasets)

from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(distilbert_tokenizer)

In [18]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

# Training arguments (feel free to play arround with these values)
model_name = "distilbert-base-uncased"
epochs = 6
batch_size = 4
learning_rate = 2e-5

args = TrainingArguments(
    f"distilbert-base-uncased",
    # evaluation_strategy = "epoch", ## Instead of focusing on loss and accuracy, we will focus on the F1 score
    evaluation_strategy ='steps',
    eval_steps = 100,
    save_total_limit = 3,
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=0.001,
    save_steps=35000,
    metric_for_best_model = 'f1',
    load_best_model_at_end=True
)

trainer = Trainer(
    distilbert_model,
    args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator = data_collator,
    tokenizer=distilbert_tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)



dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


In [19]:
results = calculate_results(trainer, tokenized_test)
results

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.




{'0, 0, 0, 0, 0, 0, 0, 2, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 3, 3, 0, 1, 0, 0, 0, 0, 0, 0, 2, 3, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0]': {'precision': 0.028605482717520857,
  'recall': 0.0898876404494382,
  'f1': 0.04339963833634719,
  'number': 267},
 '0, 0, 0, 0, 2, 3, 3, 3, 3, 0, 1, 0, 0, 0, 0]': {'precision': 0.0,
  'recall': 0.0,
  'f1': 0.0,
  'number': 536},
 '0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 3, 0, 1, 0, 0, 0, 0, 2, 3, 3, 0, 1, 0, 0, 0, 0, 0]': {'precision': 0.002105263157894737,
  'recall': 0.006711409395973154,
  'f1': 0.0032051282051282055,
  'number': 149},
 '1, 0, 2, 3, 3, 0]': {'precision': 0.001549186676994578,
  'recall': 0.015503875968992248,
  'f1': 0.0028169014084507044,
  'number': 129},
 'overall_precision': 0.009189925119128658,
 'overall_recall': 0.02497687326549491,
 'overall_f1': 0.013436178153769594,
 'overall_accuracy': 0.11517065102992032}

#Fine Tune distilBERT

In [20]:
trainer.train()

Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
100,No log,0.318679,0.446062,0.497612,0.470429,0.87708
200,No log,0.267222,0.508576,0.651385,0.571189,0.896778
300,No log,0.248551,0.582716,0.676218,0.625995,0.912811
400,No log,0.229657,0.628183,0.706781,0.665169,0.918919
500,0.336300,0.226141,0.635808,0.69532,0.664234,0.922278
600,0.336300,0.248158,0.592907,0.734479,0.656143,0.917239
700,0.336300,0.239613,0.643101,0.721108,0.679874,0.923805
800,0.336300,0.233315,0.649043,0.680038,0.664179,0.920904
900,0.336300,0.25189,0.673285,0.712512,0.692343,0.928233
1000,0.158400,0.2468,0.64315,0.748806,0.691968,0.926401




TrainOutput(global_step=1608, training_loss=0.18808375602930932, metrics={'train_runtime': 106.4152, 'train_samples_per_second': 60.443, 'train_steps_per_second': 15.111, 'total_flos': 138874977028800.0, 'train_loss': 0.18808375602930932, 'epoch': 6.0})

In [21]:
results = calculate_results(trainer, tokenized_test)
results

{'0, 0, 0, 0, 0, 0, 0, 2, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 3, 3, 0, 1, 0, 0, 0, 0, 0, 0, 2, 3, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0]': {'precision': 0.6777408637873754,
  'recall': 0.7640449438202247,
  'f1': 0.7183098591549295,
  'number': 267},
 '0, 0, 0, 0, 2, 3, 3, 3, 3, 0, 1, 0, 0, 0, 0]': {'precision': 0.6417657045840407,
  'recall': 0.7052238805970149,
  'f1': 0.6719999999999999,
  'number': 536},
 '0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 3, 0, 1, 0, 0, 0, 0, 2, 3, 3, 0, 1, 0, 0, 0, 0, 0]': {'precision': 0.6848484848484848,
  'recall': 0.7583892617449665,
  'f1': 0.7197452229299361,
  'number': 149},
 '1, 0, 2, 3, 3, 0]': {'precision': 0.6308724832214765,
  'recall': 0.7286821705426356,
  'f1': 0.6762589928057553,
  'number': 129},
 'overall_precision': 0.6553156146179402,
 'overall_recall': 0.7298797409805735,
 'overall_f1': 0.6905908096280087,
 'overall_accuracy': 0.9272289881220869}