# **Example Fine-tuning on MRPC Dataset**

In [1]:
# Install required libraries
!pip install transformers
!pip install datasets
!pip install evaluate

Collecting datasets
  Downloading datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.0-py3-none-any.whl (474 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.3/474.3 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K 

In [2]:
# Hugging Face Libraries
import transformers
import evaluate

# Standard Libraries
import numpy as np

In [3]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

# Loading MRPC dataset
raw_datasets = load_dataset("glue", "mrpc")

# Initialize tokenizer --- use distilbert here since there is limited GPU memory on colab
checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Define a tokenization function
def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

# Tokenize the raw data
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

# Initialize a data collator with our tokenizer so that we cann dynamically pad by batch
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/649k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/75.7k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/308k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/408 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

In [4]:
#Here we define a compute metrics function that will be used to assess the model performance during training. For MRPC we use accuracy and F1-score.
def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [5]:
#Initialize training arguments for the Trainer API --- only argument we need here is to provide directory where trained model will be saved. Default values should work well for basic fine-tuning
from transformers import TrainingArguments
training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch") #evaluation_strategy="epoch" will compute our metrics after each epoch during training

# Instantiating the Model
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)



model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Note: We get a warning after instantiating this pretrained model. This is because BERT has not been pretrained on classifying pairs of sentences (the task we are attempting here), so the head of the pretrained model has been discarded and a new head suitable for sequence classification has been added instead. The warnings indicate that some weights were not used (the ones corresponding to the dropped pretraining head) and that some others were randomly initialized (the ones for the new head). It concludes by encouraging you to train the model, which is exactly what we are going to do now.

In [7]:
from transformers import Trainer
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.392367,0.828431,0.87931
2,0.512700,0.433396,0.852941,0.896907
3,0.316000,0.747794,0.845588,0.893401


Downloading builder script:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

TrainOutput(global_step=1377, training_loss=0.3464753754171844, metrics={'train_runtime': 114.7102, 'train_samples_per_second': 95.929, 'train_steps_per_second': 12.004, 'total_flos': 203961502572816.0, 'train_loss': 0.3464753754171844, 'epoch': 3.0})

In [14]:
# zip folder (with best checkpoint) of training runs to save (optional for saving locally)
!zip -r /content/distilbert-finetuning-mrpc.zip /content/test-trainer/checkpoint-1000

  adding: content/test-trainer/checkpoint-1000/ (stored 0%)
  adding: content/test-trainer/checkpoint-1000/config.json (deflated 46%)
  adding: content/test-trainer/checkpoint-1000/scheduler.pt (deflated 55%)
  adding: content/test-trainer/checkpoint-1000/special_tokens_map.json (deflated 42%)
  adding: content/test-trainer/checkpoint-1000/trainer_state.json (deflated 61%)
  adding: content/test-trainer/checkpoint-1000/training_args.bin (deflated 51%)
  adding: content/test-trainer/checkpoint-1000/rng_state.pth (deflated 25%)
  adding: content/test-trainer/checkpoint-1000/vocab.txt (deflated 53%)
  adding: content/test-trainer/checkpoint-1000/model.safetensors (deflated 8%)
  adding: content/test-trainer/checkpoint-1000/optimizer.pt (deflated 28%)
  adding: content/test-trainer/checkpoint-1000/tokenizer_config.json (deflated 76%)
  adding: content/test-trainer/checkpoint-1000/tokenizer.json (deflated 71%)


In [None]:
# Download zip file locally (optional for saving locally)
from google.colab import files
files.download("/content/distilbert-finetuning-mrpc.zip")