# Fine-tuning a model with the Trainer API or Keras

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [1]:
!pip install datasets evaluate transformers[sentencepiece] accelerate>=0.20.1

In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
# Load Dataset
raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

# Data preprocessing and collator for dynamic padding
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Downloading builder script:   0%|          | 0.00/28.8k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/28.7k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/27.9k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data: 0.00B [00:00, ?B/s]

Downloading data: 0.00B [00:00, ?B/s]

Downloading data: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/3668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/408 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

**Training**

1. TrainingArguments class- It will contain all the hyperparameters the Trainer will use for training and evaluation.
2. Trainer- Class thar handles the training


In [3]:
from transformers import TrainingArguments

training_args = TrainingArguments("test-trainer")

In [4]:
training_args

TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
gradient_checkpointing_kwargs=None,
greater_is_better=None,
group_by_length=False,
half_precision_backend=au

BERT has not been pretrained on classifying pairs of sentences, so the head of the pretrained model has been discarded and a new head suitable for sequence classification has been added instead. The warnings indicate that some weights were not used (the ones corresponding to the dropped pretraining head) and that some others were randomly initialized (the ones for the new head)

In [5]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [7]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,0.5183
1000,0.2829


TrainOutput(global_step=1377, training_loss=0.32767084602350416, metrics={'train_runtime': 205.8842, 'train_samples_per_second': 53.448, 'train_steps_per_second': 6.688, 'total_flos': 405324636337200.0, 'train_loss': 0.32767084602350416, 'epoch': 3.0})

In [None]:
predictions = trainer.predict(tokenized_datasets["validation"])


In [10]:
#  EvalPrediction Object
print(type(predictions))
print(type(predictions.predictions))
print(predictions.predictions.shape, predictions.label_ids.shape)

<class 'transformers.trainer_utils.PredictionOutput'>
<class 'numpy.ndarray'>
(408, 2) (408,)


In [11]:
import numpy as np
# predictions.predictions are  logits, so need to convert it to prediction
preds = np.argmax(predictions.predictions, axis=-1)

**Evaluate** library

**F-1 score**- Harmonic mean of the precision and recall

In [12]:
import evaluate

metric = evaluate.load("glue", "mrpc")
metric.compute(predictions=preds, references=predictions.label_ids)

Downloading builder script:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

{'accuracy': 0.8651960784313726, 'f1': 0.9056603773584906}

In [13]:
def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [14]:
training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.409875,0.838235,0.884211
2,0.544000,0.581461,0.82598,0.884553
3,0.340600,0.548044,0.855392,0.895944


TrainOutput(global_step=1377, training_loss=0.3781580706827002, metrics={'train_runtime': 212.0301, 'train_samples_per_second': 51.898, 'train_steps_per_second': 6.494, 'total_flos': 405540469624800.0, 'train_loss': 0.3781580706827002, 'epoch': 3.0})

In [16]:
predictions = trainer.predict(tokenized_datasets["validation"])


In [19]:
predictions.metrics

{'test_loss': 0.5480440855026245,
 'test_accuracy': 0.8553921568627451,
 'test_f1': 0.8959435626102292,
 'test_runtime': 3.8964,
 'test_samples_per_second': 104.711,
 'test_steps_per_second': 13.089}

# Exercise
Fine-tune a model on the GLUE SST-2 dataset, using the data processing you did in section 2.

Steps
1. Load tokenizer and dataset
2. Data Preprocessing
3. Prepare evaluation metrics
4. Prepare trainer and trainer arguments
5. Train and evaluate

In [20]:
from datasets import load_dataset

sst2_datasets = load_dataset("glue", "sst2")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading data:   0%|          | 0.00/7.44M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [21]:
def tokenize_function(example):
    return tokenizer(example["sentence"], truncation=True, return_token_type_ids=False)

tokenized_datasets = sst2_datasets.map(tokenize_function, batched=True)

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [22]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [24]:

def sst2_compute_metrics(eval_preds):
    sst2_metric = evaluate.load("glue", "sst2")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return sst2_metric.compute(predictions=predictions, references=labels)


In [25]:
training_args = TrainingArguments("sst2-test-trainer", evaluation_strategy="epoch", save_steps=5000)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=sst2_compute_metrics,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# On stopping and runnig again, it will continue from the pre
trainer.train()

In [43]:
predictions = trainer.predict(tokenized_datasets["validation"])


Epoch,Training Loss,Validation Loss


In [44]:
predictions.metrics

{'test_loss': 0.6699222326278687,
 'test_accuracy': 0.8715596330275229,
 'test_runtime': 8.2075,
 'test_samples_per_second': 106.244,
 'test_steps_per_second': 13.28}