In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments
from transformers import AutoModelForSequenceClassification
from transformers import Trainer
import evaluate
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import get_scheduler
import torch
from tqdm.auto import tqdm
from accelerate import Accelerator

2025-05-29 12:04:58.633128: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748513098.690452    2158 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748513098.708221    2158 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1748513098.843567    2158 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1748513098.843605    2158 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1748513098.843607    2158 computation_placer.cc:177] computation placer alr

---------

### Handling multiple sequences - Part 2

Datasets: https://huggingface.co/datasets

In [3]:
# Downloading dataset
raw_datasets = load_dataset("glue", "mrpc")
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [5]:
# Accessing dataset
raw_train_dataset = raw_datasets["train"]
raw_train_dataset[0]
raw_train_dataset.features

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
 'label': 1,
 'idx': 0}

Labels are already integers, so not preprocessing needed.  
Behind the scenes, label is of type ClassLabel, and the mapping of integers to label name is stored in the names folder. 0 corresponds to not_equivalent, and 1 corresponds to equivalent.

From previous notebook, however, tokenizer returns a dictionary (a lot of RAM), so a new function is defined to allow ```batched=True``` in ```map()``` function to speed up tokenization since:
- Applies multiprocessing to go faster than applying the function on each element of the dataset.
- Saves results in cache as soon as one element is processed, so memory is not overloaded.

In [12]:
# Speeding up tokenization
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(
        example["sentence1"], example["sentence2"],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True,
)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True, remove_columns=drug_dataset["train"].column_names)
tokenized_datasets

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

```padding``` is skipped because it is not efficient: it’s better to pad the samples when we’re building a batch, as then we only need to pad to the maximum length in that batch, and not the maximum length in the entire dataset.  
```num_proc``` argument allows multiprocessing; not used here because Tokenizers library already uses multiple threads to tokenize our samples faster.
```return_overflowing_tokens=True``` is useful for long documents to not drop the truncated remainder of the sequence, but rather split it into multiple input chunks for the model. It also returns additional information about the overflow.
```remove_columns``` removes the columns from the old dataset, which is necessary because the Tokenizer extends the number of columns (due to the chunks) regarding the original dataset

In [None]:
# The mismatched length problem can be also solved by making the old columns the same size as the new ones
def tokenize_function2(examples):
    result = tokenizer(
        examples["review"],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True,
    )
    # Extract mapping between new and old indices
    sample_map = result.pop("overflow_to_sample_mapping")
    for key, values in examples.items():
        result[key] = [values[i] for i in sample_map]
    return result

tokenized_datasets = raw_datasets.map(tokenize_function2, batched=True)

In [23]:
# Dynamic padding on each batch with collate function (not for TPU, which prefers all batches with same size)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
samples = tokenized_datasets["train"][:8] # let’s grab a few samples
samples = {k: v for k, v in samples.items() if k not in ["idx", "sentence1", "sentence2"]}
print('Different lengths', [len(x) for x in samples["input_ids"]])
batch = data_collator(samples)
print('After padding', {k: v.shape for k, v in batch.items()})

Different lengths [50, 59, 47, 67, 59, 50, 62, 32]
After padding {'input_ids': torch.Size([8, 67]), 'token_type_ids': torch.Size([8, 67]), 'attention_mask': torch.Size([8, 67]), 'labels': torch.Size([8])}


--------

### Fine-tuning with Trainer class

In [14]:
# Getting configuration object that defines how your model will be trained, and the pretrained model
training_args = TrainingArguments("test-trainer", no_cuda=True) # Using CPU instead of GPU, which is not recommended, but I lack resources
checkpoint = "bert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


About the warning: BERT has not been pretrained on classifying pairs of sentences, so the head of the pretrained model has been discarded and a new head suitable for sequence classification has been added instead. The warnings indicate that some weights were not used (the ones corresponding to the dropped pretraining head) and that some others were randomly initialized (the ones for the new head).

In [26]:
# Defining evaluation function, trainer, and fine-tuning

def compute_metrics(eval_preds): # Input is a tuple with a predictions field and a label_ids field
    """Look text below for explanation"""
    metric = evaluate.load("glue", "mrpc") # Loads evaluation metric used for the GLUE benchmark, specifically for the MRPC task
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1) # Gives the predicted class for each row (sample
    return metric.compute(predictions=predictions, references=labels) # Dict mapping strings (metrics) to floats (values)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"].select(range(8)), # Using a fraction of the dataset, so training is fast
    eval_dataset=tokenized_datasets["validation"].select(range(8)),
    # data_collator=data_collator, # when you pass a tokenizer as the processing_class, the default data_collator used by the Trainer will be a DataCollatorWithPadding, so this line is skipped
    processing_class=tokenizer,
    compute_metrics=compute_metrics, # Optional, but necessary to report metrics at the end of each epoch
)

trainer.train()

Step,Training Loss


TrainOutput(global_step=3, training_loss=0.5873920520146688, metrics={'train_runtime': 30.1609, 'train_samples_per_second': 0.796, 'train_steps_per_second': 0.099, 'total_flos': 826333158240.0, 'train_loss': 0.5873920520146688, 'epoch': 3.0})

-------------

### Fine-tuning without Trainer class

In [27]:
# Postprocessing tokenized_datasets to take care of some things that the Trainer did automatically
tokenized_datasets = tokenized_datasets.remove_columns(["sentence1", "sentence2", "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [51]:
# Defining dataloaders and inspecting them
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

train_dataloader = DataLoader(
    tokenized_datasets["train"].select(range(8)), shuffle=True, batch_size=4, collate_fn=data_collator # batch_size <= dataset size
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"].select(range(8)), batch_size=4, collate_fn=data_collator
)

for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

{'labels': torch.Size([4]),
 'input_ids': torch.Size([4, 67]),
 'token_type_ids': torch.Size([4, 67]),
 'attention_mask': torch.Size([4, 67])}

Shapes will probably be slightly different since ```shuffle=True``` and padding to the maximum length inside the batch.

In [52]:
# Checking if everything is alright
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)

tensor(0.4203, grad_fn=<NllLossBackward0>) torch.Size([4, 2])


Yes! Loss and logits (two for each input in our batch, so a tensor of size 8 x 2) obtained.

In [53]:
# Defining an optimizer and a learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler( # learning rate scheduler used is a linear decay from the maximum value (5e-5) to 0
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [49]:
# Setting CPU or GPU
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cpu')

In [54]:
# Training finally!
progress_bar = tqdm(range(num_training_steps))

model.train() # Sets the model to training mode
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()} # Moves the batch (a dict of inputs, attention masks, labels, etc.) to the GPU or CPU.
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward() # Computes gradients (how much each parameter contributed to the loss)
        optimizer.step() # Updates the model’s weights using the gradients: model parameters are tensors stored in the model, to which the Optimizer holds references
        lr_scheduler.step() # Adjusts the learning rate based on the current step (optional but improves training)
        optimizer.zero_grad() # Clears the gradients from the previous step (essential to avoid accumulation)
        progress_bar.update(1)

  0%|          | 0/6 [00:00<?, ?it/s]

In [55]:
# Evaluating
metric = evaluate.load("glue", "mrpc")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad(): # No gradients are computed, which speeds up evaluation
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

{'accuracy': 0.375, 'f1': 0.5454545454545454}

-----------

### Fine-tuning with Accelerate library

In [None]:
# Enabling distributed training on multiple GPUs or TPUs. Lines with # are then not needed
accelerator = Accelerator() # New!

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
optimizer = AdamW(model.parameters(), lr=3e-5)

# device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# model.to(device)

train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(train_dataloader, eval_dataloader, model, optimizer) # New!

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
  for batch in train_dataloader:
      # batch = {k: v.to(device) for k, v in batch.items()}
      outputs = model(**batch)
      loss = outputs.loss
      # loss.backward()
      accelerator.backward(loss) # New!
      optimizer.step()
      lr_scheduler.step()
      optimizer.zero_grad()
      progress_bar.update(1)

In order to benefit from the speed-up offered by Cloud TPUs, samples should be padded to a fixed length with the ```padding="max_length"``` and ```max_length``` arguments of the tokenizer.

- Previous code in a train.py make it runnable in any distributed setup.
- ```accelerate config``` command creates a configuration file after asking some questions, so later ```accelerate launch train.py``` command can be launched.
- This can be run in a Notebook using ```from accelerate import notebook_launcher``` and ```notebook_launcher(training_function)```