# NB_050124T1057_fine_tuning_full_training_with_accelerator

# 1.GOAL

Full training fune-tuning model "bert-base-uncased" on  dataset  "glue mrpc" with accelerator

references:
-  [A full training hugging face article](https://huggingface.co/learn/nlp-course/chapter3/4#a-full-training)

# 2.Steps
    - tokenizer
    - datacollector
    - model
    - data_loaders
    - accelerator
    - optimizer and learning rate
    - train
    - evaluation


# 3.Tools

# 4.Implementation

## 4.0.Initializing raw dataset and checkpoint

In [3]:
from datasets import load_dataset

# Loading the MRPC (Microsoft Research Paraphrase Corpus) dataset from the GLUE benchmark.
raw_datasets = load_dataset("glue", "mrpc")

checkpoint = "bert-base-uncased"  # model name in huggingFace hub

## 4.1.Tokenizer

In [4]:
from transformers import AutoTokenizer

# Loading the tokenizer corresponding to the 'bert-base-uncased' checkpoint.
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

## 4.2.Datacollector

In [5]:
from transformers import DataCollatorWithPadding

# Creating a data collator that will dynamically pad the inputs received, to the maximum length in a batch.
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## 4.3.Model

In [6]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint, num_labels=2
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 4.4.Data_loaders: 
- train_dataloader, 
- eval_dataloader

### 4.4.1. Make tokenized dataasets

In [9]:
# Defining a function to tokenize a pair of sentences.
def tokenize_function(example):
    # Tokenizes a pair of sentences and ensures truncation to the maximum length the model can handle.
    return tokenizer(
        example["sentence1"], example["sentence2"], truncation=True
    )


# Applying the tokenize_function to all examples in the dataset using map, processing in batches for efficiency.
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

Map: 100%|██████████| 408/408 [00:00<00:00, 14067.56 examples/s]


### 4.4.2.Formating tokenized datasets

In [10]:
tokenized_datasets = tokenized_datasets.remove_columns(
    ["sentence1", "sentence2", "idx"]
)
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

### 4.4.3.Create dataloaders

In [11]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    batch_size=8,
    collate_fn=data_collator,
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator
)

## 4.5.Accelerator

In [12]:
from accelerate import Accelerator

accelerator = Accelerator()

train_dl, eval_dl, model, optimizer = accelerator.prepare(
    train_dataloader, eval_dataloader, model, optimizer
)

## 4.6. Optimizer and learning rate

In [14]:
from transformers import AdamW, get_scheduler

optimizer = AdamW(model.parameters(), lr=3e-5)

num_epochs = 3
num_training_steps = num_epochs * len(train_dl)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)



## 4.7.Train

In [16]:
from tqdm import tqdm


def training_function():
    progress_bar = tqdm(range(num_training_steps))

    model.train()
    for epoch in range(num_epochs):
        for batch in train_dl:
            outputs = model(**batch)
            loss = outputs.loss
            accelerator.backward(loss)

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)

In [19]:
from accelerate import notebook_launcher

notebook_launcher(function=training_function, num_processes=1)

Launching training on one GPU.


  0%|          | 0/1377 [00:00<?, ?it/s]You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 1377/1377 [00:42<00:00, 32.52it/s]


## 4.8.Evalutation

In [21]:
import torch

device = (
    torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
)
model.to(device)
device

device(type='cuda')

In [22]:
import evaluate

metric = evaluate.load("glue", "mrpc")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

{'accuracy': 0.8553921568627451, 'f1': 0.8977469670710572}