# Fine Tuning a Pretrained Model
* Previous Chapter: Use Tokenizers & Pretrained Models to make predictions
* This Chapter: Fine-tune a pretrained model for custom dataset

In [32]:
import torch

from datasets import load_dataset
from torch.optim import AdamW
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding
)
from rich import print

In [2]:
# Load Model + Tokenize Sequences
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "This course is amazing!"
]
batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [15]:
# Optimizing model using two examples we gave
batch["labels"] = torch.tensor([1, 1])
optimizer = AdamW(model.parameters())
loss = model(**batch).loss
loss.backward()
optimizer.step()

## Datasets

In [16]:
raw_datasets = load_dataset("glue", "mrpc")
print("Dataset Description:", raw_datasets)
print("Training Example:", raw_datasets["train"][0])
print("Training Features:", raw_datasets["train"].features)

Found cached dataset glue (/n/fs/nlp-jy1682/hf_datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

## Dataset Preprocessing

In [17]:
# Tokenize strings from DatasetDict object
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenized_sentences_1 = tokenizer(raw_datasets["train"]["sentence1"])
tokenized_sentences_1 = tokenizer(raw_datasets["train"]["sentence2"])

Quick revisit of tokens -> ids, ids -> tokens

In [18]:
# Tokenize two sequences as a pair
inputs = tokenizer("This is the first sentence.", "This is the second one.")
print(inputs)
# input_ids: integers that each uniquely identify a token in the sentence
# attention_mask: (0/1) Whether the token should be accounted for by the tokenizer
# token_type_ids: number corresponds to which sentence (only available for some models)

In [19]:
# Decode IDs back to words (has special tokens added)
print(tokenizer.convert_ids_to_tokens(inputs["input_ids"]))

Now, let's tokenize the entire dataset

In [35]:
# - Approach 1 -
tokenized_datasets = tokenizer(
    raw_datasets["train"]["sentence1"],
    raw_datasets["train"]["sentence2"],
    padding=True,
    truncation=True)
print("Dataset Object Type:", type(tokenized_dataset))
print("Dataset Keys:", tokenized_dataset.keys())
# Disadvantages: Returns dictionary, requires storing whole dataset in RAM

In [36]:
# - Approach 2 -
def tokenize_function(example):
    # Dict (i.e. dataset item) => Dict (input_ids, attention_mask, token_type_ids)
    # Also works if `example` dict contains several samples
    # `padding` excluded because it may take a lot of time
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

# Apply tokenization function on dataset
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

Loading cached processed dataset at /n/fs/nlp-jy1682/hf_datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-b5b0894350f13edc.arrow
Loading cached processed dataset at /n/fs/nlp-jy1682/hf_datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-604d48d6b86f2095.arrow


  0%|          | 0/2 [00:00<?, ?ba/s]

In [37]:
print("Dataset Object Type:", type(tokenized_dataset))
print("Dataset Keys:", tokenized_dataset.keys())

**Dynamic Padding**: Solution to pad all examples to length of longest element when batching elements together (`collate` function)

In [38]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [51]:
# Try out `data_collator` on 8 samples.
samples = tokenized_datasets["train"][:8]
samples = {k: v for k, v in samples.items() if k not in ["idx", "sentence1", "sentence2"]}
[len(x) for x in samples["input_ids"]]

[50, 59, 47, 67, 59, 50, 62, 32]

In [53]:
# Now dynamically pad with collator
batch = data_collator(samples)
print({k: v.shape for k, v in batch.items()})