# DPO Fine-Tuning: Preference-Tuned Summarizer

In [1]:
# Install dependencies (correct dev version of trl)
!pip install git+https://github.com/huggingface/trl@main
!pip install -q transformers datasets accelerate

Collecting git+https://github.com/huggingface/trl@main
  Cloning https://github.com/huggingface/trl (to revision main) to /tmp/pip-req-build-h2a6y2qp
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/trl /tmp/pip-req-build-h2a6y2qp
  Resolved https://github.com/huggingface/trl to commit d98d53983b6cf6b3381cb084c75b93d8e1ba4c52
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting datasets>=3.0.0 (from trl==0.20.0.dev0)
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets>=3.0.0->trl==0.20.0.dev0)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate>=1.4.0->trl==0.20.0.dev0)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x

In [None]:
# Upload DPO dataset (JSON format with prompt, chosen, rejected)
from google.colab import files
uploaded = files.upload()  # Upload `dpo_format.json`

Saving dpo_format.json to dpo_format.json


In [3]:
# DPO Training Script (DistilGPT2)
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import DPOTrainer, DPOConfig
import torch

In [None]:
# Load tokenizer and base model
model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Ensure padding

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [5]:
model = AutoModelForCausalLM.from_pretrained(model_name)
ref_model = AutoModelForCausalLM.from_pretrained(model_name)

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [6]:
# Load dataset
dataset = load_dataset("json", data_files="dpo_format.json", split="train")

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
# with open("configs/dpo_config.json", "r") as f:
#     config_dict = json.load(f)

# training_args = DPOConfig(**config_dict)

In [7]:
# Define DPO config
training_args = DPOConfig(
    output_dir="./distilgpt2-dpo-checkpoint",
    per_device_train_batch_size=4,
    learning_rate=5e-5,
    num_train_epochs=3,
    logging_dir="./logs",
    save_strategy="epoch",
    save_total_limit=1,  # only the latest checkpoint
    bf16=False,
    fp16=True,  # Enable for Colab GPU
    remove_unused_columns=False,
    report_to="none",
    padding_value=tokenizer.pad_token_id,
)

In [8]:
# Initialize trainer
trainer = DPOTrainer(
    model=model,
    ref_model=ref_model,
    args=training_args,
    train_dataset=dataset,
)

Extracting prompt in train dataset:   0%|          | 0/2871 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/2871 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/2871 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1198 > 1024). Running this sequence through the model will result in indexing errors


In [9]:
# Train
trainer.train()

Step,Training Loss
10,0.4537
20,0.0524
30,0.0225
40,0.0066
50,0.0064
60,0.0029
70,0.0112
80,0.0032
90,0.0022
100,0.001


TrainOutput(global_step=2154, training_loss=0.0029919418935477433, metrics={'train_runtime': 1067.5043, 'train_samples_per_second': 8.068, 'train_steps_per_second': 2.018, 'total_flos': 0.0, 'train_loss': 0.0029919418935477433, 'epoch': 3.0})

In [10]:
# Save final model
trainer.save_model("./distilgpt2-dpo-checkpoint")

In [11]:
# Step 1: Zip the folder
!zip -r distilgpt2-dpo-checkpoint.zip distilgpt2-dpo-checkpoint/

# Step 2: Download the zip file
from google.colab import files
files.download("distilgpt2-dpo-checkpoint.zip")

  adding: distilgpt2-dpo-checkpoint/ (stored 0%)
  adding: distilgpt2-dpo-checkpoint/vocab.json (deflated 59%)
  adding: distilgpt2-dpo-checkpoint/merges.txt (deflated 53%)
  adding: distilgpt2-dpo-checkpoint/README.md (deflated 47%)
  adding: distilgpt2-dpo-checkpoint/special_tokens_map.json (deflated 52%)
  adding: distilgpt2-dpo-checkpoint/training_args.bin (deflated 52%)
  adding: distilgpt2-dpo-checkpoint/tokenizer.json (deflated 82%)
  adding: distilgpt2-dpo-checkpoint/model.safetensors (deflated 7%)
  adding: distilgpt2-dpo-checkpoint/tokenizer_config.json (deflated 52%)
  adding: distilgpt2-dpo-checkpoint/config.json (deflated 52%)
  adding: distilgpt2-dpo-checkpoint/checkpoint-2154/ (stored 0%)
  adding: distilgpt2-dpo-checkpoint/checkpoint-2154/vocab.json (deflated 59%)
  adding: distilgpt2-dpo-checkpoint/checkpoint-2154/merges.txt (deflated 53%)
  adding: distilgpt2-dpo-checkpoint/checkpoint-2154/rng_state.pth (deflated 25%)
  adding: distilgpt2-dpo-checkpoint/checkpoint-215

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>