# GRPO Fine-Tuning: Preference-Tuned Summarizer

In [None]:
# Install required libraries
!pip install git+https://github.com/huggingface/trl@main
!pip install -q trl accelerate datasets evaluate transformers rouge_score

Collecting git+https://github.com/huggingface/trl@main
  Cloning https://github.com/huggingface/trl (to revision main) to /tmp/pip-req-build-swfw_scu
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/trl /tmp/pip-req-build-swfw_scu
  Resolved https://github.com/huggingface/trl to commit 68db24e01051d9990cfb93bf7c8abc896462513a
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting datasets>=3.0.0 (from trl==0.20.0.dev0)
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets>=3.0.0->trl==0.20.0.dev0)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate>=1.4.0->trl==0.20.0.dev0)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x

In [None]:
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from trl import GRPOTrainer, GRPOConfig
import evaluate

In [None]:
# Load dataset (uploaded JSON format)
import json
from google.colab import files
uploaded = files.upload()
raw_data = json.load(open(list(uploaded.keys())[0]))

# Add 'reference' field for ROUGE computation
for item in raw_data:
    item["reference"] = item["chosen"]

Saving dpo_format.json to dpo_format.json


In [None]:
# Load base language model
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
def is_short_enough(sample):
    tokens = tokenizer.encode(sample["prompt"] + sample["chosen"])
    return len(tokens) <= 1024

In [None]:
# Convert to HF dataset
dataset = Dataset.from_list(raw_data)
train_dataset = dataset.filter(is_short_enough)

Filter:   0%|          | 0/2871 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1253 > 1024). Running this sequence through the model will result in indexing errors


In [None]:
# Load Hugging Face ROUGE scorer
rouge = evaluate.load("rouge")

# Define reward function using ROUGE-L
def rouge_reward_func(prompts, completions, completion_ids=None, **kwargs):
    rewards = []
    for prompt, completion in zip(prompts, completions):
        score = rouge.compute(predictions=[completion], references=[prompt])
        reward = score["rougeL"]
        rewards.append(reward)
    return torch.tensor(rewards).unsqueeze(-1).to("cuda" if torch.cuda.is_available() else "cpu")

Downloading builder script: 0.00B [00:00, ?B/s]

In [None]:
# Define GRPO Config
grpo_config = GRPOConfig(
    output_dir="./models/distilgpt2-grpo-checkpoint",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    learning_rate=5e-5,
    lr_scheduler_type="linear",
    warmup_steps=50,
    weight_decay=0.01,
    num_train_epochs=1,
    bf16=False,
    fp16=True,  # Enable for Colab GPU
    save_strategy="epoch",
    logging_steps=10,
    report_to=["tensorboard"],
    beta=0.1
)

In [None]:
# Initialize GRPO Trainer
trainer = GRPOTrainer(
    model=model,
    args=grpo_config,
    train_dataset=train_dataset.select(range(1000)),
    reward_funcs=[rouge_reward_func]
)

In [None]:
# Start training
trainer.train()

Step,Training Loss
10,-0.6478
20,-0.2897
30,-0.2849
40,-0.0515
50,-0.0659
60,0.0734
70,-0.0406
80,-0.2234
90,-0.0329
100,0.2546


TrainOutput(global_step=1000, training_loss=0.011520708978176118, metrics={'train_runtime': 4433.7671, 'train_samples_per_second': 0.226, 'train_steps_per_second': 0.226, 'total_flos': 0.0, 'train_loss': 0.011520708978176118})

In [None]:
# Save final model
trainer.save_model("./models/distilgpt2-grpo-checkpoint")

In [None]:
# Step 1: Zip the folder
!zip -r distilgpt2-grpo-checkpoint.zip ./models/distilgpt2-grpo-checkpoint

# Step 2: Download the zip file
from google.colab import files
files.download('distilgpt2-grpo-checkpoint.zip')

  adding: models/distilgpt2-grpo-checkpoint/ (stored 0%)
  adding: models/distilgpt2-grpo-checkpoint/config.json (deflated 52%)
  adding: models/distilgpt2-grpo-checkpoint/runs/ (stored 0%)
  adding: models/distilgpt2-grpo-checkpoint/runs/Jul08_11-56-25_bea6d31edfd7/ (stored 0%)
  adding: models/distilgpt2-grpo-checkpoint/runs/Jul08_11-56-25_bea6d31edfd7/events.out.tfevents.1751975788.bea6d31edfd7.891.0 (deflated 76%)
  adding: models/distilgpt2-grpo-checkpoint/README.md (deflated 47%)
  adding: models/distilgpt2-grpo-checkpoint/tokenizer.json (deflated 82%)
  adding: models/distilgpt2-grpo-checkpoint/training_args.bin (deflated 52%)
  adding: models/distilgpt2-grpo-checkpoint/vocab.json (deflated 59%)
  adding: models/distilgpt2-grpo-checkpoint/checkpoint-1000/ (stored 0%)
  adding: models/distilgpt2-grpo-checkpoint/checkpoint-1000/scheduler.pt (deflated 55%)
  adding: models/distilgpt2-grpo-checkpoint/checkpoint-1000/config.json (deflated 52%)
  adding: models/distilgpt2-grpo-checkpo

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>