# Finetune LLMs with GRPO

This notebook shows how to finetune an LLM with GRPO, using the `trl` library.

It's by [Ben Burtenshaw](https://huggingface.co/burtenshaw) and [Maxime Labonne](https://huggingface.co/mlabonne).

This is a minimal example. For a complete example, refer to the GRPO chapter in the [course](https://huggingface.co/course/en/chapter12/1).

In [None]:
!pip install -qqq datasets==3.2.0 transformers==4.47.1 trl==0.14.0 peft==0.14.0 accelerate==1.2.1 bitsandbytes==0.45.2 wandb==0.19.7 --progress-bar off
!pip install -qqq flash-attn --no-build-isolation --progress-bar off

In [None]:
import torch
import wandb
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import GRPOConfig, GRPOTrainer

# Log to Weights & Biases
wandb.login()

# Load dataset
dataset = load_dataset("mlabonne/smoltldr")
print(dataset)

In [None]:
# Load model
model_id = "HuggingFaceTB/SmolLM-135M-Instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype="auto",
    device_map="auto",
    attn_implementation="flash_attention_2",
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Load LoRA
lora_config = LoraConfig(
    task_type="CAUSAL_LM",
    r=16,
    lora_alpha=32,
    target_modules="all-linear",
)
model = get_peft_model(model, lora_config)
print(model.print_trainable_parameters())

In [None]:
# Reward function
def reward_len(completions, **kwargs):
    return [-abs(50 - len(completion)) for completion in completions]

# Training arguments
training_args = GRPOConfig(
    output_dir="GRPO",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    max_prompt_length=512,
    max_completion_length=96,
    num_generations=8,
    optim="adamw_8bit",
    num_train_epochs=1,
    bf16=True,
    report_to=["wandb"],
    remove_unused_columns=False,
    logging_steps=1,
)

# Trainer
trainer = GRPOTrainer(
    model=model,
    reward_funcs=[reward_len],
    args=training_args,
    train_dataset=dataset["train"],
)

# Train model
wandb.init(project="GRPO")
trainer.train()

In [None]:
# Generate text
prompt = dataset["test"]["prompt"][0]
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
    output_ids = model.generate(
        **inputs, max_new_tokens=256, do_sample=True, temperature=0.5, min_p=0.1
    )
generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)[
    len(prompt) :
]
print(f"TL;DR: {generated_text.strip()}")

In [None]:
# Save model
merged_model = trainer.model.merge_and_unload()
merged_model.push_to_hub("SmolGRPO-135M", private=True)