# RLHF with PPO (Toy Example)
Optimize a small LM against a sentimentâ€‘based reward for demonstration.

In [None]:
!pip -q install -U transformers datasets accelerate trl


In [None]:
import torch, random
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from trl import PPOConfig, PPOTrainer
from datasets import Dataset

base = "distilgpt2"
tok = AutoTokenizer.from_pretrained(base)
tok.pad_token = tok.eos_token
policy = AutoModelForCausalLM.from_pretrained(base).to("cuda" if torch.cuda.is_available() else "cpu")

# Reward: sentiment pipeline as a proxy (not an actual RM)
sent = pipeline("sentiment-analysis")


In [None]:
# Create prompts
prompts = [f"Review: {t}" for t in [
    "The film was stunningly beautiful.",
    "The product broke after two days.",
    "The service was slow but friendly.",
    "The game mechanics were addictive."
]] * 32
ds = Dataset.from_dict({"prompt": prompts})


In [None]:
ppo_cfg = PPOConfig(
    model_name=base,
    learning_rate=1e-5,
    batch_size=8,
    mini_batch_size=4,
    gradient_accumulation_steps=1,
    optimize_cuda_cache=True
)
trainer = PPOTrainer(ppo_cfg, policy, tok)


In [None]:
# PPO loop (few iterations for demo)
gen_kwargs = dict(max_new_tokens=32, do_sample=True, top_p=0.9, temperature=1.0, pad_token_id=tok.eos_token_id)

for i in range(3):
    batch = ds.shuffle().select(range(ppo_cfg.batch_size))
    texts = batch["prompt"]
    inputs = tok(texts, return_tensors="pt", padding=True).to(policy.device)
    responses = policy.generate(**inputs, **gen_kwargs)
    decoded = tok.batch_decode(responses, skip_special_tokens=True)

    # compute rewards (positive sentiment => higher reward)
    rewards = []
    for out in decoded:
        res = sent(out)[0]
        rewards.append( +1.0 if res["label"]=="POSITIVE" else -1.0 )

    trainer.step(inputs["input_ids"], responses, torch.tensor(rewards).to(policy.device))
    print(f"Iter {i+1} done. Mean reward: {sum(rewards)/len(rewards):.3f}")


> Note: This is a didactic example. Real RLHF uses a trained reward model, careful KL control, and stronger baselines.