# Post‑Training for Reasoning with DPO (Direct Preference Optimization)
Train on synthetic preference pairs (chosen vs rejected) to steer outputs.

In [None]:
!pip -q install -U transformers accelerate datasets trl


In [None]:
import torch, random, json
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from trl import DPOTrainer, DPOConfig

base = "distilgpt2"
tok = AutoTokenizer.from_pretrained(base)
tok.pad_token = tok.eos_token
ref_model = AutoModelForCausalLM.from_pretrained(base)  # frozen reference
policy = AutoModelForCausalLM.from_pretrained(base)


In [None]:
# Synthetic preference data (math reasoning toy)
pairs = []
problems = [
    "Add 27 and 35.",
    "What is 9 times 7?",
    "If you have 12 apples and eat 5, how many remain?"
]
for p in problems*50:
    chosen = f"Let's think step by step. {p} The answer is: "  # 'better' style (with reasoning)
    rejected = f"{p} Answer: "                                # terse style
    pairs.append({"prompt": p, "chosen": chosen+"(correct)", "rejected": rejected+"(correct)"})
ds = Dataset.from_list(pairs)


In [None]:
cfg = DPOConfig(
    output_dir="dpo-reasoning-demo",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=1,
    learning_rate=5e-6,
    beta=0.1,   # inverse temperature; tune per task
    logging_steps=20,
    save_steps=200,
    report_to="none",
    fp16=True if torch.cuda.is_available() else False
)
trainer = DPOTrainer(
    model=policy,
    ref_model=ref_model,
    args=cfg,
    beta=cfg.beta,
    train_dataset=ds,
    tokenizer=tok,
    max_length=256
)
trainer.train()
policy.save_pretrained("dpo-reasoning-demo/model")
tok.save_pretrained("dpo-reasoning-demo/tokenizer")


> In practice: curate real preference data, control KL/divergence from the SFT policy, and evaluate with blinded pairwise tests.