<a href="https://colab.research.google.com/github/johntango/DirectPolicyOptimization01/blob/main/DPOTransformerExample.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Direct Preference Optimization

In [3]:
# 📘 SECTION 1: Install Dependencies
!pip install -q transformers datasets accelerate

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2025.3.0 requires fsspec==2025.3.0, but you have fsspec 2024.12.0 which is incompatible.[0m[31m
[0m

In [4]:
# 📘 SECTION 2: Imports and Setup
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [28]:
# 📘 SECTION 3: Load Model and Tokenizer
model_name = "gpt2"  # Replace with instruction-tuned model if desired
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
tokenizer.pad_token = tokenizer.eos_token



In [29]:
# 📘 SECTION 4: Tokenization Helper
def tokenize_pair(prompt, response, max_length=512):
    tokenized = tokenizer(prompt + response, truncation=True, max_length=max_length,
                          padding="max_length", return_tensors="pt")
    return {k: v.to(device) for k, v in tokenized.items()}


In [30]:
# 📘 SECTION 5: DPO Loss Function
def dpo_loss(chosen_logps, rejected_logps, beta=0.1):
    diff = (chosen_logps - rejected_logps) / beta
    return -F.logsigmoid(diff).mean()


In [31]:
# 📘 SECTION 6: Compute Log Probability of Sequence
def compute_logprob(model, input_ids, attention_mask):
    outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
    return -outputs.loss


In [32]:
dataset = load_dataset("Dahoas/synthetic-instruct-gptj-pairwise", split="train[:500]")
dataset[0]


{'prompt': 'I was wondering if you could walk me through the process of setting up a hydroponic garden for herbs.',
 'chosen': "Sure! The process for setting up a hydroponic garden for herbs is relatively simple. First, you'll want to choose a space where you will set up your hydroponic system. You'll need to make sure the space is well-lit and has access to electricity and an adequate water supply. Next, you'll need to choose the type of hydroponic system you want to use. There are several types of hydroponic systems, so you'll need to decide which best suits your needs. Once you've chosen a system, you'll need to gather the supplies you'll need to assemble it. This includes things like pumps, growing trays, grow lights, and nutrients. Once you've assembled the system, you'll need to add your choice of herbs to the system. Lastly, you'll need to monitor and adjust the system as needed to ensure your herbs are getting the correct amount of light, water, and nutrients.",
 'rejected': 'H

In [33]:
# 📘 SECTION 8: Dataloader for Mini-batching
def collate_fn(samples):
    return samples

dataloader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)



In [34]:
# 📘 SECTION 9: Training Loop
optimizer = AdamW(model.parameters(), lr=1e-5)
model.train()

for epoch in range(3):
    loop = tqdm(dataloader, desc=f"Epoch {epoch}")
    for batch in loop:
        total_loss = 0.0

        for sample in batch:
            prompt = sample["prompt"]
            chosen = sample["chosen"]
            rejected = sample["rejected"]

            chosen_input = tokenize_pair(prompt, chosen)
            rejected_input = tokenize_pair(prompt, rejected)

            chosen_logp = compute_logprob(model, **chosen_input)
            rejected_logp = compute_logprob(model, **rejected_input)

            loss = dpo_loss(chosen_logp, rejected_logp)
            total_loss += loss

        avg_loss = total_loss / len(batch)
        optimizer.zero_grad()
        avg_loss.backward()
        optimizer.step()
        loop.set_postfix(loss=avg_loss.item())



Epoch 0: 100%|██████████| 125/125 [00:40<00:00,  3.06it/s, loss=0.00271]
Epoch 1: 100%|██████████| 125/125 [00:41<00:00,  3.04it/s, loss=0.000474]
Epoch 2: 100%|██████████| 125/125 [00:40<00:00,  3.06it/s, loss=1.81e-5]


In [27]:
# 📘 SECTION 10: Save Fine-Tuned Model
model.save_pretrained("dpo-finetuned-model")
tokenizer.save_pretrained("dpo-finetuned-model")
print("✅ Model saved to 'dpo-finetuned-model'")


✅ Model saved to 'dpo-finetuned-model'
