In [16]:
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from trl import DPOTrainer, DPOConfig

import os

In [14]:
# Load tokenizer and base model (DistilGPT2 is small and fast)
model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
ref_model = AutoModelForCausalLM.from_pretrained(model_name)

In [7]:
# Ensure tokenizer handles padding
tokenizer.pad_token = tokenizer.eos_token

In [4]:
# Load your DPO dataset
dataset = load_dataset("json", data_files="../data/dpo_format.json", split="train")

In [20]:
# Define training arguments (use DPOConfig instead!)
training_args = DPOConfig(
    output_dir="../models/distilgpt2-dpo-checkpoint",
    per_device_train_batch_size=4,
    learning_rate=5e-5,
    num_train_epochs=3,
    logging_dir="./logs",
    save_strategy="epoch",
    bf16=False,  # Set True if you're on a GPU with bfloat16 support
    fp16=True,   # Good for most GPU setups
    remove_unused_columns=False,
    report_to="none",
    padding_value=tokenizer.pad_token_id,  # this is valid only in DPOConfig
)

In [None]:
# Initialize DPOTrainer
trainer = DPOTrainer(
    model=model,
    ref_model=ref_model,
    args=training_args,
    train_dataset=dataset,
)

KeyError: "Column train not in the dataset. Current columns in the dataset: ['prompt', 'chosen', 'rejected']"

In [None]:
# Train
trainer.train()

In [None]:
# Save final model
trainer.save_model("../models/distilgpt2-dpo-checkpoint")