In [8]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    BitsAndBytesConfig
)
from trl import DPOTrainer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import Dataset

# Check GPU
print(f"GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

# Configuration
MODEL_NAME = "microsoft/DialoGPT-small"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # Set pad_token to eos_token
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})  # Add pad token explicitly
    print("Pad token set to EOS token:", tokenizer.pad_token)

# Quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

# Load model with 4-bit quantization
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
)
model.config.pad_token_id = tokenizer.pad_token_id  # Sync model with tokenizer pad token

# Prepare model for training
model = prepare_model_for_kbit_training(model)

# LoRA configuration
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["c_attn"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

# Apply LoRA
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

# Create preference dataset
def create_preference_dataset():
    samples = [
        {"prompt": "What is the capital of France?", "chosen": "The capital of France is Paris.", "rejected": "I don't know about France."},
        {"prompt": "How to make tea?", "chosen": "To make tea, boil water and steep tea leaves for 3-5 minutes.", "rejected": "Tea is made with coffee beans."},
        {"prompt": "What is 2+2?", "chosen": "2 + 2 equals 4.", "rejected": "2 + 2 is 5."},
        {"prompt": "Translate 'hello' to Spanish:", "chosen": "Hello in Spanish is 'hola'.", "rejected": "Hello in Spanish is 'bonjour'."},
        {"prompt": "What is Python?", "chosen": "Python is a popular programming language.", "rejected": "Python is a type of snake."},
    ]
    return Dataset.from_list(samples)

# Preprocess dataset for DPO
def preprocess_dpo_dataset(dataset, tokenizer, max_length=128, max_prompt_length=64):
    def tokenize_example(example):
        prompt_tokens = tokenizer(example["prompt"], truncation=True, max_length=max_prompt_length, padding=False)
        chosen_tokens = tokenizer(example["chosen"], truncation=True, max_length=max_length, padding=False)
        rejected_tokens = tokenizer(example["rejected"], truncation=True, max_length=max_length, padding=False)
        return {
            "prompt_input_ids": prompt_tokens["input_ids"],
            "prompt_attention_mask": prompt_tokens["attention_mask"],
            "chosen_input_ids": chosen_tokens["input_ids"],
            "chosen_attention_mask": chosen_tokens["attention_mask"],
            "rejected_input_ids": rejected_tokens["input_ids"],
            "rejected_attention_mask": rejected_tokens["attention_mask"],
        }
    return dataset.map(tokenize_example, remove_columns=["prompt", "chosen", "rejected"])

# Create and preprocess dataset
train_dataset = create_preference_dataset()
train_dataset = preprocess_dpo_dataset(train_dataset, tokenizer)

# Training arguments
training_args = TrainingArguments(
    output_dir="./dpo-lora-model",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=2,
    learning_rate=5e-5,
    num_train_epochs=2,
    logging_steps=1,
    save_steps=50,
    fp16=True,
    remove_unused_columns=False,
    report_to=[],
)

# Create DPO trainer
dpo_trainer = DPOTrainer(
    model=model,
    ref_model=None,  # DPO can work without a reference model
    args=training_args,
    train_dataset=train_dataset,
    max_length=128,
    max_prompt_length=64,
)

# Start training
print("Starting DPO training with QLoRA...")
dpo_trainer.train()

# Save the model
dpo_trainer.save_model()
print("Training completed!")

# Test the model
def generate_response(prompt):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=100,
            temperature=0.7,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id
        )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test
test_prompt = "What is the capital of Germany?"
response = generate_response(test_prompt)
print(f"\nPrompt: {test_prompt}")
print(f"Response: {response}")

GPU available: True
GPU: NVIDIA GeForce RTX 3050 6GB Laptop GPU
GPU memory: 6.0 GB
Pad token set to EOS token: <|endoftext|>
trainable params: 294,912 || all params: 124,734,720 || trainable%: 0.2364


Map:   0%|          | 0/5 [00:00<?, ? examples/s]

TypeError: DPOTrainer.__init__() got an unexpected keyword argument 'max_length'

In [None]:
# SFT + DPO with QLoRA (single script)
# Run on RTX 3050 (6GB). Adjust batch/accumulation if OOM.

import os
import random
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel
from trl import DPOTrainer, DPOConfig

# -------------------------
# Settings (tweak these)
# -------------------------
model_id = "distilgpt2"   # small for prototyping
OUTPUT_DIR = "./sft_then_dpo_adapters"
device = "cuda" if torch.cuda.is_available() else "cpu"
assert device == "cuda", "CUDA required for this script."

# Training hyperparams
SFT_BATCH = 1
SFT_GRAD_ACC = 8
SFT_EPOCHS = 2
SFT_MAX_LEN = 128

DPO_BATCH = 1
DPO_GRAD_ACC = 8
DPO_EPOCHS = 2
DPO_MAX_LEN = 128

# -------------------------
# 1) Synthetic dataset
# -------------------------
def make_sft_examples():
    sft = [
        {"prompt": "Translate 'Hello' to French:", "response": "Bonjour"},
        {"prompt": "Translate 'Thank you' to French:", "response": "Merci"},
        {"prompt": "What is 2+2?", "response": "4"},
        {"prompt": "What is 7+5?", "response": "12"},
        {"prompt": "Summarize: Photosynthesis is the process by which green plants use sunlight.", "response": "Plants convert sunlight into chemical energy to make food."},
        {"prompt": "Translate 'Good morning' to German:", "response": "Guten Morgen"},
        {"prompt": "Translate 'I love you' to Italian:", "response": "Ti amo"},
        {"prompt": "What is 3x3?", "response": "9"},
        {"prompt": "Translate 'Goodbye' to Spanish:", "response": "Adiós"},
        {"prompt": "Summarize Newton's first law in one sentence.", "response": "An object in motion stays in motion unless acted on by an external force."},
    ]
    big = sft * 20   # 200 examples
    random.shuffle(big)
    return big

def make_dpo_pairs_from_sft(sft_list):
    pairs = []
    for ex in sft_list:
        prompt = ex["prompt"]
        chosen = ex["response"]
        if chosen.isdigit():
            rejected = str(int(chosen) + random.choice([1, -1, 2]))
        else:
            rejected = "Nope"
        pairs.append({"prompt": prompt, "chosen": chosen, "rejected": rejected})
    return pairs

sft_examples = make_sft_examples()
dpo_pairs = make_dpo_pairs_from_sft(sft_examples)

# train/eval splits
sft_train = sft_examples[:160]
sft_eval = sft_examples[160:]
dpo_train = dpo_pairs[:160]
dpo_eval = dpo_pairs[160:]

train_sft_ds = Dataset.from_list(sft_train)
eval_sft_ds = Dataset.from_list(sft_eval)
train_dpo_ds = Dataset.from_list(dpo_train)
eval_dpo_ds = Dataset.from_list(dpo_eval)

print("SFT train:", len(train_sft_ds), "eval:", len(eval_sft_ds))
print("DPO train:", len(train_dpo_ds), "eval:", len(eval_dpo_ds))

# -------------------------
# 2) Tokenizer + 4-bit config
# -------------------------
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

# -------------------------
# 3) Load base model (4-bit) and prepare for k-bit training
# -------------------------
print("Loading base model in 4-bit (may take a bit)...")
base = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=False,
)

base = prepare_model_for_kbit_training(base)

# -------------------------
# 4) Attach LoRA adapters for SFT
# -------------------------
lora_cfg = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["c_attn", "c_proj"],  # GPT-2 style
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(base, lora_cfg)
print("LoRA params (trainable):", sum(p.numel() for p in model.parameters() if p.requires_grad))

# -------------------------
# 5) SFT: tokenization
# -------------------------
def sft_tokenize(example):
    prompt = example["prompt"].strip()
    resp = example["response"].strip()
    text = prompt + " " + resp
    toks = tokenizer(text, truncation=True, max_length=SFT_MAX_LEN, padding="max_length")
    input_ids = toks["input_ids"]
    prompt_ids = tokenizer(prompt, truncation=True, max_length=SFT_MAX_LEN)["input_ids"]
    prompt_len = len(prompt_ids)
    labels = [-100] * prompt_len + input_ids[prompt_len:]
    labels = labels[: len(input_ids)]
    if len(labels) < len(input_ids):
        labels += [-100] * (len(input_ids) - len(labels))
    toks["labels"] = labels
    return toks

train_tok = train_sft_ds.map(sft_tokenize, remove_columns=train_sft_ds.column_names, batched=False)
eval_tok = eval_sft_ds.map(sft_tokenize, remove_columns=eval_sft_ds.column_names, batched=False)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# -------------------------
# 6) SFT training
# -------------------------
sft_args = TrainingArguments(
    output_dir="./sft_qLora_out",
    per_device_train_batch_size=SFT_BATCH,
    gradient_accumulation_steps=SFT_GRAD_ACC,
    num_train_epochs=SFT_EPOCHS,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=50,
    save_strategy="no",
    report_to=[],
)

trainer = Trainer(
    model=model,
    args=sft_args,
    train_dataset=train_tok,
    eval_dataset=eval_tok,
    data_collator=data_collator,
)
print("Starting SFT (QLoRA) ...")
trainer.train()
print("SFT done — saving adapters...")
model.save_pretrained(os.path.join(OUTPUT_DIR, "sft_adapters"))
tokenizer.save_pretrained(os.path.join(OUTPUT_DIR, "sft_adapters"))

# -------------------------
# 7) Reload base model + SFT adapters for DPO
# -------------------------
print("Reloading base model and applying SFT adapters for DPO step...")
base2 = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=False,
)
model_with_adapters = PeftModel.from_pretrained(base2, os.path.join(OUTPUT_DIR, "sft_adapters"))

# IMPORTANT: enable training
model_with_adapters.train()

# sanity check: print trainable params
print("LoRA params (trainable after reload):",
      sum(p.numel() for p in model_with_adapters.parameters() if p.requires_grad))

# -------------------------
# 8) DPO config + Trainer
# -------------------------
dpo_args = DPOConfig(
    output_dir="./dpo_from_sft_adapters",
    per_device_train_batch_size=DPO_BATCH,
    per_device_eval_batch_size=DPO_BATCH,
    gradient_accumulation_steps=DPO_GRAD_ACC,
    num_train_epochs=DPO_EPOCHS,
    learning_rate=1e-4,
    logging_steps=50,
    save_strategy="no",
    report_to=[],

    beta=0.1,
    max_length=DPO_MAX_LEN,
    max_prompt_length=64,
    padding_value=tokenizer.pad_token_id,

    eval_strategy="steps",
    eval_steps=200,
)

print("Starting DPO training (preference optimization)...")
dpo_trainer = DPOTrainer(
    model=model_with_adapters,
    ref_model=None,   # TRL will clone frozen ref internally
    args=dpo_args,
    train_dataset=train_dpo_ds,
    eval_dataset=eval_dpo_ds,
    processing_class=tokenizer,
)
dpo_trainer.train()
print("DPO done — saving final adapters...")
dpo_trainer.model.save_pretrained(os.path.join(OUTPUT_DIR, "final_adapters"))
tokenizer.save_pretrained(os.path.join(OUTPUT_DIR, "final_adapters"))

# -------------------------
# 9) Reload for inference
# -------------------------
print("Reloading final model + adapters for inference...")
base_inf = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto")
final_model = PeftModel.from_pretrained(base_inf, os.path.join(OUTPUT_DIR, "final_adapters"))
final_model.eval()

# -------------------------
# 10) Inference helper
# -------------------------
def generate_answer(prompt: str, max_new_tokens=40):
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True).to(final_model.device)
    with torch.no_grad():
        out = final_model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    decoded = tokenizer.decode(out[0], skip_special_tokens=True)
    return decoded[len(prompt):].strip() if decoded.startswith(prompt) else decoded.strip()

# Quick tests
tests = [
    "Translate 'Hello' to French:",
    "What is 7 + 5?",
    "Summarize: Photosynthesis is the process by which green plants use sunlight."
]
for t in tests:
    print("PROMPT:", t)
    print("ANSWER:", generate_answer(t))
    print("-" * 40)
