# Fine-tune model with DPO

## Goal

Let's see if using DPO can create a better model.

## Imports

In [None]:
import numpy as np
import pandas as pd

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    AutoConfig,
)

from trl import DPOConfig, DPOTrainer
from datasets import Dataset

## Load model

In [None]:
model_path = '/home/gbarbadillo/data/deepseekmath'
config = AutoConfig.from_pretrained(model_path)
config.gradient_checkpointing = True

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map='auto',
    torch_dtype="auto", #torch.bfloat16 does not show speed differences
    trust_remote_code=True,
    quantization_config=None,
    config=config
)

In [None]:
# TODO: check pad token on prompt recovery notebook
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token_id = tokenizer.eos_token_id

## Load data

In [None]:
df = pd.read_csv('/mnt/hdd0/Kaggle/aimo/external_data/dpo/v0.csv')
df.head()

In [None]:
unique_problem_ids = df['problem_idx'].unique()
len(unique_problem_ids)

In [None]:
train_problem_ids = np.random.choice(unique_problem_ids, int(0.9 * len(unique_problem_ids)), replace=False)
train_df = df[df['problem_idx'].isin(train_problem_ids)]
test_df = df[~df['problem_idx'].isin(train_problem_ids)]
assert len(train_df) + len(test_df) == len(df)
assert set(train_df['problem_idx'].unique()).intersection(set(test_df['problem_idx'].unique())) == set()
print(len(train_df), len(test_df))

In [None]:
train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(test_df)

## Fine-tuning

In [None]:
from peft import LoraConfig
 
# LoRA config based on QLoRA paper & Sebastian Raschka experiment
peft_config = LoraConfig(
        lora_alpha=128,
        lora_dropout=0.05,
        r=256,
        bias="none",
        target_modules="all-linear",
        task_type="CAUSAL_LM",
)

In [None]:
from trl import DPOTrainer
from transformers import TrainingArguments

args = DPOConfig(
    output_dir="doplhin-dpo",               # directory to save and repository id
    num_train_epochs=1,                     # number of training epochs
    per_device_train_batch_size=2,         # batch size per device during training
    per_device_eval_batch_size=4,           # batch size for evaluation
    gradient_accumulation_steps=2,          # number of steps before performing a backward/update pass
    gradient_checkpointing=True,            # use gradient checkpointing to save memory
    optim="adamw_torch_fused",              # use fused adamw optimizer
    learning_rate=5e-5,                     # 10x higher LR than QLoRA paper
    max_grad_norm=0.3,                      # max gradient norm based on QLoRA paper
    warmup_ratio=0.1,                       # warmup ratio based on QLoRA paper
    lr_scheduler_type="cosine",             # use cosine learning rate scheduler
    logging_steps=25,                       # log every 25 steps
    save_steps=500,                         # when to save checkpoint
    save_total_limit=2,                     # limit the total amount of checkpoints
    eval_strategy="steps",            # evaluate every 1000 steps
    eval_steps=700,                         # when to evaluate
    bf16=True,                              # use bfloat16 precision
    tf32=True,                              # use tf32 precision
    push_to_hub=False,                      # push model to hub
    report_to="tensorboard",                # report metrics to tensorboard
    model_init_kwargs=None,
    max_length=1024,
    max_prompt_length=512,
    beta=0.1,
    loss_type="sigmoid",
)

trainer = DPOTrainer(
    model,
    ref_model=None, # set to none since we use peft
    peft_config=peft_config,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()