# RL Finetuning with SmolLM2-135M for Math DPO

In [None]:
%%capture
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
from torch import __version__ as torch_version
from packaging.version import Version as V
xformers = "xformers==0.0.27" if V(torch_version) < V("2.4.0") else "xformers"
!pip install --no-deps {xformers} trl peft accelerate bitsandbytes datasets wandb huggingface_hub

In [None]:
import os, torch, random, numpy as np
from getpass import getpass
from huggingface_hub import login
import wandb

random.seed(42); np.random.seed(42); torch.manual_seed(42)
if torch.cuda.is_available(): torch.cuda.manual_seed_all(42)

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

Device: cuda


# Authenticate to Hugging Face and Weights & Biases (W&B)

In [None]:
hf_token = getpass("🔑 Enter your Hugging Face token (press Enter to skip): ").strip()
if hf_token:
    login(hf_token)
else:
    print("HF login skipped.")

wb_token = getpass("🔑 Enter your Weights & Biases token (press Enter to skip): ").strip()
if wb_token:
    wandb.login(key=wb_token)
    run = wandb.init(project="SmolLM2-DPO-Math", job_type="training", anonymous="allow")
else:
    os.environ["WANDB_DISABLED"] = "true"
    print("W&B disabled.")

🔑 Enter your Hugging Face token (press Enter to skip): ··········
🔑 Enter your Weights & Biases token (press Enter to skip): ··········


  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33maditya_rajpurohit[0m ([33maditya_rajpurohit-san-jose-state-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


# Preference Dataset (Math-Focused)

In [None]:
from datasets import Dataset

prefs = [
    {
        "prompt": "Solve for x: 3x - 7 = 14. Show your steps and final answer.",
        "chosen": "3x - 7 = 14 → 3x = 21 → x = 7. Final answer: 7",
        "rejected": "3x - 7 = 14 → x = 14 - 7 = 7 → divide by 3 gives x = 2.3. Final answer: 2.3",
    },
    {
        "prompt": "Compute the area of a triangle with base 10 and height 9.",
        "chosen": "A = ½·b·h = ½·10·9 = 45. Final answer: 45",
        "rejected": "A = b·h = 10·9 = 90. Final answer: 90",
    },
    {
        "prompt": "Differentiate f(x)=4x³−5x+2.",
        "chosen": "f'(x)=12x²−5. Final answer: 12x²−5",
        "rejected": "f'(x)=12x³−5x. Final answer: 12x³−5x",
    },
    {
        "prompt": "If the ratio a:b=2:5 and a=12, find b.",
        "chosen": "a:b=2:5 → a/b=2/5 → b=12·5/2=30. Final answer: 30",
        "rejected": "b=12·2/5=4.8. Final answer: 4.8",
    },
    {
        "prompt": "Solve (x+2)(x−5)=0 for x.",
        "chosen": "x+2=0 or x−5=0 → x=−2, 5. Final answer: −2, 5",
        "rejected": "Expand to x²−3x−10=0 → x=3. Final answer: 3",
    },
]

pref_ds = Dataset.from_list(prefs)
pref_ds

Dataset({
    features: ['prompt', 'chosen', 'rejected'],
    num_rows: 5
})

# Load Model (4-bit) and Attach Chat Template

In [None]:
from unsloth import FastLanguageModel, is_bfloat16_supported, PatchDPOTrainer
from unsloth.chat_templates import get_chat_template
PatchDPOTrainer()

max_seq_length = 512
dtype, load_in_4bit = None, True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name     = "unsloth/SmolLM2-135M-Instruct",
    max_seq_length = max_seq_length,
    dtype          = dtype,
    load_in_4bit   = load_in_4bit,
)

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "chatml",
    mapping = {"role": "from", "content": "value", "user": "human", "assistant": "gpt"},
    map_eos_token = True,
)
print("✅ Model and tokenizer ready.")

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.11.2: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/269M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/158 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/423 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Unsloth: Will map <|im_end|> to EOS = <|im_end|>.


✅ Model and tokenizer ready.


# Add LoRA Adapters

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = [
        "q_proj","k_proj","v_proj","o_proj",
        "gate_proj","up_proj","down_proj",
    ],
    lora_alpha = 16,
    lora_dropout = 0.05,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
)
print("✅ LoRA adapters attached.")

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2025.11.2 patched 30 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


✅ LoRA adapters attached.


# Configure and Run DPO Training

In [None]:
from trl import DPOTrainer, DPOConfig

os.environ["WANDB_DISABLED"] = os.environ.get("WANDB_DISABLED", "false")

dpo_args = DPOConfig(
    output_dir                  = "smollm2-dpo-math-output",
    per_device_train_batch_size = 4,
    gradient_accumulation_steps = 4,
    warmup_ratio                = 0.1,
    num_train_epochs            = 5,
    logging_steps               = 5,
    optim                       = "adamw_8bit",
    seed                        = 42,
    fp16                        = not is_bfloat16_supported(),
    bf16                        = is_bfloat16_supported(),
    max_length                  = 256,
    max_prompt_length           = 128,
    report_to                   = "wandb" if os.environ.get("WANDB_DISABLED") != "true" else "none",
)

dpo_trainer = DPOTrainer(
    model         = model,
    ref_model     = None,
    args          = dpo_args,
    beta          = 0.1,
    train_dataset = pref_ds,
    tokenizer     = tokenizer,
)

train_out = dpo_trainer.train()
train_out

num_proc must be <= 5. Reducing num_proc to 5 for dataset of size 5.


Extracting prompt in train dataset (num_proc=5):   0%|          | 0/5 [00:00<?, ? examples/s]

num_proc must be <= 5. Reducing num_proc to 5 for dataset of size 5.


Applying chat template to train dataset (num_proc=5):   0%|          | 0/5 [00:00<?, ? examples/s]

num_proc must be <= 5. Reducing num_proc to 5 for dataset of size 5.


Tokenizing train dataset (num_proc=5):   0%|          | 0/5 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 5 | Num Epochs = 5 | Total steps = 5
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 4,884,480 of 139,400,064 (3.50% trained)


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / chosen,logps / rejected,logits / chosen,logits / rejected,eval_logits / chosen,eval_logits / rejected,nll_loss
5,0.6836,0.006558,-0.011822,0.5,0.01838,-74.444733,-75.93998,11.101547,10.463902,0,0,0


TrainOutput(global_step=5, training_loss=0.6835691452026367, metrics={'train_runtime': 28.752, 'train_samples_per_second': 0.87, 'train_steps_per_second': 0.174, 'total_flos': 0.0, 'train_loss': 0.6835691452026367, 'epoch': 5.0})

# Inference Test

In [None]:
from unsloth import FastLanguageModel
inference_dtype = torch.bfloat16 if is_bfloat16_supported() else torch.float16
model = model.to(device=device, dtype=inference_dtype)
FastLanguageModel.for_inference(model)

def dpo_chat(prompt, max_new_tokens=128):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.inference_mode():
        outputs = model.generate(
            **inputs,
            max_new_tokens = max_new_tokens,
            do_sample      = True,
            top_p          = 0.9,
            temperature    = 0.7,
            use_cache      = True,
        )
    print(tokenizer.decode(outputs[0], skip_special_tokens=True))

# test
dpo_chat("Solve for x: 2x + 5 = 19. Show your steps and final answer.")
dpo_chat("Find the area of a circle with radius 6. Show your steps and final answer.")

Solve for x: 2x + 5 = 19. Show your steps and final answer.
Find the area of a circle with radius 6. Show your steps and final answer.


# Save LoRA model

In [None]:
save_dir = "smollm2-135m-dpo-math-final"
repo_id = "aditya-rajpurohit/smollm2-135m-dpo-math-final"

model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)
print(f"✅ Model saved locally to {save_dir}")

model.push_to_hub(repo_id)
tokenizer.push_to_hub(repo_id)
print("✅ Uploaded to Hugging Face:", repo_id)

✅ Model saved locally to smollm2-135m-dpo-math-final


README.md:   0%|          | 0.00/588 [00:00<?, ?B/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...adapter_model.safetensors:   5%|5         |  527kB / 9.82MB            

Saved model to https://huggingface.co/aditya-rajpurohit/smollm2-135m-dpo-math-final
✅ Uploaded to Hugging Face: aditya-rajpurohit/smollm2-135m-dpo-math-final
