In [4]:
# =========================
# Cell 1 ‚Äî ENV (MUST be first, before torch/transformers)
# =========================
from ft_pipeline.env import apply_env
apply_env()

import os
import gc
import torch

torch.backends.cuda.matmul.allow_tf32 = True
torch.set_float32_matmul_precision("high")

print("CUDA available:", torch.cuda.is_available())
print("CUDA device:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else None)

print("allocated:", torch.cuda.memory_allocated()/1024**2, "MB")
print("reserved:",  torch.cuda.memory_reserved()/1024**2, "MB")

gc.collect()
torch.cuda.empty_cache()

print("allocated:", torch.cuda.memory_allocated()/1024**2, "MB")
print("reserved:",  torch.cuda.memory_reserved()/1024**2, "MB")


import logging
from ft_pipeline.logger import setup_logger
from ft_pipeline.config import FTConfig
from ft_pipeline.run_sft import run_finetune
from ft_pipeline.config import DPOCfg
from ft_pipeline.run_dpo import run_dpo




CUDA available: True
CUDA device: NVIDIA A100-SXM4-40GB
allocated: 9656.4775390625 MB
reserved: 35986.0 MB
allocated: 3875.18896484375 MB
reserved: 3944.0 MB


In [2]:
# =========================
# Cell ‚Äî DPO Config
# =========================
# DPO (Direct Preference Optimization) after SFT (QLoRA LoRA-adapter)
# MamayLM-Gemma-3-12B (–ª–æ–∫–∞–ª—å–Ω–∏–π —à–ª—è—Ö)
# GPU: A100 40GB, BF16, QLoRA 4-bit

from ft_pipeline.config import DPOCfg

cfg_dpo = DPOCfg(
    # ==========================================================
    # BASE MODEL + CONTINUATION FROM SFT
    # ==========================================================
    model_id="/home/jovyan/ai-models/MamayLM-Gemma-3-12B",               # path or HF repo id base model
    
    sft_adapter_dir="MamayLM-Gemma-3-12b_QLoRA_SFT/lora_adapter",           # LoRA-adapter after SFT,
    # sft_adapter_dir=None,                                              # if DPO from base
    
    dpo_train_jsonl="ft_datasets/dpo_train.jsonl",                      # train dataset in JSONL
    dpo_val_jsonl="ft_datasets/dpo_val.jsonl",                          # validation dataset in JSONL

    
    # out_dir="outputs_mamay12b_qlora_dpo",
    out_dir="MamayLM-Gemma-3-12b_QLoRA_SFT_DPO",
    # out_dir="MamayLM-Gemma-3-12b_QLoRA_DPO",
    
    # ==========================================================
    # SEQUENCE / BATCHING
    # ==========================================================
    max_seq_len=5000,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,  # effective batch = batch_size * grad_accum

    # ==========================================================
    # TRAINING SCHEDULE / OPTIM
    # ==========================================================
    learning_rate=0.0000005,     
    weight_decay=0.05,        # L2 regularization 
    num_train_epochs=2.0,     # (if  max_steps are provided - will ignored)
    max_steps=None,            # use instead - num_train_epochs
    warmup_ratio=0.05,
    lr_scheduler_type="cosine",  #scheduler: "cosine", "linear", ...
    logging_steps=5,
    eval_steps=50,
    save_steps=200,
    save_total_limit=2,

    # ==========================================================
    # DPO CORE
    # ==========================================================
    
    beta=0.03,   # to avoid overfit - safe start. -- if   accuracy getting to fast to 1.0 - change it  ‚Üí 0.03

    use_bf16=True,            
    use_fp16=False,
    load_in_4bit=True,         # QLoRA (bitsandbytes 4-bit)
    attn_implementation="sdpa",   
    optim="paged_adamw_8bit",  # bitsandbytes to reduce the memory
    report_to="none",
    max_new_tokens_eval=512,  #  in A/B sanity (before/after)
)

print(cfg_dpo)

setup_logger(level=logging.INFO, log_file=f"{cfg_dpo.out_dir}/ft_run_dpo.log")

DPOCfg(model_id='/home/jovyan/ai-models/MamayLM-Gemma-3-12B', sft_adapter_dir='MamayLM-Gemma-3-12b_QLoRA_SFT/lora_adapter', dpo_train_jsonl='ft_datasets/dpo_train.jsonl', dpo_val_jsonl='ft_datasets/dpo_val.jsonl', out_dir='MamayLM-Gemma-3-12b_QLoRA_SFT_DPO', max_seq_len=5000, per_device_train_batch_size=1, per_device_eval_batch_size=1, gradient_accumulation_steps=8, learning_rate=5e-07, weight_decay=0.05, num_train_epochs=2.0, max_steps=None, warmup_ratio=0.05, lr_scheduler_type='cosine', logging_steps=5, eval_steps=50, save_steps=200, save_total_limit=2, beta=0.03, use_bf16=True, use_fp16=False, load_in_4bit=True, attn_implementation='sdpa', report_to='none', optim='paged_adamw_8bit', max_new_tokens_eval=512)


<Logger ft_pipeline (INFO)>

In [3]:


dpo_artifacts = run_dpo(
    cfg_dpo,
    do_ab_sanity=False,                 # optional control the json format - recommended 
    ab_val_jsonl="ft_datasets/sft_val.jsonl",   # if do_ab_sanity take the same from SFT
    ab_indices = list(range(15)),
    # ab_indices=[0, 1, 2, 10, 25, 50, 100, 150, 200, 250, 300],
    dataset_limits=(None, None),
)


08:49:15 | INFO    | === DPO RUN START ===
08:49:15 | INFO    | CUDA available=True
08:49:15 | INFO    | CUDA device=NVIDIA A100-SXM4-40GB
08:49:15 | INFO    | Loading tokenizer: /home/jovyan/ai-models/MamayLM-Gemma-3-12B
08:49:16 | INFO    | Tokenizer loaded
08:49:16 | INFO    | Loading DPO datasets
08:49:16 | INFO    |   train: ft_datasets/dpo_train.jsonl
08:49:16 | INFO    |   val:   ft_datasets/dpo_val.jsonl
08:49:41 | INFO    | DPO dataset ready | train=3094 | val=344
08:49:41 | INFO    | Loading base model (QLoRA)
08:49:41 | INFO    |   model_id: /home/jovyan/ai-models/MamayLM-Gemma-3-12B
08:49:41 | INFO    |   dtype: torch.bfloat16
08:49:41 | INFO    |   4bit: True
08:49:41 | INFO    |   attn_implementation: sdpa


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

08:51:06 | INFO    | Base model loaded
08:51:06 | INFO    | Enabling gradient checkpointing
08:51:06 | INFO    | Loading trainable LoRA adapter from: MamayLM-Gemma-3-12b_QLoRA_SFT/lora_adapter
08:51:08 | INFO    | Trainable adapter loaded
08:51:08 | INFO    | Enabled input require grads for gradient checkpointing
08:51:08 | INFO    | Trainable parameters:
trainable params: 68,456,448 || all params: 12,255,781,488 || trainable%: 0.5586
08:51:08 | INFO    | Building DPOConfig
08:51:08 | INFO    |   max_seq_len=5000
08:51:08 | INFO    |   beta=0.03
08:51:08 | INFO    |   lr=5e-07
08:51:08 | INFO    | Building DPOTrainer
08:51:08 | INFO    |   train_samples=3094
08:51:08 | INFO    |   val_samples=344


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Using auto half precision backend
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


08:51:09 | INFO    | Starting DPO training‚Ä¶


Cannot get num_tokens from dataloader
skipped Embedding(4096, 1152): 4.5M params
skipped Gemma3TextScaledWordEmbedding(262208, 3840, padding_idx=0): 964.734375M params
skipped: 964.734375M params
***** Running training *****
  Num examples = 3,094
  Num Epochs = 2
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 8
  Total optimization steps = 774
  Number of trainable parameters = 68,456,448


08:51:10 | INFO    | GPUMetricsCallback enabled
08:51:10 | INFO    | DPOMetricsCallback enabled | csv=MamayLM-Gemma-3-12b_QLoRA_SFT_DPO/dpo_metrics.csv | every_n_steps=5




Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
50,3.4962,3.469268,-0.057585,3.376739,0.0,-3.434324,-267.320862,-67.623283,-2.042471,-3.212824
100,2.327,2.257684,1.033092,3.169737,0.0,-2.136645,-230.964935,-74.523338,-1.982634,-2.661982
150,1.4082,1.290876,1.844689,2.784632,0.0,-0.939943,-203.911697,-87.360168,-1.971564,-2.398803
200,0.6016,0.564815,2.313618,1.96399,0.735465,0.349628,-188.280746,-114.71492,-2.00007,-2.30538
250,0.1851,0.162448,2.760528,0.881279,1.0,1.879249,-173.383743,-150.805283,-2.046856,-2.270607
300,0.0484,0.040745,3.244718,-0.08317,1.0,3.327888,-157.24408,-182.953583,-2.094893,-2.238698
350,0.0156,0.014001,3.533666,-0.85869,1.0,4.392356,-147.612488,-208.804276,-2.161052,-2.233387
400,0.0082,0.007533,3.687239,-1.333563,1.0,5.020802,-142.493362,-224.633331,-2.226531,-2.247115
450,0.0055,0.00533,3.749401,-1.622471,1.0,5.371871,-140.42131,-234.263626,-2.277489,-2.263341


08:52:44 | INFO    | [step 1] train_loss=4.3627 | lr=0 | grad_norm=40.3988 | gpu_mem(GB)=alloc:9.49 res:23.14 max_alloc:36.02 max_res:38.98 | elapsed=1.6m
08:58:53 | INFO    | [step 5] train_loss=4.4773 | lr=5.12821e-08 | grad_norm=37.3110 | gpu_mem(GB)=alloc:9.49 res:27.82 max_alloc:36.24 max_res:38.98 | elapsed=7.7m
08:58:53 | INFO    | DPO step=5 | loss=4.4773 | rewards/accuracies=0.0000 | rewards/margins=-4.4647 | rewards/chosen=-1.0012 | rewards/rejected=3.4635
09:06:33 | INFO    | [step 10] train_loss=4.4379 | lr=1.15385e-07 | grad_norm=36.2712 | gpu_mem(GB)=alloc:9.49 res:27.02 max_alloc:36.28 max_res:38.98 | elapsed=15.4m
09:06:33 | INFO    | DPO step=10 | loss=4.4379 | rewards/accuracies=0.0000 | rewards/margins=-4.4248 | rewards/chosen=-0.9306 | rewards/rejected=3.4942
09:14:20 | INFO    | [step 15] train_loss=4.4845 | lr=1.79487e-07 | grad_norm=33.4385 | gpu_mem(GB)=alloc:9.49 res:29.11 max_alloc:36.28 max_res:38.98 | elapsed=23.2m
09:14:20 | INFO    | DPO step=15 | loss=4.4


***** Running Evaluation *****
  Num examples = 344
  Batch size = 1


10:32:16 | INFO    | [step 50] eval_loss=3.4693 | gpu_mem(GB)=alloc:9.49 res:35.14 max_alloc:36.53 max_res:38.98 | elapsed=101.1m
10:32:16 | INFO    | DPO[EVAL] step=50
10:32:16 | INFO    | EarlyStop(metric=eval_loss): improved from None to 3.469268
10:40:01 | INFO    | [step 55] train_loss=3.4836 | lr=4.99486e-07 | grad_norm=26.5273 | gpu_mem(GB)=alloc:9.49 res:19.97 max_alloc:36.53 max_res:38.98 | elapsed=108.8m
10:40:01 | INFO    | DPO step=55 | loss=3.4836 | rewards/accuracies=0.0000 | rewards/margins=-3.4477 | rewards/chosen=-0.0047 | rewards/rejected=3.4429
10:47:40 | INFO    | [step 60] train_loss=3.2114 | lr=4.99087e-07 | grad_norm=24.8024 | gpu_mem(GB)=alloc:9.49 res:22.39 max_alloc:36.53 max_res:38.98 | elapsed=116.5m
10:47:40 | INFO    | DPO step=60 | loss=3.2114 | rewards/accuracies=0.0000 | rewards/margins=-3.1669 | rewards/chosen=0.1114 | rewards/rejected=3.2783
10:55:22 | INFO    | [step 65] train_loss=3.2018 | lr=4.98574e-07 | grad_norm=23.6171 | gpu_mem(GB)=alloc:9.49 


***** Running Evaluation *****
  Num examples = 344
  Batch size = 1


12:13:08 | INFO    | [step 100] eval_loss=2.2577 | gpu_mem(GB)=alloc:9.49 res:35.14 max_alloc:36.53 max_res:38.98 | elapsed=202.0m
12:13:08 | INFO    | DPO[EVAL] step=100
12:13:08 | INFO    | EarlyStop(metric=eval_loss): improved from 3.469268 to 2.257684
12:20:43 | INFO    | [step 105] train_loss=2.2072 | lr=4.90413e-07 | grad_norm=20.6044 | gpu_mem(GB)=alloc:9.49 res:30.32 max_alloc:36.53 max_res:38.98 | elapsed=209.6m
12:20:43 | INFO    | DPO step=105 | loss=2.2072 | rewards/accuracies=0.0000 | rewards/margins=-2.0822 | rewards/chosen=1.0467 | rewards/rejected=3.1289
12:28:23 | INFO    | [step 110] train_loss=2.1258 | lr=4.88893e-07 | grad_norm=19.1288 | gpu_mem(GB)=alloc:9.49 res:21.49 max_alloc:36.53 max_res:38.98 | elapsed=217.2m
12:28:23 | INFO    | DPO step=110 | loss=2.1258 | rewards/accuracies=0.0000 | rewards/margins=-1.9860 | rewards/chosen=1.1575 | rewards/rejected=3.1435
12:35:57 | INFO    | [step 115] train_loss=2.0704 | lr=4.87264e-07 | grad_norm=19.0702 | gpu_mem(GB)=a


***** Running Evaluation *****
  Num examples = 344
  Batch size = 1


13:53:57 | INFO    | [step 150] eval_loss=1.2909 | gpu_mem(GB)=alloc:9.49 res:35.14 max_alloc:36.53 max_res:38.98 | elapsed=302.8m
13:53:57 | INFO    | DPO[EVAL] step=150
13:53:57 | INFO    | EarlyStop(metric=eval_loss): improved from 2.257684 to 1.290876
14:01:46 | INFO    | [step 155] train_loss=1.2359 | lr=4.70402e-07 | grad_norm=14.8002 | gpu_mem(GB)=alloc:9.49 res:23.31 max_alloc:36.53 max_res:38.98 | elapsed=310.6m
14:01:46 | INFO    | DPO step=155 | loss=1.2359 | rewards/accuracies=0.0000 | rewards/margins=-0.8620 | rewards/chosen=1.8655 | rewards/rejected=2.7275
14:09:24 | INFO    | [step 160] train_loss=1.2100 | lr=4.6783e-07 | grad_norm=16.7697 | gpu_mem(GB)=alloc:9.49 res:26.65 max_alloc:36.53 max_res:38.98 | elapsed=318.2m
14:09:24 | INFO    | DPO step=160 | loss=1.2100 | rewards/accuracies=0.0000 | rewards/margins=-0.8260 | rewards/chosen=1.9254 | rewards/rejected=2.7514
14:17:00 | INFO    | [step 165] train_loss=1.1466 | lr=4.65158e-07 | grad_norm=14.2159 | gpu_mem(GB)=al


***** Running Evaluation *****
  Num examples = 344
  Batch size = 1


15:34:24 | INFO    | [step 200] eval_loss=0.5648 | gpu_mem(GB)=alloc:9.49 res:35.14 max_alloc:36.53 max_res:38.98 | elapsed=403.2m
15:34:24 | INFO    | DPO[EVAL] step=200
15:34:24 | INFO    | EarlyStop(metric=eval_loss): improved from 1.290876 to 0.564815


Saving model checkpoint to MamayLM-Gemma-3-12b_QLoRA_SFT_DPO/checkpoint-200
chat template saved in MamayLM-Gemma-3-12b_QLoRA_SFT_DPO/checkpoint-200/chat_template.jinja
tokenizer config file saved in MamayLM-Gemma-3-12b_QLoRA_SFT_DPO/checkpoint-200/tokenizer_config.json
Special tokens file saved in MamayLM-Gemma-3-12b_QLoRA_SFT_DPO/checkpoint-200/special_tokens_map.json


15:42:04 | INFO    | [step 205] train_loss=0.5298 | lr=4.40361e-07 | grad_norm=8.6214 | gpu_mem(GB)=alloc:9.49 res:26.03 max_alloc:36.53 max_res:38.98 | elapsed=410.9m
15:42:04 | INFO    | DPO step=205 | loss=0.5298 | rewards/accuracies=0.8750 | rewards/margins=0.4180 | rewards/chosen=2.3074 | rewards/rejected=1.8894
15:49:44 | INFO    | [step 210] train_loss=0.4096 | lr=4.36855e-07 | grad_norm=6.3665 | gpu_mem(GB)=alloc:9.49 res:34.04 max_alloc:36.53 max_res:38.98 | elapsed=418.6m
15:49:44 | INFO    | DPO step=210 | loss=0.4096 | rewards/accuracies=0.9250 | rewards/margins=0.7402 | rewards/chosen=2.3274 | rewards/rejected=1.5872
15:57:25 | INFO    | [step 215] train_loss=0.4846 | lr=4.33263e-07 | grad_norm=9.4812 | gpu_mem(GB)=alloc:9.49 res:35.27 max_alloc:36.53 max_res:38.98 | elapsed=426.2m
15:57:25 | INFO    | DPO step=215 | loss=0.4846 | rewards/accuracies=0.8750 | rewards/margins=0.5695 | rewards/chosen=2.3454 | rewards/rejected=1.7759
16:05:03 | INFO    | [step 220] train_loss=


***** Running Evaluation *****
  Num examples = 344
  Batch size = 1


17:18:03 | INFO    | [step 250] eval_loss=0.1624 | gpu_mem(GB)=alloc:9.49 res:35.14 max_alloc:36.53 max_res:38.98 | elapsed=506.9m
17:18:03 | INFO    | DPO[EVAL] step=250
17:18:03 | INFO    | EarlyStop(metric=eval_loss): improved from 0.564815 to 0.162448
17:25:52 | INFO    | [step 255] train_loss=0.1576 | lr=4.0166e-07 | grad_norm=3.4395 | gpu_mem(GB)=alloc:9.49 res:25.79 max_alloc:36.58 max_res:38.98 | elapsed=514.7m
17:25:52 | INFO    | DPO step=255 | loss=0.1576 | rewards/accuracies=1.0000 | rewards/margins=1.8776 | rewards/chosen=2.7297 | rewards/rejected=0.8521
17:33:34 | INFO    | [step 260] train_loss=0.1399 | lr=3.97378e-07 | grad_norm=4.2901 | gpu_mem(GB)=alloc:9.49 res:25.60 max_alloc:36.58 max_res:38.98 | elapsed=522.4m
17:33:34 | INFO    | DPO step=260 | loss=0.1399 | rewards/accuracies=1.0000 | rewards/margins=2.0035 | rewards/chosen=2.8217 | rewards/rejected=0.8182
17:41:09 | INFO    | [step 265] train_loss=0.1342 | lr=3.93029e-07 | grad_norm=3.7729 | gpu_mem(GB)=alloc:9


***** Running Evaluation *****
  Num examples = 344
  Batch size = 1


18:58:47 | INFO    | [step 300] eval_loss=0.0407 | gpu_mem(GB)=alloc:9.49 res:35.14 max_alloc:36.58 max_res:38.98 | elapsed=607.6m
18:58:47 | INFO    | DPO[EVAL] step=300
18:58:47 | INFO    | EarlyStop(metric=eval_loss): improved from 0.162448 to 0.040745
19:06:27 | INFO    | [step 305] train_loss=0.0389 | lr=3.56058e-07 | grad_norm=1.4419 | gpu_mem(GB)=alloc:9.49 res:27.41 max_alloc:36.58 max_res:38.98 | elapsed=615.3m
19:06:27 | INFO    | DPO step=305 | loss=0.0389 | rewards/accuracies=1.0000 | rewards/margins=3.3518 | rewards/chosen=3.2306 | rewards/rejected=-0.1212
19:14:01 | INFO    | [step 310] train_loss=0.0305 | lr=3.51196e-07 | grad_norm=0.7915 | gpu_mem(GB)=alloc:9.49 res:24.73 max_alloc:36.58 max_res:38.98 | elapsed=622.8m
19:14:01 | INFO    | DPO step=310 | loss=0.0305 | rewards/accuracies=1.0000 | rewards/margins=3.6150 | rewards/chosen=3.3113 | rewards/rejected=-0.3038
19:21:44 | INFO    | [step 315] train_loss=0.0312 | lr=3.46288e-07 | grad_norm=0.8074 | gpu_mem(GB)=allo


***** Running Evaluation *****
  Num examples = 344
  Batch size = 1


20:39:17 | INFO    | [step 350] eval_loss=0.0140 | gpu_mem(GB)=alloc:9.49 res:35.14 max_alloc:36.58 max_res:38.98 | elapsed=708.1m
20:39:17 | INFO    | DPO[EVAL] step=350
20:39:17 | INFO    | EarlyStop(metric=eval_loss): improved from 0.040745 to 0.014001
20:47:01 | INFO    | [step 355] train_loss=0.0135 | lr=3.0563e-07 | grad_norm=0.4110 | gpu_mem(GB)=alloc:9.49 res:19.50 max_alloc:36.58 max_res:38.98 | elapsed=715.8m
20:47:01 | INFO    | DPO step=355 | loss=0.0135 | rewards/accuracies=1.0000 | rewards/margins=4.3893 | rewards/chosen=3.5226 | rewards/rejected=-0.8667
20:54:38 | INFO    | [step 360] train_loss=0.0131 | lr=3.00409e-07 | grad_norm=0.2999 | gpu_mem(GB)=alloc:9.49 res:32.63 max_alloc:36.58 max_res:38.98 | elapsed=723.5m
20:54:38 | INFO    | DPO step=360 | loss=0.0131 | rewards/accuracies=1.0000 | rewards/margins=4.4300 | rewards/chosen=3.5277 | rewards/rejected=-0.9023
21:02:11 | INFO    | [step 365] train_loss=0.0146 | lr=2.95165e-07 | grad_norm=0.3899 | gpu_mem(GB)=alloc


***** Running Evaluation *****
  Num examples = 344
  Batch size = 1


22:19:22 | INFO    | [step 400] eval_loss=0.0075 | gpu_mem(GB)=alloc:9.49 res:35.14 max_alloc:36.58 max_res:38.98 | elapsed=808.2m
22:19:22 | INFO    | DPO[EVAL] step=400
22:19:22 | INFO    | EarlyStop(metric=eval_loss): no improvement (current=0.007533 best=0.014001) | bad=1/2


Saving model checkpoint to MamayLM-Gemma-3-12b_QLoRA_SFT_DPO/checkpoint-400
chat template saved in MamayLM-Gemma-3-12b_QLoRA_SFT_DPO/checkpoint-400/chat_template.jinja
tokenizer config file saved in MamayLM-Gemma-3-12b_QLoRA_SFT_DPO/checkpoint-400/tokenizer_config.json
Special tokens file saved in MamayLM-Gemma-3-12b_QLoRA_SFT_DPO/checkpoint-400/special_tokens_map.json


22:27:18 | INFO    | [step 405] train_loss=0.0084 | lr=2.52671e-07 | grad_norm=0.4605 | gpu_mem(GB)=alloc:9.49 res:20.13 max_alloc:36.58 max_res:38.98 | elapsed=816.1m
22:27:18 | INFO    | DPO step=405 | loss=0.0084 | rewards/accuracies=1.0000 | rewards/margins=4.9592 | rewards/chosen=3.6756 | rewards/rejected=-1.2837
22:34:57 | INFO    | [step 410] train_loss=0.0065 | lr=2.47329e-07 | grad_norm=0.2045 | gpu_mem(GB)=alloc:9.49 res:29.38 max_alloc:36.58 max_res:38.98 | elapsed=823.8m
22:34:57 | INFO    | DPO step=410 | loss=0.0065 | rewards/accuracies=1.0000 | rewards/margins=5.1425 | rewards/chosen=3.6872 | rewards/rejected=-1.4553
22:42:39 | INFO    | [step 415] train_loss=0.0083 | lr=2.41987e-07 | grad_norm=0.2696 | gpu_mem(GB)=alloc:9.49 res:22.86 max_alloc:36.58 max_res:38.98 | elapsed=831.5m
22:42:39 | INFO    | DPO step=415 | loss=0.0083 | rewards/accuracies=1.0000 | rewards/margins=4.8818 | rewards/chosen=3.6779 | rewards/rejected=-1.2039
22:50:18 | INFO    | [step 420] train_lo


***** Running Evaluation *****
  Num examples = 344
  Batch size = 1


00:00:09 | INFO    | [step 450] eval_loss=0.0053 | gpu_mem(GB)=alloc:9.49 res:35.14 max_alloc:36.58 max_res:38.98 | elapsed=909.0m
00:00:09 | INFO    | DPO[EVAL] step=450
00:00:09 | INFO    | EarlyStop(metric=eval_loss): no improvement (current=0.005330 best=0.014001) | bad=2/2




Training completed. Do not forget to share your model on huggingface.co/models =)




00:00:09 | INFO    | [step 450] tokens/sec=0.0 | gpu_mem(GB)=alloc:9.49 res:35.14 max_alloc:36.58 max_res:38.98 | elapsed=909.0m
00:00:09 | INFO    | DPO step=450
00:00:09 | INFO    | DPOMetricsCallback finished | csv=MamayLM-Gemma-3-12b_QLoRA_SFT_DPO/dpo_metrics.csv


chat template saved in MamayLM-Gemma-3-12b_QLoRA_SFT_DPO/tokenizer/chat_template.jinja
tokenizer config file saved in MamayLM-Gemma-3-12b_QLoRA_SFT_DPO/tokenizer/tokenizer_config.json
Special tokens file saved in MamayLM-Gemma-3-12b_QLoRA_SFT_DPO/tokenizer/special_tokens_map.json


00:00:17 | INFO    | DPO Artifacts: {'out_dir': 'MamayLM-Gemma-3-12b_QLoRA_SFT_DPO', 'lora_adapter_dir': 'MamayLM-Gemma-3-12b_QLoRA_SFT_DPO/lora_adapter', 'tokenizer_dir': 'MamayLM-Gemma-3-12b_QLoRA_SFT_DPO/tokenizer'}
00:00:17 | INFO    | === DPO RUN END ===


## 2Ô∏è‚É£ DPO ‚Äî Direct Preference Optimization

### üéØ Goal

Teach the model to **prefer the correct tariff**,  
when the format and style are already stable (after SFT).

---

### üìä Key metrics (must-watch)

#### üîπ `rewards/accuracies`
- normal range: **0.55‚Äì0.75**
- ‚ùå bad: **1.0 already at step 10‚Äì20**  
  (a sign of truncation or style mismatch)

#### üîπ `rewards/margins`
- should be **> 0**
- good: grows slowly
- ‚ùå bad: quickly exceeds `10`

#### üîπ `rewards/chosen > rewards/rejected`
- should be **always true**
- if not ‚Üí DPO is not working

#### üîπ `loss`
- does not have to go to 0
- ‚ùå `loss ‚âà 0` + `accuracy = 1.0` ‚Üí over-optimization

---

### üéõÔ∏è Main knobs (DPO)

| Parameter | What it controls | When to change |
|---------|------------------|---------------|
| `beta` | preference strength | ‚Üì if accuracy quickly reaches 1.0 |
| `learning_rate` | adaptation speed | ‚Üì if margins ‚Äúblow up‚Äù |
| `max_seq_len` | full context | ‚Üë if truncation occurs |
| `max_prompt_length` | prompt size | ‚Üë if `facts` are large |
| `max_steps` | overfitting control | ‚Üì if the model ‚Äúwins‚Äù too fast |

---

### üß† Typical symptoms and fixes (DPO)

| Symptom | Likely cause | Fix |
|------|--------------|-----|
| accuracy = 1.0 from step 20 | truncated prompt | `max_seq_len = 4096` |
| margins > 10 | style mismatch | align chosen/rejected |
| broken JSON | beta too high | `beta = 0.03‚Äì0.05` |
| degradation on new cases | overfitting | ‚Üì LR, ‚Üì max_steps |

---

## 3Ô∏è‚É£ Recommended production order

1. **SFT**
   - stable JSON
   - correct language and structure
2. **DPO**
   - correct tariff selection
   - control via rewards/accuracies
3. **A/B sanity + business metrics**
   - tariff accuracy
   - JSON validity
   - `tariffId ‚àà avail`

---

## ‚úÖ Minimal checklist before ‚ÄúOK for prod‚Äù

- [ ] JSON parse rate = 100%
- [ ] `tariffId` is always valid
- [ ] DPO accuracy does not jump to 1.0 within 10‚Äì20 steps
- [ ] rewards/margins < ~8‚Äì10
- [ ] stability on unseen validation data

---

> üí° Tip:  
> If DPO still feels ‚Äútoo easy‚Äù after fixing truncation ‚Äî  
> the problem is almost always **style mismatch between chosen and rejected**,  
> not the hyperparameters.


## DPO metrics: what they mean & how to tune them (Tariff Recommender)

This notebook logs **DPOTrainer** metrics (pairwise preference learning).  
For each training row we have the same `prompt` and two answers:
- **chosen** = desired recommendation (e.g., matches real customer migration / label)
- **rejected** = plausible but undesired recommendation

DPO trains the model to make **chosen more likely than rejected** for the same prompt.

---

### Key logged metrics (what they mean)

#### 1) `loss`
- The DPO objective value. Lower is usually better *during* training.
- Use it mainly to detect instability (spikes/divergence).
- **Do not optimize loss alone** ‚Äî you can overfit even when loss keeps decreasing.

**If loss is unstable / spikes:**
- Reduce `learning_rate`
- Increase `beta` **only if** accuracy is stuck ~0.5 (rare); otherwise keep beta small
- Consider gradient clipping (if available) and/or reduce batch noise (increase grad_accum)

---

#### 2) `rewards/accuracies`
- Fraction of pairs where:
  \[
  \log P(\text{chosen}|\text{prompt}) > \log P(\text{rejected}|\text{prompt})
  \]
- Interpretation:
  - `0.50` ‚âà random preference
  - `0.70‚Äì0.85` = good learning
  - `>0.90` = strong preference alignment
  - `1.00` = model always prefers chosen (watch for over-optimization)

**If accuracy is low (‚â§0.6):**
- Check data quality (chosen/rejected consistency)
- Increase training steps slightly
- Increase `beta` a bit (e.g., `0.05 ‚Üí 0.07`) **carefully**
- Ensure `max_prompt_length/max_length` prevents truncation (truncation kills preference signal)

**If accuracy hits 1.0 too fast:**
- You are likely in an easy regime ‚Üí risk of overfitting the preference pairs
- Use fewer steps and/or smaller `learning_rate`
- Consider harder negatives (better rejected answers)

---

#### 3) `rewards/margins`
- Average preference margin:
  \[
  \text{margin} = \log P(\text{chosen}) - \log P(\text{rejected})
  \]
- Interpretation:
  - `~0` = model is unsure
  - `1‚Äì5` = healthy confidence
  - `>10` = very confident
  - `>>10` (e.g., 15‚Äì20+) = can indicate **over-optimization** (model becomes too ‚Äúcertain‚Äù)

**If margins grow very large early:**
- Reduce `max_steps` / epochs
- Reduce `learning_rate` (e.g., `1e-5 ‚Üí 5e-6`)
- Reduce `beta` (makes updates less aggressive)

**If margins stay near 0:**
- Increase steps slightly
- Slightly increase `beta` (e.g., `0.03 ‚Üí 0.05`)
- Verify your rejected answers are meaningfully different (hard negatives help)

---

#### 4) `rewards/chosen` and `rewards/rejected`
- These are model ‚Äúscores‚Äù for each side (often derived from log-probs).
- What you want to see:
  - `chosen` trending up
  - `rejected` trending down
  - Their difference aligns with `rewards/margins`

**If both go up together:**
- You may be optimizing style rather than preference separation
- Consider stronger negatives / improve rejected construction
- Ensure loss is computed on the full response tokens (no masking bug)

---

#### 5) `learning_rate`
- The current LR. If logs show `0.0000`, your logger may be reading the wrong key.
- Make sure logging normalizes:
  - `learning_rate` OR `lr` ‚Üí log as `learning_rate`

---

### What to tune (control knobs)

#### A) `learning_rate` (most important)
- **DPO should use smaller LR than SFT.**
- Safe defaults:
  - `1e-5` (`0.00001`) for DPO
  - `5e-6` (`0.000005`) if margins explode or accuracy hits 1.0 too fast

Symptoms ‚Üí Fix:
- margins jump fast / accuracy ‚Üí 1.0 quickly ‚Üí **lower LR**
- loss noisy / spikes ‚Üí **lower LR**
- no learning (accuracy ~0.5) ‚Üí slightly higher LR or more steps (but first check data)

---

#### B) `beta` (aggressiveness of preference push)
- Think of `beta` as how strongly we force chosen > rejected.
- Safe starting point: `beta = 0.05`

Symptoms ‚Üí Fix:
- accuracy stuck low ‚Üí increase `beta` a bit (e.g., `0.05 ‚Üí 0.07`)
- margins explode / overconfident ‚Üí reduce `beta` (e.g., `0.05 ‚Üí 0.03`)

---

#### C) `max_steps` / epochs (how long you train)
- DPO often converges fast.
- Watch for:
  - accuracy near 1.0 + margins rising fast ‚Üí stop early

Practical guidance:
- Start with **200‚Äì400 steps** total (for ~3k rows with grad_accum=8 this is often enough)
- Use early stopping on `eval_loss` and/or a custom rule on margins/accuracy if available.

---

#### D) `max_length` / `max_prompt_length` (avoid truncation)
If prompt p95 ~3000 and answers ~300‚Äì400 tokens:
- Use `max_length = 4096`
- Use `max_prompt_length ~ 3500‚Äì3600` (leave room for response)

Symptoms of truncation:
- accuracy fails to improve
- margins stay near 0
- training becomes noisy

---

### What to watch for in practice (quick checklist)

‚úÖ Healthy DPO run:
- `rewards/accuracies` rises from ~0.5 ‚Üí 0.8‚Äì0.95
- `rewards/margins` increases to a moderate range (often 2‚Äì10)
- `chosen` up, `rejected` down
- eval metrics stabilize without degrading output format (JSON validity)

‚ö†Ô∏è Over-optimization signs:
- `rewards/accuracies` ‚Üí 1.0 very early
- `rewards/margins` keeps climbing to very high values (e.g., >10‚Äì15) while eval stops improving
- output quality issues (e.g., worse JSON compliance or less robust behavior)

**Fix over-optimization:**
- fewer steps
- lower `learning_rate`
- lower `beta`

---

### Optional: add a format metric (recommended for production)
Besides DPO metrics, track a **task-format KPI** on a fixed subset (A/B sanity):
- `valid_json_rate`
- tariff field presence: `tariffId`, `templateId`, etc.

If `valid_json_rate` drops while DPO metrics ‚Äúimprove‚Äù:
- prefer stopping early / reduce aggressiveness
- keep generation deterministic during evaluation (`do_sample=False`)
