In [1]:
# =========================
# Cell 1 ‚Äî ENV (MUST be first, before torch/transformers)
# =========================
from ft_pipeline.env import apply_env
apply_env()

import os
import gc
import torch

torch.backends.cuda.matmul.allow_tf32 = True
torch.set_float32_matmul_precision("high")

print("CUDA available:", torch.cuda.is_available())
print("CUDA device:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else None)

print("allocated:", torch.cuda.memory_allocated()/1024**2, "MB")
print("reserved:",  torch.cuda.memory_reserved()/1024**2, "MB")

gc.collect()
torch.cuda.empty_cache()

print("allocated:", torch.cuda.memory_allocated()/1024**2, "MB")
print("reserved:",  torch.cuda.memory_reserved()/1024**2, "MB")


import logging
from ft_pipeline.logger import setup_logger
from ft_pipeline.config import FTConfig
from ft_pipeline.run_sft import run_finetune
from ft_pipeline.config import DPOCfg
from ft_pipeline.run_dpo import run_dpo




CUDA available: True
CUDA device: NVIDIA A100-SXM4-40GB
allocated: 0.0 MB
reserved: 0.0 MB
allocated: 0.0 MB
reserved: 0.0 MB


In [2]:
# =========================
# Cell ‚Äî DPO Config
# =========================
# DPO (Direct Preference Optimization) after SFT (QLoRA LoRA-adapter)
# MamayLM-Gemma-3-12B (–ª–æ–∫–∞–ª—å–Ω–∏–π —à–ª—è—Ö)
# GPU: A100 40GB, BF16, QLoRA 4-bit

from ft_pipeline.config import DPOCfg

cfg_dpo = DPOCfg(
    # ==========================================================
    # BASE MODEL + CONTINUATION FROM SFT
    # ==========================================================
    model_id="/home/jovyan/ai-models/MamayLM-Gemma-3-12B",               # path or HF repo id base model
    
    sft_adapter_dir="MamayLM-Gemma-3-12b_QLoRA_SFT/lora_adapter",           # LoRA-adapter after SFT,
    # sft_adapter_dir=None,                                              # if DPO from base
    
    dpo_train_jsonl="ft_datasets/dpo_train.jsonl",                      # train dataset in JSONL
    dpo_val_jsonl="ft_datasets/dpo_val.jsonl",                          # validation dataset in JSONL

    
    out_dir="outputs_mamay12b_qlora_dpo",
    # out_dir="MamayLM-Gemma-3-12b_QLoRA_SFT_DPO",
    # out_dir="MamayLM-Gemma-3-12b_QLoRA_DPO",
    
    # ==========================================================
    # SEQUENCE / BATCHING
    # ==========================================================
    max_seq_len=4096,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,  # effective batch = batch_size * grad_accum

    # ==========================================================
    # TRAINING SCHEDULE / OPTIM
    # ==========================================================
    learning_rate=0.0000005,     
    weight_decay=0.05,        # L2 regularization 
    num_train_epochs=1.0,     # (if  max_steps are provided - will ignored)
    max_steps=400,            # use instead - num_train_epochs
    warmup_ratio=0.05,
    lr_scheduler_type="cosine",  #scheduler: "cosine", "linear", ...
    logging_steps=5,
    eval_steps=50,
    save_steps=200,
    save_total_limit=2,

    # ==========================================================
    # DPO CORE
    # ==========================================================
    
    beta=0.03,   # to avoid overfit - safe start. -- if   accuracy getting to fast to 1.0 - change it  ‚Üí 0.03

    use_bf16=True,            
    use_fp16=False,
    load_in_4bit=True,         # QLoRA (bitsandbytes 4-bit)
    attn_implementation="sdpa",   
    optim="paged_adamw_8bit",  # bitsandbytes to reduce the memory
    report_to="none",
    max_new_tokens_eval=512,  #  in A/B sanity (before/after)
)

print(cfg_dpo)

setup_logger(level=logging.INFO, log_file=f"{cfg_dpo.out_dir}/ft_run_dpo.log")

DPOCfg(model_id='/home/jovyan/ai-models/MamayLM-Gemma-3-12B', sft_adapter_dir='MamayLM-Gemma-3-12b_QLoRA_SFT/lora_adapter', dpo_train_jsonl='ft_datasets/dpo_train.jsonl', dpo_val_jsonl='ft_datasets/dpo_val.jsonl', out_dir='outputs_mamay12b_qlora_dpo', max_seq_len=4096, per_device_train_batch_size=1, per_device_eval_batch_size=1, gradient_accumulation_steps=8, learning_rate=5e-07, weight_decay=0.05, num_train_epochs=1.0, max_steps=400, warmup_ratio=0.05, lr_scheduler_type='cosine', logging_steps=5, eval_steps=50, save_steps=200, save_total_limit=2, beta=0.03, use_bf16=True, use_fp16=False, load_in_4bit=True, attn_implementation='sdpa', report_to='none', optim='paged_adamw_8bit', max_new_tokens_eval=512)


<Logger ft_pipeline (INFO)>

In [None]:


dpo_artifacts = run_dpo(
    cfg_dpo,
    do_ab_sanity=False,                 # optional control the json format - recommended 
    ab_val_jsonl="ft_datasets/sft_val.jsonl",   # if do_ab_sanity take the same from SFT
    ab_indices = list(range(5)),
    # ab_indices=[0, 1, 2, 10, 25, 50, 100, 150, 200, 250, 300],
    dataset_limits=(None, None),
)


08:35:19 | INFO    | === DPO RUN START ===
08:35:19 | INFO    | CUDA available=True
08:35:19 | INFO    | CUDA device=NVIDIA A100-SXM4-40GB
08:35:19 | INFO    | Loading tokenizer: /home/jovyan/ai-models/MamayLM-Gemma-3-12B
08:35:21 | INFO    | Tokenizer loaded
08:35:21 | INFO    | Loading DPO datasets
08:35:21 | INFO    |   train: ft_datasets/dpo_train.jsonl
08:35:21 | INFO    |   val:   ft_datasets/dpo_val.jsonl


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/3094 [00:00<?, ? examples/s]

Map:   0%|          | 0/344 [00:00<?, ? examples/s]

08:35:46 | INFO    | DPO dataset ready | train=3094 | val=344
08:35:46 | INFO    | Loading base model (QLoRA)
08:35:46 | INFO    |   model_id: /home/jovyan/ai-models/MamayLM-Gemma-3-12B
08:35:46 | INFO    |   dtype: torch.bfloat16
08:35:46 | INFO    |   4bit: True
08:35:46 | INFO    |   attn_implementation: sdpa


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

08:36:52 | INFO    | Base model loaded
08:36:52 | INFO    | Enabling gradient checkpointing
08:36:52 | INFO    | Loading trainable LoRA adapter from: MamayLM-Gemma-3-12b_QLoRA_SFT/lora_adapter
08:36:54 | INFO    | Trainable adapter loaded
08:36:54 | INFO    | Trainable parameters:
trainable params: 68,456,448 || all params: 12,255,781,488 || trainable%: 0.5586
08:36:54 | INFO    | Building DPOConfig
08:36:54 | INFO    |   max_seq_len=4096
08:36:54 | INFO    |   beta=0.03
08:36:54 | INFO    |   lr=5e-07
08:36:54 | INFO    | Building DPOTrainer
08:36:54 | INFO    |   train_samples=3094
08:36:54 | INFO    |   val_samples=344


Extracting prompt in train dataset:   0%|          | 0/3094 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/3094 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/3094 [00:00<?, ? examples/s]

Extracting prompt in eval dataset:   0%|          | 0/344 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/344 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/344 [00:00<?, ? examples/s]

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
max_steps is given, it will override any value given in num_train_epochs
Using auto half precision backend
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


08:37:07 | INFO    | Starting DPO training‚Ä¶


Cannot get num_tokens from dataloader
skipped Embedding(4096, 1152): 4.5M params
skipped Gemma3TextScaledWordEmbedding(262208, 3840, padding_idx=0): 964.734375M params
skipped: 964.734375M params
***** Running training *****
  Num examples = 3,094
  Num Epochs = 2
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 8
  Total optimization steps = 400
  Number of trainable parameters = 68,456,448


08:37:08 | INFO    | GPUMetricsCallback enabled
08:37:08 | INFO    | DPOMetricsCallback enabled | csv=outputs_mamay12b_qlora_dpo/dpo_metrics.csv | every_n_steps=5




Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
50,3.2441,3.214541,0.188638,3.357976,0.0,-3.169337,-259.113403,-68.248726,-2.019982,-3.067348


08:38:42 | INFO    | [step 1] train_loss=4.3627 | lr=0 | grad_norm=40.3302 | gpu_mem(GB)=alloc:9.49 res:23.14 max_alloc:36.02 max_res:38.98 | elapsed=1.6m
08:44:50 | INFO    | [step 5] train_loss=4.4817 | lr=1e-07 | grad_norm=46.3626 | gpu_mem(GB)=alloc:9.49 res:27.82 max_alloc:36.24 max_res:38.98 | elapsed=7.7m
08:44:50 | INFO    | DPO step=5 | loss=4.4817 | rewards/accuracies=0.0000 | rewards/margins=-4.4692 | rewards/chosen=-1.0060 | rewards/rejected=3.4632
08:52:30 | INFO    | [step 10] train_loss=4.4164 | lr=2.25e-07 | grad_norm=33.7282 | gpu_mem(GB)=alloc:9.49 res:27.02 max_alloc:36.28 max_res:38.98 | elapsed=15.4m
08:52:30 | INFO    | DPO step=10 | loss=4.4164 | rewards/accuracies=0.0000 | rewards/margins=-4.4031 | rewards/chosen=-0.9108 | rewards/rejected=3.4923
09:00:15 | INFO    | [step 15] train_loss=4.4242 | lr=3.5e-07 | grad_norm=32.3134 | gpu_mem(GB)=alloc:9.49 res:29.11 max_alloc:36.28 max_res:38.98 | elapsed=23.1m
09:00:15 | INFO    | DPO step=15 | loss=4.4242 | rewards


***** Running Evaluation *****
  Num examples = 344
  Batch size = 1


10:18:03 | INFO    | [step 50] eval_loss=3.2145 | gpu_mem(GB)=alloc:9.49 res:35.14 max_alloc:36.53 max_res:38.98 | elapsed=100.9m
10:18:03 | INFO    | DPO[EVAL] step=50
10:18:03 | INFO    | EarlyStop(metric=eval_loss): improved from None to 3.214541
10:25:48 | INFO    | [step 55] train_loss=3.2341 | lr=4.90188e-07 | grad_norm=25.2550 | gpu_mem(GB)=alloc:9.49 res:19.97 max_alloc:36.53 max_res:38.98 | elapsed=108.7m
10:25:48 | INFO    | DPO step=55 | loss=3.2341 | rewards/accuracies=0.0000 | rewards/margins=-3.1878 | rewards/chosen=0.2325 | rewards/rejected=3.4203
10:33:28 | INFO    | [step 60] train_loss=2.9724 | lr=4.87117e-07 | grad_norm=23.5792 | gpu_mem(GB)=alloc:9.49 res:22.39 max_alloc:36.53 max_res:38.98 | elapsed=116.3m
10:33:28 | INFO    | DPO step=60 | loss=2.9724 | rewards/accuracies=0.0000 | rewards/margins=-2.9157 | rewards/chosen=0.3311 | rewards/rejected=3.2467
10:41:09 | INFO    | [step 65] train_loss=2.9635 | lr=4.83641e-07 | grad_norm=22.3052 | gpu_mem(GB)=alloc:9.49 r


***** Running Evaluation *****
  Num examples = 344
  Batch size = 1


## 2Ô∏è‚É£ DPO ‚Äî Direct Preference Optimization

### üéØ Goal

Teach the model to **prefer the correct tariff**,  
when the format and style are already stable (after SFT).

---

### üìä Key metrics (must-watch)

#### üîπ `rewards/accuracies`
- normal range: **0.55‚Äì0.75**
- ‚ùå bad: **1.0 already at step 10‚Äì20**  
  (a sign of truncation or style mismatch)

#### üîπ `rewards/margins`
- should be **> 0**
- good: grows slowly
- ‚ùå bad: quickly exceeds `10`

#### üîπ `rewards/chosen > rewards/rejected`
- should be **always true**
- if not ‚Üí DPO is not working

#### üîπ `loss`
- does not have to go to 0
- ‚ùå `loss ‚âà 0` + `accuracy = 1.0` ‚Üí over-optimization

---

### üéõÔ∏è Main knobs (DPO)

| Parameter | What it controls | When to change |
|---------|------------------|---------------|
| `beta` | preference strength | ‚Üì if accuracy quickly reaches 1.0 |
| `learning_rate` | adaptation speed | ‚Üì if margins ‚Äúblow up‚Äù |
| `max_seq_len` | full context | ‚Üë if truncation occurs |
| `max_prompt_length` | prompt size | ‚Üë if `facts` are large |
| `max_steps` | overfitting control | ‚Üì if the model ‚Äúwins‚Äù too fast |

---

### üß† Typical symptoms and fixes (DPO)

| Symptom | Likely cause | Fix |
|------|--------------|-----|
| accuracy = 1.0 from step 20 | truncated prompt | `max_seq_len = 4096` |
| margins > 10 | style mismatch | align chosen/rejected |
| broken JSON | beta too high | `beta = 0.03‚Äì0.05` |
| degradation on new cases | overfitting | ‚Üì LR, ‚Üì max_steps |

---

## 3Ô∏è‚É£ Recommended production order

1. **SFT**
   - stable JSON
   - correct language and structure
2. **DPO**
   - correct tariff selection
   - control via rewards/accuracies
3. **A/B sanity + business metrics**
   - tariff accuracy
   - JSON validity
   - `tariffId ‚àà avail`

---

## ‚úÖ Minimal checklist before ‚ÄúOK for prod‚Äù

- [ ] JSON parse rate = 100%
- [ ] `tariffId` is always valid
- [ ] DPO accuracy does not jump to 1.0 within 10‚Äì20 steps
- [ ] rewards/margins < ~8‚Äì10
- [ ] stability on unseen validation data

---

> üí° Tip:  
> If DPO still feels ‚Äútoo easy‚Äù after fixing truncation ‚Äî  
> the problem is almost always **style mismatch between chosen and rejected**,  
> not the hyperparameters.


## DPO metrics: what they mean & how to tune them (Tariff Recommender)

This notebook logs **DPOTrainer** metrics (pairwise preference learning).  
For each training row we have the same `prompt` and two answers:
- **chosen** = desired recommendation (e.g., matches real customer migration / label)
- **rejected** = plausible but undesired recommendation

DPO trains the model to make **chosen more likely than rejected** for the same prompt.

---

### Key logged metrics (what they mean)

#### 1) `loss`
- The DPO objective value. Lower is usually better *during* training.
- Use it mainly to detect instability (spikes/divergence).
- **Do not optimize loss alone** ‚Äî you can overfit even when loss keeps decreasing.

**If loss is unstable / spikes:**
- Reduce `learning_rate`
- Increase `beta` **only if** accuracy is stuck ~0.5 (rare); otherwise keep beta small
- Consider gradient clipping (if available) and/or reduce batch noise (increase grad_accum)

---

#### 2) `rewards/accuracies`
- Fraction of pairs where:
  \[
  \log P(\text{chosen}|\text{prompt}) > \log P(\text{rejected}|\text{prompt})
  \]
- Interpretation:
  - `0.50` ‚âà random preference
  - `0.70‚Äì0.85` = good learning
  - `>0.90` = strong preference alignment
  - `1.00` = model always prefers chosen (watch for over-optimization)

**If accuracy is low (‚â§0.6):**
- Check data quality (chosen/rejected consistency)
- Increase training steps slightly
- Increase `beta` a bit (e.g., `0.05 ‚Üí 0.07`) **carefully**
- Ensure `max_prompt_length/max_length` prevents truncation (truncation kills preference signal)

**If accuracy hits 1.0 too fast:**
- You are likely in an easy regime ‚Üí risk of overfitting the preference pairs
- Use fewer steps and/or smaller `learning_rate`
- Consider harder negatives (better rejected answers)

---

#### 3) `rewards/margins`
- Average preference margin:
  \[
  \text{margin} = \log P(\text{chosen}) - \log P(\text{rejected})
  \]
- Interpretation:
  - `~0` = model is unsure
  - `1‚Äì5` = healthy confidence
  - `>10` = very confident
  - `>>10` (e.g., 15‚Äì20+) = can indicate **over-optimization** (model becomes too ‚Äúcertain‚Äù)

**If margins grow very large early:**
- Reduce `max_steps` / epochs
- Reduce `learning_rate` (e.g., `1e-5 ‚Üí 5e-6`)
- Reduce `beta` (makes updates less aggressive)

**If margins stay near 0:**
- Increase steps slightly
- Slightly increase `beta` (e.g., `0.03 ‚Üí 0.05`)
- Verify your rejected answers are meaningfully different (hard negatives help)

---

#### 4) `rewards/chosen` and `rewards/rejected`
- These are model ‚Äúscores‚Äù for each side (often derived from log-probs).
- What you want to see:
  - `chosen` trending up
  - `rejected` trending down
  - Their difference aligns with `rewards/margins`

**If both go up together:**
- You may be optimizing style rather than preference separation
- Consider stronger negatives / improve rejected construction
- Ensure loss is computed on the full response tokens (no masking bug)

---

#### 5) `learning_rate`
- The current LR. If logs show `0.0000`, your logger may be reading the wrong key.
- Make sure logging normalizes:
  - `learning_rate` OR `lr` ‚Üí log as `learning_rate`

---

### What to tune (control knobs)

#### A) `learning_rate` (most important)
- **DPO should use smaller LR than SFT.**
- Safe defaults:
  - `1e-5` (`0.00001`) for DPO
  - `5e-6` (`0.000005`) if margins explode or accuracy hits 1.0 too fast

Symptoms ‚Üí Fix:
- margins jump fast / accuracy ‚Üí 1.0 quickly ‚Üí **lower LR**
- loss noisy / spikes ‚Üí **lower LR**
- no learning (accuracy ~0.5) ‚Üí slightly higher LR or more steps (but first check data)

---

#### B) `beta` (aggressiveness of preference push)
- Think of `beta` as how strongly we force chosen > rejected.
- Safe starting point: `beta = 0.05`

Symptoms ‚Üí Fix:
- accuracy stuck low ‚Üí increase `beta` a bit (e.g., `0.05 ‚Üí 0.07`)
- margins explode / overconfident ‚Üí reduce `beta` (e.g., `0.05 ‚Üí 0.03`)

---

#### C) `max_steps` / epochs (how long you train)
- DPO often converges fast.
- Watch for:
  - accuracy near 1.0 + margins rising fast ‚Üí stop early

Practical guidance:
- Start with **200‚Äì400 steps** total (for ~3k rows with grad_accum=8 this is often enough)
- Use early stopping on `eval_loss` and/or a custom rule on margins/accuracy if available.

---

#### D) `max_length` / `max_prompt_length` (avoid truncation)
If prompt p95 ~3000 and answers ~300‚Äì400 tokens:
- Use `max_length = 4096`
- Use `max_prompt_length ~ 3500‚Äì3600` (leave room for response)

Symptoms of truncation:
- accuracy fails to improve
- margins stay near 0
- training becomes noisy

---

### What to watch for in practice (quick checklist)

‚úÖ Healthy DPO run:
- `rewards/accuracies` rises from ~0.5 ‚Üí 0.8‚Äì0.95
- `rewards/margins` increases to a moderate range (often 2‚Äì10)
- `chosen` up, `rejected` down
- eval metrics stabilize without degrading output format (JSON validity)

‚ö†Ô∏è Over-optimization signs:
- `rewards/accuracies` ‚Üí 1.0 very early
- `rewards/margins` keeps climbing to very high values (e.g., >10‚Äì15) while eval stops improving
- output quality issues (e.g., worse JSON compliance or less robust behavior)

**Fix over-optimization:**
- fewer steps
- lower `learning_rate`
- lower `beta`

---

### Optional: add a format metric (recommended for production)
Besides DPO metrics, track a **task-format KPI** on a fixed subset (A/B sanity):
- `valid_json_rate`
- tariff field presence: `tariffId`, `templateId`, etc.

If `valid_json_rate` drops while DPO metrics ‚Äúimprove‚Äù:
- prefer stopping early / reduce aggressiveness
- keep generation deterministic during evaluation (`do_sample=False`)
