In [1]:
# =========================
# Cell 1 ‚Äî ENV (MUST be first, before torch/transformers)
# =========================
from ft_pipeline.env import apply_env
apply_env()

import os
import gc
import torch

torch.backends.cuda.matmul.allow_tf32 = True
torch.set_float32_matmul_precision("high")

print("CUDA available:", torch.cuda.is_available())
print("CUDA device:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else None)



import gc, torch
print("allocated:", torch.cuda.memory_allocated()/1024**2, "MB")
print("reserved:",  torch.cuda.memory_reserved()/1024**2, "MB")



gc.collect()
torch.cuda.empty_cache()

print("allocated:", torch.cuda.memory_allocated()/1024**2, "MB")
print("reserved:",  torch.cuda.memory_reserved()/1024**2, "MB")

import logging
from ft_pipeline.logger import setup_logger
from ft_pipeline.config import FTConfig
from ft_pipeline.run_sft import run_finetune

CUDA available: True
CUDA device: NVIDIA A100-SXM4-40GB
allocated: 0.0 MB
reserved: 0.0 MB
allocated: 0.0 MB
reserved: 0.0 MB


In [2]:
# =========================
# Cell 2 ‚Äî FTConfig 
# =========================
cfg = FTConfig(
    # -----------------------
    # PATHS / INPUT-OUTPUT
    # -----------------------
    model_id="/home/jovyan/ai-models/MamayLM-Gemma-3-12B",  # path or HF repo id base model
    train_jsonl="ft_datasets/sft_train.jsonl",             # train dataset in JSONL
    val_jsonl="ft_datasets/sft_val.jsonl",                 # validation dataset in JSONL
    out_dir="MamayLM-Gemma-3-12b_QLoRA_SFT",         

    # -----------------------
    # SEQUENCE / BATCHING
    # -----------------------
    max_seq_len=5000,                 # max context (promt+complections)
    per_device_train_batch_size=1,    # batch size –Ω–∞ GPU 
    per_device_eval_batch_size=1,     # batch size –Ω–∞ eval 
    gradient_accumulation_steps=8,    # (effective batch = batch_size * grad_accum)

    # -----------------------
    # TRAINING SCHEDULE / OPTIM
    # -----------------------
    learning_rate=0.0000008,          
    num_train_epochs=2,               # (if  max_steps are provided - will ignored)
    max_steps=None, #400 None         # use instead - num_train_epochs
    warmup_ratio=0.05,              
    lr_scheduler_type="cosine",       # scheduler: "cosine", "linear", ...
    logging_steps=5,                  # how often to log (steps)
    eval_steps=50,                    # how often to eval (steps)
    save_steps=200,                   # how often to save checkpoint (steps)
    save_total_limit=2,               # how many checkpoint to save
    weight_decay=0.01,                # L2 regularization 

    use_bf16=True,                    # BF16 (A100 ‚Äî best)
    use_fp16=False,                   # fallback FP16 (if bf16 unavailable)

    # -----------------------
    # QLORA / BNB (4-bit quant)
    # -----------------------
    load_in_4bit=True,                # QLoRA 4-bit loading
    bnb_4bit_quant_type="nf4",        # quantization (nf4 ‚Äî standart)
    bnb_4bit_use_double_quant=True,   # double-quant (often is good for quolity)
    attn_implementation="sdpa",       # "sdpa" ‚Äî stable; 

    # -----------------------
    # LORA (adapter)
    # -----------------------
    lora_r=16,                        # rank
    lora_alpha=32,                    # scaling (often 2*r or 4*r for SFT)
    lora_dropout=0.05,                # dropout in LoRA
    target_modules=None,              # None ‚Üí default in resolved_target_modules()

    # -----------------------
    # TRAINER BEHAVIOR
    # -----------------------
    packing=False,                    # packing a few samples in one seq 
    optim="paged_adamw_8bit",         # optimizator (8bit AdamW –∑ bitsandbytes)
    report_to="none",                 # "none", "wandb", ...

    # -----------------------
    # INFERENCE SANITY CHECKS
    # -----------------------
    max_new_tokens_eval=512,          #  in A/B sanity (before/after)
)

print(cfg)
setup_logger(level=logging.INFO, log_file=f"{cfg.out_dir}/ft_run_sft.log")

FTConfig(model_id='/home/jovyan/ai-models/MamayLM-Gemma-3-12B', train_jsonl='ft_datasets/sft_train.jsonl', val_jsonl='ft_datasets/sft_val.jsonl', out_dir='MamayLM-Gemma-3-12b_QLoRA_SFT', max_seq_len=5000, per_device_train_batch_size=1, per_device_eval_batch_size=1, gradient_accumulation_steps=8, learning_rate=8e-07, weight_decay=0.01, num_train_epochs=2, max_steps=None, warmup_ratio=0.05, lr_scheduler_type='cosine', logging_steps=5, eval_steps=50, save_steps=200, save_total_limit=2, use_bf16=True, use_fp16=False, load_in_4bit=True, bnb_4bit_quant_type='nf4', bnb_4bit_use_double_quant=True, attn_implementation='sdpa', lora_r=16, lora_alpha=32, lora_dropout=0.05, target_modules=None, packing=False, optim='paged_adamw_8bit', report_to='none', max_new_tokens_eval=512)


<Logger ft_pipeline (INFO)>

In [None]:
# =========================
# Cell 3 ‚Äî Run fine-tune (—É—Å—ñ –ø–∞—Ä–∞–º–µ—Ç—Ä–∏ run_finetune + –∫–æ–º–µ–Ω—Ç–∞—Ä—ñ)
# =========================

# run_finetune(cfg, ...) ‚Äî entrypoint, which:
#   - load dadaset from prepared JSONL
#   - Load tokenizer+model (QLoRA 4-bit) + LoRA
#   - A/B BEFORE (optional)
#   - Build trainer ( completion-only loss throgh  masking collator)
#   - add callbacks 
#   - train
#   - save lora_adapter + tokenizer
#   - A/B AFTER + ab_report (optional)


sft_artifacts = run_finetune(
    cfg,
    ab_indices = None,
    # ab_indices = list(range(1)),# indexes from val_jsonl to use in A/B "before/after" (strict JSON parse rate )
    # ab_indices=[0, 1, 10, 25, 50,  100, 150,  200, 250, 300,],   
    do_ab_before=False,    #  True ‚Üí will generate ab_before.json 
    do_ab_after=True,     #  True ‚Üí will generate ab_after.json and make ab_report.md
    dataset_limits=(None, None),  # (train_limit, val_limit) (500, 100)   # None ‚Üí full dataset.
    dataset_mode="prompt_completion", # only it for now
    clean_cuda_cache_before=True, #  True ‚Üí before start will do: gc.collect() + torch.cuda.empty_cache()
)


12:58:30 | INFO    | === FT RUN START ===
12:58:30 | INFO    | CUDA available=True
12:58:30 | INFO    | CUDA device=NVIDIA A100-SXM4-40GB
12:58:30 | INFO    | Loading datasets
12:58:30 | INFO    |   train: ft_datasets/sft_train.jsonl
12:58:30 | INFO    |   val:   ft_datasets/sft_val.jsonl
12:58:55 | INFO    | Converting to prompt/completion format
12:58:55 | INFO    | Dataset ready | train=3059 | val=340
12:58:55 | INFO    | Loading tokenizer: /home/jovyan/ai-models/MamayLM-Gemma-3-12B
12:58:56 | INFO    | Tokenizer loaded
12:58:56 | INFO    | Loading base model (QLoRA)
12:58:56 | INFO    |   model_id: /home/jovyan/ai-models/MamayLM-Gemma-3-12B
12:58:56 | INFO    |   dtype: torch.bfloat16
12:58:56 | INFO    |   4bit: True
12:58:56 | INFO    |   attn_implementation: sdpa


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

13:00:05 | INFO    | Base model loaded
13:00:05 | INFO    | Enabling gradient checkpointing
13:00:05 | INFO    | Applying LoRA
13:00:05 | INFO    |   r=16, alpha=32, dropout=0.05
13:00:05 | INFO    |   target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj', 'up_proj', 'down_proj', 'gate_proj']
13:00:06 | INFO    | LoRA applied successfully
13:00:06 | INFO    | Trainable parameters:
trainable params: 68,456,448 || all params: 12,255,781,488 || trainable%: 0.5586
13:00:06 | INFO    | Building SFTConfig
13:00:06 | INFO    |   max_seq_len=5000
13:00:06 | INFO    |   batch_size=1
13:00:06 | INFO    |   grad_accum=8
13:00:06 | INFO    |   lr=8e-07
13:00:06 | INFO    | Building SFTTrainer
13:00:06 | INFO    |   train_samples=3059
13:00:06 | INFO    |   val_samples=340
13:00:06 | INFO    |   dataset_mode=prompt_completion
13:00:06 | INFO    |   max_seq_length=5000


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Using auto half precision backend
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


13:00:07 | INFO    | labels shape: (1, 3136)
13:00:07 | INFO    | non -100 labels: 261
13:00:07 | INFO    | Starting training‚Ä¶


skipped Embedding(4096, 1152): 4.5M params
skipped Gemma3TextScaledWordEmbedding(262208, 3840, padding_idx=0): 964.734375M params
skipped: 964.734375M params
***** Running training *****
  Num examples = 3,059
  Num Epochs = 2
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 8
  Total optimization steps = 766
  Number of trainable parameters = 68,456,448


13:00:36 | INFO    | GPUMetricsCallback enabled


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss
50,4.3145,0.536637
100,3.5139,0.440426
150,2.7355,0.339168
200,2.0088,0.251918
250,1.493,0.183378
300,1.2045,0.132083
350,0.8524,0.107348
400,0.6466,0.093778
450,0.8021,0.085384
500,0.4915,0.079573


13:01:22 | INFO    | [step 1] train_loss=4.9221 | lr=0 | grad_norm=8.3960 | train_tok=seq_mean:3075 prompt_mean:2831 loss_mean:249 loss_max:249 | gpu_mem(GB)=alloc:9.49 res:23.73 max_alloc:23.21 max_res:38.98 | elapsed=0.8m
13:04:23 | INFO    | [step 5] train_loss=4.6771 | lr=8.20513e-08 | grad_norm=10.5648 | train_tok=seq_mean:3137 prompt_mean:2850 loss_mean:294 loss_max:294 | gpu_mem(GB)=alloc:9.49 res:19.05 max_alloc:23.38 max_res:38.98 | elapsed=3.8m
13:08:18 | INFO    | [step 10] train_loss=4.5414 | lr=1.84615e-07 | grad_norm=15.6103 | train_tok=seq_mean:2795 prompt_mean:2485 loss_mean:315 loss_max:315 | gpu_mem(GB)=alloc:9.49 res:36.32 max_alloc:23.38 max_res:38.98 | elapsed=7.7m
13:12:08 | INFO    | [step 15] train_loss=4.5811 | lr=2.87179e-07 | grad_norm=8.5295 | train_tok=seq_mean:3113 prompt_mean:2836 loss_mean:284 loss_max:284 | gpu_mem(GB)=alloc:9.49 res:32.06 max_alloc:23.38 max_res:38.98 | elapsed=11.5m
13:16:02 | INFO    | [step 20] train_loss=4.7831 | lr=3.89744e-07 | g


***** Running Evaluation *****
  Num examples = 340
  Batch size = 1


13:46:44 | INFO    | [step 50] eval_loss=0.5366 | train_tok=seq_mean:2576 prompt_mean:2284 loss_mean:292 loss_max:292 | gpu_mem(GB)=alloc:9.49 res:37.84 max_alloc:23.45 max_res:38.98 | elapsed=46.1m
13:46:44 | INFO    | EarlyStop(metric=eval_loss): improved from None to 0.536637
13:50:29 | INFO    | [step 55] train_loss=4.1769 | lr=7.9916e-07 | grad_norm=8.9928 | train_tok=seq_mean:2798 prompt_mean:2475 loss_mean:325 loss_max:325 | gpu_mem(GB)=alloc:9.49 res:27.12 max_alloc:23.45 max_res:38.98 | elapsed=49.9m
13:54:28 | INFO    | [step 60] train_loss=4.1384 | lr=7.98507e-07 | grad_norm=10.1909 | train_tok=seq_mean:2998 prompt_mean:2710 loss_mean:290 loss_max:290 | gpu_mem(GB)=alloc:9.49 res:21.26 max_alloc:23.45 max_res:38.98 | elapsed=53.9m
13:58:30 | INFO    | [step 65] train_loss=4.0595 | lr=7.97668e-07 | grad_norm=6.9578 | train_tok=seq_mean:3118 prompt_mean:2875 loss_mean:245 loss_max:245 | gpu_mem(GB)=alloc:9.49 res:11.76 max_alloc:23.45 max_res:38.98 | elapsed=57.9m
14:02:21 | I


***** Running Evaluation *****
  Num examples = 340
  Batch size = 1


14:33:40 | INFO    | [step 100] eval_loss=0.4404 | train_tok=seq_mean:2576 prompt_mean:2284 loss_mean:292 loss_max:292 | gpu_mem(GB)=alloc:9.49 res:37.84 max_alloc:23.45 max_res:38.98 | elapsed=93.1m
14:33:40 | INFO    | EarlyStop(metric=eval_loss): improved from 0.536637 to 0.440426
14:37:41 | INFO    | [step 105] train_loss=3.3891 | lr=7.84324e-07 | grad_norm=7.9614 | train_tok=seq_mean:3124 prompt_mean:2875 loss_mean:253 loss_max:253 | gpu_mem(GB)=alloc:9.49 res:30.59 max_alloc:23.45 max_res:38.98 | elapsed=97.1m
14:41:36 | INFO    | [step 110] train_loss=3.3781 | lr=7.81839e-07 | grad_norm=6.9335 | train_tok=seq_mean:3132 prompt_mean:2880 loss_mean:256 loss_max:256 | gpu_mem(GB)=alloc:9.49 res:30.01 max_alloc:23.45 max_res:38.98 | elapsed=101.0m
14:45:27 | INFO    | [step 115] train_loss=3.2720 | lr=7.79175e-07 | grad_norm=7.4506 | train_tok=seq_mean:2913 prompt_mean:2606 loss_mean:314 loss_max:314 | gpu_mem(GB)=alloc:9.49 res:14.91 max_alloc:23.45 max_res:38.98 | elapsed=104.8m
14


***** Running Evaluation *****
  Num examples = 340
  Batch size = 1


15:20:18 | INFO    | [step 150] eval_loss=0.3392 | train_tok=seq_mean:2576 prompt_mean:2284 loss_mean:292 loss_max:292 | gpu_mem(GB)=alloc:9.49 res:37.84 max_alloc:23.45 max_res:38.98 | elapsed=139.7m
15:20:18 | INFO    | EarlyStop(metric=eval_loss): improved from 0.440426 to 0.339168
15:24:07 | INFO    | [step 155] train_loss=2.6827 | lr=7.51616e-07 | grad_norm=7.3670 | train_tok=seq_mean:3125 prompt_mean:2846 loss_mean:282 loss_max:282 | gpu_mem(GB)=alloc:9.49 res:19.89 max_alloc:23.45 max_res:38.98 | elapsed=143.5m
15:27:54 | INFO    | [step 160] train_loss=2.6633 | lr=7.47414e-07 | grad_norm=7.2646 | train_tok=seq_mean:3160 prompt_mean:2853 loss_mean:307 loss_max:307 | gpu_mem(GB)=alloc:9.49 res:20.24 max_alloc:23.50 max_res:38.98 | elapsed=147.3m
15:31:50 | INFO    | [step 165] train_loss=2.6261 | lr=7.4305e-07 | grad_norm=8.2649 | train_tok=seq_mean:2768 prompt_mean:2464 loss_mean:304 loss_max:304 | gpu_mem(GB)=alloc:9.49 res:19.09 max_alloc:23.50 max_res:38.98 | elapsed=151.2m
1


***** Running Evaluation *****
  Num examples = 340
  Batch size = 1


16:07:04 | INFO    | [step 200] eval_loss=0.2519 | train_tok=seq_mean:2576 prompt_mean:2284 loss_mean:292 loss_max:292 | gpu_mem(GB)=alloc:9.49 res:37.84 max_alloc:23.50 max_res:38.98 | elapsed=186.5m
16:07:04 | INFO    | EarlyStop(metric=eval_loss): improved from 0.339168 to 0.251918


Saving model checkpoint to MamayLM-Gemma-3-12b_QLoRA_SFT/checkpoint-200
chat template saved in MamayLM-Gemma-3-12b_QLoRA_SFT/checkpoint-200/chat_template.jinja
tokenizer config file saved in MamayLM-Gemma-3-12b_QLoRA_SFT/checkpoint-200/tokenizer_config.json
Special tokens file saved in MamayLM-Gemma-3-12b_QLoRA_SFT/checkpoint-200/special_tokens_map.json


16:10:59 | INFO    | [step 205] train_loss=1.9281 | lr=7.02557e-07 | grad_norm=7.7062 | train_tok=seq_mean:2786 prompt_mean:2478 loss_mean:314 loss_max:314 | gpu_mem(GB)=alloc:9.49 res:28.50 max_alloc:23.50 max_res:38.98 | elapsed=190.4m
16:14:46 | INFO    | [step 210] train_loss=2.0150 | lr=6.96833e-07 | grad_norm=6.4973 | train_tok=seq_mean:3161 prompt_mean:2874 loss_mean:294 loss_max:294 | gpu_mem(GB)=alloc:9.49 res:21.75 max_alloc:23.50 max_res:38.98 | elapsed=194.2m
16:18:36 | INFO    | [step 215] train_loss=1.8412 | lr=6.90971e-07 | grad_norm=9.2631 | train_tok=seq_mean:3087 prompt_mean:2846 loss_mean:242 loss_max:242 | gpu_mem(GB)=alloc:9.49 res:25.07 max_alloc:23.50 max_res:38.98 | elapsed=198.0m
16:22:27 | INFO    | [step 220] train_loss=1.9752 | lr=6.84974e-07 | grad_norm=7.2543 | train_tok=seq_mean:3107 prompt_mean:2850 loss_mean:262 loss_max:262 | gpu_mem(GB)=alloc:9.49 res:29.52 max_alloc:23.50 max_res:38.98 | elapsed=201.8m
16:26:20 | INFO    | [step 225] train_loss=1.722


***** Running Evaluation *****
  Num examples = 340
  Batch size = 1


16:53:43 | INFO    | [step 250] eval_loss=0.1834 | train_tok=seq_mean:2576 prompt_mean:2284 loss_mean:292 loss_max:292 | gpu_mem(GB)=alloc:9.49 res:37.84 max_alloc:23.50 max_res:38.98 | elapsed=233.1m
16:53:43 | INFO    | EarlyStop(metric=eval_loss): improved from 0.251918 to 0.183378
16:57:41 | INFO    | [step 255] train_loss=1.4068 | lr=6.39428e-07 | grad_norm=7.3153 | train_tok=seq_mean:3089 prompt_mean:2843 loss_mean:253 loss_max:253 | gpu_mem(GB)=alloc:9.49 res:29.68 max_alloc:23.50 max_res:38.98 | elapsed=237.1m
17:01:27 | INFO    | [step 260] train_loss=1.4764 | lr=6.32449e-07 | grad_norm=7.4088 | train_tok=seq_mean:3071 prompt_mean:2831 loss_mean:241 loss_max:241 | gpu_mem(GB)=alloc:9.49 res:23.64 max_alloc:23.50 max_res:38.98 | elapsed=240.8m
17:05:25 | INFO    | [step 265] train_loss=1.3997 | lr=6.25362e-07 | grad_norm=7.2752 | train_tok=seq_mean:3120 prompt_mean:2871 loss_mean:249 loss_max:249 | gpu_mem(GB)=alloc:9.49 res:16.59 max_alloc:23.50 max_res:38.98 | elapsed=244.8m



***** Running Evaluation *****
  Num examples = 340
  Batch size = 1


17:40:22 | INFO    | [step 300] eval_loss=0.1321 | train_tok=seq_mean:2576 prompt_mean:2284 loss_mean:292 loss_max:292 | gpu_mem(GB)=alloc:9.49 res:37.84 max_alloc:23.54 max_res:38.98 | elapsed=279.8m
17:40:22 | INFO    | EarlyStop(metric=eval_loss): improved from 0.183378 to 0.132083
17:44:13 | INFO    | [step 305] train_loss=1.0120 | lr=5.65165e-07 | grad_norm=13.3096 | train_tok=seq_mean:2969 prompt_mean:2696 loss_mean:280 loss_max:280 | gpu_mem(GB)=alloc:9.49 res:23.74 max_alloc:23.54 max_res:38.98 | elapsed=283.6m
17:48:06 | INFO    | [step 310] train_loss=0.9280 | lr=5.57255e-07 | grad_norm=9.5696 | train_tok=seq_mean:2571 prompt_mean:2279 loss_mean:297 loss_max:297 | gpu_mem(GB)=alloc:9.49 res:20.07 max_alloc:23.60 max_res:38.98 | elapsed=287.5m
17:52:01 | INFO    | [step 315] train_loss=0.8929 | lr=5.49273e-07 | grad_norm=5.2648 | train_tok=seq_mean:3102 prompt_mean:2859 loss_mean:245 loss_max:245 | gpu_mem(GB)=alloc:9.49 res:36.16 max_alloc:23.60 max_res:38.98 | elapsed=291.4m


***** Running Evaluation *****
  Num examples = 340
  Batch size = 1


18:26:33 | INFO    | [step 350] eval_loss=0.1073 | train_tok=seq_mean:2576 prompt_mean:2284 loss_mean:292 loss_max:292 | gpu_mem(GB)=alloc:9.49 res:37.84 max_alloc:23.60 max_res:38.98 | elapsed=325.9m
18:26:33 | INFO    | EarlyStop(metric=eval_loss): improved from 0.132083 to 0.107348
18:30:24 | INFO    | [step 355] train_loss=0.8125 | lr=4.83221e-07 | grad_norm=8.8585 | train_tok=seq_mean:3097 prompt_mean:2848 loss_mean:256 loss_max:256 | gpu_mem(GB)=alloc:9.49 res:18.27 max_alloc:23.60 max_res:38.98 | elapsed=329.8m
18:34:17 | INFO    | [step 360] train_loss=0.8278 | lr=4.74749e-07 | grad_norm=7.9376 | train_tok=seq_mean:3124 prompt_mean:2873 loss_mean:255 loss_max:255 | gpu_mem(GB)=alloc:9.49 res:23.31 max_alloc:23.60 max_res:38.98 | elapsed=333.7m
18:38:11 | INFO    | [step 365] train_loss=0.8669 | lr=4.66242e-07 | grad_norm=5.9211 | train_tok=seq_mean:3192 prompt_mean:2880 loss_mean:312 loss_max:312 | gpu_mem(GB)=alloc:9.49 res:19.95 max_alloc:23.60 max_res:38.98 | elapsed=337.6m



***** Running Evaluation *****
  Num examples = 340
  Batch size = 1


19:12:30 | INFO    | [step 400] eval_loss=0.0938 | train_tok=seq_mean:2576 prompt_mean:2284 loss_mean:292 loss_max:292 | gpu_mem(GB)=alloc:9.49 res:37.84 max_alloc:23.60 max_res:38.98 | elapsed=371.9m
19:12:30 | INFO    | EarlyStop(metric=eval_loss): improved from 0.107348 to 0.093778


Saving model checkpoint to MamayLM-Gemma-3-12b_QLoRA_SFT/checkpoint-400
chat template saved in MamayLM-Gemma-3-12b_QLoRA_SFT/checkpoint-400/chat_template.jinja
tokenizer config file saved in MamayLM-Gemma-3-12b_QLoRA_SFT/checkpoint-400/tokenizer_config.json
Special tokens file saved in MamayLM-Gemma-3-12b_QLoRA_SFT/checkpoint-400/special_tokens_map.json


19:16:42 | INFO    | [step 405] train_loss=0.7380 | lr=3.97407e-07 | grad_norm=4.5856 | train_tok=seq_mean:3087 prompt_mean:2840 loss_mean:248 loss_max:248 | gpu_mem(GB)=alloc:9.49 res:18.13 max_alloc:23.60 max_res:38.98 | elapsed=376.1m
19:20:41 | INFO    | [step 410] train_loss=0.6181 | lr=3.88766e-07 | grad_norm=6.8264 | train_tok=seq_mean:3123 prompt_mean:2845 loss_mean:283 loss_max:283 | gpu_mem(GB)=alloc:9.49 res:21.88 max_alloc:23.60 max_res:38.98 | elapsed=380.1m
19:24:35 | INFO    | [step 415] train_loss=0.7249 | lr=3.8013e-07 | grad_norm=5.8120 | train_tok=seq_mean:2689 prompt_mean:2451 loss_mean:245 loss_max:245 | gpu_mem(GB)=alloc:9.49 res:24.46 max_alloc:23.60 max_res:38.98 | elapsed=384.0m
19:28:22 | INFO    | [step 420] train_loss=0.6141 | lr=3.71504e-07 | grad_norm=8.7959 | train_tok=seq_mean:3165 prompt_mean:2874 loss_mean:294 loss_max:294 | gpu_mem(GB)=alloc:9.49 res:38.07 max_alloc:23.60 max_res:38.98 | elapsed=387.8m
19:32:19 | INFO    | [step 425] train_loss=0.6183


***** Running Evaluation *****
  Num examples = 340
  Batch size = 1


19:59:40 | INFO    | [step 450] eval_loss=0.0854 | train_tok=seq_mean:2576 prompt_mean:2284 loss_mean:292 loss_max:292 | gpu_mem(GB)=alloc:9.49 res:37.84 max_alloc:23.60 max_res:38.98 | elapsed=419.1m
19:59:40 | INFO    | EarlyStop(metric=eval_loss): improved from 0.093778 to 0.085384
20:03:34 | INFO    | [step 455] train_loss=0.6146 | lr=3.11714e-07 | grad_norm=5.7288 | train_tok=seq_mean:2914 prompt_mean:2610 loss_mean:310 loss_max:310 | gpu_mem(GB)=alloc:9.49 res:25.59 max_alloc:23.60 max_res:38.98 | elapsed=423.0m
20:07:27 | INFO    | [step 460] train_loss=0.8082 | lr=3.03306e-07 | grad_norm=6.7957 | train_tok=seq_mean:3116 prompt_mean:2874 loss_mean:246 loss_max:246 | gpu_mem(GB)=alloc:9.49 res:15.71 max_alloc:23.60 max_res:38.98 | elapsed=426.9m
20:11:16 | INFO    | [step 465] train_loss=0.6033 | lr=2.94943e-07 | grad_norm=6.2068 | train_tok=seq_mean:3123 prompt_mean:2837 loss_mean:291 loss_max:291 | gpu_mem(GB)=alloc:9.49 res:11.69 max_alloc:23.60 max_res:38.98 | elapsed=430.7m



***** Running Evaluation *****
  Num examples = 340
  Batch size = 1


20:46:25 | INFO    | [step 500] eval_loss=0.0796 | train_tok=seq_mean:2576 prompt_mean:2284 loss_mean:292 loss_max:292 | gpu_mem(GB)=alloc:9.49 res:37.84 max_alloc:23.60 max_res:38.98 | elapsed=465.8m
20:46:25 | INFO    | EarlyStop(metric=eval_loss): improved from 0.085384 to 0.079573
20:50:04 | INFO    | [step 505] train_loss=0.5752 | lr=2.30126e-07 | grad_norm=7.0757 | train_tok=seq_mean:3165 prompt_mean:2885 loss_mean:283 loss_max:283 | gpu_mem(GB)=alloc:9.49 res:28.74 max_alloc:23.60 max_res:38.98 | elapsed=469.5m
20:53:54 | INFO    | [step 510] train_loss=0.6461 | lr=2.22342e-07 | grad_norm=5.0928 | train_tok=seq_mean:3092 prompt_mean:2846 loss_mean:250 loss_max:250 | gpu_mem(GB)=alloc:9.49 res:18.21 max_alloc:23.60 max_res:38.98 | elapsed=473.3m
20:57:45 | INFO    | [step 515] train_loss=0.5249 | lr=2.14641e-07 | grad_norm=5.5925 | train_tok=seq_mean:2771 prompt_mean:2462 loss_mean:314 loss_max:314 | gpu_mem(GB)=alloc:9.49 res:38.21 max_alloc:23.60 max_res:38.98 | elapsed=477.1m



***** Running Evaluation *****
  Num examples = 340
  Batch size = 1


In [1]:
sft_artifacts

NameError: name 'sft_artifacts' is not defined

# üß† Fine-tuning Mamay12B: SFT + DPO ‚Äî Practical Guide
#### QLoRA ‚Äî Quantized Low-Rank Adaptation

This guide describes **two training stages** (SFT ‚Üí DPO) for a tariff recommendation assistant  
and **what to monitor in metrics + which parameters to control**.

---

## 1Ô∏è‚É£ SFT ‚Äî Supervised Fine-Tuning

### üéØ Goal
- teach the model a **stable JSON format**
- correct usage analysis
- proper language and response structure  
> **SFT does not optimize tariff selection**, only behavior and formatting.

---

### üìä Key metrics (monitor in logs)

#### üîπ `train_loss`
- expected: **smoothly decreases**
- typical range: `~1.5 ‚Üí 0.5‚Äì0.8`
- ‚ùå bad: sharp drop to `~0.0` ‚Üí overfitting

#### üîπ `eval_loss`
- should **correlate** with `train_loss`
- ‚ùå if `train ‚Üì` while `eval ‚Üë` ‚Üí overfitting

#### üîπ A/B sanity (before / after)
(via `ABSanityCallback`)
- JSON parses in **100% of cases**
- all required fields are present
- `tariffId ‚àà avail_tp_with_desc`
- text is in Ukrainian

---

### üéõÔ∏è Main knobs (SFT)

| Symptom | What to change |
|------|----------|
| loss does not decrease | ‚Üë `learning_rate` (5e-5 ‚Üí 1e-4) |
| fast overfitting | ‚Üì `learning_rate`, ‚Üì `num_train_epochs` |
| brittle JSON | ‚Üë dataset size, ‚Üì LR |
| slow training | ‚Üì `max_seq_len`, ‚Üë `grad_accum` |

---


## SFT metrics: what they mean & how to tune them (Tariff Recommender)

This notebook runs **SFT (Supervised Fine-Tuning)** with **completion-only loss**:
- We feed a long `prompt` (facts + question + formatting rules)
- We train the model to generate the `completion` (assistant JSON)
- **Loss is computed only on completion tokens** (prompt tokens are masked out)

SFT teaches:
- stable output format (strict JSON)
- the correct content structure (fields, Ukrainian text sections)
- general mapping from usage ‚Üí recommendation patterns (but not pairwise ranking like DPO)

---

### Key logged metrics (what they mean)

#### 1) `train_loss`
- Cross-entropy loss on **completion tokens only**.
- Lower is better, but:
  - very low train_loss can mean **overfitting** (especially on small datasets)
  - always compare with `eval_loss`

**If train_loss drops fast but eval_loss stalls or rises:**
- overfitting ‚Üí reduce steps/epochs, reduce LR, add regularization (weight_decay, dropout), or stop early

---

#### 2) `eval_loss`
- Cross-entropy loss on validation set completion tokens.
- This is your primary ‚Äúgeneralization‚Äù signal.

Interpretation:
- **decreasing** eval_loss ‚Üí model generalizes better
- **flat** eval_loss ‚Üí you‚Äôre near the best point
- **increasing** eval_loss ‚Üí overfitting (stop / revert to best checkpoint)

‚úÖ Typical workflow:
- Use early stopping on `eval_loss`
- Keep the checkpoint with the **lowest** eval_loss

---

#### 3) `learning_rate` (LR schedule)
- LR warms up and then follows your scheduler (cosine/linear).
- LR that is too high can:
  - make training unstable
  - harm format stability (JSON breaks)
- LR too low can:
  - learn very slowly / plateau early

**Recommended starting LR for your setup (12B + QLoRA + long prompts):**
- `learning_rate = 5e-5` (`0.00005`)
- avoid `1e-4` unless you have a large, diverse dataset and see stable format metrics

---

#### 4) `grad_norm`
- How ‚Äúbig‚Äù the update is at each step (stability indicator).
- Useful for catching too-aggressive training before quality degrades.

Rules of thumb:
- stable regime: `~1‚Äì10` (depends on setup)
- frequent spikes `>15‚Äì20` ‚Üí LR too high or batch noise ‚Üí reduce LR / clip gradients

---

#### 5) Token stats: `train_tok=...`
Example:
- `seq_mean`: average total tokens fed to the model
- `prompt_mean`: average prompt tokens (masked from loss)
- `loss_mean`: average tokens contributing to loss (completion length)
- `loss_max`: max completion length seen in the batch

Why it matters:
- if `loss_mean` becomes tiny (e.g., 20‚Äì50) you‚Äôre barely training on the answer
- if `seq_mean` is close to `max_seq_len`, you risk truncation

‚úÖ In your case:
- prompt ~2800‚Äì2900, completion ~250‚Äì350
- ensure `max_seq_len=4096` to avoid cutting prompt or answer

---

#### 6) Format/quality sanity metric: `valid_json_rate` (from A/B sanity)
- % of sanity examples where the generated output is valid JSON (and parseable).
- This is **production-critical** for your tariff assistant.

Interpretation:
- `100%` ‚Üí stable format
- dips (e.g., `80%`) ‚Üí likely decoding randomness or format instability

‚úÖ For reliable tracking, run sanity with deterministic generation:
- `do_sample=False`, `num_beams=1`
- explicitly disable sampling params (temperature/top_p/top_k)

---

### What to tune (control knobs)

#### A) `learning_rate` (most important)
Safe defaults for your setup:
- `learning_rate = 5e-5` (`0.00005`)
If you see instability / JSON breaks:
- reduce to `3e-5` (`0.00003`)
If you learn too slowly:
- increase slightly, but prefer more steps over large LR

Symptoms ‚Üí Fix:
- `grad_norm` spikes, `train_loss` noisy ‚Üí lower LR
- `valid_json_rate` drops ‚Üí lower LR + deterministic eval + maybe reduce steps

---

#### B) `max_steps` / `num_train_epochs`
SFT often converges quickly on structured outputs.
Use either:
- fixed `max_steps` (best for reproducibility)
- or `num_train_epochs` (less predictable if dataset changes)

For ~3k rows, `batch=1`, `grad_accum=8`:
- 1 epoch ‚âà ~`3094/8 ‚âà 387` optimizer steps

Practical:
- start with `max_steps = 400‚Äì800` (‚âà 1‚Äì2 epochs)
- rely on early stopping to stop before overfitting

---

#### C) `eval_steps` (how often to evaluate)
Trade-off:
- frequent eval = better early stopping decisions but slower runs

Practical:
- `eval_steps = 100` (good default)
- if eval is expensive, `eval_steps = 200` is ok (you used 200)

---

#### D) `save_steps` and `save_total_limit`
Keep enough checkpoints to recover best eval point:
- `save_steps = eval_steps` (common)
- `save_total_limit = 2‚Äì3`

---

#### E) Regularization knobs
Useful when eval_loss stops improving while train_loss keeps falling:
- `weight_decay = 0.01` (you already added ‚Äî good)
- `lora_dropout = 0.05` (good)
- reduce `lora_r` if needed (e.g., 16 ‚Üí 8) for smaller capacity (rare)

---

#### F) Sequence length and truncation protection
Given your distribution (prompt p95 ~2880, answers ~300‚Äì350):
- use `max_seq_len = 4096`
- keep `max_new_tokens_eval` around `256‚Äì512`

Symptoms of truncation:
- worse eval_loss
- inconsistent content fields
- model ignores the tail of `avail_tp_with_desc`

---

### What to watch for in practice (quick checklist)

‚úÖ Healthy SFT run:
- `train_loss` decreases steadily
- `eval_loss` decreases then plateaus
- `valid_json_rate` stays high (ideally 100% on deterministic sanity)
- generations remain consistent before/after checkpoints

‚ö†Ô∏è Overfitting signs:
- `train_loss` keeps dropping but `eval_loss` stops improving or rises
- outputs become overly rigid, repetitive, or ‚Äúmemorized‚Äù
- downstream DPO becomes too easy (margins explode fast)

**Fix overfitting:**
- fewer steps / fewer epochs
- lower `learning_rate`
- stronger regularization (weight_decay / dropout)
- early stopping on `eval_loss`

---

### Recommended production evaluation pattern
- **Full eval**: every `eval_steps` via `eval_loss`
- **Sanity subset**: run A/B sanity on a fixed set of indices
  - keep generation deterministic
  - track:
    - `valid_json_rate`
    - presence/validity of required fields (`tariffId`, `templateId`, etc.)
    - (optional) exact tariffId match rate on sanity subset

This combination prevents ‚Äúloss looks good but output is broken‚Äù failures.
