In [None]:
!pip install trl --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/564.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━[0m [32m501.8/564.7 kB[0m [31m15.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m564.7/564.7 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import os, torch, random, numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed
from trl import SFTTrainer, SFTConfig, DPOTrainer, DPOConfig
from peft import LoraConfig

In [None]:
import pandas as pd

In [None]:
os.environ["WANDB_PROJECT"] = "qwen05b-sft"
torch.backends.cuda.matmul.allow_tf32 = True
torch.set_float32_matmul_precision("high")

DTYPE = torch.bfloat16
BASE_ID = "Qwen/Qwen2.5-0.5B-Instruct"
MAXLEN = 1024

ATTN_IMPL = "flash_attention_2"
try:
    import flash_attn  # noqa
except ImportError:
    ATTN_IMPL = "eager"

SEED = 42
set_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x7fea14458d50>

In [None]:
tok = AutoTokenizer.from_pretrained(BASE_ID, use_fast=True)
if tok.pad_token is None: tok.pad_token = tok.eos_token
tok.padding_side = "right"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [None]:
raw = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft[:100000]").train_test_split(test_size=0.02, seed=SEED)

In [None]:
def to_text(ex):
    # keep only examples that END with an assistant message
    msgs = ex["messages"]
    if not msgs or msgs[-1]["role"] != "assistant":
        return {"text": None}
    return {
        "text": tok.apply_chat_template(
            msgs,
            tokenize=False,
            add_generation_prompt=False  # learn full assistant turn(s)
        )
    }

train = raw["train"].map(to_text, remove_columns=raw["train"].column_names, num_proc=4)
evald = raw["test"].map(to_text, remove_columns=raw["test"].column_names, num_proc=4)

Map (num_proc=4):   0%|          | 0/98000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/2000 [00:00<?, ? examples/s]

In [None]:
policy = AutoModelForCausalLM.from_pretrained(
    BASE_ID,
    device_map="auto",
    torch_dtype=DTYPE,
    attn_implementation=ATTN_IMPL,
)
policy.config.use_cache = False
policy.gradient_checkpointing_enable()

In [None]:
# --- LoRA: include MLP projections for better quality
peft_cfg = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]
)

# --- Training config
sft_cfg = SFTConfig(
    output_dir="ckpt_sft_merged_qwen05b",
    max_length=MAXLEN,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=8,  # effective microbatching
    num_train_epochs=1,
    learning_rate=2e-5,
    logging_steps=10,
    report_to=["wandb"],
    bf16=True,
    tf32=True,
    packing=False,  # safer for dialogue structure; turn on if you must
    dataset_text_field="text",
    group_by_length=True,
    dataloader_num_workers=4,
    dataloader_pin_memory=True,
    gradient_checkpointing=True,
    optim="adamw_torch",
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
    max_grad_norm=1.0,
    seed=SEED,

    # eval & saving
    eval_strategy="steps",
    eval_steps=50,
    save_strategy="steps",
    save_steps=200,
    save_total_limit=3,
)

trainer = SFTTrainer(
    model=policy,
    args=sft_cfg,
    peft_config=peft_cfg,
    train_dataset=train,
    eval_dataset=evald,
    processing_class=tok,
)



In [None]:
trainer.train()

trainer.model.merge_and_unload()
trainer.model.save_pretrained("ckpt_sft_merged_qwen05b/final_model")
tok.save_pretrained("ckpt_sft_merged_qwen05b/final_model")

Step,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
50,1.7164,1.565918,1.547914,2835480.0,0.625602
100,1.5284,1.502819,1.518847,5696939.0,0.635611
150,1.5599,1.496899,1.517358,8556468.0,0.636512
200,1.4952,1.493621,1.510967,11392372.0,0.637194
250,1.5112,1.49124,1.508888,14238384.0,0.637431
300,1.526,1.489147,1.519102,17085628.0,0.637891
350,1.5492,1.487926,1.508119,19938023.0,0.638
400,1.5239,1.486344,1.513233,22794001.0,0.638308
450,1.5064,1.485401,1.507754,25650593.0,0.638309
500,1.5346,1.484261,1.506646,28518372.0,0.638703


('ckpt_sft_merged_qwen05b/final_model/tokenizer_config.json',
 'ckpt_sft_merged_qwen05b/final_model/special_tokens_map.json',
 'ckpt_sft_merged_qwen05b/final_model/chat_template.jinja',
 'ckpt_sft_merged_qwen05b/final_model/vocab.json',
 'ckpt_sft_merged_qwen05b/final_model/merges.txt',
 'ckpt_sft_merged_qwen05b/final_model/added_tokens.json',
 'ckpt_sft_merged_qwen05b/final_model/tokenizer.json')

In [None]:
del policy

In [None]:
def prep_dpo(ex):
    if not ex.get("chosen") or not ex.get("rejected"):
        return None

    raw = ex["prompt"]
    # keep raw chat if it's a list; else wrap as one user turn
    prompt = raw if isinstance(raw, list) else [{"role": "user", "content": str(raw)}]

    return {
        "prompt": prompt,                 # NOT templated
        "chosen": str(ex["chosen"]),
        "rejected": str(ex["rejected"]),
    }

In [None]:
prefs = load_dataset("HuggingFaceH4/ultrafeedback_binarized", split="train_prefs")

In [None]:
prefs = prefs.map(prep_dpo, num_proc=4)

In [None]:
prefs = prefs.train_test_split(test_size=0.1, seed=SEED)

In [None]:
train_prefs = prefs["train"]
eval_prefs = prefs["test"]

In [None]:
policy_dpo = AutoModelForCausalLM.from_pretrained(
    "ckpt_sft_merged_qwen05b/final_model", device_map="auto", torch_dtype=DTYPE,
    attn_implementation=ATTN_IMPL
)
policy_dpo.config.use_cache = False
policy_dpo.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False})

peft_cfg_dpo = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]
)

Loading adapter weights from ckpt_sft_merged_qwen05b/final_model led to missing keys in the model: model.layers.0.self_attn.q_proj.lora_A.default.weight, model.layers.0.self_attn.q_proj.lora_B.default.weight, model.layers.0.self_attn.k_proj.lora_A.default.weight, model.layers.0.self_attn.k_proj.lora_B.default.weight, model.layers.0.self_attn.v_proj.lora_A.default.weight, model.layers.0.self_attn.v_proj.lora_B.default.weight, model.layers.0.self_attn.o_proj.lora_A.default.weight, model.layers.0.self_attn.o_proj.lora_B.default.weight, model.layers.0.mlp.gate_proj.lora_A.default.weight, model.layers.0.mlp.gate_proj.lora_B.default.weight, model.layers.0.mlp.up_proj.lora_A.default.weight, model.layers.0.mlp.up_proj.lora_B.default.weight, model.layers.0.mlp.down_proj.lora_A.default.weight, model.layers.0.mlp.down_proj.lora_B.default.weight, model.layers.1.self_attn.q_proj.lora_A.default.weight, model.layers.1.self_attn.q_proj.lora_B.default.weight, model.layers.1.self_attn.k_proj.lora_A.defa

In [None]:
train_prefs = train_prefs.remove_columns(["prompt_id", "messages", "score_chosen", "score_rejected"])
eval_prefs = eval_prefs.remove_columns(["prompt_id", "messages", "score_chosen", "score_rejected"])

In [None]:
dpo_cfg = DPOConfig(
    # I/O
    output_dir="ckpt_dpo_qwen05b_refrozen",

    # Core DPO
    beta=0.1,                           # keep
    f_divergence_type="reverse_kl",     # default; good starting point
    reference_free=False,               # standard DPO with ref

    # Batching
    per_device_train_batch_size=16,
    gradient_accumulation_steps=16,
    per_device_eval_batch_size=16,

    # Schedule & LR
    num_train_epochs=1,
    max_steps=-1,                       # honor num_train_epochs
    learning_rate=1e-5,                 # LoRA-DPO sweet spot
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,                  # match SFT
    weight_decay=0.0,

    # Lengths
    max_length=MAXLEN,                  # same as SFT
    max_prompt_length=512,              # default is fine; keep explicit
    truncation_mode="keep_end",

    # Logging / Eval / Save (mirror SFT cadence)
    logging_strategy="steps",
    logging_steps=5,                    # match SFT
    report_to=["wandb"],

    eval_strategy="steps",
    eval_steps=100,                      # match SFT
    save_strategy="steps",
    save_steps=100,                      # match SFT
    save_total_limit=3,
    save_safetensors=True,

    # Precision & perf
    bf16=True,
    tf32=True,
    gradient_checkpointing=True,        # match SFT
    dataloader_num_workers=4,
    dataloader_pin_memory=True,
    group_by_length=False,               # match SFT
    remove_unused_columns=False,         # default good

    # Optimizer (match SFT unless you’re in 4/8-bit)
    optim="adamw_torch",                # if using 4/8-bit, switch to "paged_adamw_torch"
    max_grad_norm=1.0,

    # Repro
    seed=SEED,

    # Nice-to-haves (explicit)
    disable_dropout=True,               # default; keeps eval-style stability during DPO
    label_pad_token_id=-100,
    average_tokens_across_devices=True
)

# No need to pass in the seprate Ref model since DPOTrainer makes their own !
dpo_tr = DPOTrainer(
    model=policy_dpo,
    args=dpo_cfg,
    train_dataset=train_prefs,
    eval_dataset=eval_prefs,
    processing_class=tok,
    peft_config=peft_cfg_dpo
)



In [None]:
dpo_tr.train()
dpo_tr.model.save_pretrained("ckpt_dpo_qwen05b_refrozen")
tok.save_pretrained("ckpt_dpo_qwen05b_refrozen")

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.


Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
100,0.6295,0.626636,0.45098,0.217187,0.650457,0.233793,-401.133972,-351.652496,-1.683457,-1.664304




Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
100,0.6295,0.626636,0.45098,0.217187,0.650457,0.233793,-401.133972,-351.652496,-1.683457,-1.664304
200,0.6066,0.614159,0.487133,0.20185,0.658616,0.285283,-400.772491,-351.805878,-1.71515,-1.696979




('ckpt_dpo_qwen05b_refrozen/tokenizer_config.json',
 'ckpt_dpo_qwen05b_refrozen/special_tokens_map.json',
 'ckpt_dpo_qwen05b_refrozen/chat_template.jinja',
 'ckpt_dpo_qwen05b_refrozen/vocab.json',
 'ckpt_dpo_qwen05b_refrozen/merges.txt',
 'ckpt_dpo_qwen05b_refrozen/added_tokens.json',
 'ckpt_dpo_qwen05b_refrozen/tokenizer.json')

In [None]:
from pathlib import Path
from huggingface_hub import create_repo, login

login(token="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")
USERNAME = "kunjcr2"   # or org
REPO_ID = f"{USERNAME}/qwen2.5-0.5b-sft-dpo"  # single repo

SFT_LOCAL = "ckpt_sft_merged_qwen05b/final_model"
DPO_LOCAL = "ckpt_dpo_qwen05b_refrozen"

assert Path(SFT_LOCAL).exists()
assert Path(DPO_LOCAL).exists()

create_repo(REPO_ID, repo_type="model", private=False, exist_ok=True)

RepoUrl('https://huggingface.co/kunjcr2/qwen2.5-0.5b-sft-dpo', endpoint='https://huggingface.co', repo_type='model', repo_id='kunjcr2/qwen2.5-0.5b-sft-dpo')

In [None]:
import shutil, os
from huggingface_hub import HfApi

STAGING = "hub_upload"
if os.path.exists(STAGING):
    shutil.rmtree(STAGING)

# copy backbone (root)
shutil.copytree(SFT_LOCAL, STAGING)

# add adapters subfolder
ADAPT_SUBFOLDER = os.path.join(STAGING, "dpo_adapters")
shutil.copytree(DPO_LOCAL, ADAPT_SUBFOLDER)

# push to hub
api = HfApi()
api.upload_folder(
    repo_id=REPO_ID,
    folder_path=STAGING,
    commit_message="Upload SFT backbone + DPO adapters in one repo",
)
print("✅ pushed to", REPO_ID)

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...ckpoint-215/rng_state.pth:  77%|#######7  | 11.3kB / 14.6kB            

  ...ckpoint-200/rng_state.pth:  77%|#######7  | 11.3kB / 14.6kB            

  ...adapter_model.safetensors:   1%|1         |  185kB / 17.6MB            

  ...eckpoint-215/optimizer.pt:   1%|1         |  373kB / 35.5MB            

  ...eckpoint-215/scheduler.pt:   1%|1         |  15.0B / 1.47kB            

  ...adapter_model.safetensors:   1%|1         |  183kB / 17.6MB            

  ...eckpoint-100/optimizer.pt:   1%|1         |  368kB / 35.5MB            

  ...int-100/training_args.bin:   1%|1         |  69.0B / 6.67kB            

  ...int-200/training_args.bin:   1%|1         |  69.0B / 6.67kB            

  ...int-215/training_args.bin:   1%|1         |  69.0B / 6.67kB            

✅ pushed to kunjcr2/qwen2.5-0.5b-sft-dpo
