In [1]:
pip install -U transformers datasets accelerate peft trl bitsandbytes wandb



In [2]:
import gc
import os
import torch
import wandb
from datasets import load_dataset

# Use this only if you're using Google Colab and stored a secret token
from google.colab import userdata

from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
)
from trl import ORPOConfig, ORPOTrainer, setup_chat_format
import wandb
wandb.init(project="Orpo")


[34m[1mwandb[0m: Currently logged in as: [33mlhoa517[0m ([33mlhoa517-phenikaa-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
if torch.cuda.get_device_capability()[0] >= 8:
    !pip install -qqq flash-attn
    attn_implementation = "flash_attention_2"
    torch_dtype = torch.bfloat16
else:
    attn_implementation = "eager"
    torch_dtype = torch.float16

In [4]:
# Model
base_model = "hoa12356/Llama-3.2-1B-Instruct-Chat-sft-hoa"
new_model = "Orpo_instruct_hoa"

# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)


peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)


tokenizer = AutoTokenizer.from_pretrained(base_model)


model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation
)


tokenizer.chat_template = None
model, tokenizer = setup_chat_format(model, tokenizer)
model = prepare_model_for_kbit_training(model)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

In [5]:
from datasets import load_dataset
import os

dataset_name = "thainq107/Vi-Alpaca-Preference"

dataset = load_dataset(dataset_name, split="all")
dataset = dataset.shuffle(seed=42).select(range(10000))


def format_chat_template(row):

    messages_chosen = [{"role": "assistant", "content": row["chosen"]}]
    messages_rejected = [{"role": "assistant", "content": row["rejected"]}]



    row["chosen"] = tokenizer.apply_chat_template(messages_chosen, tokenize=False)
    row["rejected"] = tokenizer.apply_chat_template(messages_rejected, tokenize=False)
    return row


dataset = dataset.map(
    format_chat_template,
    num_proc=os.cpu_count(),
)


dataset = dataset.train_test_split(test_size=0.01)

Map (num_proc=2):   0%|          | 0/10000 [00:00<?, ? examples/s]

In [None]:
orpo_args = ORPOConfig(
    learning_rate=8e-6,
    beta=0.1,
    lr_scheduler_type="linear",
    max_length=1024,
    max_prompt_length=512,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    optim="paged_adamw_8bit",

    num_train_epochs=1,
    max_steps=400,              # ← thêm dòng này
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=10,
    report_to="wandb",
    output_dir="./results/",
)


trainer = ORPOTrainer(
    model=model,
    args=orpo_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    peft_config=peft_config,
    processing_class=tokenizer,
)
trainer.train()
trainer.save_model(new_model)



Map:   0%|          | 0/9900 [00:00<?, ? examples/s]

Map:   0%|          | 0/9900 [00:00<?, ? examples/s]

Map:   0%|          | 0/9900 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
  return fn(*args, **kwargs)


Step,Training Loss
1,10.6108
2,12.981
3,7.9995
4,11.9563
5,7.1671
6,8.4034
7,11.5254
8,9.9213
9,8.3265
10,14.3695


Step,Training Loss
1,10.6108
2,12.981
3,7.9995
4,11.9563
5,7.1671
6,8.4034
7,11.5254
8,9.9213
9,8.3265
10,14.3695
