In [21]:
! pip install -q datasets trl peft bitsandbytes sentencepiece

In [22]:
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from peft import LoraConfig, get_peft_model, TaskType
import torch
from torch.utils.data import DataLoader
from transformers import Trainer, TrainingArguments
from trl import RewardTrainer, RewardConfig
import os

In [None]:
CACHE_DIR = "./cache"
ROOT_OUTPUT_DIR = "./output"
REWARD_OUTPUT_DIR = f"{ROOT_OUTPUT_DIR}-reward"

os.makedirs(REWARD_OUTPUT_DIR, exist_ok=True)

dataset = load_dataset("thainq107/Vi-Alpaca-Preference", cache_dir=CACHE_DIR)
train_dataset = dataset["train"].select(range(5000))
print(f"Dataset loaded: {train_dataset}")

model_name = "hoa12356/Llama-3.2-1B-Instruct-Chat-sft-hoa"  
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

Dataset loaded: Dataset({
    features: ['id', 'question', 'chosen', 'rejected'],
    num_rows: 5000
})


In [None]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_CLS", 
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj"
    ]
)


base_model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=1,
    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
)
model = get_peft_model(base_model, peft_config)
model.config.pad_token_id = tokenizer.pad_token_id

config.json:   0%|          | 0.00/921 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at hoa12356/Llama-3.2-1B-Instruct-Chat-sft-hoa and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def preprocess_function(examples):
    # Process chosen responses
    chosen_prompts = [{"role": "user", "content": q} for q in examples["question"]]
    chosen_responses = examples["chosen"]

    # Process rejected responses
    rejected_prompts = [{"role": "user", "content": q} for q in examples["question"]]
    rejected_responses = examples["rejected"]

    # Create chosen and rejected pairs with chat templates
    batch_size = len(examples["id"])
    chosen_data = []
    rejected_data = []

    for i in range(batch_size):

        chosen_conv = [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": examples["question"][i]},
            {"role": "assistant", "content": examples["chosen"][i]}
        ]
        chosen_text = tokenizer.apply_chat_template(chosen_conv, tokenize=False)
        chosen_data.append(chosen_text)


        rejected_conv = [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": examples["question"][i]},
            {"role": "assistant", "content": examples["rejected"][i]}
        ]
        rejected_text = tokenizer.apply_chat_template(rejected_conv, tokenize=False)
        rejected_data.append(rejected_text)

    return {
        "chosen": chosen_data,
        "rejected": rejected_data
    }


processed_dataset = train_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=train_dataset.column_names
)

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [None]:

training_args = RewardConfig(
    output_dir=REWARD_OUTPUT_DIR,

    num_train_epochs=1.5,  
    per_device_train_batch_size=4,  
    gradient_accumulation_steps=8,
    gradient_checkpointing=True,
    learning_rate=5e-5,
    weight_decay=0.001,
    max_length=512,
    logging_steps=50,
    save_strategy="epoch",
    adam_beta1=0.9,
    adam_beta2=0.999,
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    optim="paged_adamw_8bit" if torch.cuda.is_available() else "adamw_torch",
    bf16=True if torch.cuda.is_available() else False,
    report_to="tensorboard"
)


trainer = RewardTrainer(
    model=model,
    args=training_args,

    processing_class=tokenizer,
    train_dataset=processed_dataset
)

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5000 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [32]:
if __name__ == "__main__":
    trainer.train()

    # 10. Save the model
    trainer.save_model(REWARD_OUTPUT_DIR)
    print(f"Reward model saved to {REWARD_OUTPUT_DIR}")



Step,Training Loss
50,0.8062
100,0.7523
150,0.6982
200,0.6907




Reward model saved to ./output-reward


In [None]:
from huggingface_hub import login
import os
os.environ["HUGGINGFACE_TOKEN"] = "*********************" 
login()


def push_model_to_hub():

    model_path = REWARD_OUTPUT_DIR

    repo_name = "hoa12356/vietnamese-reward-model"  
    model.push_to_hub(repo_name)
    tokenizer.push_to_hub(repo_name)

    print(f"Model uploaded to https://huggingface.co/{repo_name}")
push_model_to_hub()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

adapter_model.safetensors:   0%|          | 0.00/45.1M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

Model uploaded to https://huggingface.co/hoa12356/vietnamese-reward-model


In [37]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from peft import PeftModel
import torch

# Định nghĩa tên mô hình
model_name = "hoa12356/vietnamese-reward-model"
base_model_name = "hoa12356/Llama-3.2-1B-Instruct-Chat-sft-hoa"  # Mô hình gốc

# Load tokenizer
try:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    print("Tokenizer loaded successfully")
except Exception as e:
    print(f"Error loading tokenizer: {e}")
    exit(1)

# Load mô hình gốc và adapter
try:
    # Load mô hình gốc
    base_model = AutoModelForSequenceClassification.from_pretrained(
        base_model_name,
        num_labels=1  # Reward model thường có 1 đầu ra
    )
    # Load PEFT adapter
    model = PeftModel.from_pretrained(base_model, model_name)
    model.eval()
    print("Model loaded successfully")
except Exception as e:
    print(f"Error loading model: {e}")
    exit(1)

Tokenizer loaded successfully


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at hoa12356/Llama-3.2-1B-Instruct-Chat-sft-hoa and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded successfully


In [1]:
from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer

# 1. Tải mô hình ngôn ngữ để sinh câu trả lời
text_model_name = "hoa12356/Llama-3.2-1B-Instruct-Chat-sft-hoa"
text_tokenizer = AutoTokenizer.from_pretrained(text_model_name)
text_model = AutoModelForCausalLM.from_pretrained(text_model_name)

# 2. Tải mô hình thưởng để đánh giá
reward_model_name = "hoa12356/vietnamese-reward-model"
reward_tokenizer = AutoTokenizer.from_pretrained(reward_model_name)
reward_model = AutoModelForSequenceClassification.from_pretrained(reward_model_name, num_labels=1)

# 3. Tạo câu trả lời từ mô hình ngôn ngữ
prompt = "viết cho tôi câu thơ về tình anh em."
inputs = text_tokenizer(prompt, return_tensors="pt")
outputs = text_model.generate(**inputs, max_length=200)
response = text_tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"Câu trả lời: {response}")

# 4. Đánh giá câu trả lời bằng mô hình thưởng
reward_inputs = reward_tokenizer(response, return_tensors="pt")
reward_outputs = reward_model(**reward_inputs)
score = reward_outputs.logits.item()
print(f"Điểm thưởng: {score}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at hoa12356/Llama-3.2-1B-Instruct-Chat-sft-hoa and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Câu trả lời: viết cho tôi câu thơ về tình anh em.
Điểm thưởng: -2.2883410453796387
