In [None]:
from google.colab import drive
drive.mount('/gdrive')

%cd /gdrive/MyDrive/성윤/

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive/MyDrive/성윤


In [None]:
!pip install -q transformers datasets accelerate peft trl bitsandbytes

# 하나의 코드


In [None]:
# ============================================================
# 🧠 한국어 LLaMA 기반 RLHF 전체 파이프라인 (Colab용)
# - 단계: SFT → RM → PPO
# - 기능: LoRA 학습 + 병합 + 최종 모델 저장
# - 데이터 샘플링 비율 적용 가능
# ============================================================

import os
import json
import random
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    AutoModelForSequenceClassification,
    TrainingArguments,
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model, PeftModel
from trl import SFTTrainer, RewardTrainer, PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead
from datasets import Dataset
from transformers import pipeline

# ============================================================
# 0. 기본 설정
# ============================================================
BASE_MODEL = "beomi/llama-2-ko-7b"
device_map = "auto"
output_dir = "./weight/output_rlhf/test"
os.makedirs(output_dir, exist_ok=True)

# ============================================================
# 1. 데이터 샘플 비율 설정
# ============================================================
DATA_SAMPLE_RATIO = 0.0001  # 기본 5%, 필요 시 조절

def sample_data(data_list, ratio=0.05, seed=42):
    random.seed(seed)
    n_sample = max(1, int(len(data_list) * ratio))
    return random.sample(data_list, n_sample)

# ============================================================
# 2. JSON 파일 로드 함수
# ============================================================
def load_custom_json(path):
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)
        return data["data_info"]

# ============================================================
# 3. 데이터 로드 및 샘플링
# ============================================================
sft_data = sample_data(load_custom_json("./data/train/RLHF_train/SFT.json"), DATA_SAMPLE_RATIO)
rm_data = sample_data(load_custom_json("./data/train/RLHF_train/RM.json"), DATA_SAMPLE_RATIO)
ppo_data = sample_data(load_custom_json("./data/train/RLHF_train/PPO.json"), DATA_SAMPLE_RATIO)

# ============================================================
# 4. 토크나이저 및 모델 로드
# ============================================================
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
tokenizer.pad_token = tokenizer.eos_token

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=bnb_config,
    device_map=device_map
)

# ============================================================
# 5. LoRA 구성
# ============================================================
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
base_model = get_peft_model(base_model, lora_config)

# ============================================================
# 6. Step 1: SFT (Supervised Fine-Tuning)
# ============================================================
sft_dataset_formatted = Dataset.from_list([
    {"text": f"### 질문:\n{d['question']}\n\n### 답변:\n{d['answer']['contents']}"}
    for d in sft_data
])

sft_trainer = SFTTrainer(
    model=base_model,
    train_dataset=sft_dataset_formatted,
    args=TrainingArguments(
        output_dir=f"{output_dir}/sft",
        per_device_train_batch_size=2,
        num_train_epochs=1,
        logging_steps=10,
        save_strategy="epoch",
        learning_rate=2e-4,
        fp16=True,
        report_to="none"
    )
)
sft_trainer.train()
sft_model = sft_trainer.model
sft_model.save_pretrained(f"{output_dir}/sft_lora")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/15 [00:00<?, ?it/s]

Adding EOS to train dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 2}.


Step,Training Loss


In [None]:
from peft import LoraConfig, get_peft_model

# ============================================================
# LoRA 설정
# ============================================================
rm_lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],  # LLaMA에서 학습 가능한 모듈
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_CLS"
)


rm_model = AutoModelForSequenceClassification.from_pretrained(
    BASE_MODEL,
    num_labels=1,
    quantization_config=bnb_config,
    device_map=device_map
)

rm_model = get_peft_model(rm_model, rm_lora_config)

rm_dataset_list = []
for d in rm_data:
    answers = []
    for key in d:
        if key.startswith("answer") and isinstance(d[key], dict):
            ans_info = d[key]
            if "contents" in ans_info and "ranking" in ans_info:
                try:
                    ranking = int(ans_info["ranking"])
                    answers.append((ans_info["contents"], ranking))
                except (ValueError, TypeError):
                    continue
    # 모든 가능한 chosen/rejected 쌍 생성
    for i in range(len(answers)):
        for j in range(len(answers)):
            if i != j and answers[i][1] < answers[j][1]:
                rm_dataset_list.append({
                    "chosen": answers[i][0],
                    "rejected": answers[j][0]
                })

rm_dataset = Dataset.from_list(rm_dataset_list)
from trl import RewardTrainer, RewardConfig

# ============================================================
# RM 학습용 RewardConfig 정의
# ============================================================
rm_config = RewardConfig(
    learning_rate=1e-4,
    per_device_train_batch_size=2,
    num_train_epochs=1,
    logging_steps=10,
    fp16=False,
    output_dir=f"{output_dir}/rm",
    save_strategy="no",
    report_to="none"
)

# ============================================================
# RewardTrainer 생성 및 학습
# ============================================================
reward_trainer = RewardTrainer(
    model=rm_model,         # 이미 로드된 RM 모델 전달
    train_dataset=rm_dataset,
    args=rm_config          # TrainingArguments 대신 RewardConfig 사용
)

reward_trainer.train()

# 학습 완료 후 RM 모델 저장
rm_model.save_pretrained(f"{output_dir}/rm_base")

Loading checkpoint shards:   0%|          | 0/15 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at beomi/llama-2-ko-7b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Adding EOS to train dataset:   0%|          | 0/20 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/20 [00:00<?, ? examples/s]

Filtering train >1024 tokens:   0%|          | 0/20 [00:00<?, ? examples/s]

Step,Training Loss
10,0.7191


In [None]:
# ============================================================
# Step 3: PPO (Reinforcement Learning)
# ============================================================
from datasets import Dataset
from trl import PPOTrainer, PPOConfig, create_reference_model
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    AutoModelForSequenceClassification,
    GenerationConfig
)
from peft import get_peft_model
import torch

# -----------------------------
# 1️⃣ SFT 모델 로드 (LoRA 포함)
# -----------------------------
sft_model_path = f"{output_dir}/sft_lora"
MODEL_NAME = "beomi/llama-2-ko-7b"

# Tokenizer 로드
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

# Base 정책 모델 (Policy)
base_model = AutoModelForCausalLM.from_pretrained(
    sft_model_path,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)
ppo_model = get_peft_model(base_model, lora_config)

# -----------------------------
# 2️⃣ Reference 모델
# -----------------------------
ref_model = create_reference_model(ppo_model)

# -----------------------------
# 3️⃣ Reward Model (Value Head 역할)
# -----------------------------
reward_model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=1,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

# -----------------------------
# 4️⃣ Value Model (옵션)
# -----------------------------
value_model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=1,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

# -----------------------------
# 5️⃣ PPO 데이터셋 생성
# -----------------------------
# ppo_data는 [{"question": "..."}] 형태
ppo_dataset = Dataset.from_list([
    {"prompt": d["question"], "response": ""} for d in ppo_data
])

# -----------------------------
# 6️⃣ Collate 함수 정의
# -----------------------------
def collate_fn(batch):
    return {
        "query": [b["prompt"] for b in batch],
        "response": [b.get("response", "") for b in batch]
    }

# -----------------------------
# 7️⃣ PPO Config 정의
# -----------------------------
ppo_config = PPOConfig(
    batch_size=2,
    mini_batch_size=1,
    learning_rate=1.4e-5,
    # log_with=None,  # WandB 로그 비활성화 시
)

# -----------------------------
# 8️⃣ PPO Trainer 정의
# -----------------------------
ppo_trainer = PPOTrainer(
    model=ppo_model,
    ref_model=ref_model,
    reward_model=reward_model,  # ✅ 기존 rm_model → reward_model
    value_model=value_model,
    train_dataset=ppo_dataset,
    processing_class=tokenizer,
    data_collator=collate_fn,
    args=ppo_config,
)

# -----------------------------
# 9️⃣ PPO 학습 루프
# -----------------------------
for batch in ppo_trainer.dataloader:
    queries = batch["query"]
    responses = ppo_trainer.generate(queries, max_new_tokens=64)
    rewards = [torch.tensor([1.0]) for _ in responses]  # 임시
    ppo_trainer.step(queries, responses, rewards)

# -----------------------------
# 🔟 학습된 PPO 모델 저장
# -----------------------------
ppo_model.save_pretrained(f"{output_dir}/ppo_lora")
tokenizer.save_pretrained(f"{output_dir}/ppo_lora")

print(f"✅ PPO 학습 완료 및 모델 저장: {output_dir}/ppo_lora")


`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/15 [00:00<?, ?it/s]



Loading checkpoint shards:   0%|          | 0/15 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at beomi/llama-2-ko-7b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading checkpoint shards:   0%|          | 0/15 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at beomi/llama-2-ko-7b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ PPO 학습 완료 및 모델 저장: ./weight/output_rlhf/ppo_lora


In [None]:
# ============================================================
# 9. LoRA Weight 병합 및 최종 모델 저장
# ============================================================
print("🔧 LoRA weight를 병합 중입니다...")
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    AutoModelForSequenceClassification,
    GenerationConfig
)
from peft import get_peft_model
import torch

BASE_MODEL = "beomi/llama-2-ko-7b"


base = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=torch.float16,
    device_map="auto"
)
merged_model = PeftModel.from_pretrained(base, f"{output_dir}/ppo_lora")
merged_model = merged_model.merge_and_unload()

final_path = f"{output_dir}/final_model"
merged_model.save_pretrained(final_path)
tokenizer.save_pretrained(final_path)

print(f"✅ 최종 모델 저장 완료: {final_path}")



`torch_dtype` is deprecated! Use `dtype` instead!


🔧 LoRA weight를 병합 중입니다...


Loading checkpoint shards:   0%|          | 0/15 [00:00<?, ?it/s]



✅ 최종 모델 저장 완료: ./weight/output_rlhf/final_model


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Device set to use cuda:0


💬 모델 응답: 요즘 날씨가 쌀쌀한데, 감기 조심하라는 따뜻한 말을 해줘.​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​


In [None]:
# ============================================================
# 10. 최종 모델 테스트
# ============================================================
chat = pipeline("text-generation", model=final_path, tokenizer=tokenizer, device_map="auto")
prompt = "접속사가 뭐야?"
result = chat(prompt, max_new_tokens=100, do_sample=True, top_p=0.9)
print(f"💬 프롬프트 : {prompt}")
print("💬 모델 응답:", result[0]["generated_text"])

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Device set to use cuda:0


프롬프트 : 접속사가 뭐야?
💬 모델 응답: 접속사가 뭐야? 접속사? 접속사.. 접속사..​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​
