<a href="https://colab.research.google.com/github/hwee1017/assignment/blob/main/code_practices/Attention_is_All_You_Need_Tutorial_(German_English).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# 저장 경로 설정
SAVE_DIR = "/content/drive/MyDrive/LoRA-Korean-Chat"
print(f"✅ 모델 저장 경로: {SAVE_DIR}")

In [None]:
# ====================================================
# 1️⃣ 환경 세팅
# ====================================================
!pip install -q transformers accelerate peft bitsandbytes datasets

import torch
from datasets import load_dataset, concatenate_datasets
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model

print("✅ 환경 설정 완료")

In [None]:
# ====================================================
# 2️⃣ 데이터 로드 및 전처리
# ====================================================
print("📚 데이터 로드 중...")

# KoAlpaca + KoChatGPT 스타일 QA 데이터
koalpaca = load_dataset("beomi/KoAlpaca-v1.1", split="train")
kobest = load_dataset("jinmang2/korean_chatbot_qa", split="train")

# 7000개만 샘플링
koalpaca = koalpaca.shuffle(seed=42).select(range(min(3500, len(koalpaca))))
kobest = kobest.shuffle(seed=42).select(range(min(3500, len(kobest))))

def format_data(example):
    user = example.get("instruction") or example.get("input") or example.get("question", "")
    assistant = example.get("output") or example.get("answer", "")
    text = f"### User: {user.strip()}\n### Assistant: {assistant.strip()}"
    return {"text": text}

dataset = concatenate_datasets([koalpaca, kobest]).map(format_data)
dataset = dataset.filter(lambda x: len(x["text"].strip()) > 0)

print(f"✅ 데이터 전처리 완료. 샘플 수: {len(dataset)}")

In [None]:
# ====================================================
# 3️⃣ 토크나이저 & 모델 불러오기
# ====================================================
model_name = "MLP-KTLim/llama-3-Korean-Bllossom-8B"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True
)

# pad token 설정 (경고 방지용)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

print("✅ 모델 및 토크나이저 로드 완료")

In [None]:
# ====================================================
# 4️⃣ LoRA 구성
# ====================================================
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
print("✅ LoRA 구성 완료")

In [None]:
# ====================================================
# 5️⃣ 토크나이징
# ====================================================
def tokenize(example):
    return tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=512
    )

dataset = dataset.map(tokenize, batched=True)
dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
print("✅ 데이터 토크나이징 완료")

In [None]:
# ====================================================
# 6️⃣ 학습 설정
# ====================================================
training_args = TrainingArguments(
    output_dir="./lora-korean-chat",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=2,
    learning_rate=2e-4,
    logging_steps=10,
    save_steps=200,
    save_total_limit=2,
    fp16=True,
    optim="paged_adamw_8bit",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset
)

In [None]:
# ====================================================
# 7️⃣ 학습 시작
# ====================================================
print("🚀 학습 시작!")
trainer.train()

In [None]:
# ====================================================
# 9️⃣ 가중치 저장 (Drive에)
# ====================================================
model.save_pretrained(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)

print(f"✅ 학습 완료 및 LoRA 가중치 저장됨 → {SAVE_DIR}")