<a href="https://colab.research.google.com/github/hwee1017/assignment/blob/main/code_practices/Attention_is_All_You_Need_Tutorial_(German_English).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ====================================================
# 1️⃣ Google Drive 마운트
# ====================================================
from google.colab import drive
drive.mount('/content/drive')

SAVE_DIR = "/content/drive/MyDrive/llama3_korean_lora"

Mounted at /content/drive


In [None]:
# ====================================================
# 2️⃣ 필수 패키지 설치
# ====================================================
!pip install -q datasets transformers peft accelerate bitsandbytes torch pandas tqdm

In [None]:
# ====================================================
# 3️⃣ 데이터 로드 및 전처리
# ====================================================
from datasets import load_dataset, concatenate_datasets
import pandas as pd

print("📚 데이터 로드 중...")

# 한국어 QA 및 대화 데이터셋 3종
koalpaca = load_dataset("beomi/KoAlpaca-v1.1", split="train")
kobest = load_dataset("jinmang2/korean_chatbot_qa", split="train")
koconv = load_dataset("Ammad1Ali/Korean-conversational-dataset", split="train")

# 샘플링 (총 7000개)
koalpaca = koalpaca.shuffle(seed=42).select(range(min(2500, len(koalpaca))))
kobest = kobest.shuffle(seed=42).select(range(min(2500, len(kobest))))
koconv = koconv.shuffle(seed=42).select(range(min(2000, len(koconv))))

# 전처리 함수
def format_data(example):
    user = (
        example.get("instruction")
        or example.get("input")
        or example.get("question")
        or example.get("context")
        or ""
    )
    assistant = (
        example.get("output")
        or example.get("answer")
        or example.get("response")
        or ""
    )
    text = f"### User: {user.strip()}\n### Assistant: {assistant.strip()}"
    return {"text": text}

dataset = concatenate_datasets([koalpaca, kobest, koconv]).map(format_data)
dataset = dataset.filter(lambda x: len(x["text"].strip()) > 0)
df = pd.DataFrame(dataset)
df.drop_duplicates(subset=["text"], inplace=True)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"✅ 데이터 전처리 완료. 샘플 수: {len(df)}")

In [None]:
# ====================================================
# 4️⃣ 모델 및 LoRA 설정
# ====================================================
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# EncoderDecoderCache 문제 회피용 패치
try:
    from transformers import EncoderDecoderCache
except ImportError:
    EncoderDecoderCache = None

from peft import LoraConfig, get_peft_model, TaskType

MODEL_NAME = "MLP-KTLim/llama-3-Korean-Bllossom-8B"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True
)

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora_config)
print("✅ 모델 & LoRA 설정 완료")

In [None]:
# ====================================================
# 5️⃣ 토크나이징 및 데이터 준비
# ====================================================
from datasets import Dataset
from transformers import DataCollatorForLanguageModeling

raw_dataset = Dataset.from_pandas(df)

def tokenize_function(example):
    return tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=512,
        return_tensors="pt",
    )

tokenized_dataset = raw_dataset.map(tokenize_function, batched=True)
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [None]:
# ====================================================
# 6️⃣ 학습 설정
# ====================================================
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir=SAVE_DIR,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    warmup_steps=50,
    num_train_epochs=2,
    learning_rate=2e-4,
    fp16=False,
    bf16=True,
    logging_steps=10,
    save_strategy="epoch",
    save_total_limit=2,
    report_to="none",
    evaluation_strategy="no"
)

In [None]:
# ====================================================
# 7️⃣ Trainer로 학습
# ====================================================
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator
)

print("🚀 학습 시작...")
trainer.train()
print("✅ 학습 완료!")

In [None]:
# ====================================================
# 8️⃣ 모델 저장
# ====================================================
model.save_pretrained(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)
print(f"💾 LoRA 모델 저장 완료 → {SAVE_DIR}")