In [None]:
from datasets import load_dataset
ds = load_dataset("json", data_files={"train":"splits/train.jsonl",
                                    "validation":"splits/dev.jsonl"})

  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 87290 examples [00:00, 1745868.26 examples/s]
Generating validation split: 10911 examples [00:00, 1363973.86 examples/s]


In [6]:
import os
import numpy as np
import torch
from datasets import load_dataset
import evaluate

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    set_seed,
)

# -------------------------- 설정 --------------------------
MODEL_NAME   = "skt/kobert-base-v1"
MAX_LEN      = 128
BATCH_SIZE   = 16
LR           = 2e-5
EPOCHS       = 5
WORK_DIR     = "./kobert-emotion"
SEED         = 42

set_seed(SEED)
os.makedirs(WORK_DIR, exist_ok=True)

# -------------------- 1) JSONL 데이터 로드 ----------------------
# 아래 경로를 실제 파일명으로 바꿔주세요.
data_files = {
    "train":      "emotion_train_dataset.jsonl",
    "validation": "emotion_valid_dataset.jsonl",
    # "test":    "emotion_test_dataset.jsonl",  # 테스트셋이 있으면 주석 해제
}
ds = load_dataset("json", data_files=data_files)

# -------------------- 2) "label_id" → "labels" rename ----------------------
# Trainer는 "labels" 컬럼을 기대하기 때문에 이름을 바꿔줍니다.
ds = ds.rename_column("label_id", "labels")

# -------------------- 3) 토크나이저 & 토큰화 ----------------------------
tok = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)

def tokenize_fn(batch):
    return tok(
        batch["text"],
        truncation=True,
        max_length=MAX_LEN,
        padding=False,            # DataCollatorWithPadding가 패딩을 처리
        return_token_type_ids=False
    )

# "text" 컬럼을 없애고 input_ids·attention_mask만 남기기
ds_enc = ds.map(tokenize_fn, batched=True, remove_columns=["text"])

# 마지막으로, Trainer에게 넘길 형식(텐서) 지정
ds_enc.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# -------------------- 4) DataCollator, 모델, 지표 -------------------------
data_collator = DataCollatorWithPadding(tokenizer=tok, pad_to_multiple_of=8)

# 레이블 수는 데이터셋에 있는 고유 label_id 개수와 일치하도록 설정하세요.
# 예를 들어 0~6 범위라면 num_labels=7
num_labels = len(set(ds_enc["train"]["labels"].numpy().tolist()))

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels,
)

metric_acc = evaluate.load("accuracy")
metric_f1  = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc   = metric_acc.compute(predictions=preds, references=labels)["accuracy"]
    f1    = metric_f1.compute(predictions=preds, references=labels, average="macro")["f1"]
    return {"accuracy": acc, "macro_f1": f1}

# -------------------- 5) Trainer & 학습 인자 ------------------------------
training_args = TrainingArguments(
    output_dir               = WORK_DIR,
    evaluation_strategy      = "epoch",
    save_strategy            = "epoch",
    load_best_model_at_end   = True,
    metric_for_best_model    = "macro_f1",
    greater_is_better        = True,

    num_train_epochs         = EPOCHS,
    per_device_train_batch_size  = BATCH_SIZE,
    per_device_eval_batch_size   = BATCH_SIZE * 2,
    learning_rate            = LR,
    weight_decay             = 0.01,
    fp16                     = torch.cuda.is_available(),
    report_to                = "none",
    seed                     = SEED,
    save_total_limit         = 2,
)

trainer = Trainer(
    model           = model,
    args            = training_args,
    train_dataset   = ds_enc["train"],
    eval_dataset    = ds_enc["validation"],
    tokenizer       = tok,
    data_collator   = data_collator,
    compute_metrics = compute_metrics,
)

# -------------------- 6) 학습 및 저장 --------------------------
if __name__ == "__main__":
    print(f"▶️ 학습 샘플 수: {len(ds_enc['train'])}")
    print(f"▶️ 검증 샘플 수: {len(ds_enc['validation'])}")
    if "test" in ds_enc:
        print(f"▶️ 테스트 샘플 수: {len(ds_enc['test'])}")

    print("🚀 학습 시작...")
    trainer.train()

    print("💾 최종 모델 저장 중...")
    trainer.save_model(WORK_DIR)
    tok.save_pretrained(WORK_DIR)

    print("✅ 저장 완료! 모델 및 토크나이저가 다음 경로에 저장되었습니다:")
    print(os.path.abspath(WORK_DIR))

    # -------------------- 7) (Optional) 테스트셋 평가 --------------------------
    if "test" in ds_enc:
        print("\n📊 테스트셋 평가 결과:")
        test_metrics = trainer.evaluate(ds_enc["test"])
        for k, v in test_metrics.items():
            print(f"   {k}: {v:.4f}")


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
