## 1-1. 제목 (Markdown)

In [1]:
# Assignment 5 — Model Training (KoBART Summarization)
#201903774 언어인지과학과 한형준

#이 노트북은 `daekeun-ml/naver-news-summarization-ko` 데이터셋을 사용하여
#KoBART(`gogamza/kobart-base-v2`) 한국어 뉴스 요약 모델을 미세조정(fine-tuning)하는 코드입니다.

## 1-2. 환경 설정

In [2]:
!pip install -q transformers datasets sentencepiece accelerate evaluate rouge-score

import os
import numpy as np
import pandas as pd
from datasets import Dataset, DatasetDict

import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
)
import evaluate

from google.colab import drive
drive.mount("/content/drive")

# ✅ 여기다가 넣기
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU name:", torch.cuda.get_device_name(0))
else:
    print("⚠ GPU가 안 잡혔습니다. 런타임 유형을 GPU로 바꿔주세요.")

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
CUDA available: True
GPU name: Tesla T4


## 1-3. 설정값(config) 정의

In [3]:
# 데이터/모델/학습 설정
MODEL_NAME = "gogamza/kobart-base-v2"   # KoBART base
OUTPUT_DIR = "/content/drive/MyDrive/boncahier/models/kobart_ko_news"
os.makedirs(OUTPUT_DIR, exist_ok=True)
print("OUTPUT_DIR:", OUTPUT_DIR)  # 학습 결과(가중치, 로그)가 저장될 경로

MAX_SOURCE_LENGTH = 512   # 입력 최대 토큰 길이
MAX_TARGET_LENGTH = 128   # 요약 최대 토큰 길이

NUM_TRAIN_EPOCHS = 4      # Colab에서 부담되지 않도록 소규모 에폭
BATCH_SIZE = 4            # GPU VRAM에 맞게 조절
LEARNING_RATE = 3e-5

SEED = 42

# 빠른 실험을 위해 샘플 수 제한 (None이면 전체 사용)
MAX_TRAIN_SAMPLES = 5000
MAX_EVAL_SAMPLES  = 1000
MAX_TEST_SAMPLES  = 1000

os.makedirs(OUTPUT_DIR, exist_ok=True)

OUTPUT_DIR: /content/drive/MyDrive/boncahier/models/kobart_ko_news


## 1-4. 데이터 로드 및 분할

In [4]:
# ============================================================
# 1. CSV에서 한국어 요약 데이터 로드 (Assignment 4 결과 재사용)
#    - 파일: data/naver_news_summarization_ko.csv
#    - 컬럼: date, category, press, title, document, link, summary
# ============================================================

csv_path = "data/naver_news_summarization_ko.csv"

ko_df = pd.read_csv(csv_path)

# 우리가 필요한 건 요약 학습용 컬럼: document(본문), summary(요약)
ko_df = ko_df[["document", "summary"]].dropna().reset_index(drop=True)
print("Loaded from CSV:", ko_df.shape)
print(ko_df.head(3))

def filter_examples(df):
    df = df.copy()
    df["doc_len"] = df["document"].astype(str).str.len()
    df["sum_len"] = df["summary"].astype(str).str.len()
    df["ratio"] = df["sum_len"] / df["doc_len"]

    df = df[(df.doc_len >= 200) & (df.sum_len >= 30)]
    df = df[(df.ratio >= 0.05) & (df.ratio <= 0.7)]

    return df.reset_index(drop=True)

ko_df = filter_examples(ko_df)
print("After filtering:", ko_df.shape)

# HuggingFace Dataset으로 변환
raw_dataset = Dataset.from_pandas(ko_df, preserve_index=False)
print(raw_dataset)

# 셔플 후 train/valid/test로 8:1:1 분할
raw_dataset = raw_dataset.shuffle(seed=SEED)

train_valid_test = raw_dataset.train_test_split(test_size=0.2, seed=SEED)
temp = train_valid_test["test"]
valid_test = temp.train_test_split(test_size=0.5, seed=SEED)

dataset_dict = DatasetDict({
    "train": train_valid_test["train"],
    "validation": valid_test["train"],
    "test": valid_test["test"],
})

dataset_dict

Loaded from CSV: (22194, 2)
                                            document  \
0  앵커 정부가 올해 하반기 우리 경제의 버팀목인 수출 확대를 위해 총력을 기울이기로 ...   
1  문어 랍스터 대게 갑오징어 새우 소라 등 해산물 활용 미국식 해물찜 시푸드 보일 준...   
2  한탄바이러스 발견 노벨상 유력 후보로 자주 거론 한국을 대표하는 의학자이자 미생물학...   

                                             summary  
0  올해 상반기 우리나라 무역수지는 역대 최악인 103억 달러 적자를 기록한 가운데, ...  
1  인터엑스 1층 뷔페 레스토랑 브래서리는 오는 6일부터 8월31일까지 쿨 섬머 페스타...  
2  이 이호왕 고려대 명예교수는 바이러스의 병원체와 진단법 백신까지 모두 개발한 한국을...  
After filtering: (19060, 5)
Dataset({
    features: ['document', 'summary', 'doc_len', 'sum_len', 'ratio'],
    num_rows: 19060
})


DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'doc_len', 'sum_len', 'ratio'],
        num_rows: 15248
    })
    validation: Dataset({
        features: ['document', 'summary', 'doc_len', 'sum_len', 'ratio'],
        num_rows: 1906
    })
    test: Dataset({
        features: ['document', 'summary', 'doc_len', 'sum_len', 'ratio'],
        num_rows: 1906
    })
})

## 1-5. 샘플 수 제한(선택)

In [5]:
def maybe_subsample(ds, max_samples, seed=SEED):
    if max_samples is None or len(ds) <= max_samples:
        return ds
    return ds.shuffle(seed=seed).select(range(max_samples))

train_dataset = maybe_subsample(dataset_dict["train"], MAX_TRAIN_SAMPLES)
eval_dataset  = maybe_subsample(dataset_dict["validation"], MAX_EVAL_SAMPLES)
test_dataset  = maybe_subsample(dataset_dict["test"], MAX_TEST_SAMPLES)

print("Train size:", len(train_dataset))
print("Valid size:", len(eval_dataset))
print("Test size :", len(test_dataset))

Train size: 5000
Valid size: 1000
Test size : 1000


## 1-6. 토크나이저 & 데이터 전처리

In [6]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)

# KoBART의 경우 pad_token이 없으면 eos_token으로 설정
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def preprocess_function(examples):
    # 입력: document, 타깃: summary
    inputs = examples["document"]
    targets = examples["summary"]

    model_inputs = tokenizer(
        inputs,
        max_length=MAX_SOURCE_LENGTH,
        padding="max_length",
        truncation=True,
    )

    labels = tokenizer(
        targets,
        max_length=MAX_TARGET_LENGTH,
        padding="max_length",
        truncation=True,
    )["input_ids"]

    # padding 토큰은 -100으로 바꿔서 loss에서 무시되도록 처리
    labels = [
        [(label if label != tokenizer.pad_token_id else -100) for label in label_seq]
        for label_seq in labels
    ]
    model_inputs["labels"] = labels
    return model_inputs

tokenized_train = train_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=train_dataset.column_names,
)

tokenized_eval = eval_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=eval_dataset.column_names,
)

tokenized_test = test_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=test_dataset.column_names,
)

tokenized_train[0]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

You passed `num_labels=3` which is incompatible to the `id2label` map of length `2`.


tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/4.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

You passed `num_labels=3` which is incompatible to the `id2label` map of length `2`.


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

{'input_ids': [14802,
  14040,
  12074,
  20169,
  14082,
  27914,
  29457,
  15906,
  25003,
  245,
  365,
  15126,
  248,
  14802,
  15126,
  248,
  26920,
  11911,
  14680,
  14141,
  15328,
  16356,
  19749,
  16099,
  14622,
  27914,
  249,
  13699,
  9264,
  14039,
  11712,
  9085,
  16347,
  13679,
  19949,
  10500,
  306,
  286,
  9242,
  14039,
  16476,
  14670,
  19446,
  17148,
  16067,
  14040,
  13607,
  15100,
  17468,
  14175,
  10314,
  19610,
  14364,
  21716,
  15615,
  14188,
  11372,
  10314,
  29457,
  27914,
  29457,
  22986,
  27914,
  249,
  13699,
  9264,
  28733,
  14145,
  18288,
  14152,
  16832,
  14280,
  14045,
  15541,
  18354,
  14058,
  15260,
  16969,
  14862,
  18531,
  25756,
  17167,
  19749,
  19042,
  13590,
  14289,
  15210,
  14098,
  12037,
  16476,
  14670,
  19446,
  17148,
  10952,
  15100,
  12790,
  10314,
  14854,
  17316,
  21716,
  16982,
  14068,
  14253,
  14130,
  27914,
  249,
  13699,
  9264,
  16099,
  26466,
  14085,
  16239,
  

## 1-7. 모델/데이터콜레이터/평가지표 설정

In [7]:
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

# 디코더 시작 토큰 설정 (필요 시)
if model.config.decoder_start_token_id is None:
    model.config.decoder_start_token_id = tokenizer.bos_token_id

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

rouge = evaluate.load("rouge")

def postprocess_text(preds, labels):
    preds = [p.strip() for p in preds]
    labels = [l.strip() for l in labels]
    return preds, labels

def compute_metrics(eval_pred):
    preds, labels = eval_pred

    # 생성 결과 디코딩
    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # 레이블 디코딩(-100 → pad_token_id)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True,
    )
    # ROUGE 점수(0~1)를 0~100 스케일로 변환
    result = {k: round(v * 100, 2) for k, v in result.items()}
    return result

You passed `num_labels=3` which is incompatible to the `id2label` map of length `2`.


model.safetensors:   0%|          | 0.00/495M [00:00<?, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

## 1-8. 학습 설정 & Trainer 생성

In [8]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=4,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=NUM_TRAIN_EPOCHS,
    predict_with_generate=False,
    fp16=torch.cuda.is_available(),  # GPU 있으면 mixed precision
    seed=SEED,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Seq2SeqTrainer(


## 1-9. 학습 실행

In [9]:
trainer.train()

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mhhj2000[0m ([33mhhj2000-hanguk-university-of-foreign-studies[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
500,0.6567
1000,0.3932




TrainOutput(global_step=1252, training_loss=0.48349771103539024, metrics={'train_runtime': 597.6506, 'train_samples_per_second': 33.464, 'train_steps_per_second': 2.095, 'total_flos': 6097364582400000.0, 'train_loss': 0.48349771103539024, 'epoch': 4.0})

## 1-10. 최종 모델/토크나이저 저장

In [10]:
# 최종 모델 및 토크나이저 저장
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print(f"✅ 모델과 토크나이저가 {OUTPUT_DIR} 에 저장되었습니다.")

✅ 모델과 토크나이저가 /content/drive/MyDrive/boncahier/models/kobart_ko_news 에 저장되었습니다.
