In [1]:
!pip install transformers
!pip install datasets
!pip install sentencepiece
!pip install evaluate
!pip install rouge_score

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=f638bfb1166359e2ade8a3ef6617b478264fd7d2fb4e74914ead1fa6aa7bc625
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [2]:
!pip install msal msal-extensions

Collecting msal
  Downloading msal-1.32.3-py3-none-any.whl.metadata (11 kB)
Collecting msal-extensions
  Downloading msal_extensions-1.3.1-py3-none-any.whl.metadata (7.8 kB)
Downloading msal-1.32.3-py3-none-any.whl (115 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.4/115.4 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading msal_extensions-1.3.1-py3-none-any.whl (20 kB)
Installing collected packages: msal, msal-extensions
Successfully installed msal-1.32.3 msal-extensions-1.3.1


In [3]:
from datasets import Dataset, DatasetDict
from google.colab import drive, userdata
import huggingface_hub
import pandas as pd
import torch
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from transformers import PreTrainedTokenizerFast, BartForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer, EarlyStoppingCallback # PretrainedModel


In [4]:
# KoBART 토크나이저 및 모델 로드
# https://huggingface.co/gogamza/kobart-base-v2

# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments

model_name = 'gogamza/kobart-base-v2'

tokenizer = PreTrainedTokenizerFast.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer.json:   0%|          | 0.00/682k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/4.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels will be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels will be overwritten to 2.


model.safetensors:   0%|          | 0.00/495M [00:00<?, ?B/s]

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split

from google.colab import drive
drive.mount('/content/drive')  # Google Drive를 마운트하여 데이터셋에 접근

# 데이터 로드
df = pd.read_csv('/content/drive/MyDrive/combined_dataset.csv')
df = df[['original_text', 'simple_text']].dropna()

# 학습/검증/테스트 분할 8:1:1
# Step 1: 전체에서 test 먼저 고정
train_val_df, test_df = train_test_split(df, test_size=0.1, random_state=42)

# Step 2: 나머지에서 train / val 분할 (예: 8:1이 되도록)
train_df, val_df = train_test_split(train_val_df, test_size=0.1111, random_state=42)  # ≒ 0.1111 → 전체에서 10%


Mounted at /content/drive


In [6]:
# HuggingFace Dataset 객체로 변환

from datasets import Dataset

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)


In [7]:
# 토크나이징 함수 정의

max_input_length = 128
max_target_length = 128

def preprocess_function(examples):
    model_inputs = tokenizer(examples['original_text'], max_length=max_input_length, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['simple_text'], max_length=max_target_length, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_tokenized = train_dataset.map(preprocess_function, batched=True, remove_columns=train_dataset.column_names)
val_tokenized = val_dataset.map(preprocess_function, batched=True, remove_columns=val_dataset.column_names)


Map:   0%|          | 0/392 [00:00<?, ? examples/s]



Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [8]:
import evaluate
import numpy as np

rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    return {k: round(v, 4) for k, v in result.items()}


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [16]:
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers import EarlyStoppingCallback

# DataCollator 설정 (padding 처리 자동)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

training_args = Seq2SeqTrainingArguments(
    run_name="kobart_base_v2_finetuned",
    output_dir="./outputs", # 체크포인트 저장 경로
    overwrite_output_dir=True, # 기존 결과 덮어쓰기

    do_train=True,
    do_eval=True,

    eval_strategy="epoch", # 각 epoch 종료 후 평가
    save_strategy="epoch", # 각 epoch마다 모델 저장
    save_total_limit=1, # 가장 좋은 모델만 유지

    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,

    num_train_epochs=10, # 🔧학습 반복 횟수 5, 7, 10
    learning_rate=2e-5, # 🔧 초기 학습률 3e-5, 1e-4, 2e-5

    logging_strategy="steps",  # 로그 출력
    logging_steps=10,

    report_to="none", # wandb 등의 리포팅 끔
    predict_with_generate=True, # ROUGE 계산 위해
)


trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model),
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)], # Early Stopping 설정
    compute_metrics=compute_metrics
)

  trainer = Seq2SeqTrainer(


In [None]:
# ------------------------------
# 실험 결과 비교 요약
# ------------------------------

# 실험 1 (learning_rate=2e-5)
# - Train Loss: 0.0117
# - Best Validation Loss: 0.355894
# - ROUGE sum 최대값: 0.12 (1epoch)

# 실험 2 (learning_rate=1e-4)
# - Train Loss: 0.4086 (너무 큼 → 과적합 발생)
# - Validation Loss: 0.327072, 단 Validation Loss가 줄지 않고 증가하는 모습 보임 (327→370)
# - ROUGE sum 최대값: 0.14 (3 epoch)

# 실험 3 (learning_rate=3e-5)
# - Train Loss: 0.0159
# - Best Validation Loss: 0.347933, 단 Validation Loss가 줄지 않고 증가하는 모습 보임 (347→369)
# - ROUGE sum 최대값: 0.12 (1, 4, 6 epoch)

# ------------------------------
# 종합 판단
# ------------------------------
# - 실험 2는 Train Loss가 가장 낮지만, Validation Loss가 지속적으로 증가하여 과적합 가능성이 큼
# - 실험 3은 ROUGE 점수가 일시적으로 높았으나 Validation Loss가 증가하고 점수도 일관되지 않음
# - 실험 1은 Validation Loss가 안정적으로 유지되며, ROUGE 점수도 전체적으로 가장 균형 있게 높음
# → 따라서 일반화 성능과 안정성을 고려해 실험 1(learning_rate=2e-5)을 최종 모델로 선정

# ------------------------------
# Early Stopping 설정 이유
# ------------------------------
# - 초기에는 early_stopping_patience=3으로 설정했으나
# - 학습 중 개선 폭이 작아 쉽게 종료되는 문제가 발생
# - 더 충분한 학습 기회를 주기 위해 patience 값을 5로 늘려 설정

# ------------------------------
# 하이퍼 파라미터 설정:
# ------------------------------
# - learning_rate = 2e-5
# - num_train_epochs = 10
# - early_stopping_patience = 5


In [17]:
# 학습
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,0.0099,0.355894,0.1033,0.02,0.1,0.1
2,0.0169,0.359558,0.12,0.04,0.12,0.12
3,0.008,0.357807,0.09,0.02,0.09,0.08
4,0.0067,0.367187,0.09,0.02,0.09,0.08
5,0.0118,0.36299,0.09,0.02,0.08,0.08
6,0.0105,0.370073,0.11,0.02,0.11,0.1


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=294, training_loss=0.011775994800081869, metrics={'train_runtime': 111.3558, 'train_samples_per_second': 35.202, 'train_steps_per_second': 4.4, 'total_flos': 179262518722560.0, 'train_loss': 0.011775994800081869, 'epoch': 6.0})

In [15]:
!pip install matplotlib scikit-learn
!pip install bert-score
!pip install evaluate
!pip install sacremoses sacrebleu

Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.0.

Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m33.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.1.1-py3-none-any.whl (19 kB)
Installing collected pa

In [18]:
def generate_rewrite(input_text: str):
    model.eval()
    model.to("cuda")

    # tokenizer가 반환하는 dict 중 필요한 키만 추출
    tokenized = tokenizer(
        input_text,
        return_tensors="pt",
        truncation=True,
        padding=True
    ).to(model.device)

    # token_type_ids 제거
    inputs = {
        "input_ids": tokenized["input_ids"],
        "attention_mask": tokenized["attention_mask"]
    }

    # generate 호출
    outputs = model.generate(**inputs, max_new_tokens=50)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# 테스트셋 사용
originals_sample = test_df["original_text"].tolist()
references_sample = test_df["simple_text"].tolist()

# 모델 생성 결과
rewrites = [generate_rewrite(sentence) for sentence in originals_sample]

import sys
sys.path.append('/content/drive/MyDrive')
from evaluate_utils import evaluate_model

evaluate_model(
    originals=originals_sample,
    rewrites=rewrites,
    references=references_sample,
    model_name="KoBART Seq2Seq Fine-tuned"
)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Downloading builder script:   0%|          | 0.00/12.1k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

평가 결과 - KoBART Seq2Seq Fine-tuned
──────────────────────────────
KoBERTScore:     0.7721
SARI Score:      60.73
평균 문장 길이:   9.40 단어
──────────────────────────────
[1] Original : 맨손체조, 가벼운 산책이나 1주일에 2-3회, 30분 정도의 걷기운동은 가능합니다.
[1] Rewrite  : 맨손체조, 산책같은 운동은 한 달 동안 하지 마세요.
----------------------------------------
[2] Original : 복대는 수술 부위를 보호하고 지지하는 효과가 있으므로 1개월 정도까지 산책하거나 활동할 때 착용하십시오.
[2] Rewrite  : 배에 두르는 복대는 아픈 데를 덜 아프게 해주는 거예요. 한 달 정도 계속 착용하면 몸에 힘이 생겨요.
----------------------------------------
[3] Original : 직장에 복귀한 후에는 서서히 활동 시간을 늘려 나가고, 업무량을 조절하여 처음부터 무리하지 않도록 합니다.
[3] Rewrite  : 직장에하고 나서도 조금씩 활동 시간을 늘려가고, 너무 무리하지 마세요.
----------------------------------------
[4] Original : 이 기간에는 수술로 인한 체력과 상처회복을 위해단백질(고기, 생선, 콩/두부 등)과 비타민 C(과일, 익힌 야채)를 많이 드십시오.
[4] Rewrite  : 수술로 몸이 많이 힘들었을 수 있으니 기운을 차리려면 비타민 C가 많은 음식을 먹어요.
----------------------------------------
[5] Original : 수술 후 장관이 부분적으로 또는 완전히 막혀 음식물, 소화액, 가스 등 장 내용물이 통과하지 못하는 상태를 말합니다.
[5] Rewrite  : 수술한 데가 갑자기 아프거나 열이 나면 병원에 다시 가야 해요.
----

