# 1. 라이브러리 설치

In [None]:
%pip install -q peft transformers datasets evaluate seqeval rouge_score

In [None]:
import torch
import numpy as np
import evaluate
from datasets import load_dataset
from tqdm import tqdm

# Transformers 관련
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)

# PEFT(LoRA) 관련
from peft import (
    get_peft_model,
    LoraConfig,
    TaskType,
    PeftModel
)

# 2. 전처리

In [None]:
model_name_or_path = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path)

dataset = load_dataset("knkarthick/samsum")
print(f"전체 구조: {dataset}")
print(f"train: {dataset["train"][0]}")

In [None]:
prefix = "summarize: "
def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["dialogue"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# 데이터셋 전체에 전처리 함수 적용 (map 함수 이용)
tokenized_datasets = dataset.map(preprocess_function, batched=True)

# 3. LoRA 설정

In [None]:
peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 344,064 || all params: 77,305,216 || trainable%: 0.4451


# 4. 학습 설정

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model = model)
metric = evaluate.load("rouge")

def compute_metrics(eval_pred):
  predictions, labels = eval_pred
  decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
  # 레이블의 -100은 패딩 토큰이므로 복원 시 무시
  labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
  decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
  result = metric.compute(predictions = decoded_preds, references = decoded_labels)
  return result

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir = "lora_t5_result",
    learning_rate = 1e-3,
    num_train_epochs = 2,
    weight_decay = 0.01,
    logging_steps=100,
    eval_strategy = "epoch", #epoch마다 평가
    save_strategy = "epoch",
    predict_with_generate = True, #생성작업

    report_to = "none" # wandb 비활성화
)

trainer = Seq2SeqTrainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_datasets["train"],
    eval_dataset = tokenized_datasets["validation"],
    data_collator = data_collator,
    tokenizer = tokenizer,
    compute_metrics = compute_metrics
)

# 5. 학습, 모델 저장

In [None]:
trainer.train()

In [27]:
model.save_pretrained("lora_t5_final")

# 6. 성능 평가

In [None]:
base_model_name_or_path = "google/flan-t5-small"
base_model = AutoModelForSeq2SeqLM.from_pretrained(base_model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(base_model_name_or_path)

model = PeftModel.from_pretrained(base_model, "lora_t5_final")
model.to("cuda")
model.eval()

In [None]:
test_dataset = tokenized_datasets["test"]
#features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'], num_rows: 819

predictions = []
for i in tqdm(range(len(test_dataset))):
    input_ids = torch.tensor([test_dataset["input_ids"][i]]).to("cuda")
    attention_mask = torch.tensor([test_dataset["attention_mask"][i]]).to("cuda")

    prediction = model.generate(
      input_ids = input_ids,
      attention_mask = attention_mask,
      max_length = 50,
      num_beams = 4
    )

    decode_output = tokenizer.decode(prediction[0], skip_special_tokens = True)
    predictions.append(decode_output)

result = metric.compute(predictions=predictions, references=test_dataset["summary"])
result