In [1]:
import pandas as pd
import os
import re
import json
import yaml
import random
import numpy as np
import torch
from glob import glob
from tqdm import tqdm
from pprint import pprint
from rouge import Rouge

# Transformers & Torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer, 
    BartForConditionalGeneration, 
    BartConfig,
    Seq2SeqTrainingArguments, 
    Seq2SeqTrainer,
    EarlyStoppingCallback
)
import wandb

# -----------------------------------------------------------------------------
# 1. Configuration & Seed Setting (설정 및 시드 고정)
# -----------------------------------------------------------------------------

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# 고득점을 위한 하이퍼파라미터 튜닝
CONF = {
    "general": {
        "data_path": "./data/", # 데이터 경로 (수정 필요 시 변경)
        "model_name": "digit82/kobart-summarization",
        "output_dir": "./results",
        "seed": 42
    },
    "tokenizer": {
        "encoder_max_len": 512, # 대화문은 길 수 있으므로 512 유지
        "decoder_max_len": 128, # 요약문 길이를 약간 여유있게 설정
        # DialogueSum 데이터셋에 등장하는 특수 토큰들
        "special_tokens": ['#Person1#', '#Person2#', '#Person3#', '#Person4#', '#Person5#', '#Person6#', '#Person7#', '#PhoneNumber#', '#Address#', '#PassportNumber#']
    },
    "training": {
        "overwrite_output_dir": True,
        "num_train_epochs": 15, # Early Stopping이 있으므로 넉넉하게
        "learning_rate": 2e-5,  # Fine-tuning에 적합한 LR
        "per_device_train_batch_size": 32, # GPU 메모리에 맞춰 조절 (VRAM 부족시 16으로 감소)
        "per_device_eval_batch_size": 32,
        "warmup_ratio": 0.1,
        "weight_decay": 0.01,
        "lr_scheduler_type": 'cosine',
        "optim": 'adamw_torch',
        "gradient_accumulation_steps": 1,
        "evaluation_strategy": 'epoch',
        "save_strategy": 'epoch',
        "save_total_limit": 3,
        "fp16": True, # 학습 속도 향상
        "load_best_model_at_end": True,
        "metric_for_best_model": "combined_score", # 커스텀 메트릭 기준
        "greater_is_better": True,
        "logging_dir": "./logs",
        "logging_strategy": "steps",
        "logging_steps": 100,
        "predict_with_generate": True, # 중요: Eval 시 생성 모드 활성화
        "generation_max_length": 128,
        "early_stopping_patience": 3,
        "report_to": "none" # wandb 사용시 "wandb"로 변경
    },
    "inference": {
        "ckpt_path": "./results/checkpoint-best", # 학습 후 자동 설정됨
        "result_path": "./prediction/",
        "no_repeat_ngram_size": 3, # 반복되는 문구 억제 (2 또는 3 추천)
        "early_stopping": True,
        "generate_max_length": 128,
        "num_beams": 5, # Beam Search 크기 증가 (정확도 향상)
        "length_penalty": 1.0, # 길이에 대한 페널티 (0.6~1.2 실험 권장)
        "batch_size": 32,
        "remove_tokens": [] # 추후 토크나이저 로드 후 설정
    }
}

seed_everything(CONF['general']['seed'])
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

# -----------------------------------------------------------------------------
# 2. Data Preprocessing (데이터 전처리)
# -----------------------------------------------------------------------------

# Tokenizer 로드 및 Special Token 추가
tokenizer = AutoTokenizer.from_pretrained(CONF['general']['model_name'])
special_tokens_dict = {'additional_special_tokens': CONF['tokenizer']['special_tokens']}
tokenizer.add_special_tokens(special_tokens_dict)

CONF['tokenizer']['bos_token'] = tokenizer.bos_token
CONF['tokenizer']['eos_token'] = tokenizer.eos_token
CONF['inference']['remove_tokens'] = [tokenizer.bos_token, tokenizer.eos_token, tokenizer.pad_token]

class Preprocess:
    def __init__(self, bos_token, eos_token):
        self.bos_token = bos_token
        self.eos_token = eos_token

    def make_set_as_df(self, file_path, is_train=True):
        df = pd.read_csv(file_path)
        if is_train:
            return df[['fname', 'dialogue', 'summary']]
        else:
            return df[['fname', 'dialogue']]

    def make_input(self, dataset, is_test=False):
        if is_test:
            encoder_input = dataset['dialogue']
            # Test 시에는 Decoder 입력 시작 토큰만 줌
            decoder_input = [self.bos_token] * len(dataset['dialogue'])
            return encoder_input.tolist(), list(decoder_input)
        else:
            encoder_input = dataset['dialogue']
            # Train 시에는 Summary를 Decoder 입력으로 사용 (Teacher Forcing)
            decoder_input = dataset['summary'].apply(lambda x: self.bos_token + str(x))
            decoder_output = dataset['summary'].apply(lambda x: str(x) + self.eos_token)
            return encoder_input.tolist(), decoder_input.tolist(), decoder_output.tolist()

class CustomDataset(Dataset):
    def __init__(self, encoder_input, decoder_input, labels=None, is_test=False, ids=None):
        self.encoder_input = encoder_input
        self.decoder_input = decoder_input
        self.labels = labels
        self.is_test = is_test
        self.ids = ids

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encoder_input.items()}
        
        # Decoder Input
        if 'input_ids' in self.decoder_input:
            item['decoder_input_ids'] = self.decoder_input['input_ids'][idx].clone().detach()
            item['decoder_attention_mask'] = self.decoder_input['attention_mask'][idx].clone().detach()
        else:
            # Inference 시점 (List 형태일 수 있음)
            # 여기서는 로직상 토크나이징된 결과가 들어오므로 위 조건문이 실행됨
            pass

        if not self.is_test:
            # Labels (pad token은 loss 계산에서 제외하기 위해 -100 처리하는 것이 좋으나, 
            # 여기서는 tokenizer가 처리한 그대로 사용하고 pad masking은 DataCollator가 보통 처리함.
            # 베이스라인 방식을 따르되, labels가 존재하면 추가.
            item['labels'] = self.labels['input_ids'][idx].clone().detach()
            
        if self.ids is not None:
            item['ID'] = self.ids[idx]
            
        return item

    def __len__(self):
        return len(self.encoder_input['input_ids'])

def prepare_data(conf, tokenizer, is_train=True):
    preprocessor = Preprocess(conf['tokenizer']['bos_token'], conf['tokenizer']['eos_token'])
    data_path = conf['general']['data_path']
    
    if is_train:
        train_df = preprocessor.make_set_as_df(os.path.join(data_path, 'train.csv'))
        val_df = preprocessor.make_set_as_df(os.path.join(data_path, 'dev.csv'))
        
        # Train Data
        enc_train, dec_in_train, dec_out_train = preprocessor.make_input(train_df)
        tokenized_enc_train = tokenizer(enc_train, return_tensors="pt", padding=True, truncation=True, max_length=conf['tokenizer']['encoder_max_len'])
        tokenized_dec_in_train = tokenizer(dec_in_train, return_tensors="pt", padding=True, truncation=True, max_length=conf['tokenizer']['decoder_max_len'])
        tokenized_dec_out_train = tokenizer(dec_out_train, return_tensors="pt", padding=True, truncation=True, max_length=conf['tokenizer']['decoder_max_len'])
        
        train_dataset = CustomDataset(tokenized_enc_train, tokenized_dec_in_train, tokenized_dec_out_train)
        
        # Val Data
        enc_val, dec_in_val, dec_out_val = preprocessor.make_input(val_df)
        tokenized_enc_val = tokenizer(enc_val, return_tensors="pt", padding=True, truncation=True, max_length=conf['tokenizer']['encoder_max_len'])
        tokenized_dec_in_val = tokenizer(dec_in_val, return_tensors="pt", padding=True, truncation=True, max_length=conf['tokenizer']['decoder_max_len'])
        tokenized_dec_out_val = tokenizer(dec_out_val, return_tensors="pt", padding=True, truncation=True, max_length=conf['tokenizer']['decoder_max_len'])
        
        val_dataset = CustomDataset(tokenized_enc_val, tokenized_dec_in_val, tokenized_dec_out_val)
        
        return train_dataset, val_dataset
    
    else: # Test
        test_df = preprocessor.make_set_as_df(os.path.join(data_path, 'test.csv'), is_train=False)
        enc_test, dec_in_test = preprocessor.make_input(test_df, is_test=True)
        
        tokenized_enc_test = tokenizer(enc_test, return_tensors="pt", padding=True, truncation=True, max_length=conf['tokenizer']['encoder_max_len'])
        # Test시 Decoder input은 start token만 있으면 됨 (배치 처리를 위해 tokenize)
        tokenized_dec_in_test = tokenizer(dec_in_test, return_tensors="pt", padding=True, truncation=True, max_length=conf['tokenizer']['decoder_max_len'])
        
        test_dataset = CustomDataset(tokenized_enc_test, tokenized_dec_in_test, is_test=True, ids=test_df['fname'].tolist())
        return test_df, test_dataset

# -----------------------------------------------------------------------------
# 3. Model Training (모델 학습)
# -----------------------------------------------------------------------------
def compute_metrics(eval_pred):
    rouge = Rouge()
    predictions, labels = eval_pred
    
    # [수정 1] predictions 처리
    if isinstance(predictions, tuple):
        predictions = predictions[0]
    
    # -100을 pad_token_id로 치환
    predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    
    # [핵심 수정] skip_special_tokens=False로 설정하여 #Person# 토큰 살리기
    decoded_preds = tokenizer.batch_decode(predictions.tolist(), skip_special_tokens=False)
    decoded_labels = tokenizer.batch_decode(labels.tolist(), skip_special_tokens=False)
    
    # [추가] 시스템 토큰(BOS, EOS, PAD)만 수동으로 제거
    # 모델이 학습한 Special Token인 #Person1# 등은 남겨야 함
    remove_tokens = [tokenizer.bos_token, tokenizer.eos_token, tokenizer.pad_token]
    
    def clean_text(text_list):
        cleaned = []
        for text in text_list:
            for token in remove_tokens:
                if token is not None:
                    text = text.replace(token, "")
            cleaned.append(text.strip())
        return cleaned

    decoded_preds = clean_text(decoded_preds)
    decoded_labels = clean_text(decoded_labels)
    
    # ROUGE 점수 계산
    try:
        results = rouge.get_scores(decoded_preds, decoded_labels, avg=True)
        r1 = results["rouge-1"]["f"]
        r2 = results["rouge-2"]["f"]
        rl = results["rouge-l"]["f"]
        combined_score = (r1 + r2 + rl) / 3
        
        return {
            "rouge-1": r1,
            "rouge-2": r2,
            "rouge-l": rl,
            "combined_score": combined_score
        }
    except Exception as e:
        print(f"Error in metrics: {e}")
        return {"combined_score": 0.0}


def train():
    # 데이터 로드
    train_dataset, val_dataset = prepare_data(CONF, tokenizer, is_train=True)
    
    # 모델 로드
    model = BartForConditionalGeneration.from_pretrained(CONF['general']['model_name'])
    model.resize_token_embeddings(len(tokenizer)) # Special token 추가 반영
    model.to(device)
    
    # Training Arguments
    args = Seq2SeqTrainingArguments(
        output_dir=CONF['general']['output_dir'],
        overwrite_output_dir=CONF['training']['overwrite_output_dir'],
        num_train_epochs=CONF['training']['num_train_epochs'],
        learning_rate=CONF['training']['learning_rate'],
        per_device_train_batch_size=CONF['training']['per_device_train_batch_size'],
        per_device_eval_batch_size=CONF['training']['per_device_eval_batch_size'],
        warmup_ratio=CONF['training']['warmup_ratio'],
        weight_decay=CONF['training']['weight_decay'],
        lr_scheduler_type=CONF['training']['lr_scheduler_type'],
        optim=CONF['training']['optim'],
        evaluation_strategy=CONF['training']['evaluation_strategy'],
        save_strategy=CONF['training']['save_strategy'],
        save_total_limit=CONF['training']['save_total_limit'],
        fp16=CONF['training']['fp16'],
        load_best_model_at_end=CONF['training']['load_best_model_at_end'],
        metric_for_best_model=CONF['training']['metric_for_best_model'],
        greater_is_better=CONF['training']['greater_is_better'],
        logging_dir=CONF['training']['logging_dir'],
        logging_steps=CONF['training']['logging_steps'],
        predict_with_generate=CONF['training']['predict_with_generate'],
        generation_max_length=CONF['training']['generation_max_length'],
        report_to=CONF['training']['report_to'],
        seed=CONF['general']['seed']
    )
    
    # Early Stopping
    early_stopping = EarlyStoppingCallback(
        early_stopping_patience=CONF['training']['early_stopping_patience']
    )
    
    # Trainer
    trainer = Seq2SeqTrainer(
        model=model,
        args=args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[early_stopping]
    )
    
    print(">>> Training Start...")
    trainer.train()
    
    # Best Model 저장
    best_model_path = os.path.join(CONF['general']['output_dir'], "best_model")
    trainer.save_model(best_model_path)
    print(f">>> Training Finished. Best Model Saved at {best_model_path}")
    
    return best_model_path

# -----------------------------------------------------------------------------
# 4. Inference (추론)
# -----------------------------------------------------------------------------

def inference(model_path=None):
    if model_path is None:
        model_path = os.path.join(CONF['general']['output_dir'], "best_model")
        
    print(f">>> Load Model from {model_path}")
    
    # 모델 로드
    model = BartForConditionalGeneration.from_pretrained(model_path)
    model.to(device)
    model.eval()
    
    # 데이터 로드
    test_df, test_dataset = prepare_data(CONF, tokenizer, is_train=False)
    dataloader = DataLoader(test_dataset, batch_size=CONF['inference']['batch_size'], shuffle=False)
    
    summary_list = []
    fname_list = []
    
    # 시스템 토큰 정의 (제거 대상)
    system_tokens = [tokenizer.bos_token, tokenizer.eos_token, tokenizer.pad_token]

    print(">>> Inference Start...")
    with torch.no_grad():
        for batch in tqdm(dataloader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            
            summary_ids = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                num_beams=CONF['inference']['num_beams'],
                max_length=CONF['inference']['generate_max_length'],
                no_repeat_ngram_size=CONF['inference']['no_repeat_ngram_size'],
                early_stopping=CONF['inference']['early_stopping'],
                length_penalty=CONF['inference']['length_penalty']
            )
            
            # [핵심 수정] skip_special_tokens=False로 변경
            decoded = tokenizer.batch_decode(summary_ids, skip_special_tokens=False)
            
            # [추가] 시스템 토큰만 제거하고 #Person# 등은 유지
            cleaned_batch = []
            for text in decoded:
                for token in system_tokens:
                    if token is not None:
                        text = text.replace(token, "")
                cleaned_batch.append(text.strip())
            
            summary_list.extend(cleaned_batch)
            fname_list.extend(batch['ID'])
            
    # 결과 저장
    result_path = CONF['inference']['result_path']
    os.makedirs(result_path, exist_ok=True)
    
    output_df = pd.DataFrame({
        "fname": fname_list,
        "summary": summary_list
    })
    
    save_file = os.path.join(result_path, "output.csv")
    output_df.to_csv(save_file, index=False)
    print(f">>> Inference Finished. Saved at {save_file}")
    
    return output_df

# -----------------------------------------------------------------------------
# 5. Main Execution
# -----------------------------------------------------------------------------

if __name__ == "__main__":
    # 1. 학습 실행
    best_ckpt = train()
    
    # 2. 추론 실행
    inference(best_ckpt)

Device: cuda:0


You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


>>> Training Start...


Epoch,Training Loss,Validation Loss,Rouge-1,Rouge-2,Rouge-l,Combined Score
1,1.6138,0.60124,0.221584,0.06756,0.206402,0.165182
2,0.446,0.529328,0.275691,0.101218,0.255237,0.210715
3,0.3909,0.516457,0.279172,0.107771,0.261388,0.21611
4,0.351,0.515866,0.283462,0.111927,0.261767,0.219052
5,0.3187,0.520485,0.288545,0.113943,0.266357,0.222948
6,0.2915,0.526989,0.276672,0.106311,0.255669,0.212884
7,0.2684,0.533678,0.284768,0.112749,0.267821,0.221779
8,0.2508,0.543689,0.284402,0.111761,0.264236,0.220133


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


>>> Training Finished. Best Model Saved at ./results/best_model


You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


>>> Load Model from ./results/best_model
>>> Inference Start...


100%|██████████| 16/16 [00:31<00:00,  1.98s/it]

>>> Inference Finished. Saved at ./prediction/output.csv



