In [1]:
# 1. 환경 설정 및 라이브러리 로드
!pip install -q -U torch transformers peft bitsandbytes accelerate pandas datasets evaluate rouge_score

[0m

In [2]:
from evaluate import load
r = load("rouge")
print(r.compute(predictions=["이것은 테스트 요약입니다."], references=["이것은 테스트 요약입니다."]))

  warn(


{'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}


In [3]:
import os
import torch
import pandas as pd
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel

# 설정
CONF = {
    "base_model": "upstage/SOLAR-10.7B-Instruct-v1.0",
    "adapter_path": "./results_solar",  # 학습된 어댑터 경로
    "data_path": "./data/",
    "seed": 42
}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


## 2. 모델 및 토크나이저 로드 (Base + Adapter)

In [4]:
# 4-bit Quantization Config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False,
)

# Load Base Model
base_model = AutoModelForCausalLM.from_pretrained(
    CONF['base_model'],
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

# Load Adapter
model = PeftModel.from_pretrained(base_model, CONF['adapter_path'])
model.eval()

# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(CONF['base_model'], trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

## 3. 데이터 로드 및 평가 함수 정의

In [5]:
# Load Dev Data for Validation
dev_df = pd.read_csv(os.path.join(CONF['data_path'], 'dev.csv'))
print(f"Dev set size: {len(dev_df)}")

# ROUGE Metric (use `evaluate` instead of deprecated `datasets.load_metric`)
from evaluate import load as load_metric
rouge = load_metric("rouge")


def compute_rouge(predictions, references):
    # evaluate returns Rouge object with attributes, not nested dicts
    results = rouge.compute(predictions=predictions, references=references, use_stemmer=True)
    # Extract fmeasure from the Rouge score objects
    return {
        "rouge1": results["rouge1"].mid.fmeasure if hasattr(results["rouge1"], "mid") else results["rouge1"],
        "rouge2": results["rouge2"].mid.fmeasure if hasattr(results["rouge2"], "mid") else results["rouge2"],
        "rougeL": results["rougeL"].mid.fmeasure if hasattr(results["rougeL"], "mid") else results["rougeL"],
    }

Dev set size: 499


## 4. 실험 1: Prompt Engineering
다양한 프롬프트 스타일을 테스트하여 모델이 요약을 더 잘 생성하도록 유도합니다.

In [6]:
prompts = {
    "basic": "### User:\nSummarize the following dialogue:\n\n{dialogue}\n\n### Assistant:\n",
    "korean_explicit": "### User:\n다음 대화를 한국어로 요약해줘:\n\n{dialogue}\n\n### Assistant:\n",
    "concise": "### User:\nSummarize the dialogue briefly in 3 sentences:\n\n{dialogue}\n\n### Assistant:\n",
    "detailed": "### User:\nProvide a detailed summary of the conversation:\n\n{dialogue}\n\n### Assistant:\n"
}

def run_inference(model, tokenizer, dialogue, prompt_template, params):
    prompt = prompt_template.format(dialogue=dialogue)
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            **params,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
    
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    try:
        summary = generated_text.split("### Assistant:\n")[1].strip()
    except:
        summary = generated_text
    return summary

# 테스트용 샘플 (속도를 위해 10개만)
sample_dev = dev_df.head(10)

print("=== Prompt Engineering Test ===")
default_params = {"max_new_tokens": 128, "do_sample": False, "repetition_penalty": 1.2}

for name, template in prompts.items():
    preds = []
    for dialogue in sample_dev['dialogue']:
        preds.append(run_inference(model, tokenizer, dialogue, template, default_params))
    
    scores = compute_rouge(preds, sample_dev['summary'].tolist())
    print(f"Prompt [{name}]: R1={scores['rouge1']:.4f}, R2={scores['rouge2']:.4f}, RL={scores['rougeL']:.4f}")

=== Prompt Engineering Test ===
Prompt [basic]: R1=0.8033, R2=0.4756, RL=0.7467
Prompt [korean_explicit]: R1=0.7400, R2=0.4100, RL=0.6900
Prompt [concise]: R1=0.8200, R2=0.4756, RL=0.7633
Prompt [detailed]: R1=0.8400, R2=0.4590, RL=0.7500


## 5. 실험 2: Decoding Strategies (Grid Search)
Beam Search, Repetition Penalty, No Repeat N-gram Size 등을 조합하여 최적의 파라미터를 찾습니다.

In [7]:
# 실험할 파라미터 조합
search_space = [
    {"name": "Greedy (Baseline)", "params": {"max_new_tokens": 150, "do_sample": False, "repetition_penalty": 1.2}},
    {"name": "Beam-3", "params": {"max_new_tokens": 150, "num_beams": 3, "early_stopping": True, "repetition_penalty": 1.2}},
    {"name": "Beam-5 + NoRepeat", "params": {"max_new_tokens": 150, "num_beams": 5, "no_repeat_ngram_size": 3, "repetition_penalty": 1.2}},
    {"name": "Length Penalty", "params": {"max_new_tokens": 150, "num_beams": 3, "length_penalty": 1.2, "repetition_penalty": 1.2}},
]

# 최적의 프롬프트 선택 (위 실험 결과에 따라 수정 가능, 여기선 기본값 사용)
best_prompt = prompts["basic"] 

print("=== Decoding Strategy Search ===")
for config in search_space:
    preds = []
    # 전체 Dev 셋으로 검증하면 좋지만 시간상 20개로 테스트
    eval_subset = dev_df.head(20)
    
    for dialogue in tqdm(eval_subset['dialogue'], desc=config['name']):
        preds.append(run_inference(model, tokenizer, dialogue, best_prompt, config['params']))
    
    scores = compute_rouge(preds, eval_subset['summary'].tolist())
    print(f"Config [{config['name']}]: R1={scores['rouge1']:.4f}, R2={scores['rouge2']:.4f}, RL={scores['rougeL']:.4f}")

=== Decoding Strategy Search ===


Greedy (Baseline): 100%|██████████| 20/20 [06:13<00:00, 18.68s/it]


Config [Greedy (Baseline)]: R1=0.7751, R2=0.4634, RL=0.7221


Beam-3: 100%|██████████| 20/20 [20:12<00:00, 60.63s/it] 


Config [Beam-3]: R1=0.7764, R2=0.4370, RL=0.6833


Beam-5 + NoRepeat: 100%|██████████| 20/20 [35:57<00:00, 107.87s/it]


Config [Beam-5 + NoRepeat]: R1=0.3439, R2=0.1525, RL=0.3099


Length Penalty: 100%|██████████| 20/20 [20:59<00:00, 62.98s/it] 

Config [Length Penalty]: R1=0.7964, R2=0.4791, RL=0.7063





## 6. 최종 추론 및 제출 파일 생성
위 실험에서 얻은 최적의 파라미터와 프롬프트를 적용하여 Test 셋에 대한 추론을 수행합니다.

In [8]:
# TODO: 위 실험 결과를 바탕으로 최적의 파라미터로 수정하세요.
FINAL_PROMPT = prompts["basic"] 
FINAL_PARAMS = {
    "max_new_tokens": 150,
    "num_beams": 3,
    "no_repeat_ngram_size": 0,
    "repetition_penalty": 1.2,
    "length_penalty": 1.2
}

def post_process(text):
    # 간단한 후처리: 문장이 중간에 끊긴 경우 마지막 마침표까지만 자르기
    if '.' in text:
        last_period_index = text.rfind('.')
        return text[:last_period_index+1]
    return text

# Load Test Data
test_df = pd.read_csv(os.path.join(CONF['data_path'], 'test.csv'))

print("Starting Final Inference...")
final_summaries = []

for dialogue in tqdm(test_df['dialogue']):
    raw_summary = run_inference(model, tokenizer, dialogue, FINAL_PROMPT, FINAL_PARAMS)
    clean_summary = post_process(raw_summary)
    final_summaries.append(clean_summary)

# Save Results
submission = pd.DataFrame({
    'fname': test_df['fname'],
    'summary': final_summaries
})

submission.to_csv("./prediction/submit_solar_tuned.csv", index=False)
print("Saved to ./prediction/submit_solar_tuned.csv")

Starting Final Inference...


100%|██████████| 499/499 [9:08:05<00:00, 65.90s/it]    

Saved to ./prediction/submit_solar_tuned.csv



