# 05. Evaluation

QA 성능, Write 품질, Efficiency 종합 평가

In [None]:
import sys
from pathlib import Path

PROJECT_ROOT = Path(".").resolve().parent
sys.path.insert(0, str(PROJECT_ROOT))

import torch
import time
import json
from tqdm.notebook import tqdm
import pandas as pd
import matplotlib.pyplot as plt

## 1. Metrics 모듈

In [None]:
from evaluation.metrics import (
    compute_em,
    compute_f1,
    compute_recall_at_k,
    compute_mrr,
    compute_rouge_l,
)

# Metric 테스트
prediction = "Paris is the capital"
reference = "Paris"

print(f"EM: {compute_em(prediction, reference)}")
print(f"F1: {compute_f1(prediction, reference):.4f}")
print(f"ROUGE-L: {compute_rouge_l(prediction, reference):.4f}")

In [None]:
# Retrieval metrics 테스트
retrieved = [3, 1, 5, 2, 0]
gold = [1, 5]

print(f"Recall@5: {compute_recall_at_k(retrieved, gold, k=5):.4f}")
print(f"MRR: {compute_mrr(retrieved, gold):.4f}")

## 2. QA 평가

In [None]:
from evaluation.evaluate_qa import QAEvaluator

# 샘플 데이터
sample_qa_pairs = [
    {"question": "What is the capital of France?", "answer": "Paris", "gold_doc_ids": [0]},
    {"question": "Who wrote Romeo and Juliet?", "answer": "William Shakespeare", "gold_doc_ids": [1]},
    {"question": "What did Einstein develop?", "answer": "theory of relativity", "gold_doc_ids": [3]},
]

print(f"Sample QA pairs: {len(sample_qa_pairs)}")

In [None]:
# QA 평가 실행 (모델이 로드된 경우)
# evaluator = QAEvaluator(model, tokenizer, device="cuda")
# results = evaluator.evaluate(sample_qa_pairs, top_k=5)
# print(results)

## 3. Write Phase 평가

In [None]:
from evaluation.evaluate_write import WriteEvaluator

# Write 평가 메트릭:
# - Reconstruction loss (perplexity)
# - BLEU / ROUGE 기반 재구성 품질
# - z 벡터 clustering 품질 (optional)

print("Write evaluation metrics:")
print("  - Reconstruction loss (perplexity)")
print("  - BLEU score")
print("  - ROUGE-L score")

In [None]:
# Write 평가 실행
# write_evaluator = WriteEvaluator(model, tokenizer, device="cuda")
# write_results = write_evaluator.evaluate(corpus, num_samples=100)
# print(write_results)

## 4. Efficiency 평가

In [None]:
from evaluation.evaluate_efficiency import EfficiencyEvaluator

# Efficiency 메트릭:
# - Selection latency
# - Generation latency
# - Total latency
# - Storage (z vectors)
# - Peak memory

print("Efficiency metrics:")
print("  - Selection latency (ms)")
print("  - Generation latency (ms)")
print("  - Total latency (ms)")
print("  - Storage per doc (bytes)")
print("  - Peak GPU memory (GB)")

In [None]:
def measure_latency(model, tokenizer, question, device="cuda", warmup=3, trials=10):
    """
    Latency 측정
    """
    model.eval()
    
    # Tokenize
    encoded = tokenizer(
        question, max_length=128, truncation=True,
        padding="max_length", return_tensors="pt"
    )
    query_ids = encoded["input_ids"].to(device)
    query_mask = encoded["attention_mask"].to(device)
    
    # Warmup
    for _ in range(warmup):
        with torch.no_grad():
            selected_ids, _ = model.select_documents(query_ids, query_mask, k=5)
            _ = model.generate(
                query_ids=query_ids,
                doc_indices=selected_ids,
                query_attention_mask=query_mask,
                max_new_tokens=32,
            )
    
    # CUDA sync
    if device == "cuda":
        torch.cuda.synchronize()
    
    # Measure
    selection_times = []
    generation_times = []
    
    for _ in range(trials):
        # Selection
        start = time.perf_counter()
        with torch.no_grad():
            selected_ids, _ = model.select_documents(query_ids, query_mask, k=5)
        if device == "cuda":
            torch.cuda.synchronize()
        selection_times.append((time.perf_counter() - start) * 1000)
        
        # Generation
        start = time.perf_counter()
        with torch.no_grad():
            _ = model.generate(
                query_ids=query_ids,
                doc_indices=selected_ids,
                query_attention_mask=query_mask,
                max_new_tokens=32,
            )
        if device == "cuda":
            torch.cuda.synchronize()
        generation_times.append((time.perf_counter() - start) * 1000)
    
    return {
        "selection_latency_ms": sum(selection_times) / len(selection_times),
        "generation_latency_ms": sum(generation_times) / len(generation_times),
        "total_latency_ms": sum(selection_times) / len(selection_times) + sum(generation_times) / len(generation_times),
    }

# 실행
# latency = measure_latency(model, tokenizer, "What is the capital of France?")
# print(latency)

In [None]:
def measure_storage(model, num_docs):
    """
    Storage 측정
    """
    z_dim = model.doc_vectors.shape[2]
    m_tokens = model.doc_vectors.shape[1]
    
    bytes_per_doc = m_tokens * z_dim * 4  # float32
    total_bytes = num_docs * bytes_per_doc
    
    return {
        "bytes_per_doc": bytes_per_doc,
        "total_storage_mb": total_bytes / (1024 * 1024),
        "z_dim": z_dim,
        "m_tokens": m_tokens,
    }

# 예상 storage
z_dim = 256
m_tokens = 4
num_docs = 10000
bytes_per_doc = m_tokens * z_dim * 4
print(f"Storage per doc: {bytes_per_doc} bytes")
print(f"Total for {num_docs:,} docs: {num_docs * bytes_per_doc / (1024*1024):.2f} MB")

## 5. RAGAS 평가

In [None]:
# RAGAS 메트릭
# - Faithfulness: 생성된 답이 context에 충실한가
# - Answer Relevancy: 답이 질문과 관련 있는가
# - Context Relevancy: 검색된 context가 질문과 관련 있는가

print("RAGAS metrics:")
print("  - Faithfulness")
print("  - Answer Relevancy")
print("  - Context Relevancy")

In [None]:
# RAGAS 평가는 LLM-as-judge 방식이므로 추가 API 필요
# from evaluation.evaluate_ragas import RagasEvaluator

# ragas_evaluator = RagasEvaluator(judge_model="gpt-4")
# ragas_results = ragas_evaluator.evaluate(predictions)
# print(ragas_results)

## 6. 결과 시각화

In [None]:
# 예시 결과 데이터
example_results = {
    "Parametric-QA": {"EM": 45.2, "F1": 52.3, "Recall@5": 78.5},
    "No Retrieval": {"EM": 22.1, "F1": 28.4, "Recall@5": 0},
    "BM25-RAG": {"EM": 35.6, "F1": 42.1, "Recall@5": 65.2},
    "Dense-RAG": {"EM": 40.3, "F1": 48.7, "Recall@5": 72.4},
}

# DataFrame 변환
df = pd.DataFrame(example_results).T
print(df)

In [None]:
# Bar chart
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

metrics = ["EM", "F1", "Recall@5"]
for i, metric in enumerate(metrics):
    ax = axes[i]
    df[metric].plot(kind="bar", ax=ax, color=["#4CAF50", "#2196F3", "#FF9800", "#9C27B0"])
    ax.set_title(metric)
    ax.set_ylabel("Score (%)")
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
    ax.grid(axis="y", alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Latency comparison
latency_data = {
    "Parametric-QA": {"Selection": 5.2, "Generation": 120.5},
    "BM25-RAG": {"Selection": 15.3, "Generation": 145.2},
    "Dense-RAG": {"Selection": 45.8, "Generation": 150.3},
}

df_latency = pd.DataFrame(latency_data).T
df_latency["Total"] = df_latency["Selection"] + df_latency["Generation"]

# Stacked bar chart
fig, ax = plt.subplots(figsize=(10, 6))
df_latency[["Selection", "Generation"]].plot(
    kind="bar", stacked=True, ax=ax,
    color=["#3498db", "#e74c3c"]
)
ax.set_ylabel("Latency (ms)")
ax.set_title("Latency Comparison")
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
ax.legend(loc="upper right")
ax.grid(axis="y", alpha=0.3)

plt.tight_layout()
plt.show()

## 7. 결과 저장

In [None]:
def save_results(results: dict, save_path: str):
    """결과 저장"""
    save_path = Path(save_path)
    save_path.parent.mkdir(parents=True, exist_ok=True)
    
    with open(save_path, "w") as f:
        json.dump(results, f, indent=2)
    
    print(f"Results saved to {save_path}")

# 저장 예시
# all_results = {
#     "qa_metrics": {"EM": 45.2, "F1": 52.3},
#     "efficiency": {"latency_ms": 125.7, "storage_mb": 39.1},
#     "ragas": {"faithfulness": 0.85, "answer_relevancy": 0.78},
# }
# save_results(all_results, PROJECT_ROOT / "results" / "phase1_results.json")