# LLaDA Benchmark Playground

이 노트북은 **정량적 평가**를 위한 벤치마크 실험을 수행합니다.

## 목적
- Academic Benchmarks (GSM8K, MMLU)를 사용한 A/B 테스트
- Baseline vs Experimental 샘플링 전략 비교
- 메트릭: Accuracy, Perplexity, Stability, Survival Rate, Correction Efficacy

In [None]:
import os
import sys
import torch
import pandas as pd
from transformers import AutoTokenizer

# Add current directory to path
current_dir = os.getcwd()
if current_dir not in sys.path:
    sys.path.append(current_dir)

# Import local modules
from modeling_llada import LLaDAModelLM
from configuration_llada import LLaDAConfig
import experiment_utils
import decoding

print("Modules loaded successfully.")

## 1. Load Model

In [None]:
LOCAL_MODEL_PATH = "../Grok-1-LLaDA-8B"
HF_MODEL_ID = "GSAI-ML/LLaDA-8B-Base"

model_path = HF_MODEL_ID
if os.path.exists(LOCAL_MODEL_PATH):
    model_path = LOCAL_MODEL_PATH
    print(f"Using local model: {model_path}")
else:
    print(f"Using HuggingFace model: {model_path}")

config = LLaDAConfig.from_pretrained(model_path)
model = LLaDAModelLM.from_pretrained(model_path, config=config, torch_dtype="auto")

if torch.cuda.is_available():
    model.cuda()
model.eval()

tokenizer = AutoTokenizer.from_pretrained(model_path)
print("Model loaded successfully.")

## 2. Run Academic Benchmark

이 셀은 GSM8K와 MMLU 데이터셋을 사용하여 다양한 `alpha_decay` 값에 대한 벤치마크를 실행합니다.

In [None]:
# Benchmark Configuration
ALPHA_DECAY_VALUES = [0.03, 0.05, 0.07, 0.10]  # Temporal decay rates to test
N_SAMPLES = 50  # Number of samples per task
STEPS = 64
GEN_LENGTH = 64
BLOCK_LENGTH = 64
REMASK_BUDGET = 0.05

print(f"Starting benchmark with {len(ALPHA_DECAY_VALUES)} configurations...")
print(f"Alpha Decay Values: {ALPHA_DECAY_VALUES}")
print(f"Samples per task: {N_SAMPLES}")

results_df = experiment_utils.run_academic_benchmark(
    model=model,
    tokenizer=tokenizer,
    thresholds=ALPHA_DECAY_VALUES,
    samples=N_SAMPLES,
    steps=STEPS,
    gen_length=GEN_LENGTH,
    block_length=BLOCK_LENGTH,
    remask_budget=REMASK_BUDGET,
    alpha_decay=0.05  # Default, will be overridden by thresholds
)

print("\nBenchmark completed!")
print(f"Total results: {len(results_df)} rows")

## 3. Analyze Results

In [None]:
# Display comprehensive analysis
experiment_utils.analyze_icml_results(results_df)

## 4. Save Results

In [None]:
# Save to CSV for further analysis
output_file = "benchmark_results.csv"
results_df.to_csv(output_file, index=False)
print(f"Results saved to {output_file}")

## 5. Detailed Inspection (Optional)

특정 케이스를 자세히 살펴보고 싶다면 아래 셀을 사용하세요.

In [None]:
# Filter for specific category or threshold
print("\n=== Math Task Results ===")
math_results = results_df[results_df['Category'] == 'math']
print(math_results.groupby('Threshold')[['Acc_Exp', 'PPL_Delta', 'Stability_Delta']].mean())

print("\n=== Logic Task Results ===")
logic_results = results_df[results_df['Category'] == 'logic']
print(logic_results.groupby('Threshold')[['Acc_Exp', 'PPL_Delta', 'Stability_Delta']].mean())