# Benchmark Results Visualization

This notebook allows you to load and visualize the results of a benchmark run performed by `notebooks/run_benchmarks.py`.

## 1. Setup

Specify the `RUN_DIR` where your benchmark results (including `results.json` and `trace.jsonl`) are stored.

In [None]:
from pathlib import Path

# Default path (can be overridden by Papermill)
RUN_DIR = "benchmark_runs/LATEST"

In [None]:
import sys
import os
import json
import pandas as pd
import pydantic
from typing import List

# Ensure project root is in path
project_root = Path(os.getcwd()).resolve()
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

from benchmarks.data_models import BenchmarkRunResult
import benchmarks.analysis as analysis

pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_rows", None)

run_path = Path(RUN_DIR).resolve()
print(f"Loading results from: {run_path}")

## 2. Load Data

In [None]:
results_file = run_path / "results.json"
if not results_file.exists():
    raise FileNotFoundError(f"No results.json found in {run_path}")

with open(results_file, "r", encoding="utf-8") as f:
    data = json.load(f)
    
TypeAdapter = pydantic.TypeAdapter(List[BenchmarkRunResult])
results = TypeAdapter.validate_python(data)

print(f"Loaded {len(results)} benchmark results.")

## 3. Analysis

In [None]:
df = analysis.process_results(results)

### Summary

In [None]:
analysis.print_summary(df)

### Metrics (Latency, Tokens, Cost)

In [None]:
analysis.print_metrics(df)

### Time Profiling

In [None]:
analysis.print_time_profiling(df)

### Error Breakdown

In [None]:
analysis.print_detailed_breakdown(df)

### Timeline & API Key Analysis

In [None]:
import plotly.express as px
import pandas as pd

timeline_data = []

for r in results:
    attempts = r.generation_attempts or []
    # If no attempts recorded (legacy or non-generation failure), skip or infer?
    # We'll skip for now to keep chart clean, or add a single point if needed.
    
    for att in attempts:
        # att is a GenerationAttempt model
        timeline_data.append({
            "Benchmark": r.benchmark_name,
            "Generator": r.answer_generator,
            "Attempt": att.attempt_number,
            "Status": att.status,
            "Duration": att.duration,
            "API Key ID": att.api_key_id or "Unknown",
            "Error": att.error_message or ""
        })

if timeline_data:
    df_timeline = pd.DataFrame(timeline_data)
    
    # 1. Timeline Scatter Plot
    fig = px.scatter(
        df_timeline, 
        x=df_timeline.index, # Sequence index
        y="Duration",
        color="Status",
        symbol="Status",
        hover_data=["Benchmark", "Generator", "API Key ID", "Error"],
        title="Generation Attempts Timeline (Sequence)",
        color_discrete_map={"success": "green", "failure": "red"}
    )
    fig.show()
    
    # 2. API Key Breakdown
    print("\n--- API Key Performance ---")
    key_stats = df_timeline.groupby("API Key ID").agg(
        total=("Status", "count"),
        successes=("Status", lambda x: (x == "success").sum()),
        failures=("Status", lambda x: (x == "failure").sum())
    )
    key_stats["Failure Rate"] = (key_stats["failures"] / key_stats["total"] * 100).map("{:.1f}%".format)
    print(key_stats)
else:
    print("No detailed generation attempt history found to plot.")