# Benchmark Results Visualization

This notebook allows you to load and visualize the results of a benchmark run performed by `notebooks/run_benchmarks.py`.

## 1. Setup

Specify the `RUN_DIR` where your benchmark results (including `results.json` and `trace.jsonl`) are stored.

In [None]:
from pathlib import Path

# Default path (can be overridden by Papermill)
RUN_DIR = "benchmark_runs/LATEST"

In [None]:
import sys
import os
import json
import pandas as pd
import pydantic
from typing import List

# Ensure project root is in path
project_root = Path(os.getcwd()).resolve()
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

from benchmarks.data_models import BenchmarkRunResult
import benchmarks.analysis as analysis

pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_rows", None)

run_path = Path(RUN_DIR).resolve()
print(f"Loading results from: {run_path}")

## 2. Load Data

In [None]:
results_file = run_path / "results.json"
if not results_file.exists():
    raise FileNotFoundError(f"No results.json found in {run_path}")

with open(results_file, "r", encoding="utf-8") as f:
    data = json.load(f)
    
TypeAdapter = pydantic.TypeAdapter(List[BenchmarkRunResult])
results = TypeAdapter.validate_python(data)

print(f"Loaded {len(results)} benchmark results.")

## 3. Analysis

In [None]:
df = analysis.process_results(results)

### Summary

In [None]:
analysis.print_summary(df)

### Metrics (Latency, Tokens, Cost)

In [None]:
analysis.print_metrics(df)

### Time Profiling

In [None]:
analysis.print_time_profiling(df)

### Error Breakdown

In [None]:
analysis.print_detailed_breakdown(df)

### Timeline & API Key Analysis

In [None]:
import plotly.express as px
import pandas as pd

timeline_data = []

for r in results:
    attempts = r.generation_attempts or []
    # If no attempts recorded (legacy or non-generation failure), skip or infer?
    # We'll skip for now to keep chart clean, or add a single point if needed.
    
    for att in attempts:
        # att is a GenerationAttempt model
        timeline_data.append({
            "Benchmark": r.benchmark_name,
            "Generator": r.answer_generator,
            "Attempt": att.attempt_number,
            "Status": att.status,
            "Duration": att.duration,
            "API Key ID": att.api_key_id or "Unknown",
            "Error": att.error_message or ""
        })

if timeline_data:
    df_timeline = pd.DataFrame(timeline_data)
    
    # 1. Timeline Scatter Plot
    fig = px.scatter(
        df_timeline, 
        x=df_timeline.index, # Sequence index
        y="Duration",
        color="Status",
        symbol="Status",
        hover_data=["Benchmark", "Generator", "API Key ID", "Error"],
        title="Generation Attempts Timeline (Sequence)",
        color_discrete_map={"success": "green", "failure": "red"}
    )
    fig.show()
    
    # 2. API Key Breakdown
    print("\n--- API Key Performance ---")
    key_stats = df_timeline.groupby("API Key ID").agg(
        total=("Status", "count"),
        successes=("Status", lambda x: (x == "success").sum()),
        failures=("Status", lambda x: (x == "failure").sum())
    )
    key_stats["Failure Rate"] = (key_stats["failures"] / key_stats["total"] * 100).map("{:.1f}%".format)
    print(key_stats)
else:
    print("No detailed generation attempt history found to plot.")

In [None]:


def analyze_token_usage(results):
    import pandas as pd
    token_entries = []

    for r in results:
        if not r.trace_logs:
            continue
        
        # Group logs by timestamp + author + total_tokens to handle multi-part events (text + tool) from one API call
        # Key: (timestamp, author, total_tokens) -> { info }
        unique_generations = {}

        for event in r.trace_logs:
            # Handle Pydantic model vs dict
            details = event.details if hasattr(event, "details") else event.get("details", {})
            if not details: 
                continue

            usage = details.get("usage_metadata")
            if not usage:
                continue

            # normalize usage to dict
            if not isinstance(usage, dict):
                usage = {
                    "prompt_token_count": getattr(usage, "prompt_token_count", 0),
                    "candidates_token_count": getattr(usage, "candidates_token_count", 0),
                    "total_token_count": getattr(usage, "total_token_count", 0)
                }
            
            total = usage["total_token_count"]
            if total == 0:
                continue

            # Identify the event
            timestamp = event.timestamp if hasattr(event, "timestamp") else event.get("timestamp")
            author = event.author if hasattr(event, "author") else event.get("author")
            if not author:
                 author = event.source if hasattr(event, "source") else event.get("source", "unknown")
            
            # Create a unique key for this "generation turn"
            key = (timestamp, author, total)
            
            if key not in unique_generations:
                unique_generations[key] = {
                    "benchmark": r.benchmark_name,
                    "generator": r.answer_generator,
                    "author": author,
                    "prompt_tokens": usage["prompt_token_count"],
                    "completion_tokens": usage["candidates_token_count"],
                    "total_tokens": total,
                    "tool_names": set(),
                    "has_text": False
                }
            
            # Update the group info based on this specific log event type
            e_type = event.type if hasattr(event, "type") else event.get("type")
            # Handle enum if necessary
            if hasattr(e_type, "value"):
                e_type = e_type.value
                
            if e_type == "tool_use":
                t_name = event.tool_name if hasattr(event, "tool_name") else event.get("tool_name")
                if t_name:
                    unique_generations[key]["tool_names"].add(t_name)
            elif e_type == "message":
                role = event.role if hasattr(event, "role") else event.get("role")
                if role == "model":
                    unique_generations[key]["has_text"] = True

        # Convert unique generations to list
        for gen in unique_generations.values():
            tools = list(gen["tool_names"])
            if tools:
                label = f"Tool: {', '.join(sorted(tools))}"
            elif gen["has_text"]:
                label = "Text Generation"
            else:
                label = "Other"
            
            token_entries.append({
                "Benchmark": gen["benchmark"],
                "Generator": gen["generator"],
                "Agent": gen["author"],
                "Action": label,
                "Prompt Tokens": gen["prompt_tokens"],
                "Completion Tokens": gen["completion_tokens"],
                "Total Tokens": gen["total_tokens"]
            })

    if not token_entries:
        print("No token usage data found.")
        return

    df = pd.DataFrame(token_entries)

    print("\n=== Token Usage by Agent ===")
    print(df.groupby("Agent")[["Total Tokens"]].sum().sort_values("Total Tokens", ascending=False))

    print("\n=== Token Usage by Action/Tool ===")
    print(df.groupby("Action")[["Total Tokens"]].sum().sort_values("Total Tokens", ascending=False))

    print("\n=== Detailed Breakdown (Agent + Action) ===")
    print(df.groupby(["Agent", "Action"])[["Total Tokens"]].sum().sort_values("Total Tokens", ascending=False))

    print("\n=== Top 10 Most Expensive Single Steps ===")
    print(df.sort_values("Total Tokens", ascending=False).head(10)[["Benchmark", "Agent", "Action", "Total Tokens"]])

analyze_token_usage(results)



### Tool Success Rate Analysis (P(success | tool_X_used))

In [None]:
import pandas as pd

def analyze_tool_success_rate(results):
    tool_stats_data = []
    
    for r in results:
        # Extract unique tools used in this run from trace_logs
        tools_used = set()
        logs = r.trace_logs or []
        for log in logs:
            # Handle Pydantic model vs dict
            e_type = log.type if hasattr(log, "type") else log.get("type")
            if hasattr(e_type, "value"): e_type = e_type.value
            
            if e_type == "tool_use":
                t_name = log.tool_name if hasattr(log, "tool_name") else log.get("tool_name")
                if t_name:
                    tools_used.add(t_name)
        
        is_success = (r.result == 1)
        
        for tool in tools_used:
            tool_stats_data.append({"tool": tool, "success": is_success})
            
    if not tool_stats_data:
        print("No tool usage data found for success rate analysis.")
        return

    df_tools = pd.DataFrame(tool_stats_data)
    
    # Aggregate
    tool_agg = df_tools.groupby("tool").agg(
        times_used=("success", "count"),
        successes=("success", "sum")
    ).reset_index()
    
    tool_agg["P(Success | Used)"] = tool_agg["successes"] / tool_agg["times_used"]
    
    # Calculate Lift (Difference from overall pass rate)
    overall_pass_rate = sum(1 for r in results if r.result == 1) / len(results) if results else 0
    tool_agg["Lift"] = tool_agg["P(Success | Used)"] - overall_pass_rate
    
    # Formatting for display
    display_df = tool_agg.copy()
    display_df["P(Success | Used)"] = (display_df["P(Success | Used)"] * 100).map("{:.1f}%".format)
    display_df["Lift"] = (display_df["Lift"] * 100).map("{:+.1f}%".format)
    
    print(f"Overall Baseline Pass Rate: {overall_pass_rate*100:.1f}%")
    print("\n--- Tool Success Impact Analysis ---")
    print(display_df.sort_values("times_used", ascending=False).to_string(index=False))

analyze_tool_success_rate(results)