In [None]:
!uv pip install --upgrade kaleido

In [11]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
from datetime import datetime
import json

# Set plotly to use simple static rendering
import plotly.io as pio
pio.renderers.default = "png"

In [None]:
# Load and process the CSV data
def load_benchmark_data(csv_path):
    # Read the CSV file
    df = pd.read_csv(csv_path)
    
    # Convert timestamp columns to datetime
    df['run_timestamp'] = pd.to_datetime(df['run_timestamp'])
    df['run_date'] = pd.to_datetime(df['run_date'])
    
    # Map 'name' to 'metric_name' if needed
    if 'name' in df.columns and 'metric_name' not in df.columns:
        df['metric_name'] = df['name']
    
    # Convert numeric columns
    numeric_columns = ['count', 'sum', 'mean', 'min', 'max', 'std', 'variance', 
                      'p25', 'p50', 'p75', 'p90', 'p95', 'p99']
    for col in numeric_columns:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    
    return df

# Load your data
# Replace with your CSV path
df = load_benchmark_data('results/benchmark_summary.csv')
print(df.columns)
df.head()

In [None]:
# Function to create the overview chart
def create_overview_chart(df, selected_metric=None, selected_split=None):
    # Filter data based on selections
    filtered_df = df.copy()
    if selected_metric:
        filtered_df = filtered_df[filtered_df['metric_name'] == selected_metric]
    if selected_split:
        filtered_df = filtered_df[filtered_df['split'] == selected_split]
    
    # Group by model and timestamp to handle multiple points per timestamp
    grouped = filtered_df.groupby(['model', 'run_timestamp']).agg({
        'mean': 'mean',
        'stddev': 'mean',
        'scenario_class': 'count'
    }).reset_index()
    
    # Create the figure
    fig = go.Figure()
    
    # Add traces for each model
    colors = ['#667eea', '#48bb78', '#ed8936', '#e53e3e', '#9f7aea', '#38b2ac', '#d69e2e', '#805ad5', '#dd6b20']
    
    for i, (model, model_data) in enumerate(grouped.groupby('model')):
        color = colors[i % len(colors)]
        
        fig.add_trace(go.Scatter(
            x=model_data['run_timestamp'],
            y=model_data['mean'],
            mode='lines+markers',
            name=model,
            line=dict(color=color, width=3),
            marker=dict(color=color, size=8),
            hovertemplate=(
                f"{model}<br>"
                "Avg across %{customdata[0]} scenarios<br>"
                "Mean: %{y:.4f}<br>"
                "Std Dev: %{customdata[1]:.4f}"
            ),
            customdata=np.column_stack((model_data['scenario_class'], model_data['stddev']))
        ))
    
    # Update layout
    metric_name = selected_metric or 'Performance'
    fig.update_layout(
        title=dict(
            text=f"{metric_name} - All Models Comparison",
            x=0.5,
            font=dict(size=16)
        ),
        xaxis_title="Date",
        yaxis_title=metric_name,
        margin=dict(t=50, r=50, b=80, l=80),
        plot_bgcolor='#f8f9fa',
        paper_bgcolor='white',
        showlegend=True,
        legend=dict(
            orientation='h',
            x=0.5,
            xanchor='center',
            y=-0.2
        )
    )
    
    return fig

# Create and display the overview chart
fig = create_overview_chart(df, selected_metric='exact_match')
fig.show()

In [None]:
# Function to create the time series chart for a specific model
def create_time_series_chart(df, selected_model, selected_metric=None, selected_split=None, is_averaging=False):
    # Filter data based on selections
    filtered_df = df[df['model'] == selected_model].copy()
    if selected_metric:
        filtered_df = filtered_df[filtered_df['metric_name'] == selected_metric]
    if selected_split:
        filtered_df = filtered_df[filtered_df['split'] == selected_split]
    
    if is_averaging:
        # Calculate averages across scenarios
        grouped = filtered_df.groupby('run_timestamp').agg({
            'mean': 'mean',
            'stddev': 'mean',
            'scenario_class': 'count'
        }).reset_index()
        
        title_suffix = " (across scenarios)"
    else:
        grouped = filtered_df.groupby(['run_timestamp', 'scenario_class']).agg({
            'mean': 'mean',
            'stddev': 'mean'
        }).reset_index()
        title_suffix = ""
    
    # Create the figure
    fig = go.Figure()
    
    if is_averaging:
        fig.add_trace(go.Scatter(
            x=grouped['run_timestamp'],
            y=grouped['mean'],
            mode='lines+markers',
            name=selected_model,
            line=dict(color='#667eea', width=3),
            marker=dict(color='#667eea', size=8),
            hovertemplate=(
                "%{customdata[0]} data points<br>"
                "Avg across %{customdata[0]} scenarios<br>"
                "Mean: %{y:.4f}<br>"
                "Std Dev: %{customdata[1]:.4f}"
            ),
            customdata=np.column_stack((grouped['scenario_class'], grouped['stddev']))
        ))
    else:
        for scenario in grouped['scenario_class'].unique():
            scenario_data = grouped[grouped['scenario_class'] == scenario]
            fig.add_trace(go.Scatter(
                x=scenario_data['run_timestamp'],
                y=scenario_data['mean'],
                mode='lines+markers',
                name=scenario,
                line=dict(width=2),
                marker=dict(size=6),
                hovertemplate=(
                    f"Scenario: {scenario}<br>"
                    "Mean: %{y:.4f}<br>"
                    "Std Dev: %{customdata[0]:.4f}"
                ),
                customdata=scenario_data['stddev']
            ))
    
    # Update layout
    metric_name = selected_metric or 'Metric'
    title_prefix = "Average " if is_averaging else ""
    
    fig.update_layout(
        title=dict(
            text=f"{title_prefix}{metric_name} - {selected_model}{title_suffix}",
            x=0.5,
            font=dict(size=16)
        ),
        xaxis_title="Date",
        yaxis_title=metric_name,
        margin=dict(t=50, r=50, b=100, l=80),
        plot_bgcolor='#f8f9fa',
        paper_bgcolor='white',
        showlegend=True,
        legend=dict(
            orientation='h',
            x=0.5,
            xanchor='center',
            y=-0.25
        )
    )
    
    return fig

# Create and display the time series chart
fig = create_time_series_chart(df, 
                              selected_model='anthropic/claude-3-5-sonnet-20240620',
                              selected_metric='exact_match',
                              is_averaging=True)
fig.show()

In [None]:
# Function to create the comparison chart
def create_comparison_chart(df, selected_model, selected_metric=None, selected_split=None, is_averaging=False):
    # Filter data based on selections
    filtered_df = df[df['model'] == selected_model].copy()
    if selected_metric:
        filtered_df = filtered_df[filtered_df['metric_name'] == selected_metric]
    if selected_split:
        filtered_df = filtered_df[filtered_df['split'] == selected_split]
    
    # Group by date
    filtered_df['date'] = filtered_df['run_date'].dt.date
    grouped = filtered_df.groupby('date').agg({
        'mean': 'mean',
        'stddev': 'mean',
        'scenario_class': 'count'
    }).reset_index()
    
    # Get the last 7 days
    grouped = grouped.sort_values('date').tail(7)
    
    # Create the figure
    fig = go.Figure()
    
    fig.add_trace(go.Scatter(
        x=grouped['date'],
        y=grouped['mean'],
        mode='markers',
        name='Daily Average',
        marker=dict(
            color='#48bb78',
            size=10,
            line=dict(color='#38a169', width=2)
        ),
        error_y=dict(
            type='data',
            array=grouped['stddev'],
            visible=True,
            color='#38a169',
            thickness=2,
            width=6
        ),
        hovertemplate=(
            "Date: %{x}<br>"
            "Mean: %{y:.4f}<br>"
            "Std Dev: %{customdata[0]:.4f}<br>"
            "Runs: %{customdata[1]}<br>"
            "Data points: %{customdata[2]}"
        ),
        customdata=np.column_stack((grouped['stddev'], grouped['scenario_class'], grouped['scenario_class']))
    ))
    
    # Update layout
    title_suffix = " (across scenarios)" if is_averaging else ""
    metric_name = selected_metric or 'Value'
    
    fig.update_layout(
        title=dict(
            text=f"Daily Performance Comparison (±1 Std Dev{title_suffix})",
            x=0.5,
            font=dict(size=16)
        ),
        xaxis_title="Date",
        yaxis_title=metric_name,
        margin=dict(t=60, r=50, b=100, l=80),
        plot_bgcolor='#f8f9fa',
        paper_bgcolor='white',
        showlegend=False
    )
    
    return fig

# Create and display the comparison chart
fig = create_comparison_chart(df, 
                             selected_model='anthropic/claude-3-5-sonnet-20240620',
                             selected_metric='exact_match',
                             is_averaging=True)
fig.show()

In [None]:
# Function to display summary statistics
def display_summary_stats(df, selected_model, selected_metric=None, selected_split=None, is_averaging=False):
    # Filter data based on selections
    filtered_df = df[df['model'] == selected_model].copy()
    if selected_metric:
        filtered_df = filtered_df[filtered_df['metric_name'] == selected_metric]
    if selected_split:
        filtered_df = filtered_df[filtered_df['split'] == selected_split]
    
    # Calculate statistics
    values = filtered_df['mean'].dropna()
    unique_runs = filtered_df['run'].nunique()
    date_range = (filtered_df['run_timestamp'].min(), filtered_df['run_timestamp'].max())
    
    # Create a DataFrame for display
    stats_df = pd.DataFrame({
        'Statistic': [
            'Latest Value',
            'Overall Average',
            'Minimum',
            'Maximum',
            'Total Runs',
            'Data Points',
            'Days Tracked'
        ],
        'Value': [
            f"{values.iloc[-1]:.4f}",
            f"{values.mean():.4f}",
            f"{values.min():.4f}",
            f"{values.max():.4f}",
            str(unique_runs),
            str(len(filtered_df)),
            str((date_range[1] - date_range[0]).days + 1)
        ]
    })
    
    if is_averaging:
        scenario_count = filtered_df['scenario_class'].nunique()
        stats_df = pd.concat([stats_df, pd.DataFrame({
            'Statistic': ['Scenarios Averaged'],
            'Value': [str(scenario_count)]
        })], ignore_index=True)
    
    return stats_df

# Display summary statistics
stats_df = display_summary_stats(df, 
                                selected_model='anthropic/claude-3-5-sonnet-20240620',
                                selected_metric='exact_match',
                                is_averaging=True)
stats_df

In [17]:
# Change these parameters to view different data
fig = create_overview_chart(df, 
                          selected_metric='exact_match',  # Change metric
                          selected_split='test')         # Change split

In [None]:
df