In [48]:
import pandas as pd

In [49]:
import matplotlib.pyplot as plt
import numpy as np


In [50]:

def analyze_metric_trends_from_csv(df: pd.DataFrame, 
                                  model: str = None, 
                                  dataset: str = None,
                                  metric_name: str = None,
                                  stat_column: str = 'mean',
                                  min_runs: int = 2) -> pd.DataFrame:
    """
    Analyze how metric values change over time for model-dataset combinations.
    
    Args:
        df: DataFrame loaded from the saved CSV with columns like 
            ['model', 'scenario_class', 'run_timestamp', 'name', 'mean', etc.]
        model: Specific model to filter for (None for all models)
        dataset: Specific dataset/scenario_class to filter for (None for all)
        metric_name: Specific metric to analyze (None for all metrics)
        stat_column: Which statistical column to use ('mean', 'max', 'min', etc.)
        min_runs: Minimum number of runs required to include a combo
        
    Returns:
        DataFrame with time series data showing metric trends
    """
    # Make a copy to avoid modifying original
    analysis_df = df.copy()
    
    # Convert run_timestamp to datetime if it's not already
    if analysis_df['run_timestamp'].dtype == 'object':
        analysis_df['run_timestamp'] = pd.to_datetime(analysis_df['run_timestamp'])
    
    # Apply filters
    if model:
        analysis_df = analysis_df[analysis_df['model'] == model]
    if dataset:
        analysis_df = analysis_df[analysis_df['scenario_class'] == dataset]
    if metric_name:
        analysis_df = analysis_df[analysis_df['name'] == metric_name]
    
    # Group by model, dataset, metric, and time to get time series
    time_series = (analysis_df
                  .groupby(['model', 'scenario_class', 'name', 'run_timestamp', 'run_date'])
                  .agg({
                      stat_column: 'first',  # Take the value
                      'count': 'first',      # Number of instances
                      'run': 'first'         # Run identifier
                  })
                  .reset_index())
    
    # Add sequence numbers for each model-dataset-metric combo
    time_series['run_sequence'] = (time_series
                                  .groupby(['model', 'scenario_class', 'name'])
                                  .cumcount() + 1)
    
    # Filter combinations with minimum runs
    combo_counts = (time_series
                   .groupby(['model', 'scenario_class', 'name'])
                   .size()
                   .reset_index(name='run_count'))
    
    valid_combos = combo_counts[combo_counts['run_count'] >= min_runs]
    
    time_series = time_series.merge(
        valid_combos[['model', 'scenario_class', 'name']], 
        on=['model', 'scenario_class', 'name'],
        how='inner'
    )
    
    # Sort by time for proper trend analysis
    time_series = time_series.sort_values(['model', 'scenario_class', 'name', 'run_timestamp'])
    
    # Add trend indicators
    def calculate_trend(group):
        if len(group) < 2:
            return group.assign(
                trend_direction='insufficient_data',
                change_from_first=0,
                change_from_previous=0
            )
        
        first_value = group[stat_column].iloc[0]
        last_value = group[stat_column].iloc[-1]
        
        # Calculate change from first run
        change_from_first = last_value - first_value
        
        # Calculate change from previous run
        group = group.copy()
        group['change_from_previous'] = group[stat_column].diff()
        group['change_from_first'] = group[stat_column] - first_value
        
        # Determine overall trend
        if change_from_first > 0.001:  # Small threshold for numerical stability
            trend = 'improving'
        elif change_from_first < -0.001:
            trend = 'declining'
        else:
            trend = 'stable'
            
        group['trend_direction'] = trend
        
        return group
    
    time_series = (time_series
                  .groupby(['model', 'scenario_class', 'name'])
                  .apply(calculate_trend)
                  .reset_index(drop=True))
    
    return time_series


def get_metric_summary_table(df: pd.DataFrame, 
                           stat_column: str = 'mean',
                           top_n_recent: int = 5) -> pd.DataFrame:
    """
    Create a summary table showing the latest performance for each model-dataset-metric combo.
    
    Args:
        df: DataFrame from analyze_metric_trends_from_csv
        stat_column: Which column to summarize
        top_n_recent: Number of most recent runs to average for "current" performance
        
    Returns:
        Summary DataFrame with latest performance metrics
    """
    
    # Get the most recent N runs for each combo
    latest_performance = (df
                         .groupby(['model', 'scenario_class', 'name'])
                         .apply(lambda x: x.nlargest(top_n_recent, 'run_timestamp'))
                         .reset_index(drop=True))
    
    # Calculate summary statistics
    summary = (latest_performance
              .groupby(['model', 'scenario_class', 'name'])
              .agg({
                  stat_column: ['mean', 'std', 'count'],
                  'run_timestamp': ['min', 'max'],
                  'trend_direction': 'first',
                  'change_from_first': 'last'
              })
              .round(4))
    
    # Flatten column names
    summary.columns = ['_'.join(col).strip() for col in summary.columns]
    summary = summary.rename(columns={
        f'{stat_column}_mean': f'current_{stat_column}',
        f'{stat_column}_std': f'{stat_column}_std',
        f'{stat_column}_count': 'num_recent_runs',
        'run_timestamp_min': 'earliest_recent_run',
        'run_timestamp_max': 'latest_run',
        'trend_direction_first': 'trend',
        'change_from_first_last': 'total_change'
    })
    
    summary = summary.reset_index()
    
    # Sort by model, dataset, and current performance
    summary = summary.sort_values(['model', 'scenario_class', f'current_{stat_column}'], 
                                 ascending=[True, True, False])
    
    return summary


def plot_metric_trends(df: pd.DataFrame, 
                      model: str = None,
                      dataset: str = None, 
                      metric_name: str = None,
                      stat_column: str = 'mean',
                      save_path: str = None) -> None:
    """
    Plot metric trends over time. Requires matplotlib.
    
    Args:
        df: DataFrame from analyze_metric_trends_from_csv  
        model: Model to plot (None for all)
        dataset: Dataset to plot (None for all)
        metric_name: Metric to plot (None for all)
        stat_column: Which column to plot
        save_path: Path to save plot (None to display)
    """
    try:
        import matplotlib.pyplot as plt
        import matplotlib.dates as mdates
    except ImportError:
        print("matplotlib not available. Install with: pip install matplotlib")
        return
    
    # Filter data
    plot_df = df.copy()
    if model:
        plot_df = plot_df[plot_df['model'] == model]
    if dataset:
        plot_df = plot_df[plot_df['scenario_class'] == dataset]
    if metric_name:
        plot_df = plot_df[plot_df['name'] == metric_name]
    
    if plot_df.empty:
        print("No data to plot with the given filters")
        return
    
    # Create subplots for each metric
    metrics = plot_df['name'].unique()
    n_metrics = len(metrics)
    
    fig, axes = plt.subplots(n_metrics, 1, figsize=(12, 4*n_metrics), squeeze=False)
    axes = axes.flatten()
    
    for i, metric in enumerate(metrics):
        ax = axes[i]
        metric_data = plot_df[plot_df['name'] == metric]
        
        # Plot each model-dataset combo
        for (model_name, dataset_name), group in metric_data.groupby(['model', 'scenario_class']):
            group = group.sort_values('run_timestamp')
            ax.plot(group['run_timestamp'], group[stat_column], 
                   marker='o', label=f"{model_name} - {dataset_name}")
        
        ax.set_title(f"{metric} - {stat_column}")
        ax.set_xlabel("Time")
        ax.set_ylabel(stat_column)
        ax.legend()
        ax.grid(True, alpha=0.3)
        
        # Format x-axis dates
        ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
        plt.setp(ax.xaxis.get_majorticklabels(), rotation=45)
    
    plt.tight_layout()
    
    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
        print(f"Plot saved to {save_path}")
    else:
        plt.show()


# Example usage function
def analyze_saved_benchmark_data(csv_path: str = "benchmark_summary.csv") -> dict[str, pd.DataFrame]:
    """
    Load and analyze the saved benchmark data CSV.
    
    Args:
        csv_path: Path to the saved CSV file
        
    Returns:
        Dictionary with analysis results
    """
    
    # Load the data
    df = pd.read_csv(csv_path)
    print(f"Loaded {len(df)} rows from {csv_path}")
    
    # Get overall trends
    trends = analyze_metric_trends_from_csv(df)
    print(f"Found {len(trends)} metric trend points across {trends.groupby(['model', 'scenario_class', 'name']).ngroups} model-dataset-metric combinations")
    
    # Get summary table
    summary = get_metric_summary_table(trends)
    print(f"Generated summary for {len(summary)} combinations")
    
    # Show some examples
    print("\nTop performing model-dataset combinations (by mean):")
    print(summary.head(10)[['model', 'scenario_class', 'name', 'current_mean', 'trend', 'total_change']].to_string())
    
    print("\nMost improved combinations:")
    improved = summary[summary['total_change'] > 0].nlargest(5, 'total_change')
    print(improved[['model', 'scenario_class', 'name', 'current_mean', 'total_change']].to_string())
    
    return {
        'raw_data': df,
        'trends': trends,
        'summary': summary
    }

In [None]:
# Load and analyze your saved data
results = analyze_saved_benchmark_data("benchmark_summary.csv")

# Look at trends for a specific model-dataset combo
gpt4_math_trends = analyze_metric_trends_from_csv(
    results['raw_data'], 
    model="gpt-4", 
    dataset="math_scenario"
)

# Get trends for a specific metric across all combos
exact_match_trends = analyze_metric_trends_from_csv(
    results['raw_data'],
    metric_name="exact_match"
)

# Plot trends (if you have matplotlib)
plot_metric_trends(
    results['trends'],
    metric_name="exact_match",
    save_path="exact_match_trends.png"
)