In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Path to your results file
results_file = "/Users/serenapei/llmination-reasoning/results/math_results_20250513_121754.pkl"

def inspect_results(file_path):
    """Load and inspect the results data"""
    
    # Check if file exists
    if not os.path.exists(file_path):
        print(f"Error: File not found - {file_path}")
        return
    
    # Load the pickle file
    print(f"Loading data from {file_path}...")
    try:
        df = pd.read_pickle(file_path)
    except Exception as e:
        print(f"Error loading file: {e}")
        return
    
    # Basic info
    print(f"\n===== BASIC INFORMATION =====")
    print(f"Total records: {len(df)}")
    print(f"Memory usage: {df.memory_usage(deep=True).sum() / (1024*1024):.2f} MB")
    print(f"Columns: {', '.join(df.columns)}")
    
    # Model distribution
    print(f"\n===== MODEL DISTRIBUTION =====")
    model_counts = df.groupby(['api', 'model']).size()
    print(model_counts)
    
    # Reasoning type distribution 
    print(f"\n===== REASONING TYPE DISTRIBUTION =====")
    reasoning_counts = df['reasoning_type'].value_counts()
    print(reasoning_counts)
    print("\nPercentage:")
    print(reasoning_counts / len(df) * 100)
    
    # Confidence stats
    if 'confidence' in df.columns and df['confidence'].notna().any():
        print(f"\n===== CONFIDENCE STATISTICS =====")
        print(f"Mean confidence: {df['confidence'].mean():.2f}%")
        print(f"Median confidence: {df['confidence'].median():.2f}%")
        print(f"Confidence by reasoning type:")
        print(df.groupby('reasoning_type')['confidence'].mean().sort_values(ascending=False))
    
    # Response time
    print(f"\n===== RESPONSE TIME STATISTICS (seconds) =====")
    print(f"Mean response time: {df['response_time'].mean():.2f}s")
    print(f"Median response time: {df['response_time'].median():.2f}s")
    print(f"Response time by model:")
    print(df.groupby(['api', 'model'])['response_time'].mean().sort_values())
    
    # Accuracy analysis (if applicable)
    if 'correct' in df.columns:
        print(f"\n===== ACCURACY ANALYSIS =====")
        overall_acc = df['correct'].mean() * 100
        print(f"Overall accuracy: {overall_acc:.2f}%")
        
        print("\nAccuracy by model:")
        model_acc = df.groupby(['api', 'model'])['correct'].agg(['mean', 'count'])
        model_acc['mean'] = model_acc['mean'] * 100
        print(model_acc.sort_values('mean', ascending=False))
        
        if 'problem_type' in df.columns:
            print("\nAccuracy by problem type:")
            type_acc = df.groupby('problem_type')['correct'].agg(['mean', 'count'])
            type_acc['mean'] = type_acc['mean'] * 100
            print(type_acc.sort_values('mean', ascending=False))
        
        print("\nAccuracy by reasoning type:")
        reason_acc = df.groupby('reasoning_type')['correct'].agg(['mean', 'count'])
        reason_acc['mean'] = reason_acc['mean'] * 100
        print(reason_acc.sort_values('mean', ascending=False))
        
        # Check relationship between confidence and accuracy
        if 'confidence' in df.columns and df['confidence'].notna().any():
            print("\nCorrelation between confidence and accuracy:")
            correlation = df['confidence'].corr(df['correct'])
            print(f"Correlation coefficient: {correlation:.4f}")
            
            # Group by confidence bands
            df['confidence_band'] = pd.cut(df['confidence'], 
                                          bins=[0, 25, 50, 75, 90, 100], 
                                          labels=['0-25%', '26-50%', '51-75%', '76-90%', '91-100%'])
            conf_acc = df.groupby('confidence_band')['correct'].agg(['mean', 'count'])
            conf_acc['mean'] = conf_acc['mean'] * 100
            print("\nAccuracy by confidence band:")
            print(conf_acc)
    
    # Sample responses
    print(f"\n===== SAMPLE RESPONSES =====")
    # Show a few example responses from different models
    for (api, model), group in df.groupby(['api', 'model']):
        if len(group) > 0:
            print(f"\nSample response from {api}/{model}:")
            sample = group.iloc[0]
            print(f"Problem: {sample['prompt_text'][:150]}...")
            print(f"Answer: {sample['answer'][:150]}...")
            print(f"Reasoning: {sample['reasoning_type']}")
            if 'confidence' in df.columns and pd.notna(sample['confidence']):
                print(f"Confidence: {sample['confidence']}%")
            if 'correct' in df.columns:
                print(f"Correct: {sample['correct']}")
            print("-" * 50)
    
    return df

# Execute the inspection
df = inspect_results(results_file)

# Optionally save summarized results to a CSV for easier viewing
if df is not None:
    summary_file = os.path.splitext(results_file)[0] + "_summary.csv"
    df_summary = df[['api', 'model', 'prompt_id', 'problem_type', 'reasoning_type', 
                     'confidence', 'response_time']]
    if 'correct' in df.columns:
        df_summary = df_summary.join(df[['correct']])
    df_summary.to_csv(summary_file, index=False)
    print(f"\nSummary saved to {summary_file}")

print("\nInspection complete!")

Loading data from /Users/serenapei/llmination-reasoning/results/math_results_20250513_121754.pkl...

===== BASIC INFORMATION =====
Total records: 50
Memory usage: 0.05 MB
Columns: prompt_id, prompt_text, api, model, answer, reasoning_type, confidence, response_time, timestamp, problem_type, subject, level, original_answer

===== MODEL DISTRIBUTION =====
api   model         
groq  llama3-8b-8192    50
dtype: int64

===== REASONING TYPE DISTRIBUTION =====
reasoning_type
Reasoning        30
Uncertain        13
Hallucination     7
Name: count, dtype: int64

Percentage:
reasoning_type
Reasoning        60.0
Uncertain        26.0
Hallucination    14.0
Name: count, dtype: float64

===== CONFIDENCE STATISTICS =====
Mean confidence: 97.70%
Median confidence: 100.00%
Confidence by reasoning type:
reasoning_type
Hallucination        100.0
Reasoning             98.4
Uncertain        94.846154
Name: confidence, dtype: object

===== RESPONSE TIME STATISTICS (seconds) =====
Mean response time: 1.27s
M