# Success vs. Total Average Episodes

For each model across ALL tasks and trials:

1. TOTAL AVERAGE EPISODES:
   - Total episodes = sum of all episode counts across all trials for that model
   - Total trials = count of all trials for that model  
   - Total avg episodes = total episodes / total trials
   
   This gives us the average number of episodes a model needs per trial,
   aggregated across all tasks it attempted.

2. SUCCESS RATE:
   - Successful trials = count of trials where reward = 1
   - Success rate = successful trials / total trials
   
   This gives us the proportion of trials where the model succeeded.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
from pathlib import Path
from scipy import stats


In [None]:
def load_episode_data():
    # TODO: Set path to your terminus2 data directory (output from get_terminus2_runs.py)
    episode_file = Path("../../../terminus2_9-17_essential_files/episode_counts.json")
    
    with open(episode_file, 'r') as f:
        episode_data = json.load(f)
    
    df = pd.DataFrame(episode_data)
    
    required_columns = ['trial_id', 'episode_count', 'model_name', 'task_name', 'reward']
    df = df[df[required_columns].notna().all(axis=1)].copy()
    df = df[df['episode_count'] >= 0].copy()
    
    return df

df = load_episode_data()
print(f"Loaded {len(df)} valid trials")


In [None]:
def calculate_model_statistics(df):
    model_stats = []
    
    for model_name in df['model_name'].unique():
        model_data = df[df['model_name'] == model_name]
        
        total_episodes = model_data['episode_count'].sum()
        total_trials = len(model_data)
        successful_trials = len(model_data[model_data['reward'] == 1])
        success_rate = successful_trials / total_trials if total_trials > 0 else 0
        
        model_stats.append({
            'model_name': model_name,
            'total_episodes': total_episodes,
            'total_trials': total_trials,
            'successful_trials': successful_trials,
            'success_rate': success_rate
        })
    
    stats_df = pd.DataFrame(model_stats)
    stats_df = stats_df.sort_values('success_rate', ascending=False)
    
    return stats_df

stats_df = calculate_model_statistics(df)
display(stats_df.head(10))


In [None]:
def simplify_model_name(model_name):
    model_names = {
        "claude-sonnet-4-20250514": "Claude Sonnet 4",
        "claude-opus-4-1-20250805": "Claude Opus 4.1",
        "gpt-5": "GPT-5",
        "gpt-5-mini": "GPT-5-Mini",
        "gpt-5-nano": "GPT-5-Nano",
        "grok-4-0709": "Grok 4",
        "grok-code-fast-1": "Grok Code Fast 1",
        "gemini-2.5-pro": "Gemini 2.5 Pro",
        "gemini-2.5-flash": "Gemini 2.5 Flash",
        "Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8": "Qwen 3 Coder 480B",
        "openai/gpt-oss-120b": "GPT-OSS 120B",
        "OpenAI/gpt-oss-20B": "GPT-OSS 20B",
        "moonshotai/Kimi-K2-Instruct-0905": "Kimi K2",
        "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8": "Llama 4 Maverick 17B",
        "zai-org/GLM-4.5-Air-FP8": "GLM 4.5 Air",
        "deepseek-ai/DeepSeek-V3.1": "DeepSeek V3.1",
    }
    return model_names.get(model_name, model_name)

def extract_provider_from_model(model_name):
    if 'claude' in model_name.lower():
        return 'Anthropic'
    elif 'gpt' in model_name.lower() or 'openai' in model_name.lower():
        return 'OpenAI'
    elif 'gemini' in model_name.lower():
        return 'Google'
    elif 'deepseek' in model_name.lower():
        return 'DeepSeek'
    elif 'kimi' in model_name.lower() or 'moonshot' in model_name.lower():
        return 'Moonshot'
    elif 'qwen' in model_name.lower():
        return 'Alibaba'
    elif 'grok' in model_name.lower():
        return 'xAI'
    elif 'glm' in model_name.lower() or 'zai-org' in model_name.lower():
        return 'Zhipu'
    elif 'llama' in model_name.lower() or 'meta' in model_name.lower():
        return 'Meta'
    else:
        return 'Other'


In [None]:
def create_scatter_plot(stats_df):
    fig, ax = plt.subplots(figsize=(14, 10))
    
    stats_df = stats_df.copy()
    stats_df['provider'] = stats_df['model_name'].apply(extract_provider_from_model)
    
    provider_colors = {
        'OpenAI': '#d62728',
        'Anthropic': '#2ca02c',
        'Google': '#ff7f0e',
        'xAI': '#bcbd22',
        'DeepSeek': '#9467bd',
        'Meta': '#8c564b',
        'Moonshot': '#e377c2',
        'Alibaba': '#7f7f7f',
        'Zhipu': '#17becf',
        'Other': '#1f77b4'
    }
    
    stats_df['total_avg_episodes'] = stats_df['total_episodes'] / stats_df['total_trials']
    
    for provider in stats_df['provider'].unique():
        provider_data = stats_df[stats_df['provider'] == provider]
        ax.scatter(provider_data['total_avg_episodes'], provider_data['success_rate'],
                  c=provider_colors.get(provider, '#95A5A6'), 
                  label=provider, s=100, alpha=0.7, edgecolors='black', linewidth=1)
    
    x = stats_df['total_avg_episodes']
    y = stats_df['success_rate']
    
    correlation, p_value = stats.pearsonr(x, y)
    
    z = np.polyfit(x, y, 1)
    p = np.poly1d(z)
    ax.plot(x, p(x), "r--", alpha=0.8, linewidth=2, 
            label=f'Trend Line (r={correlation:.3f}, p={p_value:.3f})')
    
    for _, row in stats_df.iterrows():
        display_name = simplify_model_name(row['model_name'])
        x_pos = row['total_avg_episodes']
        y_pos = row['success_rate']
        
        base_offset = (-8, 0)
        if display_name == 'Grok 4':
            base_offset = (8, 8)
        
        ha = 'right' if base_offset[0] < 0 else 'left'
        
        ax.annotate(display_name, 
                   (x_pos, y_pos),
                   xytext=base_offset, textcoords='offset points',
                   fontsize=10, ha=ha, va='center')
    
    ax.set_xlabel('Total Average Episodes', fontsize=12, fontweight='bold')
    ax.set_ylabel('Success Rate', fontsize=12, fontweight='bold')
    ax.set_title('Total Average Episodes vs Success Rate', fontsize=14, fontweight='bold')
    
    ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: f'{y:.2f}'))
    ax.grid(True, alpha=0.3)
    ax.set_xlim(left=0)
    ax.set_ylim(bottom=0, top=max(stats_df['success_rate']) * 1.1)
    ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    
    plt.tight_layout()
    plt.savefig('total_avg_episodes_vs_success.png', dpi=300, bbox_inches='tight')
    plt.savefig('total_avg_episodes_vs_success.pdf', dpi=300, bbox_inches='tight')
    plt.show()
    
    return correlation, p_value

correlation, p_value = create_scatter_plot(stats_df)
print(f"Correlation: {correlation:.3f}, p-value: {p_value:.3f}")


In [None]:
stats_df.to_csv('model_total_statistics.csv', index=False)
print(f"Saved statistics for {len(stats_df)} models")
