# Model Success vs. Avg Output Tokens

1. **Extracts token usage** from result.json files in trial directories
2. **Combines with performance data** from difficulty analysis pipeline  
3. **Analyzes token efficiency** - tokens per success, output tokens vs success rates
4. **Creates visualizations** showing relationships between token usage and performance


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
from pathlib import Path
from datetime import datetime
import ast
from scipy import stats


In [None]:
def extract_token_usage():
    # TODO: Set path to your terminus2 data directory (output from get_terminus2_runs.py)
    base_dir = Path("../../../../terminus2_9-17_essential_files")
    token_data = []
    
    for trial_dir in base_dir.iterdir():
        if not trial_dir.is_dir():
            continue
            
        result_file = trial_dir / "result.json"
        if not result_file.exists():
            continue
            
        try:
            with open(result_file, 'r') as f:
                result = json.load(f)
            
            trial_id = result.get('id')
            model_name = result.get('agent_info', {}).get('model_info', {}).get('name')
            task_name = result.get('task_name')
            
            verifier_result = result.get('verifier_result') or {}
            reward = verifier_result.get('reward', 0) if isinstance(verifier_result, dict) else 0
            
            # Token data is in agent_result, not agent_execution
            agent_result = result.get('agent_result', {})
            if not agent_result:
                continue
                
            input_tokens = agent_result.get('n_input_tokens', 0)
            output_tokens = agent_result.get('n_output_tokens', 0)
            
            # Execution time is in agent_execution
            agent_execution = result.get('agent_execution', {})
            started_at = agent_execution.get('started_at')
            finished_at = agent_execution.get('finished_at')
            
            execution_time = None
            if started_at and finished_at:
                start_dt = datetime.fromisoformat(started_at.replace('Z', '+00:00'))
                end_dt = datetime.fromisoformat(finished_at.replace('Z', '+00:00'))
                execution_time = (end_dt - start_dt).total_seconds()
            
            token_data.append({
                'trial_id': trial_id,
                'model_name': model_name,
                'task_name': task_name,
                'reward': reward,
                'success': reward > 0,
                'n_input_tokens': input_tokens,
                'n_output_tokens': output_tokens,
                'total_tokens': input_tokens + output_tokens,
                'execution_time_sec': execution_time
            })
            
        except:
            continue
    
    return pd.DataFrame(token_data)

token_df = extract_token_usage()
print(f"Extracted token data from {len(token_df)} trials")


In [None]:
def extract_provider_from_model(model_name):
    if 'claude' in model_name.lower():
        return 'Anthropic'
    elif 'gpt' in model_name.lower() or 'openai' in model_name.lower():
        return 'OpenAI'
    elif 'gemini' in model_name.lower():
        return 'Google'
    elif 'deepseek' in model_name.lower():
        return 'DeepSeek'
    elif 'kimi' in model_name.lower() or 'moonshot' in model_name.lower():
        return 'Moonshot'
    elif 'qwen' in model_name.lower():
        return 'Alibaba'
    elif 'grok' in model_name.lower():
        return 'xAI'
    elif 'glm' in model_name.lower() or 'zai-org' in model_name.lower():
        return 'Zhipu AI'
    elif 'llama' in model_name.lower() or 'meta' in model_name.lower():
        return 'Meta'
    else:
        return 'Other'

def process_token_data(token_df):
    model_summary = token_df.groupby('model_name').agg({
        'n_output_tokens': ['mean', 'std', 'sum'],
        'success': ['mean', 'count'],
        'execution_time_sec': 'mean'
    }).round(3)
    
    model_summary.columns = [
        'avg_output_tokens', 'std_output_tokens', 'total_output_tokens',
        'success_rate', 'trial_count', 'avg_execution_time'
    ]
    
    model_summary = model_summary[model_summary['trial_count'] >= 10].copy()
    model_summary['provider'] = model_summary.index.map(extract_provider_from_model)
    
    return model_summary

model_summary = process_token_data(token_df)
print(f"Processed data for {len(model_summary)} models with >= 10 trials")


In [None]:
def simplify_model_name(model_name):
    model_names = {
        "claude-sonnet-4-20250514": "Claude Sonnet 4",
        "claude-opus-4-1-20250805": "Claude Opus 4.1",
        "gpt-5": "GPT-5",
        "gpt-5-mini": "GPT-5-Mini",
        "gpt-5-nano": "GPT-5-Nano",
        "grok-4-0709": "Grok 4",
        "grok-code-fast-1": "Grok Code Fast 1",
        "gemini-2.5-pro": "Gemini 2.5 Pro",
        "gemini-2.5-flash": "Gemini 2.5 Flash",
        "Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8": "Qwen 3 Coder 480B",
        "openai/gpt-oss-120b": "GPT-OSS 120B",
        "OpenAI/gpt-oss-20B": "GPT-OSS 20B",
        "moonshotai/Kimi-K2-Instruct-0905": "Kimi K2",
        "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8": "Llama 4 Maverick 17B",
        "zai-org/GLM-4.5-Air-FP8": "GLM 4.5 Air",
        "deepseek-ai/DeepSeek-V3.1": "DeepSeek V3.1",
    }
    return model_names.get(model_name, model_name)

def create_token_success_plot(model_summary):
    print(f"Data summary:")
    print(f"Models: {len(model_summary)}")
    print(f"Output tokens range: {model_summary['avg_output_tokens'].min():.0f} - {model_summary['avg_output_tokens'].max():.0f}")
    print(f"Success rate range: {model_summary['success_rate'].min():.3f} - {model_summary['success_rate'].max():.3f}")
    
    # Check if we have enough variation in the data
    if len(model_summary) < 2:
        print("Not enough models for correlation analysis")
        return None, None
        
    if model_summary['avg_output_tokens'].std() == 0:
        print("All models have same output tokens - no correlation possible")
        return None, None
        
    if model_summary['success_rate'].std() == 0:
        print("All models have same success rate - no correlation possible")
        return None, None
    
    fig, ax = plt.subplots(figsize=(12, 8))
    
    provider_colors = {
        'OpenAI': '#d62728',
        'Anthropic': '#2ca02c', 
        'Google': '#ff7f0e',
        'xAI': '#bcbd22',
        'DeepSeek': '#9467bd',
        'Meta': '#8c564b',
        'Moonshot': '#e377c2',
        'Alibaba': '#7f7f7f',
        'Zhipu AI': '#17becf',
        'Other': '#1f77b4'
    }
    
    for provider in model_summary['provider'].unique():
        provider_data = model_summary[model_summary['provider'] == provider]
        ax.scatter(provider_data['avg_output_tokens'], provider_data['success_rate'],
                  c=provider_colors.get(provider, '#95A5A6'), 
                  label=provider, s=100, alpha=0.7, edgecolors='black', linewidth=1)
    
    x = model_summary['avg_output_tokens']
    y = model_summary['success_rate']
    
    # Try to calculate correlation and trend line with error handling
    correlation, p_value = None, None
    try:
        correlation, p_value = stats.pearsonr(x, y)
        
        # Only add trend line if correlation is valid
        if not np.isnan(correlation):
            z = np.polyfit(x, y, 1)
            p = np.poly1d(z)
            ax.plot(x, p(x), "r--", alpha=0.8, linewidth=2, 
                    label=f'Trend (r={correlation:.3f}, p={p_value:.3f})')
    except:
        print("Could not calculate correlation - data may be constant")
    
    # Add model name labels
    for _, row in model_summary.iterrows():
        display_name = simplify_model_name(row['model_name'])
        x_pos = row['avg_output_tokens']
        y_pos = row['success_rate']
        
        ax.annotate(display_name, 
                   (x_pos, y_pos),
                   xytext=(8, 0), textcoords='offset points',
                   fontsize=10, ha='left', va='center')
    
    ax.set_xlabel('Average Output Tokens', fontsize=12, fontweight='bold')
    ax.set_ylabel('Success Rate', fontsize=12, fontweight='bold')
    ax.set_title('Success Rate vs Output Tokens', fontsize=14, fontweight='bold')
    
    ax.grid(True, alpha=0.3)
    ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    
    plt.tight_layout()
    plt.savefig('results/success_vs_output_tokens.png', dpi=300, bbox_inches='tight')
    plt.savefig('results/success_vs_output_tokens.pdf', dpi=300, bbox_inches='tight')
    plt.show()
    
    return correlation, p_value

Path("results").mkdir(exist_ok=True)
correlation, p_value = create_token_success_plot(model_summary)

if correlation is not None and p_value is not None:
    print(f"Correlation: {correlation:.3f}, p-value: {p_value:.3f}")
else:
    print("Could not calculate correlation statistics")


In [None]:
model_summary.to_csv('results/model_token_summary.csv')

provider_summary = model_summary.groupby('provider').agg({
    'avg_output_tokens': 'mean',
    'success_rate': 'mean',
    'trial_count': 'sum',
    'avg_execution_time': 'mean'
}).round(3)

provider_summary.to_csv('results/provider_token_summary.csv')

print("Saved files:")
print("- model_token_summary.csv")
print("- provider_token_summary.csv")
print("- success_vs_output_tokens.png")
print("- success_vs_output_tokens.pdf")
