# Task Difficulty Analysis Pipeline


We compare:

1. MODEL DIFFICULTY (Empirical):
- **Model resolve rate** = (models that solve task) / (total models tested)
- **Classification**:
  - Easy: >= 66.7% of models solve it
  - Medium: 33.3%-66.6% of models solve it  
  - Hard: < 33.3% of models solve it

This gives us empirical difficulty based on actual model performance.

2. HUMAN DIFFICULTY (Predicted):
- **Human labels** from terminal-bench dataset task definitions
- Categories: medium, hard
- Based on human assessment of task complexity

3. DIFFICULTY MATRIX:
Creates a confusion matrix comparing human predictions vs empirical model performance.


In [None]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from datetime import datetime
import matplotlib.colors as mcolors
import toml
import yaml

# TODO: Set path to your terminus2 data directory (output from get_terminus2_runs.py)
TRAJECTORY_DIR = ""  # e.g., "../../../terminus2_9-17_essential_files"
RESULTS_DIR = "difficulty_analysis_results"
Path(RESULTS_DIR).mkdir(exist_ok=True)


In [None]:
def extract_trial_data(trial_dir):
    result_path = trial_dir / "result.json"
    if not result_path.exists():
        return None
        
    try:
        with open(result_path) as f:
            result = json.load(f)
        
        verifier_result = result.get('verifier_result') or {}
        reward = verifier_result.get('reward', 0) if isinstance(verifier_result, dict) else 0
        
        data = {
            'trial_id': result.get('id'),
            'trial_name': result.get('trial_name'),
            'model_name': result.get('agent_info', {}).get('model_info', {}).get('name'),
            'agent_name': result.get('agent_info', {}).get('name'),
            'task_name': result.get('task_name'),
            'reward': reward,
            'success': reward > 0,
            'trial_uri': result.get('trial_uri'),
            'created_at': result.get('created_at'),
        }
        
        for phase in ['setup', 'execution']:
            phase_data = result.get(f'agent_{phase}', {})
            start = phase_data.get('started_at')
            end = phase_data.get('finished_at')
            if start and end:
                start_dt = datetime.fromisoformat(start.replace('Z', '+00:00'))
                end_dt = datetime.fromisoformat(end.replace('Z', '+00:00'))
                data[f'agent_{phase}_time_sec'] = (end_dt - start_dt).total_seconds()
            else:
                data[f'agent_{phase}_time_sec'] = None
        
        return data
    except:
        return None

def extract_all_trials(trajectories_dir=TRAJECTORY_DIR, output_dir=RESULTS_DIR):
    trajectories_path = Path(trajectories_dir)
    if not trajectories_path.exists():
        return {}, pd.DataFrame()
    
    output_path = Path(output_dir)
    output_path.mkdir(exist_ok=True)
    
    cache_path = output_path / "trials_cache.json"
    trials_dict = {}
    if cache_path.exists():
        with open(cache_path) as f:
            trials_dict = json.load(f)
    
    trial_dirs = [d for d in trajectories_path.iterdir() if d.is_dir()]
    new_count = 0
    
    for trial_dir in trial_dirs:
        trial_id = trial_dir.name
        if trial_id not in trials_dict:
            trial_data = extract_trial_data(trial_dir)
            if trial_data:
                trials_dict[trial_id] = trial_data
                new_count += 1
    
    if new_count > 0:
        with open(cache_path, 'w') as f:
            json.dump(trials_dict, f, indent=2)
    
    return trials_dict, pd.DataFrame(list(trials_dict.values()))

trials_dict, trials_df = extract_all_trials()
trials_df.to_csv(f"{RESULTS_DIR}/trials_raw.csv", index=False)


In [None]:
def calculate_model_task_performance(trials_df):
    trials_df = trials_df.drop_duplicates('trial_id')
    
    grouped = trials_df.groupby(['model_name', 'task_name']).agg({
        'trial_id': ['count', list],
        'trial_name': list,
        'success': ['sum', list],
        'agent_execution_time_sec': list,
        'agent_setup_time_sec': list,
    }).reset_index()
    
    grouped.columns = ['model_name', 'task_name', 'total_trials', 'trial_ids', 'trial_names',
                      'successful_trials', 'success_list', 'execution_times_sec', 'setup_times_sec']
    
    grouped['success_rate'] = grouped['successful_trials'] / grouped['total_trials']
    grouped['resolves_task'] = grouped['successful_trials'] > (grouped['total_trials'] / 2)
    
    grouped['successful_trial_ids'] = grouped.apply(
        lambda row: [tid for tid, success in zip(row['trial_ids'], row['success_list']) if success], 
        axis=1
    )
    
    grouped = grouped.drop(['success_list'], axis=1)
    return grouped

performance_df = calculate_model_task_performance(trials_df)
performance_df.to_csv(f"{RESULTS_DIR}/model_task_performance.csv", index=False)


In [None]:
def calculate_task_resolution(performance_df):
    task_groups = performance_df.groupby('task_name').agg({
        'model_name': list,
        'resolves_task': ['count', 'sum', list],
    }).reset_index()
    
    task_groups.columns = ['task_name', 'all_models', 'total_models_tested', 'models_that_resolve', 'resolve_list']
    
    task_groups['resolving_models'] = task_groups.apply(
        lambda row: [model for model, resolves in zip(row['all_models'], row['resolve_list']) if resolves],
        axis=1
    )
    
    task_groups['non_resolving_models'] = task_groups.apply(
        lambda row: [model for model, resolves in zip(row['all_models'], row['resolve_list']) if not resolves],
        axis=1
    )
    
    task_groups['model_resolve_rate'] = task_groups['models_that_resolve'] / task_groups['total_models_tested']
    task_groups = task_groups.drop(['all_models', 'resolve_list'], axis=1)
    
    return task_groups

resolution_df = calculate_task_resolution(performance_df)
resolution_df.to_csv(f"{RESULTS_DIR}/task_resolution.csv", index=False)


In [None]:
def classify_task_difficulty(resolution_df):
    def classify_difficulty(resolve_rate):
        if resolve_rate >= 0.667:
            return 'easy'
        elif resolve_rate >= 0.333:
            return 'medium'
        else:
            return 'hard'
    
    difficulty_df = resolution_df.copy()
    difficulty_df['model_difficulty'] = difficulty_df['model_resolve_rate'].apply(classify_difficulty)
    difficulty_df['human_difficulty'] = None
    
    cols = ['task_name', 'model_resolve_rate', 'model_difficulty', 'human_difficulty', 
            'total_models_tested', 'models_that_resolve', 'resolving_models', 'non_resolving_models']
    difficulty_df = difficulty_df[cols]
    
    return difficulty_df

difficulty_df = classify_task_difficulty(resolution_df)
difficulty_df.to_csv(f"{RESULTS_DIR}/task_difficulty.csv", index=False)


In [None]:
def load_human_difficulty_labels():
    human_labels = {}
    
    # TODO: Set path to your terminal-bench-2.0-dataset repository
    terminal_bench_2_path = Path.home() / "path/to/terminal-bench-2.0-dataset" / "tasks"
    if terminal_bench_2_path.exists():
        for task_dir in terminal_bench_2_path.iterdir():
            if not task_dir.is_dir():
                continue
            task_toml_path = task_dir / "task.toml"
            if not task_toml_path.exists():
                continue
            try:
                with open(task_toml_path, 'r') as f:
                    task_config = toml.load(f)
                task_name = task_dir.name
                difficulty = task_config.get('metadata', {}).get('difficulty', 'unknown')
                if difficulty != 'unknown':
                    human_labels[task_name] = difficulty.lower()
            except:
                pass
    
    # TODO: Set path to your terminal-bench repository
    terminal_bench_path = Path.home() / "path/to/terminal-bench" / "tasks"
    if terminal_bench_path.exists():
        for task_dir in terminal_bench_path.iterdir():
            if not task_dir.is_dir():
                continue
            task_yaml_path = task_dir / "task.yaml"
            if not task_yaml_path.exists():
                continue
            try:
                with open(task_yaml_path, 'r') as f:
                    task_config = yaml.safe_load(f)
                task_name = task_dir.name
                difficulty = task_config.get('difficulty', 'unknown')
                if difficulty != 'unknown' and task_name not in human_labels:
                    human_labels[task_name] = difficulty.lower()
            except:
                pass
    
    return human_labels

def create_confusion_matrix(difficulty_df):
    human_labels = load_human_difficulty_labels()
    
    def get_human_difficulty(task_name):
        return human_labels.get(task_name, 'unknown')
    
    difficulty_df['human_difficulty'] = difficulty_df['task_name'].apply(get_human_difficulty)
    difficulty_df = difficulty_df[difficulty_df['human_difficulty'] != 'unknown']
    
    if len(difficulty_df) == 0:
        return pd.DataFrame()
    
    confusion = pd.crosstab(
        difficulty_df['human_difficulty'], 
        difficulty_df['model_difficulty'],
        margins=False
    )
    
    human_categories = ['hard', 'medium']
    empirical_categories = ['easy', 'medium', 'hard']
    
    confusion = confusion.reindex(
        index=human_categories, 
        columns=empirical_categories, 
        fill_value=0
    )
    
    return confusion

def plot_compact_heatmap(confusion_matrix, output_base, normalize=False):
    plt.rcParams['figure.dpi'] = 300
    plt.rcParams['savefig.dpi'] = 300
    
    fig, ax = plt.subplots(figsize=(4.2, 3.6))
    
    if normalize:
        confusion_normalized = confusion_matrix.div(confusion_matrix.sum(axis=1), axis=0) * 100
        plot_data = confusion_normalized.values
        colorbar_label = 'Percentage of Tasks'
        value_format = '.1f'
        value_suffix = '%'
    else:
        plot_data = confusion_matrix.values
        colorbar_label = 'Number of Tasks'
        value_format = 'd'
        value_suffix = ''
    
    colors = ['#f7f7f7', '#c6dbef', '#6baed6', '#3182bd', '#08519c']
    cmap = mcolors.LinearSegmentedColormap.from_list('custom_blues', colors, N=256)
    
    im = ax.imshow(plot_data, cmap=cmap, aspect='equal')
    
    cbar = plt.colorbar(im, ax=ax, shrink=0.5, pad=0.05)
    cbar.set_label(colorbar_label, rotation=90, labelpad=10, fontsize=10)
    cbar.ax.tick_params(labelsize=9)
    
    ax.set_xticks(range(len(confusion_matrix.columns)))
    ax.set_yticks(range(len(confusion_matrix.index)))
    ax.set_xticklabels([col.capitalize() for col in confusion_matrix.columns], fontsize=11)
    ax.set_yticklabels([idx.capitalize() for idx in confusion_matrix.index], fontsize=11)
    
    for i in range(len(confusion_matrix.index)):
        for j in range(len(confusion_matrix.columns)):
            if normalize:
                value = confusion_normalized.iloc[i, j]
                display_value = f"{value:{value_format}}{value_suffix}"
            else:
                value = confusion_matrix.iloc[i, j]
                display_value = f"{value:{value_format}}{value_suffix}"
            
            text_color = 'black' if value < plot_data.max() * 0.6 else 'white'
            ax.text(j, i, display_value, 
                   ha="center", va="center", 
                   color=text_color, 
                   fontsize=12, 
                   fontweight='bold')
    
    plt.title('Task Difficulty Matrix', fontsize=12, pad=15, fontweight='bold')
    plt.xlabel('Empirical Difficulty', fontsize=10, fontweight='bold', labelpad=8)
    plt.ylabel('Human-Predicted Difficulty', fontsize=10, fontweight='bold', labelpad=8)
    
    ax.set_xticks(np.arange(len(confusion_matrix.columns)) - 0.5, minor=True)
    ax.set_yticks(np.arange(len(confusion_matrix.index)) - 0.5, minor=True)
    ax.grid(which="minor", color="white", linestyle='-', linewidth=2)
    ax.tick_params(which="minor", size=0)
    
    for spine in ax.spines.values():
        spine.set_visible(False)
    
    plt.tight_layout(pad=0.8)
    
    output_file = f"{output_base}.png"
    plt.savefig(output_file, 
               dpi=300,
               bbox_inches='tight', 
               pad_inches=0.05,
               facecolor='white',
               edgecolor='none')
    
    plt.show()
    return confusion_matrix

# Update CSV with human difficulty labels
def load_human_labels_for_csv():
    human_labels = {}
    
    # TODO: Set path to your terminal-bench-2.0-dataset repository
    path_2_0 = Path.home() / "path/to/terminal-bench-2.0-dataset" / "tasks"
    if path_2_0.exists():
        for task_dir in path_2_0.iterdir():
            if not task_dir.is_dir():
                continue
            task_toml = task_dir / "task.toml"
            if not task_toml.exists():
                continue
            try:
                with open(task_toml, 'r') as f:
                    config = toml.load(f)
                difficulty = config.get('metadata', {}).get('difficulty', 'unknown')
                if difficulty != 'unknown':
                    human_labels[task_dir.name] = difficulty.lower()
            except:
                pass
    
    # TODO: Set path to your terminal-bench repository
    path_orig = Path.home() / "path/to/terminal-bench" / "tasks"
    if path_orig.exists():
        for task_dir in path_orig.iterdir():
            if not task_dir.is_dir():
                continue
            task_yaml = task_dir / "task.yaml"
            if not task_yaml.exists():
                continue
            try:
                with open(task_yaml, 'r') as f:
                    config = yaml.safe_load(f)
                task_name = task_dir.name
                difficulty = config.get('difficulty', 'unknown')
                if difficulty != 'unknown' and task_name not in human_labels:
                    human_labels[task_name] = difficulty.lower()
            except:
                pass
    
    return human_labels

# Update difficulty_df with human labels
human_labels_for_csv = load_human_labels_for_csv()
difficulty_df['human_difficulty'] = difficulty_df['task_name'].map(human_labels_for_csv)
difficulty_df.to_csv(f"{RESULTS_DIR}/task_difficulty.csv", index=False)

confusion = create_confusion_matrix(difficulty_df)
if len(confusion) > 0:
    plot_compact_heatmap(confusion, f"{RESULTS_DIR}/difficulty_heatmap_normalized", normalize=True)
