In [11]:
import os

import pandas as pd
from dotenv import load_dotenv

load_dotenv()

True

In [12]:
db_user = os.environ["SANDBOXES_POSTGRES_USER"]
db_password = os.environ["SANDBOXES_POSTGRES_PASSWORD"]
db_host = os.environ["SANDBOXES_POSTGRES_HOST"]
db_port = os.environ["SANDBOXES_POSTGRES_PORT"]
db_name = os.environ["SANDBOXES_POSTGRES_NAME"]

In [13]:
from matplotlib import pyplot as plt
from matplotlib import rcParams
from matplotlib_inline.backend_inline import set_matplotlib_formats

rcParams["figure.figsize"] = (8, 5)
rcParams["figure.dpi"] = 100
set_matplotlib_formats("retina")
plt.style.use("ggplot")

In [14]:
agent_name_map = {
    "gemini-cli": "Gemini CLI",
    "claude-code": "Claude Code",
    "codex": "Codex CLI",
    "openhands": "OpenHands",
    "mini-swe-agent": "Mini-SWE-Agent",
    "terminus-2": "Terminus",
}

model_name_map = {
    "claude-sonnet-4-20250514": "Claude Sonnet 4",
    "claude-opus-4-1-20250805": "Claude Opus 4.1",
    "gpt-5": "GPT-5",
    "gpt-5-mini": "GPT-5-Mini",
    "gpt-5-nano": "GPT-5-Nano",
    "grok-4-0709": "Grok 4",
    "grok-code-fast-1": "Grok Code Fast 1",
    "gemini-2.5-pro": "Gemini 2.5 Pro",
    "gemini-2.5-flash": "Gemini 2.5 Flash",
    "Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8": "Qwen 3 Coder 480B",
    "openai/gpt-oss-120b": "GPT-OSS 120B",
    "OpenAI/gpt-oss-20B": "GPT-OSS 20B",
    "moonshotai/Kimi-K2-Instruct-0905": "Kimi K2",
    "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8": "Llama 4 Maverick 17B",
    "zai-org/GLM-4.5-Air-FP8": "GLM 4.5 Air",
    "deepseek-ai/DeepSeek-V3.1": "DeepSeek V3.1",
}

In [15]:
%load_ext sql
%sql postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [24]:
%%sql hero_table <<
with p_hats as (
    select agent_name,
        model_name,
        m.display_name as model_display_name,
        a.display_name as agent_display_name,
        task.name as task_name,
        avg(coalesce(reward, 0)) as p_hat,
        count(*) as n_trials,
        avg(
            jsonb_array_length(agent_metadata->'api_request_times_msec')
        ) as avg_api_calls,
        sum(
            case
                when exception_info is null then 0
                else 1
            end
        ) as n_errors,
        case
            when count(*) > 1 then avg(coalesce(reward, 0)) * (1 - avg(coalesce(reward, 0))) / (count(*) - 1)
            else null
        end as partial_var,
        avg(n_input_tokens) as avg_n_input_tokens,
        avg(n_output_tokens) as avg_n_output_tokens,
        avg(
            n_input_tokens / 1000000.0 * m.cents_per_million_input_tokens + n_output_tokens / 1000000.0 * m.cents_per_million_output_tokens
        ) as avg_cost_cents,
        avg(
            extract(
                epoch
                from (
                        agent_execution_ended_at - agent_execution_started_at
                    )
            )
        ) as avg_execution_time_seconds
    from trial as t
        inner join dataset_task as dt on dt.task_checksum = t.task_checksum
        inner join task on task.checksum = dt.task_checksum
        inner join trial_model as tm on tm.trial_id = t.id
        inner join model as m on m.name = tm.model_name
        inner join agent as a on a.name = t.agent_name and a.version = t.agent_version
        and m.provider = tm.model_provider
        inner join job as j on j.id = t.job_id
    where dataset_name = 'terminal-bench'
        and dataset_version = '2.0'
         AND j.created_at <= '2025-11-04 18:58:26.409036+00'::timestamptz
        and (
            exception_info is null
            or exception_info->>'exception_type' in (
                'AgentTimeoutError',
                'VerifierTimeoutError'
            )
        )
    group by agent_name,
        model_name,
        task_name,
        model_display_name,
        agent_display_name
)
select agent_name,
    model_name,
    model_display_name,
    agent_display_name,
    avg(p_hat) as accuracy,
    sum(n_errors) / sum(n_trials) as error_probability,
    sum(avg_api_calls) as total_avg_api_calls,
    sum(avg_n_input_tokens) as total_avg_n_input_tokens,
    sum(avg_n_output_tokens) as total_avg_n_output_tokens,
    sum(avg_cost_cents) / 100.0 as total_avg_cost_usd,
    avg(avg_execution_time_seconds) as avg_execution_time_sec,
    case
        when count(*) > count(partial_var) then null
        else sqrt(sum(partial_var)) / count(*)
    end as stderr,
    case
        when count(*) > count(partial_var) then null
        else sqrt(sum(partial_var * n_trials)) / count(*)
    end as stddev,
    count(distinct task_name) as n_tasks
from p_hats
group by agent_name,
    model_name,
    model_display_name,
    agent_display_name
having avg(p_hat) > 0.01
order by accuracy desc;

 * postgresql://postgres:***@db.jccajjvblmajkbwqsmaz.supabase.co:5432/postgres
51 rows affected.
Returning data to local variable hero_table


In [25]:
df = hero_table.DataFrame()

df.head()

Unnamed: 0,agent_name,model_name,model_display_name,agent_display_name,accuracy,error_probability,total_avg_api_calls,total_avg_n_input_tokens,total_avg_n_output_tokens,total_avg_cost_usd,avg_execution_time_sec,stderr,stddev,n_tasks
0,codex,gpt-5,GPT-5,Codex CLI,0.496067415730337,0.1412300683371298,,38620547.18333333,829270.1,56.568384979166666,531.9368644117977,0.0147821504658171,0.032532569083489,89
1,codex,gpt-5-codex,GPT-5-Codex,Codex CLI,0.4432584269662921,0.2729357798165137,,40012749.633333325,766034.6166666666,57.67628320833333,872.0856386005619,0.0138031623306145,0.0303925142002522,89
2,terminus-2,gpt-5-codex,GPT-5-Codex,Terminus 2,0.4342696629213483,0.3226544622425629,1850.5,30400573.45,2274666.75,60.7473843125,929.8529940971911,0.0146606610682021,0.0326616670870755,89
3,openhands,gpt-5,GPT-5,OpenHands,0.4337078651685393,0.3377777777777777,,38057575.55,3644808.7833333327,84.02005727083333,943.0773217284643,0.015222772608915,0.0343264097000437,89
4,terminus-2,claude-sonnet-4-5-20250929,Claude Sonnet 4.5,Terminus 2,0.4280898876404494,0.2136363636363636,2739.95,77046379.85,1094753.65,247.5604443,720.2923860713483,0.0142272808338745,0.0316473860288803,89


In [26]:
table_df = df.copy()

table_df["Model Name"] = table_df["model_display_name"]

table_df["Agent Name"] = table_df["agent_display_name"]


def format_accuracy_with_stderr(row):
    try:
        acc = float(row["accuracy"]) * 100
    except Exception:
        acc = row["accuracy"] * 100
    try:
        stderr = float(row["stderr"]) * 100
    except Exception:
        stderr = row["stderr"] * 100
    if isinstance(acc, float) and isinstance(stderr, float):
        return f"{acc:.0f}\\% ± {stderr * 1.96:.0f}"
    else:
        return f"{acc}\\% ± {stderr * 1.96}"


table_df["Resolution Rate"] = table_df.apply(format_accuracy_with_stderr, axis=1)

# Convert token columns to numeric (float) before performing arithmetic operations
input_tokens = pd.to_numeric(table_df["total_avg_n_input_tokens"], errors="coerce")
output_tokens = pd.to_numeric(table_df["total_avg_n_output_tokens"], errors="coerce")
table_df["Input Tokens"] = input_tokens.apply(lambda x: f"{x / 1e6:.1f}M")
table_df["Output Tokens"] = output_tokens.apply(lambda x: f"{x / 1e6:.1f}M")

# table_df["Agent Run Time (Minutes)"] = (
#     pd.to_numeric(table_df["avg_execution_time_sec"], errors="coerce") / 60
# ).apply(lambda x: f"{x:.2f}")

table_df.drop(df.columns, axis=1, inplace=True)

In [28]:
table_df.iloc[:50]

Unnamed: 0,Model Name,Agent Name,Resolution Rate,Input Tokens,Output Tokens
0,GPT-5,Codex CLI,50\% ± 3,38.6M,0.8M
1,GPT-5-Codex,Codex CLI,44\% ± 3,40.0M,0.8M
2,GPT-5-Codex,Terminus 2,43\% ± 3,30.4M,2.3M
3,GPT-5,OpenHands,43\% ± 3,38.1M,3.6M
4,Claude Sonnet 4.5,Terminus 2,43\% ± 3,77.0M,1.1M
5,Claude Sonnet 4.5,Mini-SWE-Agent,43\% ± 3,102.5M,1.4M
6,Claude Sonnet 4.5,OpenHands,42\% ± 3,135.4M,1.4M
7,GPT-5-Codex,Mini-SWE-Agent,41\% ± 3,31.7M,2.7M
8,Claude Sonnet 4.5,Claude Code,40\% ± 3,200.2M,0.1M
9,Claude Opus 4.1,Terminus 2,38\% ± 3,47.7M,0.9M


In [29]:
print(table_df.iloc[:50].to_latex(index=False))

\begin{tabular}{lllll}
\toprule
Model Name & Agent Name & Resolution Rate & Input Tokens & Output Tokens \\
\midrule
GPT-5 & Codex CLI & 50\% ± 3 & 38.6M & 0.8M \\
GPT-5-Codex & Codex CLI & 44\% ± 3 & 40.0M & 0.8M \\
GPT-5-Codex & Terminus 2 & 43\% ± 3 & 30.4M & 2.3M \\
GPT-5 & OpenHands & 43\% ± 3 & 38.1M & 3.6M \\
Claude Sonnet 4.5 & Terminus 2 & 43\% ± 3 & 77.0M & 1.1M \\
Claude Sonnet 4.5 & Mini-SWE-Agent & 43\% ± 3 & 102.5M & 1.4M \\
Claude Sonnet 4.5 & OpenHands & 42\% ± 3 & 135.4M & 1.4M \\
GPT-5-Codex & Mini-SWE-Agent & 41\% ± 3 & 31.7M & 2.7M \\
Claude Sonnet 4.5 & Claude Code & 40\% ± 3 & 200.2M & 0.1M \\
Claude Opus 4.1 & Terminus 2 & 38\% ± 3 & 47.7M & 0.9M \\
Claude Opus 4.1 & OpenHands & 36\% ± 3 & 110.1M & 1.3M \\
GPT-5 & Terminus 2 & 35\% ± 3 & 15.4M & 2.1M \\
Claude Opus 4.1 & Mini-SWE-Agent & 35\% ± 3 & 45.6M & 0.9M \\
Claude Opus 4.1 & Claude Code & 35\% ± 3 & 171.1M & 0.3M \\
GPT-5 & Mini-SWE-Agent & 34\% ± 3 & 11.4M & 2.7M \\
Gemini 2.5 Pro & Terminus 2 & 33\% ± 3 