In [None]:
import os

from dotenv import load_dotenv

load_dotenv()

True

In [2]:
db_user = os.environ["SANDBOXES_POSTGRES_USER"]
db_password = os.environ["SANDBOXES_POSTGRES_PASSWORD"]
db_host = os.environ["SANDBOXES_POSTGRES_HOST"]
db_port = os.environ["SANDBOXES_POSTGRES_PORT"]
db_name = os.environ["SANDBOXES_POSTGRES_NAME"]

In [3]:
from matplotlib import pyplot as plt
from matplotlib import rcParams
from matplotlib_inline.backend_inline import set_matplotlib_formats

rcParams["figure.figsize"] = (8, 5)
rcParams["figure.dpi"] = 100
set_matplotlib_formats("retina")
plt.style.use("ggplot")

In [4]:
agent_name_map = {
    "gemini-cli": "Gemini CLI",
    "claude-code": "Claude Code",
    "codex": "Codex CLI",
    "openhands": "OpenHands",
    "mini-swe-agent": "Mini-SWE-Agent",
    "terminus-2": "Terminus",
}

model_name_map = {
    "claude-sonnet-4-20250514": "Claude Sonnet 4",
    "claude-opus-4-1-20250805": "Claude Opus 4.1",
    "gpt-5": "GPT-5",
    "gpt-5-mini": "GPT-5-Mini",
    "gpt-5-nano": "GPT-5-Nano",
    "grok-4-0709": "Grok 4",
    "grok-code-fast-1": "Grok Code Fast 1",
    "gemini-2.5-pro": "Gemini 2.5 Pro",
    "gemini-2.5-flash": "Gemini 2.5 Flash",
    "Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8": "Qwen 3 Coder 480B",
    "openai/gpt-oss-120b": "GPT-OSS 120B",
    "OpenAI/gpt-oss-20B": "GPT-OSS 20B",
    "moonshotai/Kimi-K2-Instruct-0905": "Kimi K2",
    "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8": "Llama 4 Maverick 17B",
    "zai-org/GLM-4.5-Air-FP8": "GLM 4.5 Air",
    "deepseek-ai/DeepSeek-V3.1": "DeepSeek V3.1",
}

In [5]:
%load_ext sql
%sql postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}

In [20]:
%%sql missing_tokens <<

WITH task_trial_summary AS (
    SELECT t.agent_name,
        t.agent_version,
        tm.model_name,
        tm.model_provider,
        t.task_checksum,
        COUNT(*) as total_trials_for_task,
        COUNT(
            CASE
                WHEN tm.n_input_tokens IS NOT NULL
                AND tm.n_output_tokens IS NOT NULL THEN 1
            END
        ) as trials_with_non_null_tokens
    FROM trial t
        JOIN trial_model tm ON t.id = tm.trial_id
    WHERE t.agent_name IN ('openhands', 'gemini-cli', 'mini-swe-agent')
        AND t.created_at >= '2025-09-17 01:13:33.950824+00'::timestamptz
        AND (
            t.exception_info IS NULL
            OR t.exception_info->>'exception_type' in ('AgentTimeoutError', 'VerifierTimeoutError')
        )
    GROUP BY t.agent_name,
        t.agent_version,
        tm.model_name,
        tm.model_provider,
        t.task_checksum
)
SELECT agent_name,
    model_name,
    COUNT(*) FILTER (WHERE trials_with_non_null_tokens = 0) as tasks_with_zero_non_null_token_trials
FROM task_trial_summary
GROUP BY agent_name,
    agent_version,
    model_name,
    model_provider
HAVING COUNT(*) FILTER (WHERE trials_with_non_null_tokens = 0) > 0
ORDER BY tasks_with_zero_non_null_token_trials DESC,
    agent_name,
    model_name,
    model_provider;

 * postgresql://postgres:***@db.jccajjvblmajkbwqsmaz.supabase.co:5432/postgres
19 rows affected.
Returning data to local variable missing_tokens


In [21]:
df = missing_tokens.DataFrame()

In [25]:
table_df = df.copy()
table_df["Agent"] = df.agent_name.apply(lambda x: agent_name_map.get(x))
table_df["Model"] = df.model_name.apply(lambda x: model_name_map.get(x))
table_df["Tasks Missing Token Counts"] = df.tasks_with_zero_non_null_token_trials

table_df.drop(df.columns, axis=1, inplace=True)

In [28]:
print(table_df.to_latex(index=False))

\begin{tabular}{llr}
\toprule
Agent & Model & Tasks Missing Token Counts \\
\midrule
OpenHands & GPT-5-Nano & 24 \\
OpenHands & Grok Code Fast 1 & 23 \\
OpenHands & GPT-5-Mini & 17 \\
OpenHands & GPT-5 & 15 \\
Mini-SWE-Agent & Grok 4 & 10 \\
OpenHands & Grok 4 & 8 \\
Mini-SWE-Agent & GPT-5 & 6 \\
OpenHands & Kimi K2 & 6 \\
OpenHands & GLM 4.5 Air & 4 \\
Gemini CLI & Gemini 2.5 Pro & 2 \\
Mini-SWE-Agent & Gemini 2.5 Flash & 2 \\
Mini-SWE-Agent & GPT-5-Mini & 2 \\
Mini-SWE-Agent & Grok Code Fast 1 & 2 \\
Gemini CLI & Gemini 2.5 Flash & 1 \\
OpenHands & Claude Opus 4.1 & 1 \\
OpenHands & Claude Sonnet 4 & 1 \\
OpenHands & Gemini 2.5 Flash & 1 \\
OpenHands & Gemini 2.5 Pro & 1 \\
OpenHands & Qwen 3 Coder 480B & 1 \\
\bottomrule
\end{tabular}

