In [None]:
!pip install agent-diff langchain langchain-openai langchain-anthropic pandas matplotlib -q 

In [None]:
import os

#AGENT_DIFF_API_KEY = "" #Not needed for local server.
#AGENT_DIFF_BASE_URL = "" #Only needed if using remote agent-diff server, for local it automatically uses local server at 8000.
OPENAI_API_KEY = ""

In [None]:
import json
from pathlib import Path

# Base path relative to this notebook
DOCS_BASE = Path("../../examples")

def load_api_docs(filepath: Path) -> dict:
    """Load API docs JSON, return empty dict if not found."""
    if filepath.exists():
        return json.load(open(filepath))
    print(f"Docs not found: {filepath}")
    return {}

def format_docs_markdown(docs: dict) -> str:
    """Convert API docs dict to markdown format."""
    if not docs:
        return ""
    
    markdown = ""
    for endpoint, info in docs.items():
        markdown += f"## {endpoint}\n"
        markdown += f"{info.get('description', '')}\n\n"
        
        if info.get('parameters'):
            markdown += "**Parameters:**\n"
            for location, params in info['parameters'].items():
                markdown += f"  {location}:\n"
                if not isinstance(params, dict):
                    markdown += f"    {params}\n"
                    continue
                for param_name, param_info in params.items():
                    if not isinstance(param_info, dict):
                        markdown += f"    - `{param_name}`: {param_info}\n"
                        continue
                    required = "**required**" if param_info.get('required') else "optional"
                    param_type = param_info.get('type', 'any')
                    param_desc = param_info.get('description', '')
                    markdown += f"    - `{param_name}` ({param_type}, {required}): {param_desc}\n"
            markdown += "\n"
    
    return markdown

# Load available docs (all services)
slack_docs = load_api_docs(DOCS_BASE / "slack/testsuites/slack_docs/slack_api_full_docs.json")
box_docs = load_api_docs(DOCS_BASE / "box/testsuites/box_docs/box_api_full_docs.json")
calendar_docs = load_api_docs(DOCS_BASE / "calendar/testsuites/calendar_docs/calendar_api_full_docs.json")
linear_docs = load_api_docs(DOCS_BASE / "linear/testsuites/linear_docs/linear_api_full_docs.json")

# Format to markdown
slack_docs_markdown = format_docs_markdown(slack_docs)
box_docs_markdown = format_docs_markdown(box_docs)
calendar_docs_markdown = format_docs_markdown(calendar_docs)
linear_docs_markdown = format_docs_markdown(linear_docs)

# Summary
print(f"[{'OK' if slack_docs else 'MISSING'}] Slack docs: {len(slack_docs)} endpoints")
print(f"[{'OK' if box_docs else 'MISSING'}] Box docs: {len(box_docs)} endpoints")
print(f"[{'OK' if calendar_docs else 'MISSING'}] Calendar docs: {len(calendar_docs)} endpoints")
print(f"[{'OK' if linear_docs else 'MISSING'}] Linear docs: {len(linear_docs)} endpoints")


In [None]:
import re
from typing import Optional, Tuple

# Service configurations with base URLs
SERVICE_CONFIG = {
    "slack": {
        "name": "Slack",
        "base_url": "https://slack.com/api",
        "description": "Slack workspace messaging and collaboration API",
    },
    "box": {
        "name": "Box",
        "base_url": "https://api.box.com/2.0",
        "description": "Box cloud storage and file management API",
    },
    "calendar": {
        "name": "Google Calendar",
        "base_url": "https://www.googleapis.com/calendar/v3",
        "description": "Google Calendar scheduling and events API",
    },
    "linear": {
        "name": "Linear",
        "base_url": "https://api.linear.app/graphql",
        "description": "Linear project management and issue tracking API",
    },
}

# ReAct System Prompt 
REACT_SYSTEM_PROMPT_WITH_API_DOCS = """You are an AI assistant that completes tasks by interacting with APIs via bash commands.

## Current Session
- **Service**: {service_name}
- **Base URL**: {base_url}
- **Description**: {service_description}

## Environment
- You are authenticated as a user in the {service_name} workspace/account.
- Authentication is handled automatically via proxy. Use placeholder tokens like `<TOKEN>` where credentials would go.
- You execute bash commands (primarily curl) to interact with the {service_name} API.
- The environment is stateless between commands - you cannot install packages or persist files.

## Response Format
You must respond using XML tags. Think step-by-step, then execute a command OR declare completion.

**To execute a bash command:**
<thinking>
Your reasoning about what needs to be done and why this command will help.
</thinking>

<action>
Your bash command here (e.g., curl request)
</action>

**When the task is complete:**
<thinking>
Your reasoning confirming the task is done based on API responses.
</thinking>

<done>
Brief summary of what was accomplished.
</done>

## Rules
1. Execute ONE command at a time, then wait for the result.
2. Parse API responses carefully - extract IDs and data needed for subsequent calls.
3. If a command fails, analyze the error and try a different approach.
4. Only use <done> when the task is fully completed (not just when you've gathered information).

## API Documentation
{api_docs}
"""

REACT_SYSTEM_PROMPT = """You are an AI assistant that completes tasks by interacting with APIs via bash commands.

## Current Session
- **Service**: {service_name}
- **Base URL**: {base_url}
- **Description**: {service_description}

## Environment
- You are authenticated as a user in the {service_name} workspace/account.
- Authentication is handled automatically via proxy. Use placeholder tokens like `<TOKEN>` where credentials would go.
- You execute bash commands (primarily curl) to interact with the {service_name} API.
- If you are not sure how to use {service_name} API, explore the endpoint, parameters, and learn how it works.
- The environment is stateless between commands - you cannot install packages or persist files.

## Response Format
You must respond using XML tags. Think step-by-step, then execute a command OR declare completion.

**To execute a bash command:**
<thinking>
Your reasoning about what needs to be done and why this command will help.
</thinking>

<action>
Your bash command here (e.g., curl request)
</action>

**When the task is complete:**
<thinking>
Your reasoning confirming the task is done based on API responses.
</thinking>

<done>
Brief summary of what was accomplished.
</done>

## Rules
1. Execute ONE command at a time, then wait for the result.
2. Parse API responses carefully - extract IDs and data needed for subsequent calls.
3. If a command fails, analyze the error and try a different approach.
4. Only use <done> when the task is fully completed (not just when you've gathered information).

"""

# Function to build the full system prompt
def build_system_prompt(service: str, docs_markdown: str, include_api_docs: bool = True) -> str:
    """Build system prompt with service-specific context.
    
    Args:
        service: Service name (slack, box, calendar, linear)
        docs_markdown: Formatted API documentation markdown
        include_api_docs: Whether to include API docs in the prompt
    
    Returns:
        str: Complete system prompt
    """
    config = SERVICE_CONFIG.get(service.lower(), {
        "name": service,
        "base_url": "unknown",
        "description": f"{service} API",
    })
    
    if include_api_docs:
        return REACT_SYSTEM_PROMPT_WITH_API_DOCS.format(
            service_name=config["name"],
            base_url=config["base_url"],
            service_description=config["description"],
            api_docs=docs_markdown,
        )
    else:
        return REACT_SYSTEM_PROMPT.format(
            service_name=config["name"],
            base_url=config["base_url"],
            service_description=config["description"],
        )



def parse_react_response(response: str) -> Tuple[Optional[str], Optional[str], Optional[str]]:
    """
    Parse ReAct XML response.
    Returns: (thinking, action, done)
    - If action is present: execute the command
    - If done is present: task is complete
    """
    thinking_match = re.search(r'<thinking>(.*?)</thinking>', response, re.DOTALL)
    action_match = re.search(r'<action>(.*?)</action>', response, re.DOTALL)
    done_match = re.search(r'<done>(.*?)</done>', response, re.DOTALL)
    
    thinking = thinking_match.group(1).strip() if thinking_match else None
    action = action_match.group(1).strip() if action_match else None
    done = done_match.group(1).strip() if done_match else None
    
    return thinking, action, done


In [None]:
import asyncio
import time
import httpx
from pathlib import Path
from datetime import datetime
from typing import Any, List, Dict
from IPython.display import display, HTML, clear_output
import pandas as pd
from tqdm.auto import tqdm

from agent_diff import (
    AgentDiff,
    BashExecutorProxy,
)

# ============ Benchmark Configurations ============

BENCHMARK_CONFIGS = {
    "slack": {
        "test_suite_name": "Slack Bench v2",
        "docs_markdown": slack_docs_markdown,
    },
    "box": {
        "test_suite_name": "Box Bench v2",
        "docs_markdown": box_docs_markdown,
    },
    "calendar": {
        "test_suite_name": "Calendar Bench",
        "docs_markdown": calendar_docs_markdown,
    },
    "linear": {
        "test_suite_name": "Linear Bench",
        "docs_markdown": linear_docs_markdown,
    },
}

def get_benchmark_config(service: str, include_api_docs: bool = True) -> dict:
    """Get benchmark configuration for a service.
    
    Args:
        service: Service name
        include_api_docs: Whether to include API docs in the system prompt
    """
    service_lower = service.lower()
    if service_lower not in BENCHMARK_CONFIGS:
        raise ValueError(f"Unknown service: {service}. Available: {list(BENCHMARK_CONFIGS.keys())}")
    
    config = BENCHMARK_CONFIGS[service_lower]
    return {
        "service": service_lower,
        "test_suite_name": config["test_suite_name"],
        "docs_markdown": config["docs_markdown"],
        "include_api_docs": include_api_docs,
        "system_prompt": build_system_prompt(service_lower, config["docs_markdown"], include_api_docs),
    }

# ============ Output Directory ============

OUTPUT_DIR = Path("evaluation_outputs")
OUTPUT_DIR.mkdir(exist_ok=True)

# ============  ReAct Agent ============

def call_openrouter(
    model: str,
    messages: List[Dict],
    api_key: str,
) -> Dict:
    """Make a completion request to OpenRouter API (no tool calling).
    
    Returns:
        dict: {
            "content": str,           # Model response text
            "finish_reason": str,     # "stop", "length", etc.
            "usage": {
                "prompt_tokens": int,
                "completion_tokens": int,
                "total_tokens": int,
                "cost": float,        # USD cost
            }
        }
    """
    with httpx.Client(timeout=120) as client:
        response = client.post(
            "https://openrouter.ai/api/v1/chat/completions",
            headers={
                "Authorization": f"Bearer {api_key}",
                "Content-Type": "application/json",
            },
            json={
                "model": model,
                "messages": messages,
            },
        )
        response.raise_for_status()
        data = response.json()
        
        choice = data["choices"][0]
        usage = data.get("usage", {})
        
        return {
            "content": choice["message"]["content"],
            "finish_reason": choice.get("finish_reason"),
            "usage": {
                "prompt_tokens": usage.get("prompt_tokens", 0),
                "completion_tokens": usage.get("completion_tokens", 0),
                "total_tokens": usage.get("total_tokens", 0),
                "cost": usage.get("cost", 0.0),
            }
        }


def run_react_agent(
    model_name: str,
    task_prompt: str,
    bash_executor: BashExecutorProxy,
    system_prompt: str,
    max_iterations: int = 25,
    trace_accumulator: Dict = None,
) -> Dict:
    """
    Custom ReAct agent loop using XML tags.
    Returns structured trace with each step containing thinking, action, observation,
    plus token usage and finish reasons.
    
    If trace_accumulator is provided, steps are written to it in real-time,
    allowing partial trace recovery on timeout.
    """
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": f"Task: {task_prompt}"},
    ]
    
    # Use provided accumulator or create new one
    if trace_accumulator is not None:
        steps = trace_accumulator.setdefault("steps", [])
        trace_accumulator["final"] = None
        trace_accumulator["completed"] = False
        trace_accumulator["usage"] = {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0, "cost": 0.0}
    else:
        steps = []
    
    final_step = None
    completed = False
    
    # Track total usage across all iterations
    total_usage = {
        "prompt_tokens": 0,
        "completion_tokens": 0,
        "total_tokens": 0,
        "cost": 0.0,
    }
    
    for iteration in range(max_iterations):
        # Get model response
        try:
            api_response = call_openrouter(
                model=model_name,
                messages=messages,
                api_key=OPENAI_API_KEY,
            )
            response_text = api_response["content"]
            finish_reason = api_response["finish_reason"]
            iter_usage = api_response["usage"]
            
            # Accumulate total usage
            total_usage["prompt_tokens"] += iter_usage["prompt_tokens"]
            total_usage["completion_tokens"] += iter_usage["completion_tokens"]
            total_usage["total_tokens"] += iter_usage["total_tokens"]
            total_usage["cost"] += iter_usage["cost"]
            
            # Update accumulator in real-time
            if trace_accumulator is not None:
                trace_accumulator["usage"] = total_usage.copy()
                
        except Exception as e:
            steps.append({
                "iteration": iteration + 1,
                "error": f"API error: {str(e)}",
            })
            break
        
        # Parse XML response
        thinking, action, done = parse_react_response(response_text)
        
        # If model includes both, execute the action and ignore premature done
        if action:
            # Execute bash command
            try:
                result = bash_executor.execute(action)
                # Normalize result to dict for consistent storage
                if isinstance(result, dict):
                    observation = {
                        "stdout": result.get("stdout", ""),
                        "stderr": result.get("stderr", ""),
                        "exit_code": result.get("exit_code", 0),
                    }
                else:
                    observation = {
                        "stdout": str(result) if result else "",
                        "stderr": "",
                        "exit_code": 0,
                    }
            except Exception as e:
                observation = {
                    "stdout": "",
                    "stderr": str(e),
                    "exit_code": 1,
                    "error": str(e),
                }
            
            # Record this step with nested structure + usage
            steps.append({
                "iteration": iteration + 1,
                "thinking": thinking,
                "action": action,
                "observation": observation,
                "raw_response": response_text,
                "finish_reason": finish_reason,
                "usage": iter_usage,
            })
            
            # Format observation for model (just stdout, or error info)
            if observation.get("exit_code", 0) != 0:
                obs_text = f"{observation['stdout']}\n[stderr]: {observation['stderr']}\n[exit_code]: {observation['exit_code']}".strip()
            else:
                obs_text = observation["stdout"].strip() if observation["stdout"] else "(empty output)"
            
            # Add to conversation
            messages.append({"role": "assistant", "content": response_text})
            messages.append({"role": "user", "content": f"<observation>\n{obs_text}\n</observation>"})
        elif done:
            # Task completed (only when NO action present)
            final_step = {
                "iteration": iteration + 1,
                "thinking": thinking,
                "summary": done,
                "raw_response": response_text,
                "finish_reason": finish_reason,
                "usage": iter_usage,
            }
            completed = True
            break
        else:
            # No action and no done - malformed response
            steps.append({
                "iteration": iteration + 1,
                "thinking": thinking,
                "warning": "No <action> or <done> tag found",
                "raw_response": response_text,
                "finish_reason": finish_reason,
                "usage": iter_usage,
            })
            messages.append({"role": "assistant", "content": response_text})
            messages.append({"role": "user", "content": "Please respond with either an <action> to execute or <done> if the task is complete."})
    
    result = {
        "steps": steps,
        "final": final_step,
        "iterations": iteration + 1,
        "completed": completed,
        "usage": total_usage,
    }
    
    # Update accumulator if provided (for timeout recovery)
    if trace_accumulator is not None:
        trace_accumulator["final"] = final_step
        trace_accumulator["iterations"] = iteration + 1
        trace_accumulator["completed"] = completed
        trace_accumulator["usage"] = total_usage
    
    return result


async def run_single_test(
    client: AgentDiff, 
    model_name: str, 
    test: Any, 
    system_prompt: str,
    test_timeout_seconds: int = 300,
    max_iterations: int = 25,
) -> tuple:
    """Run a single test case using custom ReAct agent.
    
    Args:
        client: AgentDiff client instance
        model_name: Model identifier (e.g., 'openai/gpt-5-mini')
        test: Test object with id and prompt attributes
        system_prompt: Full system prompt including API docs
        test_timeout_seconds: Max seconds before timeout
        max_iterations: Max ReAct loop iterations
    
    Returns:
        tuple: (test_id, result_dict) where result_dict contains:
            - prompt (str): Task prompt
            - status (str): 'passed', 'failed', 'timeout', or 'error'
            - passed (bool): Whether assertions passed
            - score (float): Score 0-100
            - time (float): Execution seconds
            - failures (list[str]): Failure messages
            - runId (str): Run UUID
            - error (str|None): Error message if status='error'
            - trace (dict): Execution trace containing:
                - steps (list): Each step has iteration, thinking, action, 
                  observation, raw_response, finish_reason, usage
                - final (dict|None): Completion step with usage
                - iterations (int): Total iterations
                - completed (bool): Whether agent declared done
                - usage (dict): Total {prompt_tokens, completion_tokens, 
                  total_tokens, cost}
            - diff (dict|None): State changes {inserts, updates, deletes}
    """
    test_id = test.id
    prompt = test.prompt
    response = None
    timed_out = False
    env = None

    try:
        # Initialize environment
        env = client.init_env(testId=test_id)
        run = client.start_run(envId=env.environmentId, testId=test_id)

        # Create bash executor (direct, not LangChain tool)
        bash_executor = BashExecutorProxy(
            env.environmentId,
            base_url=client.base_url,
            api_key=client.api_key,
        )

        # Execution with timeout
        # Use trace_accumulator to capture partial trace on timeout
        trace_accumulator = {
            "steps": [], 
            "final": None, 
            "completed": False,
            "usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0, "cost": 0.0},
        }
        
        start = time.perf_counter()
        try:
            response = await asyncio.wait_for(
                asyncio.to_thread(
                    run_react_agent,
                    model_name=model_name,
                    task_prompt=prompt,
                    bash_executor=bash_executor,
                    system_prompt=system_prompt,
                    max_iterations=max_iterations,
                    trace_accumulator=trace_accumulator,
                ),
                timeout=test_timeout_seconds
            )
        except asyncio.TimeoutError:
            timed_out = True
            # Use accumulated trace (partial) instead of losing it
            response = {
                "steps": trace_accumulator.get("steps", []),
                "final": trace_accumulator.get("final"),
                "iterations": len(trace_accumulator.get("steps", [])),
                "completed": False,
                "usage": trace_accumulator.get("usage", {}),
                "timeout_error": f"Test timed out after {test_timeout_seconds} seconds",
            }
        except Exception as e:
            response = {
                "steps": trace_accumulator.get("steps", []),
                "final": trace_accumulator.get("final"),
                "iterations": len(trace_accumulator.get("steps", [])),
                "completed": False,
                "usage": trace_accumulator.get("usage", {}),
                "error": str(e),
            }
        finally:
            execution_time = time.perf_counter() - start

        # Evaluation
        score = client.evaluate_run(runId=run.runId)
        run_result = client.get_results_for_run(runId=run.runId)

        result = {
            "prompt": prompt,
            "status": "timeout" if timed_out else run_result.status,
            "passed": False if timed_out else run_result.passed,
            "score": 0 if timed_out else run_result.score.get("percent", 0),
            "time": round(execution_time, 2),
            "failures": ["Test timed out"] if timed_out else run_result.failures,
            "runId": run.runId,
            "trace": response,
            "diff": getattr(run_result, "diff", None),
        }

        # Cleanup
        client.delete_env(envId=env.environmentId)
        return test_id, result

    except Exception as e:
        # Cleanup on error if environment was created
        if env:
            try:
                client.delete_env(envId=env.environmentId)
            except:
                pass
        return test_id, {"passed": False, "score": 0, "status": "error", "error": str(e)}


async def run_benchmark_suite(
    service: str,
    models: list,
    runs_per_test: int = 1,
    max_tests: int = None,
    max_concurrent_models: int = 1,
    max_concurrent_tests: int = 10,
    max_calls_per_minute: int = 90,
    test_timeout_seconds: int = 300,
    max_iterations: int = 25,
    include_api_docs: bool = True,
):
    """Run benchmark for a single service.
    
    Args:
        service: Service to benchmark ('slack', 'box', 'calendar', 'linear')
        models: List of model identifiers to evaluate
        runs_per_test: Number of times to run each test
        max_tests: Maximum number of tests to run (None = all)
        max_concurrent_models: Max parallel model evaluations
        max_concurrent_tests: Max parallel test executions
        max_calls_per_minute: Rate limit for API calls
        test_timeout_seconds: Timeout per test in seconds
        max_iterations: Max ReAct iterations per test
        include_api_docs: Whether to include API documentation in system prompt
    
    Returns:
        List[dict]: List of result dicts, each containing:
            - prompt (str): The task prompt
            - status (str): 'passed', 'failed', 'timeout', or 'error'
            - passed (bool): Whether the test passed
            - score (float): Score percentage (0-100)
            - time (float): Execution time in seconds
            - failures (list): List of failure messages
            - runId (str): Unique run identifier
            - error (str|None): Error message if status='error'
            - model (str): Model identifier used
            - test_id (str): Test case identifier (constant across runs)
            - service (str): Service name (e.g., 'slack', 'box')
            - test_suite_name (str): Full test suite name (e.g., 'Slack Bench v2')
            - include_api_docs (bool): Whether API docs were included in prompt
            - timestamp (str): ISO format timestamp when test was run
            - trace (dict): Execution trace containing:
                - steps (list): List of ReAct steps, each with:
                    - iteration (int)
                    - thinking (str): Model's reasoning
                    - action (str): Bash command executed
                    - observation (dict): {stdout, stderr, exit_code}
                    - raw_response (str): Full model response
                    - finish_reason (str): "stop", "length" (context overflow), etc.
                    - usage (dict): {prompt_tokens, completion_tokens, total_tokens, cost}
                - final (dict|None): Completion step with thinking, summary, usage
                - iterations (int): Total iterations
                - completed (bool): Whether agent declared done
                - usage (dict): Total tokens/cost for entire run:
                    {prompt_tokens, completion_tokens, total_tokens, cost}
            - diff (dict|None): State diff with inserts, updates, deletes
    """
    # Get benchmark configuration for this service
    config = get_benchmark_config(service, include_api_docs=include_api_docs)
    test_suite_name = config["test_suite_name"]
    system_prompt = config["system_prompt"]
    run_timestamp = datetime.now().isoformat()
    
    client = AgentDiff(
        #api_key=AGENT_DIFF_API_KEY,
        #base_url=AGENT_DIFF_BASE_URL,
    )
    try:
        suite_list = client.list_test_suites(name=test_suite_name)
        if not suite_list.testSuites:
            print(f"[ERROR] Test suite '{test_suite_name}' not found on AgentDiff server.")
            return []

        suite = client.get_test_suite(suite_list.testSuites[0].id, expand=True)
        tests = suite.tests[:max_tests] if max_tests else suite.tests
    except Exception as e:
        print(f"[ERROR] Error connecting to AgentDiff: {e}")
        return []

    total_logical = len(tests) * len(models)
    total_runs = total_logical * runs_per_test
    print(f"\n[{config['service'].upper()}] {test_suite_name} | {len(tests)} tests x {len(models)} models x {runs_per_test} runs = {total_runs} total")

    semaphore = asyncio.Semaphore(max_concurrent_models * max_concurrent_tests)

    # rate limiting state (per minute window)
    window_seconds = 60
    window_start = time.monotonic()
    calls_in_window = 0
    rate_lock = asyncio.Lock()

    async def acquire_rate_slot():
        nonlocal window_start, calls_in_window
        while True:
            async with rate_lock:
                now = time.monotonic()
                # reset window if needed
                if now - window_start >= window_seconds:
                    window_start = now
                    calls_in_window = 0

                if calls_in_window < max_calls_per_minute:
                    calls_in_window += 1
                    return  # allowed to proceed

                # need to wait until current window ends
                sleep_for = window_seconds - (now - window_start)
            # sleep outside the lock
            if sleep_for > 0:
                await asyncio.sleep(sleep_for)

    # Progress tracking state
    completed_results = []
    results_lock = asyncio.Lock()
    
    # Create progress bar
    pbar = tqdm(
        total=total_runs,
        desc=f"{config['service'].upper()} | 0/{total_runs} | 0.0% pass | 0.0% avg",
        unit="test",
        leave=True,
        dynamic_ncols=True,
        mininterval=0.1,
    )
    pbar.refresh()  # Force initial display
    
    async def update_progress():
        """Update progress bar with current stats."""
        async with results_lock:
            n = len(completed_results)
            if n > 0:
                passed = sum(1 for r in completed_results if r.get("passed"))
                avg_score = sum(r.get("score", 0) for r in completed_results) / n
                pass_rate = 100 * passed / n
                pbar.set_description(
                    f"{config['service'].upper()} | {passed}/{n} | {pass_rate:.1f}% pass | {avg_score:.1f}% avg"
                )

    async def worker(model_name, test):
        await acquire_rate_slot()
        async with semaphore:
            tid, res = await run_single_test(
                client, model_name, test, system_prompt,
                test_timeout_seconds=test_timeout_seconds,
                max_iterations=max_iterations,
            )
            res["model"] = model_name
            res["test_id"] = tid
            
            # Track result and update progress
            async with results_lock:
                completed_results.append(res)
            
            pbar.update(1)
            await update_progress()
            
            # Log failures to tqdm (won't mess up progress bar)
            if not res.get("passed"):
                if res.get("status") == "timeout":
                    tqdm.write(f"[TIMEOUT] {tid[:8]}... | {res.get('time', 0):.1f}s")
                elif res.get("status") == "error":
                    tqdm.write(f"[ERROR] {tid[:8]}... | {res.get('error', 'unknown')}")
                else:
                    tqdm.write(f"[FAIL] {tid[:8]}... | {res.get('score')}% | {(res.get('failures') or ['unknown'])[:1]}")
            
            return res

    tasks = []
    for model in models:
        for test in tests:
            for _ in range(runs_per_test):
                tasks.append(worker(model, test))

    results = await asyncio.gather(*tasks)
    pbar.close()
    
    # Add metadata to each result
    for r in results:
        r["service"] = config["service"]
        r["test_suite_name"] = test_suite_name
        r["include_api_docs"] = include_api_docs
        r["timestamp"] = run_timestamp
    
    # Final summary
    passed = sum(1 for r in results if r.get("passed"))
    avg_score = sum(r.get("score", 0) for r in results) / len(results) if results else 0
    print(f"{config['service'].upper()} Complete: {passed}/{len(results)} passed ({avg_score:.1f}% avg)")
    
    return results


async def run_all_benchmarks(
    models: list,
    services: list = None,
    runs_per_test: int = 1,
    max_tests: int = None,
    max_concurrent_models: int = 1,
    max_concurrent_tests: int = 10,
    max_calls_per_minute: int = 90,
    test_timeout_seconds: int = 300,
    max_iterations: int = 25,
    include_api_docs: bool = True,
):
    """Run benchmarks for multiple services.
    
    Args:
        models: List of model identifiers to evaluate
        services: List of services to benchmark. If None, runs all available services.
                  Options: ['slack', 'box', 'calendar', 'linear']
        runs_per_test: Number of times to run each test
        max_tests: Maximum number of tests to run per service (None = all)
        max_concurrent_models: Max parallel model evaluations
        max_concurrent_tests: Max parallel test executions
        max_calls_per_minute: Rate limit for API calls
        test_timeout_seconds: Timeout per test in seconds
        max_iterations: Max ReAct iterations per test
        include_api_docs: Whether to include API documentation in system prompt
    
    Returns:
        Dict[str, List[dict]]: Mapping of service name to list of results.
            Each result dict contains (see run_benchmark_suite for full schema):
            - prompt, status, passed, score, time, failures, error
            - runId, model, test_id, service, test_suite_name
            - include_api_docs (bool), timestamp (ISO format)
            - trace: {steps, final, iterations, completed, usage}
              - Each step includes: finish_reason, usage (per-iteration tokens/cost)
              - usage: Total {prompt_tokens, completion_tokens, total_tokens, cost}
            - diff: {inserts, updates, deletes}
    """
    if services is None:
        services = list(BENCHMARK_CONFIGS.keys())
    
    docs_status = "with API docs" if include_api_docs else "NO API docs"
    print(f"Benchmarks: {', '.join(s.upper() for s in services)} | {len(models)} models | {docs_status} | {test_timeout_seconds}s timeout")
    
    all_results = {}
    for service in services:
        try:
            results = await run_benchmark_suite(
                service=service,
                models=models,
                runs_per_test=runs_per_test,
                max_tests=max_tests,
                max_concurrent_models=max_concurrent_models,
                max_concurrent_tests=max_concurrent_tests,
                max_calls_per_minute=max_calls_per_minute,
                test_timeout_seconds=test_timeout_seconds,
                max_iterations=max_iterations,
                include_api_docs=include_api_docs,
            )
            all_results[service] = results
        except Exception as e:
            print(f"[ERROR] Error running {service} benchmark: {e}")
            all_results[service] = []
    
    # Overall summary
    print(f"\n{'='*60}")
    print("OVERALL SUMMARY")
    print(f"{'='*60}")
    total_passed = 0
    total_tests = 0
    for service, results in all_results.items():
        if results:
            passed = sum(1 for r in results if r.get("passed"))
            total = len(results)
            total_passed += passed
            total_tests += total
            print(f"  {service.upper()}: {passed}/{total} passed")
    
    if total_tests > 0:
        print(f"\n  TOTAL: {total_passed}/{total_tests} passed ({100*total_passed/total_tests:.1f}%)")
    
    return all_results

In [None]:
# Models to evaluate (uncomment to include)

MODELS = [
    "openai/gpt-5-mini",
    # "anthropic/claude-haiku-4.5",
    # "anthropic/claude-sonnet-4.5",
    # "anthropic/claude-opus-4.5",
    # "x-ai/grok-4.1-fast",
    #"deepseek/deepseek-v3.2",
    # "moonshotai/kimi-k2-0905",
    # "qwen/qwen3-vl-235b-a22b-instruct",
]

In [None]:
# Runtime Settings (all passed to benchmark functions)

BENCHMARK_SETTINGS = {
    "models": MODELS,                    # Models to evaluate (from cell above)
    "runs_per_test": 2,                  # Number of runs per test
    "max_tests": None,                      # None = all tests, or set a limit
    "max_concurrent_models": 1,          # Parallel model evaluations
    "max_concurrent_tests": 10,          # Parallel test executions
    "max_calls_per_minute": 120,         # API rate limit
    "test_timeout_seconds": 300,         # 5 minutes per test
    "max_iterations": 25,                # Max ReAct iterations
    "include_api_docs": True,            # Include API docs in prompt (False = agent explores)
}

# Run benchmark (uncomment one option):

# Single service:
#results = await run_benchmark_suite(service="slack", **BENCHMARK_SETTINGS)

# Multiple services:
all_results = await run_all_benchmarks(services=["slack", "box"], **BENCHMARK_SETTINGS)

# All services:
# all_results = await run_all_benchmarks(**BENCHMARK_SETTINGS)

In [None]:
# Handle both single service (results = list) and multi-service (all_results = dict)
if 'all_results' in dir() and all_results:
    # Flatten all_results dict into a single list
    results_to_save = []
    for service, service_results in all_results.items():
        results_to_save.extend(service_results)
elif 'results' in dir() and results:
    results_to_save = results
else:
    results_to_save = []

if results_to_save:
    df = pd.DataFrame(results_to_save)

    # Group by model to get leaderboard
    leaderboard = df.groupby("model").agg(
        passed=("passed", "sum"),
        total=("passed", "count"),
        avg_score=("score", "mean"),
        avg_time=("time", "mean")
    ).reset_index()

    leaderboard["pass_rate"] = (leaderboard["passed"] / leaderboard["total"] * 100).round(1)
    leaderboard = leaderboard.sort_values("pass_rate", ascending=False)

    display(HTML("<h3>Final Results</h3>"))
    display(leaderboard)
    
    total_cost = 0.0
    total_tokens = 0
    for r in results_to_save:
        if "trace" in r and "usage" in r["trace"]:
            total_cost += r["trace"]["usage"].get("cost", 0)
            total_tokens += r["trace"]["usage"].get("total_tokens", 0)
    
    if total_cost > 0 or total_tokens > 0:
        print(f"\nUsage Summary: {total_tokens:,} tokens | ${total_cost:.4f} USD")

    # Save using standard json library to handle encoding errors robustly
    ts = datetime.now().strftime('%Y%m%d_%H%M%S')
    output_path = OUTPUT_DIR / f"full_results_{ts}.json"

    def safe_serialize(obj):
        if isinstance(obj, bytes):
            return obj.decode('utf-8', errors='replace')
        return str(obj)

    try:
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(results_to_save, f, indent=2, default=safe_serialize, ensure_ascii=False)
        print(f"Results saved to {output_path}")
    except Exception as e:
        print(f"Error saving JSON: {e}")
else:
    print("No results generated.")




In [None]:
# Utility: Merge Multiple Result Files

def list_result_files(output_dir: Path = OUTPUT_DIR) -> list:
    """List all result JSON files in the output directory."""
    files = sorted(output_dir.glob("full_results_*.json"))
    print(f"Found {len(files)} result files in {output_dir}:")
    for i, f in enumerate(files):
        size_kb = f.stat().st_size / 1024
        print(f"  [{i}] {f.name} ({size_kb:.1f} KB)")
    return files


def merge_result_files(
    files: list = None,
    output_dir: Path = OUTPUT_DIR,
    output_name: str = None,
    deduplicate: bool = False,
) -> list:
    """Merge multiple result JSON files into one.
    
    Args:
        files: List of file paths to merge. If None, merges all files in output_dir.
        output_dir: Directory containing result files (used if files=None)
        output_name: Output filename. If None, generates timestamped name.
        deduplicate: If True, removes duplicate entries (same test_id + model + timestamp)
    
    Returns:
        list: Merged results
    """
    if files is None:
        files = sorted(output_dir.glob("full_results_*.json"))
    
    if not files:
        print("No files to merge.")
        return []
    
    print(f"Merging {len(files)} files...")
    
    all_results = []
    for f in files:
        try:
            with open(f, 'r', encoding='utf-8') as fp:
                data = json.load(fp)
                if isinstance(data, list):
                    all_results.extend(data)
                    print(f"  + {f.name}: {len(data)} results")
                else:
                    print(f"  ! {f.name}: unexpected format (not a list)")
        except Exception as e:
            print(f"  ! {f.name}: error loading - {e}")
    
    print(f"\nTotal: {len(all_results)} results")
    
    # Deduplicate if requested
    if deduplicate and all_results:
        seen = set()
        unique_results = []
        for r in all_results:
            # Create unique key from test_id, model, and timestamp
            key = (r.get("test_id"), r.get("model"), r.get("timestamp"))
            if key not in seen:
                seen.add(key)
                unique_results.append(r)
        
        removed = len(all_results) - len(unique_results)
        if removed > 0:
            print(f"Deduplicated: removed {removed} duplicates, {len(unique_results)} unique results")
        all_results = unique_results
    
    # Save merged results
    if output_name is None:
        ts = datetime.now().strftime('%Y%m%d_%H%M%S')
        output_name = f"merged_results_{ts}.json"
    
    output_path = output_dir / output_name
    
    def safe_serialize(obj):
        if isinstance(obj, bytes):
            return obj.decode('utf-8', errors='replace')
        return str(obj)
    
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(all_results, f, indent=2, default=safe_serialize, ensure_ascii=False)
    
    print(f"\nSaved to: {output_path}")
    
    # Summary by model and service
    if all_results:
        df = pd.DataFrame(all_results)
        print("\n--- Summary by Model ---")
        model_summary = df.groupby("model").agg(
            tests=("passed", "count"),
            passed=("passed", "sum"),
            avg_score=("score", "mean"),
        ).reset_index()
        model_summary["pass_rate"] = (100 * model_summary["passed"] / model_summary["tests"]).round(1)
        display(model_summary)
        
        if "service" in df.columns:
            print("\n--- Summary by Service ---")
            service_summary = df.groupby("service").agg(
                tests=("passed", "count"),
                passed=("passed", "sum"),
                avg_score=("score", "mean"),
            ).reset_index()
            service_summary["pass_rate"] = (100 * service_summary["passed"] / service_summary["tests"]).round(1)
            display(service_summary)
    
    return all_results


# Example usage:
# files = list_result_files()
# merged = merge_result_files()  # Merge all files
# merged = merge_result_files(files=[files[0], files[2]])  # Merge specific files
# merged = merge_result_files(deduplicate=True)  # Merge and remove duplicates

In [None]:
# List available result files
files = list_result_files()

# Merge all files with deduplication
merged = merge_result_files(deduplicate=True)

# Alternative options:
# merged = merge_result_files()  # Merge all without deduplication
# merged = merge_result_files(files=[files[0], files[-1]])  # Merge specific files only