# Evaluation Framework Playground

Interactive notebook for evaluating agent performance with the cascade orchestrator.

## Features

- ‚úÖ Test YAML scenario loading
- ‚úÖ Run orchestrator turns with full evaluation
- ‚úÖ Record events and score performance
- ‚úÖ Compare model configurations (GPT-4o vs o1, different verbosity levels)
- ‚úÖ A/B testing capabilities

## Quick Links

- [Evaluation Package](../../../apps/artagent/backend/evaluation/)
- [Documentation](../../../docs/testing/model-evals.md)
- [Test Scenarios](../../../tests/eval_scenarios/)

## Setup

In [None]:
# Import required libraries
import asyncio
import os
import sys
from pathlib import Path
import json
import yaml
from IPython.display import display, HTML, Markdown
import ipywidgets as widgets

# Add project root to Python path
PROJECT_ROOT = Path(os.getcwd()).parent.parent.parent
sys.path.insert(0, str(PROJECT_ROOT))

print(f"üìÅ Project Root: {PROJECT_ROOT}")
print(f"‚úÖ Python path configured")

## Load Environment

In [None]:
# Load environment configuration
import os
from pathlib import Path
from dotenv import load_dotenv

# Load .env first (fallback)
env_local_path = PROJECT_ROOT / ".env.local"
env_path = PROJECT_ROOT / ".env"

if env_local_path.exists():
    print(f"‚úÖ Loading .env.local")
    load_dotenv(env_local_path, override=True)
    config_source = ".env.local"
elif env_path.exists():
    print(f"‚úÖ Loading .env")
    load_dotenv(env_path, override=True)
    config_source = ".env"
else:
    print("‚ö†Ô∏è  No .env file found. Using system environment variables.")
    config_source = "system environment"

# Try to load Azure App Configuration (preferred)
try:
    from apps.artagent.backend.config.appconfig_provider import bootstrap_appconfig, get_provider_status
    
    appconfig_loaded = bootstrap_appconfig()
    if appconfig_loaded:
        status = get_provider_status()
        endpoint_name = status.get("endpoint", "").split("//")[-1].split(".")[0] if status.get("endpoint") else "unknown"
        print(f"‚úÖ Loaded configuration from Azure App Config ({endpoint_name})")
        config_source = f"Azure App Config ({endpoint_name})"
except Exception as e:
    print(f"‚ÑπÔ∏è  App Configuration not available, using {config_source}")

# Verify Azure OpenAI is configured
endpoint = os.getenv('AZURE_OPENAI_ENDPOINT')
deployment = os.getenv('AZURE_OPENAI_CHAT_DEPLOYMENT_ID') or 'gpt-4o'

print(f"\nüìã Configuration source: {config_source}")
if endpoint:
    print(f"‚úÖ Azure OpenAI endpoint: {endpoint}")
else:
    print("‚ùå AZURE_OPENAI_ENDPOINT not set")

if deployment:
    print(f"‚úÖ Default deployment: {deployment}")
else:
    print("‚ö†Ô∏è  AZURE_OPENAI_DEPLOYMENT_NAME not set")

## Import Components

In [None]:
# Import evaluation framework
from tests.evaluation import (
    EventRecorder,
    EvaluationOrchestratorWrapper,
    MetricsScorer,
    ComparisonRunner,
    MockMemoManager,
    build_context,
)

# Import orchestrator components
from apps.artagent.backend.registries.agentstore.loader import (
    discover_agents,
    get_agent,
    build_handoff_map,
)
from apps.artagent.backend.registries.agentstore.base import ModelConfig
from apps.artagent.backend.voice.speech_cascade.orchestrator import (
    CascadeOrchestratorAdapter,
)
from apps.artagent.backend.voice.shared.base import OrchestratorContext

print("‚úÖ All components imported successfully")

## 1. Discover Available Agents

Load real agents from the agent registry:

In [None]:
# Discover all available agents
agents = discover_agents()
handoff_map = build_handoff_map(agents)

print(f"üì¶ Discovered {len(agents)} agents:\n")

# Group by category
banking = []
insurance = []
other = []

for name, agent in sorted(agents.items()):
    desc = agent.description[:80] if agent.description else "No description"
    
    # Categorize based on name patterns
    if any(x in name.lower() for x in ['banking', 'fraud', 'investment', 'card', 'auth', 'compliance']):
        banking.append((name, desc))
    elif any(x in name.lower() for x in ['claims', 'policy', 'fnol', 'subro']):
        insurance.append((name, desc))
    else:
        other.append((name, desc))

if banking:
    print("Banking Agents:")
    for name, desc in banking:
        print(f"  ‚Ä¢ {name}: {desc}")
    print()

if insurance:
    print("Insurance Agents:")
    for name, desc in insurance:
        print(f"  ‚Ä¢ {name}: {desc}")
    print()

if other:
    print("Other Agents:")
    for name, desc in other:
        print(f"  ‚Ä¢ {name}: {desc}")
    print()

print(f"\nüîó Handoff map: {len(handoff_map)} handoff triggers configured")

## 2. Create Custom Cascade Orchestrator instance

This creates an actual orchestrator with real agents (not mocks):

In [None]:
def create_orchestrator(
    agent_name: str,
    model_override: dict = None,
    session_id: str = "eval-session",
) -> CascadeOrchestratorAdapter:
    """
    Create a real orchestrator for evaluation.
    
    Args:
        agent_name: Name of the agent to use
        model_override: Optional model configuration override
        session_id: Session ID for tracking
    
    Returns:
        Configured CascadeOrchestratorAdapter
    """
    # Load all agents
    all_agents = discover_agents()
    
    # Apply model override if provided
    if model_override and agent_name in all_agents:
        agent = all_agents[agent_name]
        deployment_id = model_override.get('deployment_id', agent.model.deployment_id)
        
        # Detect model family from deployment_id
        deployment_lower = deployment_id.lower()
        
        # Models that use max_completion_tokens instead of max_tokens
        uses_max_completion_tokens = any(x in deployment_lower for x in [
            'o1', 'o3', 'o4',           # Reasoning models
            'gpt-5', 'gpt5',            # GPT-5 family
            'gpt-4.1', 'gpt4.1',        # GPT-4.1 family
        ])
        
        # Models that don't support custom temperature (only default=1)
        # o-series: reasoning models
        # gpt-5: only supports default temperature
        no_custom_temperature = any(x in deployment_lower for x in [
            'o1', 'o3', 'o4',           # Reasoning models
            'gpt-5', 'gpt5',            # GPT-5 family (only default temp)
        ])
        
        # For new-gen models: use max_completion_tokens, not max_tokens
        if uses_max_completion_tokens:
            max_tokens_val = None  # New models don't support max_tokens
            max_completion_tokens_val = model_override.get(
                'max_completion_tokens', 
                model_override.get('max_tokens', 4096)  # Allow fallback from max_tokens
            )
        else:
            max_tokens_val = model_override.get('max_tokens', agent.model.max_tokens)
            max_completion_tokens_val = model_override.get('max_completion_tokens')
        
        # Temperature handling
        if no_custom_temperature:
            temperature_val = None  # These models only support default (1)
        else:
            temperature_val = model_override.get('temperature', agent.model.temperature)
        
        # Reasoning effort only for o-series
        is_reasoning_model = any(x in deployment_lower for x in ['o1', 'o3', 'o4'])
        reasoning_effort = model_override.get('reasoning_effort', 'medium') if is_reasoning_model else model_override.get('reasoning_effort')
        
        # Create new model config with proper parameters for model type
        model_config = ModelConfig(
            deployment_id=deployment_id,
            endpoint_preference=model_override.get('endpoint_preference', agent.model.endpoint_preference),
            verbosity=model_override.get('verbosity', agent.model.verbosity),
            temperature=temperature_val,
            max_tokens=max_tokens_val,
            max_completion_tokens=max_completion_tokens_val,
            reasoning_effort=reasoning_effort,
        )
        
        # Update agent's model config
        agent.model = model_config
        agent.cascade_model = model_config
    
    # Build handoff map
    handoff_map = build_handoff_map(all_agents)
    
    # Create orchestrator
    orchestrator = CascadeOrchestratorAdapter.create(
        start_agent=agent_name,
        session_id=session_id,
        call_connection_id=f"eval-{session_id}",
        agents=all_agents,
        handoff_map=handoff_map,
        enable_rag=False,  # Disable RAG for faster eval
        streaming=False,   # Non-streaming for eval
    )
    
    return orchestrator

print("‚úÖ create_orchestrator() function defined")
print("   - o-series/gpt-5: max_completion_tokens, NO temperature")
print("   - gpt-4.1: max_completion_tokens, with temperature")
print("   - gpt-4o: max_tokens + temperature (legacy)")
print("\nYou can now create orchestrators with:")
print("  orchestrator = create_orchestrator('FraudAgent')")

## 3. Run a Single Turn (Real Agent!)

Test a real agent with a real query:

In [None]:
async def run_single_turn(
    agent_name: str,
    user_query: str,
    model_override: dict = None,
    record_events: bool = True,
) -> dict:
    """
    Run a single turn with the cascade orchestrator.
    
    Args:
        agent_name: Agent to use
        user_query: User's question/request
        model_override: Optional model config override
        record_events: Whether to record events
    
    Returns:
        Result dictionary with response and metrics
    """
    import time
    from tests.evaluation import MockMemoManager
    
    session_id = f"eval-{int(time.time())}"
    
    # Create MemoManager for session state
    memo_manager = MockMemoManager(
        session_id=session_id,
        context={"caller_name": "Test User", "client_id": "test_client"}
    )
    
    # Create orchestrator
    orchestrator = create_orchestrator(
        agent_name=agent_name,
        model_override=model_override,
        session_id=session_id,
    )
    
    # DEBUG: Verify model config is correctly applied on the orchestrator's agent
    if model_override:
        # Get the agent from the orchestrator's internal state (not a fresh discover)
        agent = orchestrator.agents.get(agent_name)
        if agent:
            actual_model = agent.get_model_for_mode("cascade")
            print(f"  ‚û°Ô∏è  Model config applied: deployment={actual_model.deployment_id}, "
                  f"verbosity={actual_model.verbosity}, "
                  f"endpoint_pref={actual_model.endpoint_preference}")
    
    # Optionally wrap with recorder
    if record_events:
        recorder = EventRecorder(
            run_id=f"{agent_name}_{session_id}",
            output_dir=PROJECT_ROOT / "runs" / "jupyter_tests",
        )
        orchestrator = EvaluationOrchestratorWrapper(
            orchestrator=orchestrator,
            recorder=recorder,
        )
    
    # Create context WITH MemoManager!
    context = OrchestratorContext(
        session_id=session_id,
        user_text=user_query,
        turn_id="turn_1",
        conversation_history=memo_manager.get_history(agent_name),
        metadata={
            "scenario": "jupyter_test",
            "memo_manager": memo_manager,  # ‚Üê THE KEY!
        },
    )
    
    # Run turn through orchestrator
    start_time = time.time()
    result = await orchestrator.process_turn(context)
    elapsed_ms = (time.time() - start_time) * 1000
    
    # Update history
    memo_manager.append_to_history(agent_name, "user", user_query)
    memo_manager.append_to_history(agent_name, "assistant", result.response_text)
    
    return {
        "query": user_query,
        "agent": agent_name,
        "response": result.response_text,
        "model": model_override.get('deployment_id') if model_override else 'default',
        "endpoint": model_override.get('endpoint_preference') if model_override else 'auto',
        "verbosity": model_override.get('verbosity') if model_override else 'default',
        "input_tokens": result.input_tokens,
        "output_tokens": result.output_tokens,
        "latency_ms": elapsed_ms,
        "error": result.error,
    }

print("‚úÖ run_single_turn() function defined")
print("   Evaluates agent with full orchestrator execution")
print("   Now includes verbosity debugging output")

### Test It! Run a Query

In [None]:
# Example: Evaluate FraudAgent with GPT-4o
result = await run_single_turn(
    agent_name="FraudAgent",
    user_query="I see a $500 charge from Amazon that I didn't make",
    model_override={
        "deployment_id": "gpt-4o",
        "endpoint_preference": "responses",  # Use responses API for verbosity support
        "verbosity": 0,  # Minimal verbosity for faster responses
        "temperature": 0.7,
    },
    record_events=True,
)

# Display result
print("\n" + "=" * 70)
print("üéØ EVALUATION RESULT")
print("=" * 70)
print(f"\nüìù Query: {result['query']}")
print(f"\nü§ñ Agent: {result['agent']}")
print(f"üí¨ Response:\n{result['response']}")
print(f"\nüìä Metrics:")
print(f"  ‚Ä¢ Model: {result['model']}")
print(f"  ‚Ä¢ Endpoint: {result['endpoint']}")
print(f"  ‚Ä¢ Verbosity: {result['verbosity']}")
print(f"  ‚Ä¢ Input tokens: {result['input_tokens']}")
print(f"  ‚Ä¢ Output tokens: {result['output_tokens']}")
print(f"  ‚Ä¢ Latency: {result['latency_ms']:.0f}ms")
if result['error']:
    print(f"  ‚Ä¢ ‚ùå Error: {result['error']}")
print("=" * 70)

## 4. Compare Model Configurations (i.e gpt-5-mini vs gpt-4o from the above)

Test the same query with different model configurations:

In [None]:
async def compare_model_configs(
    agent_name: str,
    user_query: str,
    configs: list[dict],
) -> list[dict]:
    """
    Run the same query with different model configurations.
    
    Args:
        agent_name: Agent to test
        user_query: Query to test
        configs: List of model config dictionaries
    
    Returns:
        List of results
    """
    results = []
    
    for config in configs:
        print(f"Testing {config.get('deployment_id')} with {config.get('endpoint_preference')}...")
        
        result = await run_single_turn(
            agent_name=agent_name,
            user_query=user_query,
            model_override=config,
            record_events=True,
        )
        results.append(result)
    
    return results

# Example: Compare GPT-4o Chat vs Responses API
comparison_results = await compare_model_configs(
    agent_name="FraudAgent",
    user_query="I see a suspicious $500 charge",
    configs=[
        {
            "deployment_id": "gpt-5-nano",
            "endpoint_preference": "responses",
            "verbosity": 1,
            "temperature": 0.7,
        },
        {
            "deployment_id": "gpt-5-nano",
            "endpoint_preference": "responses",
            "verbosity": 0,  # Minimal
        },
        {
            "deployment_id": "gpt-5-nano",
            "endpoint_preference": "responses",
            "verbosity": 2,  # Detailed
        },
    ],
)

# Display comparison
print("\n" + "=" * 70)
print("üìä MODEL CONFIGURATION COMPARISON")
print("=" * 70)
for i, result in enumerate(comparison_results, 1):
    print(f"\n{i}. {result['model']} | {result['endpoint']}")
    print(f"   Response length: {len(result['response'])} chars")
    print(f"   Output tokens: {result['output_tokens']}")
    print(f"   Latency: {result['latency_ms']:.0f}ms")
    print(f"   Response: {result['response'][:150]}...")

print("\n" + "=" * 70)

## Diagnostic: Verify Model Parameters

Let's verify what parameters are actually being sent to the API:

In [None]:
# Diagnostic: Check what parameters are being prepared for the API call
from src.aoai.manager import AzureOpenAIManager
from apps.artagent.backend.registries.agentstore.base import ModelConfig

def diagnose_model_params(model_override: dict):
    """
    Diagnose what parameters would be sent to the API.
    
    This shows exactly what the AzureOpenAIManager prepares
    based on your model_override settings.
    """
    # Create a ModelConfig from the override
    config = ModelConfig(
        deployment_id=model_override.get('deployment_id', 'gpt-4o'),
        endpoint_preference=model_override.get('endpoint_preference', 'auto'),
        verbosity=model_override.get('verbosity', 0),
        temperature=model_override.get('temperature'),
        max_tokens=model_override.get('max_tokens'),
        max_completion_tokens=model_override.get('max_completion_tokens'),
        reasoning_effort=model_override.get('reasoning_effort'),
    )
    
    print(f"üìã ModelConfig created:")
    print(f"   ‚Ä¢ deployment_id: {config.deployment_id}")
    print(f"   ‚Ä¢ endpoint_preference: {config.endpoint_preference}")
    print(f"   ‚Ä¢ verbosity: {config.verbosity}")
    print(f"   ‚Ä¢ temperature: {config.temperature}")
    print(f"   ‚Ä¢ max_tokens: {config.max_tokens}")
    print(f"   ‚Ä¢ max_completion_tokens: {config.max_completion_tokens}")
    print(f"   ‚Ä¢ reasoning_effort: {config.reasoning_effort}")
    
    # Create a temporary manager to check parameter preparation
    manager = AzureOpenAIManager(enable_tracing=False)
    
    # Check which endpoint would be used
    use_responses = manager._should_use_responses_endpoint(config)
    print(f"\nüîÄ Endpoint decision: {'RESPONSES' if use_responses else 'CHAT'}")
    
    # Sample messages for testing
    test_messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Hello"},
    ]
    
    # Get prepared parameters
    if use_responses:
        params = manager._prepare_responses_params(config, test_messages)
        print(f"\nüì¶ Responses API params:")
    else:
        params = manager._prepare_chat_params(config, test_messages)
        print(f"\nüì¶ Chat API params:")
    
    # Show key parameters (excluding messages for brevity)
    for key, value in params.items():
        if key != 'messages' and key != 'input':
            print(f"   ‚Ä¢ {key}: {value}")
    
    # Check if verbosity is included
    if 'verbosity' in params:
        print(f"\n‚úÖ verbosity IS being sent to API: {params['verbosity']}")
    else:
        print(f"\n‚ö†Ô∏è  verbosity is NOT in the API params!")
    
    return params

# Test with different configurations
print("=" * 70)
print("üîç DIAGNOSTIC: Testing verbosity parameter passing")
print("=" * 70)

print("\n--- Test 1: GPT-4o with Chat endpoint ---")
diagnose_model_params({
    "deployment_id": "gpt-4o",
    "endpoint_preference": "chat",
    "verbosity": 0,
})

print("\n--- Test 2: GPT-4o with Responses endpoint ---")
diagnose_model_params({
    "deployment_id": "gpt-4o",
    "endpoint_preference": "responses",
    "verbosity": 0,
})

print("\n--- Test 3: GPT-5-mini with Responses endpoint ---")
diagnose_model_params({
    "deployment_id": "gpt-5-mini",
    "endpoint_preference": "responses",
    "verbosity": 0,
    "reasoning_effort": "low",
})

print("\n" + "=" * 70)

## 5. Score Recorded Events

Load and score events that were recorded:

In [None]:
# Find recorded events
runs_dir = PROJECT_ROOT / "runs" / "jupyter_tests"

if runs_dir.exists():
    event_files = list(runs_dir.glob("*_events.jsonl"))
    
    if event_files:
        print(f"üìÇ Found {len(event_files)} event file(s):\n")
        for f in event_files:
            print(f"  ‚Ä¢ {f.name}")
        
        # Score the most recent one
        latest_events = event_files[-1]
        print(f"\nüìä Scoring: {latest_events.name}\n")
        
        # Load and score
        scorer = MetricsScorer()
        events = scorer.load_events(latest_events)
        
        # Generate summary
        summary = scorer.generate_summary(
            events,
            scenario_name="jupyter_test",
        )
        
        # Display summary
        print("=" * 60)
        print(f"Scenario: {summary.scenario_name}")
        print(f"Agent: {summary.agent_name}")
        print(f"Total Turns: {summary.total_turns}")
        print(f"\nüîß Tool Metrics:")
        print(f"  Precision: {summary.tool_metrics['precision']:.2%}")
        print(f"  Recall: {summary.tool_metrics['recall']:.2%}")
        print(f"  Efficiency: {summary.tool_metrics['efficiency']:.2%}")
        print(f"\n‚è±Ô∏è  Latency:")
        print(f"  P50: {summary.latency_metrics['e2e_p50_ms']:.1f}ms")
        print(f"  P95: {summary.latency_metrics['e2e_p95_ms']:.1f}ms")
        print(f"\nüí∞ Cost:")
        total_tokens = summary.cost_analysis['total_input_tokens'] + summary.cost_analysis['total_output_tokens']
        print(f"  Total tokens: {total_tokens:,}")
        print(f"  Estimated cost: ${summary.cost_analysis['estimated_cost_usd']:.4f}")
        print("=" * 60)
    else:
        print("‚ö†Ô∏è  No event files found. Run some tests first!")
else:
    print("‚ö†Ô∏è  Runs directory not found. Run some tests first!")

## 6. Interactive Widget for Testing

A simple widget to test different agents and queries:

In [None]:
# Create interactive widgets (fresh instances each run to avoid handler accumulation)
agent_selector = widgets.Dropdown(
    options=sorted(list(agents.keys())),
    description='Agent:',
    style={'description_width': '100px'},
)

model_selector = widgets.Dropdown(
    options=[
        # GPT-4 family
        'gpt-4o',
        'gpt-4o-mini',
        'gpt-4.1',
        'gpt-4.1-mini',
        'gpt-4.1-nano',
        # GPT-5 family
        'gpt-5',
        'gpt-5-chat',
        'gpt-5-mini',
        # o-series (reasoning models)
        'o1',
        'o1-mini',
        'o1-preview',
        'o3',
        'o3-mini',
        'o4-mini',
        'gpt-oss-120b'
    ],
    value='gpt-4o',
    description='Model:',
    style={'description_width': '100px'},
)

endpoint_selector = widgets.Dropdown(
    options=['auto', 'chat', 'responses'],
    value='auto',
    description='Endpoint:',
    style={'description_width': '100px'},
)

query_input = widgets.Textarea(
    value='I need help with my account',
    description='Query:',
    style={'description_width': '100px'},
    layout=widgets.Layout(width='500px', height='80px')
)

# Create fresh button each run (avoids accumulating click handlers)
test_button = widgets.Button(
    description='Run Test',
    button_style='success',
    icon='play',
)

output_area = widgets.Output()

# Apply nest_asyncio once at module level
import nest_asyncio
nest_asyncio.apply()

def on_button_click(b):
    """Handle button click - run async test."""
    with output_area:
        output_area.clear_output(wait=True)
        print("‚è≥ Running test...\n")
        
        try:
            # Get the current event loop and run the coroutine
            loop = asyncio.get_event_loop()
            result = loop.run_until_complete(run_single_turn(
                agent_name=agent_selector.value,
                user_query=query_input.value,
                model_override={
                    "deployment_id": model_selector.value,
                    "endpoint_preference": endpoint_selector.value,
                },
            ))
            
            print("‚úÖ Test Complete\n")
            print(f"Agent: {result['agent']}")
            print(f"Model: {result['model']} ({result['endpoint']})\n")
            print(f"Response:\n{result['response']}\n")
            print(f"Tokens: {result['output_tokens']} | Latency: {result['latency_ms']:.0f}ms")
            
            if result['error']:
                print(f"\n‚ö†Ô∏è Error: {result['error']}")
        
        except Exception as e:
            print(f"‚ùå Error: {e}")
            import traceback
            traceback.print_exc()

# Register handler on fresh button
test_button.on_click(on_button_click)

# Build and display widget UI
widget_ui = widgets.VBox([
    widgets.HTML("<h3>üß™ Interactive Agent Tester</h3>"),
    agent_selector,
    model_selector,
    endpoint_selector,
    query_input,
    test_button,
    output_area,
])

display(widget_ui)

## 7. Load YAML Comparison

Load the fraud detection comparison YAML:

In [None]:
# Load comparison YAML
comparison_path = PROJECT_ROOT / "tests" / "eval_scenarios" / "ab_tests" / "fraud_detection_comparison.yaml"

if comparison_path.exists():
    runner = ComparisonRunner(
        comparison_path=comparison_path,
        output_dir=PROJECT_ROOT / "runs" / "jupyter_comparison",
    )
    
    print(f"‚úÖ Loaded: {runner.comparison['comparison_name']}")
    print(f"\nVariants:")
    for variant in runner.comparison['variants']:
        print(f"  ‚Ä¢ {variant['variant_id']}")
        model = variant.get('model_override', {})
        print(f"    Model: {model.get('deployment_id')}")
        print(f"    Endpoint: {model.get('endpoint_preference')}")
    
    print(f"\nNote: To run this comparison with real orchestrators,")
    print(f"      you'd need to implement the full comparison runner.")
    print(f"      For now, use the compare_model_configs() function above.")
else:
    print("‚ö†Ô∏è  Comparison YAML not found")

## Next Steps

You now have:

‚úÖ **Real agent testing** - Not mocks, actual orchestrator
‚úÖ **Event recording** - All turns recorded to JSONL
‚úÖ **Metrics scoring** - Performance analysis
‚úÖ **Model comparison** - Test different configs
‚úÖ **Real-time optimization** - Best practices for minimal latency

### Try These:

1. **Test different agents**:
   ```python
   result = await run_single_turn(
       agent_name="InvestmentAdvisor",
       user_query="How's my 401k doing?",
   )
   ```

2. **Compare endpoints**:
   ```python
   results = await compare_model_configs(
       agent_name="FraudAgent",
       user_query="Suspicious charge",
       configs=[{...}, {...}],
   )
   ```

3. **Analyze recorded events**:
   - Events are saved to `runs/jupyter_tests/`
   - Use MetricsScorer to analyze them

4. **Optimize for real-time**:
   - See the Real-Time Optimization section above
   - Use `verbosity=0`, `reasoning_effort="low"`, and capped tokens
   - Test with the benchmark to measure improvements

### Resources

- [Evaluation Package](../../../apps/artagent/backend/evaluation/)
- [Full Documentation](../../../docs/testing/model-evals.md)
- [YAML Scenarios](../../../tests/eval_scenarios/)

In [None]:
# Real-time optimization comparison
# Compare optimal vs suboptimal configurations

import time

async def benchmark_realtime_configs():
    """
    Benchmark optimal vs suboptimal real-time configurations.
    Shows the impact of proper configuration on latency.
    """
    test_query = "I see a suspicious charge on my account"
    
    # ‚úÖ OPTIMAL Real-Time Configuration
    optimal_config = {
        "deployment_id": "gpt-5-mini",
        "endpoint_preference": "responses",
        "verbosity": 0,  # Minimal - fastest
        "reasoning_effort": "low",  # Fast reasoning
        "max_completion_tokens": 2048,  # Reasonable limit
    }
    
    # ‚ö†Ô∏è SUBOPTIMAL Configuration (for comparison)
    suboptimal_config = {
        "deployment_id": "gpt-5-mini", 
        "endpoint_preference": "responses",
        "verbosity": 2,  # Detailed - slower
        # No reasoning_effort set - will use default "medium"
        "max_completion_tokens": 8192,  # Excessive for real-time
    }
    
    print("üöÄ Benchmarking Real-Time Configurations\n")
    print("=" * 70)
    
    # Test 1: Optimal configuration
    print("\n‚úÖ Testing OPTIMAL configuration (verbosity=0, low reasoning, capped tokens)...")
    start = time.time()
    result_optimal = await run_single_turn(
        agent_name="FraudAgent",
        user_query=test_query,
        model_override=optimal_config,
        record_events=False,  # Skip recording for faster benchmark
    )
    optimal_latency = (time.time() - start) * 1000
    
    # Test 2: Suboptimal configuration  
    print(f"\n‚ö†Ô∏è  Testing SUBOPTIMAL configuration (verbosity=2, higher tokens)...")
    start = time.time()
    result_suboptimal = await run_single_turn(
        agent_name="FraudAgent",
        user_query=test_query,
        model_override=suboptimal_config,
        record_events=False,
    )
    suboptimal_latency = (time.time() - start) * 1000
    
    # Display comparison
    print("\n" + "=" * 70)
    print("üìä REAL-TIME OPTIMIZATION RESULTS")
    print("=" * 70)
    
    print(f"\n‚úÖ OPTIMAL Configuration:")
    print(f"   ‚Ä¢ Verbosity: 0 (minimal)")
    print(f"   ‚Ä¢ Reasoning: low")
    print(f"   ‚Ä¢ Max tokens: 2048")
    print(f"   ‚Ä¢ Response length: {len(result_optimal['response'])} chars")
    print(f"   ‚Ä¢ Output tokens: {result_optimal['output_tokens']}")
    print(f"   ‚Ä¢ Latency: {optimal_latency:.0f}ms")
    
    print(f"\n‚ö†Ô∏è  SUBOPTIMAL Configuration:")
    print(f"   ‚Ä¢ Verbosity: 2 (detailed)")
    print(f"   ‚Ä¢ Reasoning: default")
    print(f"   ‚Ä¢ Max tokens: 8192")
    print(f"   ‚Ä¢ Response length: {len(result_suboptimal['response'])} chars")
    print(f"   ‚Ä¢ Output tokens: {result_suboptimal['output_tokens']}")
    print(f"   ‚Ä¢ Latency: {suboptimal_latency:.0f}ms")
    
    # Calculate improvement
    improvement_pct = ((suboptimal_latency - optimal_latency) / suboptimal_latency) * 100
    latency_diff = suboptimal_latency - optimal_latency
    
    print(f"\nüéØ Performance Impact:")
    print(f"   ‚Ä¢ Latency reduction: {latency_diff:.0f}ms ({improvement_pct:.1f}% faster)")
    print(f"   ‚Ä¢ Token efficiency: {result_optimal['output_tokens']} vs {result_suboptimal['output_tokens']} tokens")
    
    if improvement_pct > 10:
        print(f"\n‚úÖ CONCLUSION: Optimal configuration provides {improvement_pct:.1f}% better latency!")
        print(f"   For real-time voice, this translates to noticeably faster responses.")
    else:
        print(f"\nüí° CONCLUSION: Both configs perform similarly (~{improvement_pct:.1f}% difference)")
    
    print("=" * 70)
    
    return {
        "optimal": result_optimal,
        "suboptimal": result_suboptimal,
        "improvement_pct": improvement_pct,
    }

# Run the benchmark
print("‚è≥ Running real-time optimization benchmark...\n")
benchmark_results = await benchmark_realtime_configs()