# Multi-Agent vs Single Model Comparison

**Experiment:** Compare debate and manager-worker strategies to single-model baseline

**Date:** 2025-10-26

**Goals:**
- Run same tasks with multiple strategies
- Compare accuracy, latency, cost
- Identify where multi-agent helps

In [None]:
# 1. Add parent directory to path for code imports
import sys
sys.path.append('../code')

# 2. Import harness functions for running strategies and tracking experiments
from harness import (
    run_strategy,          # Run any multi-agent or single-agent strategy
    ExperimentConfig,      # Configuration for experiment tracking
    ExperimentResult,      # Structure for logging results
    get_tracker,           # Get experiment tracker instance
    compare_experiments    # Compare multiple experiment runs
)

# 3. Import pandas for data manipulation and matplotlib for plotting
import pandas as pd
import matplotlib.pyplot as plt

## 1. Define Task

In [None]:
# 1. Define a test reasoning task as a dictionary
task = {
    'id': 'reasoning_001',                                        # Unique task identifier
    'input': 'A farmer has 17 sheep. All but 9 die. How many are left?',  # Trick question
    'type': 'reasoning'                                           # Task category
}

## 2. Run Single Model Baseline

In [None]:
# 1. Configure experiment for single model baseline
config_single = ExperimentConfig(
    experiment_name='single_baseline',   # Name for this experiment
    task_type='reasoning',               # Type of task being tested
    strategy='single',                   # Using single-model strategy
    provider='ollama'                    # Using Ollama local provider
)

# 2. Initialize experiment tracker
tracker = get_tracker()
tracker.start_experiment(config_single)

# 3. Run single model strategy on the task
result_single = run_strategy(
    'single',              # Strategy name
    task['input'],         # Input text from task
    provider='ollama'      # Provider to use
)

# 4. Print results for inspection
print(f"Output: {result_single.output}")
print(f"Latency: {result_single.latency_s:.2f}s")

## 3. Run Debate Strategy

In [None]:
# 1. Run debate strategy with 3 debaters
result_debate = run_strategy(
    'debate',              # Use debate strategy (2+ agents argue, judge decides)
    task['input'],         # Same task input as baseline
    n_debaters=3,          # Number of agents that will debate
    provider='ollama'      # Use Ollama provider
)

# 2. Print the final output from the judge
print(f"Output: {result_debate.output}")

# 3. Print latency (will be higher than single model)
print(f"Latency: {result_debate.latency_s:.2f}s")

# 4. Print the individual arguments from each debater
print(f"\nDebater arguments:")
for i, arg in enumerate(result_debate.metadata['arguments']):
    print(f"{i+1}. {arg[:100]}...")  # Print first 100 chars of each argument

## 4. Compare Results

In [None]:
# 1. Create a pandas DataFrame to compare the two strategies
comparison = pd.DataFrame([
    {
        'strategy': 'single',                                           # Single model baseline
        'latency_s': result_single.latency_s,                          # Time taken in seconds
        'tokens': result_single.tokens_in + result_single.tokens_out,  # Total tokens used
        'cost': result_single.cost_usd                                  # Cost in USD
    },
    {
        'strategy': 'debate',                                           # Multi-agent debate
        'latency_s': result_debate.latency_s,                          # Time taken
        'tokens': result_debate.tokens_in + result_debate.tokens_out,  # Total tokens
        'cost': result_debate.cost_usd                                  # Cost
    }
])

# 2. Print the comparison table
print(comparison)