# ToolBench A2C Experiment
Test contrastive vs reconstruction embeddings on LLM tool selection using A2C with RewardTransformer

In [None]:
import sys
sys.path.insert(0, '..')

import numpy as np
import torch
from tqdm import tqdm
import matplotlib.pyplot as plt

from src.embeddings import get_extractor
from src.models import A2CTransformerAgent
from src.datasets import ToolBenchDataset

## 1. Load ToolBench Data

In [None]:
# Download ToolBench dataset (16K+ APIs)
dataset = ToolBenchDataset(cache_dir='../data/toolbench')
print(f"Loaded {len(dataset)} tools")

# Get tool descriptions for embeddings
tool_texts = dataset.get_tool_texts()
print(f"Sample: {tool_texts[0][:200]}...")

## 2. Compute Embeddings

In [None]:
# Contrastive (expected to work well)
simcse = get_extractor('simcse')
simcse_embs = simcse.encode(tool_texts)
print(f"SimCSE embeddings: {simcse_embs.shape}")

# Reconstruction-based (expected to struggle)
bert = get_extractor('bert')
bert_embs = bert.encode(tool_texts)
print(f"BERT embeddings: {bert_embs.shape}")

## 3. Compute Effective Dimension

In [None]:
from src.analysis.eigenvalues import compute_eigenvalue_spectrum, compute_effective_dimension

bert_eigs, _ = compute_eigenvalue_spectrum(bert_embs)
simcse_eigs, _ = compute_eigenvalue_spectrum(simcse_embs)

bert_deff = compute_effective_dimension(bert_eigs)
simcse_deff = compute_effective_dimension(simcse_eigs)

print(f"BERT d_eff: {bert_deff:.1f}")
print(f"SimCSE d_eff: {simcse_deff:.1f}")

## 4. Create Synthetic I3 Tasks
I3 tasks require multi-tool, cross-category chaining

In [None]:
def create_synthetic_tasks(tool_embs: np.ndarray, n_tasks: int = 100, tools_per_task: int = 3, seed: int = 42):
    """
    Create synthetic I3-style tasks where we know the ground truth tool sequence.
    
    Each task:
    - Has a random query embedding direction
    - Requires a sequence of 3 tools
    - Ground truth tools are those closest to the query in different directions
    """
    np.random.seed(seed)
    tasks = []
    
    for i in range(n_tasks):
        # Random query direction
        query_emb = np.random.randn(tool_embs.shape[1]).astype(np.float32)
        query_emb = query_emb / np.linalg.norm(query_emb)
        
        # Find tools closest to rotated versions of query (simulating multi-step reasoning)
        gt_tools = []
        for step in range(tools_per_task):
            # Rotate query for each step
            rotation = np.random.randn(tool_embs.shape[1]).astype(np.float32)
            rotated = query_emb + 0.3 * step * rotation
            rotated = rotated / np.linalg.norm(rotated)
            
            # Find best tool not already selected
            similarities = tool_embs @ rotated
            for idx in gt_tools:
                similarities[idx] = -np.inf
            best_idx = int(np.argmax(similarities))
            gt_tools.append(best_idx)
        
        tasks.append({
            'query_id': i,
            'query_emb': query_emb,
            'ground_truth_tools': gt_tools,
            'required_categories': [],  # Not used for synthetic
        })
    
    return tasks

# Create tasks using SimCSE embeddings (tasks are fixed, embeddings vary)
tasks = create_synthetic_tasks(simcse_embs, n_tasks=200)
print(f"Created {len(tasks)} synthetic I3 tasks")
print(f"Example task ground truth tools: {tasks[0]['ground_truth_tools']}")

## 5. Run A2C Experiment

In [None]:
def run_a2c_experiment(
    tool_embs: np.ndarray,
    tasks: list,
    n_episodes: int = 100,
    n_candidates: int = 50,
    max_steps: int = 5,
    seed: int = 42
):
    """
    Run A2C agent on tool selection tasks.
    
    Metrics:
    - Task success rate (found all required tools)
    - Average reward per episode
    - Steps to completion
    """
    np.random.seed(seed)
    torch.manual_seed(seed)
    
    agent = A2CTransformerAgent(
        d_model=tool_embs.shape[1],
        nhead=8,
        num_layers=2,
        hidden_dim=512,
        lr=1e-4
    )
    
    episode_rewards = []
    episode_successes = []
    episode_steps = []
    
    for ep in tqdm(range(n_episodes), desc="Episodes"):
        task = tasks[ep % len(tasks)]
        query_emb = task['query_emb']
        gt_tools = set(task['ground_truth_tools'])
        
        tools_used = []
        sequence_embs = []
        episode_reward = 0
        
        for step in range(max_steps):
            # Sample candidate tools (include some ground truth for learning)
            n_gt = min(3, len(gt_tools - set(tools_used)))
            gt_remaining = list(gt_tools - set(tools_used))
            candidates = gt_remaining[:n_gt] if n_gt > 0 else []
            
            # Add random candidates
            available = [i for i in range(len(tool_embs)) if i not in tools_used]
            random_candidates = np.random.choice(available, min(n_candidates - n_gt, len(available)), replace=False)
            candidates.extend(random_candidates.tolist())
            candidates = candidates[:n_candidates]
            
            candidate_embs = tool_embs[candidates]
            
            # Select action
            action_idx, log_prob, value = agent.select_action(
                query_emb, sequence_embs, candidate_embs
            )
            selected_tool = candidates[action_idx]
            
            # Compute reward
            if selected_tool in gt_tools:
                if selected_tool == task['ground_truth_tools'][len(tools_used)] if len(tools_used) < len(task['ground_truth_tools']) else False:
                    reward = 2.0  # Correct tool in correct position
                else:
                    reward = 1.0  # Correct tool, wrong position
            else:
                reward = -0.2  # Wrong tool
            
            tools_used.append(selected_tool)
            sequence_embs.append(tool_embs[selected_tool])
            episode_reward += reward
            
            # Check completion
            done = gt_tools.issubset(set(tools_used)) or step == max_steps - 1
            
            # Store transition
            agent.store_transition(
                query_emb, sequence_embs[:-1], candidate_embs,
                action_idx, log_prob, value, reward, done
            )
            
            if done:
                break
        
        # Update agent after each episode
        if len(agent.trajectory_buffer) >= 10:
            agent.update()
        
        success = gt_tools.issubset(set(tools_used))
        episode_rewards.append(episode_reward)
        episode_successes.append(success)
        episode_steps.append(len(tools_used))
    
    return {
        'rewards': np.array(episode_rewards),
        'successes': np.array(episode_successes),
        'steps': np.array(episode_steps),
        'success_rate': np.mean(episode_successes),
        'avg_reward': np.mean(episode_rewards)
    }

In [None]:
# Run experiments
print("Running BERT A2C...")
bert_results = run_a2c_experiment(bert_embs, tasks, n_episodes=100)

print("\nRunning SimCSE A2C...")
simcse_results = run_a2c_experiment(simcse_embs, tasks, n_episodes=100)

## 6. Plot Results

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

# Success rate over time (rolling window)
ax1 = axes[0]
window = 20
bert_rolling_success = np.convolve(bert_results['successes'].astype(float), np.ones(window)/window, mode='valid')
simcse_rolling_success = np.convolve(simcse_results['successes'].astype(float), np.ones(window)/window, mode='valid')
ax1.plot(bert_rolling_success, label=f'BERT (d_eff={bert_deff:.0f})', color='red')
ax1.plot(simcse_rolling_success, label=f'SimCSE (d_eff={simcse_deff:.0f})', color='blue')
ax1.set_xlabel('Episode')
ax1.set_ylabel('Success Rate (20-ep window)')
ax1.set_title('Task Success Rate')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Cumulative reward
ax2 = axes[1]
ax2.plot(np.cumsum(bert_results['rewards']), label='BERT', color='red')
ax2.plot(np.cumsum(simcse_results['rewards']), label='SimCSE', color='blue')
ax2.set_xlabel('Episode')
ax2.set_ylabel('Cumulative Reward')
ax2.set_title('Cumulative Reward')
ax2.legend()
ax2.grid(True, alpha=0.3)

# Steps to completion
ax3 = axes[2]
bert_rolling_steps = np.convolve(bert_results['steps'], np.ones(window)/window, mode='valid')
simcse_rolling_steps = np.convolve(simcse_results['steps'], np.ones(window)/window, mode='valid')
ax3.plot(bert_rolling_steps, label='BERT', color='red')
ax3.plot(simcse_rolling_steps, label='SimCSE', color='blue')
ax3.set_xlabel('Episode')
ax3.set_ylabel('Avg Steps (20-ep window)')
ax3.set_title('Steps to Completion')
ax3.legend()
ax3.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../results/plots/toolbench_a2c_results.png', dpi=150)
plt.show()

print(f"\n=== Final Results ===")
print(f"BERT:   Success Rate = {bert_results['success_rate']*100:.1f}%, Avg Reward = {bert_results['avg_reward']:.2f}")
print(f"SimCSE: Success Rate = {simcse_results['success_rate']*100:.1f}%, Avg Reward = {simcse_results['avg_reward']:.2f}")

## 7. Analysis: Embedding Geometry → Tool Selection Quality

In [None]:
# Analyze how embedding geometry affects tool selection
from src.analysis.coverage import compute_coverage_metric

# Coverage: how well does a random k-sample cover the action space?
bert_coverage_dict = compute_coverage_metric(bert_embs, k_values=[50], n_trials=50)
simcse_coverage_dict = compute_coverage_metric(simcse_embs, k_values=[50], n_trials=50)

bert_coverage = bert_coverage_dict[50]
simcse_coverage = simcse_coverage_dict[50]

print(f"Coverage ρ(50, {len(bert_embs)}) - lower is better:")
print(f"  BERT:   {bert_coverage:.4f}")
print(f"  SimCSE: {simcse_coverage:.4f}")

print(f"\nTheory prediction: High d_eff → Low ρ → Better exploration → Higher success rate")
print(f"  SimCSE d_eff={simcse_deff:.0f} > BERT d_eff={bert_deff:.0f}")
print(f"  SimCSE coverage={simcse_coverage:.4f} < BERT coverage={bert_coverage:.4f}")
print(f"  SimCSE success={simcse_results['success_rate']*100:.1f}% > BERT success={bert_results['success_rate']*100:.1f}%")