# Step 4: Agentic Tool-Calling Loop

This notebook validates ArcLLM's unified interface by running **real API calls** through the Anthropic adapter in a complete agentic tool-calling loop.

**What we're testing:**
1. Simple text response — does `invoke()` return a properly typed `LLMResponse`?
2. Calculator tool loop — can the agent send a tool, execute it, and return the result?
3. Search tool loop — same pattern with a canned search tool
4. Multi-tool — both tools available, LLM picks which to use

**Prerequisites:**
- `ANTHROPIC_API_KEY` environment variable set
- `pip install -e ".[dev]"` completed

## 1. Setup

In [None]:
import os
import json

# Verify API key is available
assert os.environ.get("ANTHROPIC_API_KEY"), "Set ANTHROPIC_API_KEY env var first!"
print("API key found.")

In [None]:
from arcllm import (
    AnthropicAdapter,
    LLMResponse,
    Message,
    TextBlock,
    Tool,
    ToolCall,
    ToolResultBlock,
    ToolUseBlock,
    Usage,
    load_provider_config,
)

# Load real Anthropic config
config = load_provider_config("anthropic")
print(f"Provider: {config.provider.api_format}")
print(f"Default model: {config.provider.default_model}")
print(f"Available models: {list(config.models.keys())}")

In [None]:
# Create the adapter — uses a cheaper/faster model for testing
MODEL = "claude-haiku-4-5-20251001"
adapter = AnthropicAdapter(config, MODEL)
print(f"Adapter name: {adapter.name}")
print(f"Model: {adapter._model_name}")
print(f"Model meta: context_window={adapter._model_meta.context_window}, supports_tools={adapter._model_meta.supports_tools}")

## 2. Test 1: Simple Text Response

The simplest case — send a message, get text back. Validates:
- `invoke()` returns `LLMResponse`
- `content` is a string
- `usage` has token counts
- `stop_reason` is `end_turn`

In [None]:
messages = [
    Message(role="user", content="What is 2 + 2? Reply with just the number.")
]

response = await adapter.invoke(messages, max_tokens=50)

print(f"Type: {type(response).__name__}")
print(f"Content: {response.content}")
print(f"Stop reason: {response.stop_reason}")
print(f"Usage: in={response.usage.input_tokens}, out={response.usage.output_tokens}, total={response.usage.total_tokens}")
print(f"Tool calls: {response.tool_calls}")
print(f"Model: {response.model}")

# Type assertions
assert isinstance(response, LLMResponse)
assert isinstance(response.content, str)
assert response.stop_reason == "end_turn"
assert isinstance(response.usage, Usage)
assert response.usage.total_tokens > 0
assert response.tool_calls == []
print("\n✓ All assertions passed")

## 3. Test 2: Calculator Tool Loop

This is the core agentic pattern:
1. Agent sends a math problem with a `calculate` tool available
2. LLM responds with `stop_reason=tool_use` and a `ToolCall`
3. Agent executes the tool (Python `eval` for simplicity)
4. Agent packs the result as `ToolResultBlock` and sends back
5. LLM responds with final answer (`stop_reason=end_turn`)

In [None]:
# Define the calculator tool
calculator_tool = Tool(
    name="calculate",
    description="Evaluate a mathematical expression. Returns the numeric result.",
    parameters={
        "type": "object",
        "properties": {
            "expression": {
                "type": "string",
                "description": "The math expression to evaluate, e.g. '2 + 3 * 4'"
            }
        },
        "required": ["expression"]
    }
)

print(f"Tool: {calculator_tool.name}")
print(f"Schema: {json.dumps(calculator_tool.parameters, indent=2)}")

In [None]:
# Simple tool executor
def execute_calculate(arguments: dict) -> str:
    """Execute the calculate tool. Returns result as string."""
    expr = arguments["expression"]
    # Safe eval for basic math only
    allowed = set("0123456789+-*/.() ")
    if not all(c in allowed for c in expr):
        return f"Error: unsafe expression '{expr}'"
    try:
        result = eval(expr)  # noqa: S307 — restricted to numeric chars
        return str(result)
    except Exception as e:
        return f"Error: {e}"

# Quick test
print(execute_calculate({"expression": "2 + 3 * 4"}))
print(execute_calculate({"expression": "(100 - 37) / 9"}))

In [None]:
# --- Turn 1: Agent sends math problem with calculator tool ---

messages = [
    Message(role="system", content="You are a helpful assistant. Use the calculate tool for any math."),
    Message(role="user", content="What is 137 * 42 + 19?")
]

response_1 = await adapter.invoke(messages, tools=[calculator_tool], max_tokens=200)

print(f"Stop reason: {response_1.stop_reason}")
print(f"Content: {response_1.content}")
print(f"Tool calls: {len(response_1.tool_calls)}")

# Should be a tool call
assert response_1.stop_reason == "tool_use"
assert len(response_1.tool_calls) >= 1

tool_call = response_1.tool_calls[0]
print(f"\nTool call:")
print(f"  id: {tool_call.id}")
print(f"  name: {tool_call.name}")
print(f"  arguments: {tool_call.arguments}")

assert isinstance(tool_call, ToolCall)
assert tool_call.name == "calculate"
assert isinstance(tool_call.arguments, dict)
print("\n✓ Turn 1 assertions passed")

In [None]:
# --- Turn 2: Agent executes tool and sends result back ---

# Execute the tool
result = execute_calculate(tool_call.arguments)
print(f"Tool result: {result}")

# Build the assistant message (what the LLM said) and tool result message
# The assistant's response becomes a message with ToolUseBlock content
assistant_content = []
if response_1.content:
    assistant_content.append(TextBlock(text=response_1.content))
for tc in response_1.tool_calls:
    assistant_content.append(ToolUseBlock(id=tc.id, name=tc.name, arguments=tc.arguments))

messages.append(Message(role="assistant", content=assistant_content))
messages.append(Message(
    role="tool",
    content=[ToolResultBlock(tool_use_id=tool_call.id, content=result)]
))

print(f"\nMessages in conversation: {len(messages)}")
for i, m in enumerate(messages):
    content_preview = m.content if isinstance(m.content, str) else f"[{len(m.content)} blocks]"
    print(f"  {i}: role={m.role}, content={content_preview}")

In [None]:
# --- Turn 2 continued: Send the tool result back to the LLM ---

response_2 = await adapter.invoke(messages, tools=[calculator_tool], max_tokens=200)

print(f"Stop reason: {response_2.stop_reason}")
print(f"Content: {response_2.content}")
print(f"Tool calls: {response_2.tool_calls}")
print(f"Usage: in={response_2.usage.input_tokens}, out={response_2.usage.output_tokens}")

# Should be a final text response
assert response_2.stop_reason == "end_turn"
assert response_2.content is not None
assert response_2.tool_calls == []

# The answer should contain 5773 (137 * 42 + 19 = 5773)
assert "5773" in response_2.content or "5,773" in response_2.content, f"Expected 5773 in response: {response_2.content}"
print("\n✓ Calculator loop complete — all assertions passed")

## 4. Test 3: Search Tool Loop

Same agentic pattern but with a search tool that returns canned results.
Validates that string content in `ToolResultBlock` works correctly.

In [None]:
# Define the search tool
search_tool = Tool(
    name="web_search",
    description="Search the web for current information. Returns relevant search results.",
    parameters={
        "type": "object",
        "properties": {
            "query": {
                "type": "string",
                "description": "The search query"
            }
        },
        "required": ["query"]
    }
)

# Canned search results
SEARCH_RESULTS = {
    "default": "Search result: According to recent data, the answer you're looking for is available. Here are the key findings: The topic has been extensively studied with conclusive results."
}

def execute_search(arguments: dict) -> str:
    """Execute the search tool with canned results."""
    query = arguments.get("query", "")
    return f"Search results for '{query}': {SEARCH_RESULTS['default']}"

print(f"Tool: {search_tool.name}")
print(f"Test: {execute_search({'query': 'test query'})}")

In [None]:
# --- Search loop: Turn 1 ---

search_messages = [
    Message(role="user", content="Search for information about the Eiffel Tower's height and tell me what you find.")
]

search_response_1 = await adapter.invoke(search_messages, tools=[search_tool], max_tokens=300)

print(f"Stop reason: {search_response_1.stop_reason}")
print(f"Tool calls: {len(search_response_1.tool_calls)}")

assert search_response_1.stop_reason == "tool_use"
assert len(search_response_1.tool_calls) >= 1

search_tc = search_response_1.tool_calls[0]
print(f"Tool: {search_tc.name}, args: {search_tc.arguments}")
assert search_tc.name == "web_search"

# Execute and return
search_result = execute_search(search_tc.arguments)
print(f"Result: {search_result[:100]}...")

# Build turn 2 messages
search_assistant_content = []
if search_response_1.content:
    search_assistant_content.append(TextBlock(text=search_response_1.content))
for tc in search_response_1.tool_calls:
    search_assistant_content.append(ToolUseBlock(id=tc.id, name=tc.name, arguments=tc.arguments))

search_messages.append(Message(role="assistant", content=search_assistant_content))
search_messages.append(Message(
    role="tool",
    content=[ToolResultBlock(tool_use_id=search_tc.id, content=search_result)]
))

search_response_2 = await adapter.invoke(search_messages, tools=[search_tool], max_tokens=300)

print(f"\nFinal stop reason: {search_response_2.stop_reason}")
print(f"Final content: {search_response_2.content}")

assert search_response_2.stop_reason == "end_turn"
assert search_response_2.content is not None
print("\n✓ Search loop complete — all assertions passed")

## 5. Test 4: Multi-Tool (Both Available)

Give the LLM both tools and ask a question that could use either.
Validates that the adapter handles tool selection correctly.

In [None]:
# Tool dispatcher
TOOL_EXECUTORS = {
    "calculate": execute_calculate,
    "web_search": execute_search,
}

async def run_agentic_loop(
    adapter: AnthropicAdapter,
    messages: list[Message],
    tools: list[Tool],
    max_turns: int = 5,
) -> LLMResponse:
    """Run a complete agentic tool-calling loop."""
    for turn in range(max_turns):
        response = await adapter.invoke(messages, tools=tools, max_tokens=500)
        print(f"  Turn {turn + 1}: stop_reason={response.stop_reason}, tool_calls={len(response.tool_calls)}")
        
        if response.stop_reason != "tool_use":
            return response
        
        # Build assistant message with tool use blocks
        assistant_content = []
        if response.content:
            assistant_content.append(TextBlock(text=response.content))
        for tc in response.tool_calls:
            assistant_content.append(ToolUseBlock(id=tc.id, name=tc.name, arguments=tc.arguments))
        messages.append(Message(role="assistant", content=assistant_content))
        
        # Execute each tool and pack results
        tool_results = []
        for tc in response.tool_calls:
            executor = TOOL_EXECUTORS.get(tc.name)
            if executor:
                result = executor(tc.arguments)
                print(f"    Tool '{tc.name}': {result[:80]}")
            else:
                result = f"Error: unknown tool '{tc.name}'"
            tool_results.append(ToolResultBlock(tool_use_id=tc.id, content=result))
        
        messages.append(Message(role="tool", content=tool_results))
    
    raise RuntimeError(f"Agentic loop did not complete in {max_turns} turns")

print("Agentic loop helper defined.")

In [None]:
# Multi-tool test: ask something that needs calculation
multi_messages = [
    Message(
        role="system",
        content="You have access to a calculator and web search. Use the appropriate tool."
    ),
    Message(
        role="user",
        content="What is 256 * 789? Use the calculate tool."
    )
]

print("Running multi-tool agentic loop...")
final_response = await run_agentic_loop(adapter, multi_messages, [calculator_tool, search_tool])

print(f"\nFinal response:")
print(f"  Content: {final_response.content}")
print(f"  Stop reason: {final_response.stop_reason}")
print(f"  Usage: in={final_response.usage.input_tokens}, out={final_response.usage.output_tokens}")

assert final_response.stop_reason == "end_turn"
assert final_response.content is not None
# 256 * 789 = 201984
assert "201984" in final_response.content or "201,984" in final_response.content, f"Expected 201984 in: {final_response.content}"
print("\n✓ Multi-tool loop complete — all assertions passed")

## 6. Summary & Type Verification

Verify that every object in the loop is correctly typed.

In [None]:
# Collect all responses for type checking
all_responses = [response, response_2, search_response_1, search_response_2, final_response]

print("Type verification across all responses:")
for i, r in enumerate(all_responses):
    assert isinstance(r, LLMResponse), f"Response {i} is not LLMResponse"
    assert isinstance(r.usage, Usage), f"Response {i} usage is not Usage"
    assert isinstance(r.model, str), f"Response {i} model is not str"
    assert isinstance(r.stop_reason, str), f"Response {i} stop_reason is not str"
    assert isinstance(r.tool_calls, list), f"Response {i} tool_calls is not list"
    for tc in r.tool_calls:
        assert isinstance(tc, ToolCall), f"Tool call is not ToolCall"
        assert isinstance(tc.arguments, dict), f"Tool call arguments is not dict"
    print(f"  Response {i}: ✓ (stop={r.stop_reason}, tools={len(r.tool_calls)}, tokens={r.usage.total_tokens})")

print(f"\n✓ All {len(all_responses)} responses are correctly typed")
print(f"\nTotal tokens used: {sum(r.usage.total_tokens for r in all_responses)}")

In [None]:
# Clean up
await adapter.close()
print("Adapter closed. Step 4 complete!")