### AI Agent Tool Call Testing

In [None]:
from deepeval.tracing import observe, update_current_span

#### AI Agent with Tools

In [88]:
from langchain.tools import tool
from deepeval.test_case import ToolCall
from langchain_community.tools import DuckDuckGoSearchRun

s_tool = DuckDuckGoSearchRun()

@tool
@observe(type="tool")
def search_tool(query: str):
    "Tool to search online for given query and return result"
    return s_tool.run(query)

@tool
@observe(type="tool")
def add_numbers(a: int, b: int):
    "Add two numbers and returns result"
    return int(a) + int(b)

@tool
@observe(type="tool")
def subtract_numbers(a: int, b: int):
    "Subtract two numbers and returns result"
    return int(a) - int(b)

tools = [add_numbers, subtract_numbers, search_tool]
AVAILABLE_TOOLS = [
    ToolCall(name=t.name, input_parameters={})
    for t in tools
]

#### Local LLMs

In [None]:
from langchain_ollama import ChatOllama
@observe(type="llm",
         name="Local LLM",
         model="qwen2.5:latest")
def local_llm():
    llm = ChatOllama(base_url="http://localhost:11434", 
                      model="qwen2.5:latest", 
                      temperature=0.5, 
                      max_token=250)
    return llm.bind_tools(tools=tools)
llm = local_llm()

In [None]:
from deepeval.models import OllamaModel
local_llm_deepseek = OllamaModel(
    model="deepseek-r1:8b",
    base_url="http://localhost:11434",
    temperature=0.5,
    generation_kwargs={"num_predict": 500}
)

#### Create ChatPrompt, agent

In [None]:
from langchain_classic.agents import create_tool_calling_agent, AgentExecutor
from langchain_core.prompts import ChatPromptTemplate,MessagesPlaceholder
from deepeval.test_case import ToolCall

prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful AI assistant with access to tools."),
    MessagesPlaceholder("chat_history"),
    ("human", "{input}"),
    MessagesPlaceholder("agent_scratchpad"),
])
@observe(type="agent", 
         available_tools=["add_numbers", "subtract_numbers", "search_tool"])
def main_ai_agent(query):
    agent = create_tool_calling_agent(llm, tools, prompt)
    agent_executor = AgentExecutor(agent=agent,
                                   tools=tools,
                                   verbose=True,
                                   handle_parsing_errors=True,
                                   return_intermediate_steps=True
    )
    response = agent_executor.invoke({"input": query, "chat_history": []})
    return response

#### Extract Tools called from Response

In [None]:
from langchain_core.agents import AgentAction
from typing import List
def extract_tools_from_response(response: dict[str: any]) -> List[ToolCall]:
    deepeval_tools: List[ToolCall] = []
    intermediate_steps = response.get("intermediate_steps", [])
    for action, _ in intermediate_steps:
        if isinstance(action, AgentAction):
            tool_name = action.tool
            tool_args = action.tool_input
            deepeval_tools.append(
                ToolCall(name=tool_name, 
                         input_parameters=tool_args)
            ) 
    return deepeval_tools

In [None]:
search_tool_response = main_ai_agent("Tell me a fun fact about giraffes.")
extract_tools_from_response(search_tool_response)

#### Evaluation (Tool Correctness Metric)

In [None]:
from deepeval.dataset import Golden
tool_correctness_test_data = [
    {
        "input": "What is 15 + 20?",
        "expected_tool_calls": [
            ToolCall(name="add_numbers", input_parameters={"a": 15, "b": 20})
        ]
    },
    {
        "input": "What is the highest mountain in the world?",
        "expected_tool_calls": [
            ToolCall(name="search_tool", 
                     input_parameters={"query": "highest mountain in the world"})
        ]
    },
    {
        "input": "Calculate the difference between 500 and 150?",
        "expected_tool_calls": [
            ToolCall(name="subtract_numbers", 
                     input_parameters={"a": 500, "b": 150})
        ]
    },
    {
        "input": "Tell me a fun fact about giraffes.",
        "expected_tool_calls": [
            ToolCall(name="search_tool", 
                     input_parameters={"query": "fun fact about giraffes"})
        ]
    }
]

In [None]:
goldens_data = []
for test_data in tool_correctness_test_data:
    golden = Golden(
        input= test_data["input"],
        expected_tools=test_data["expected_tool_calls"]
    )
    goldens_data.append(golden)

In [89]:
from deepeval.test_case import LLMTestCase
from deepeval.dataset import EvaluationDataset
from deepeval.evaluate import evaluate
from deepeval.metrics import ToolCorrectnessMetric

data_set = EvaluationDataset(goldens=goldens_data)
processed_testcases:List[LLMTestCase] = []
for golden in data_set.goldens:
    response = main_ai_agent(golden.input)
    llm_testcase = LLMTestCase(
        input=golden.input,
        expected_tools=golden.expected_tools,
        tools_called=extract_tools_from_response(response)
    )
    processed_testcases.append(llm_testcase)
evaluate(processed_testcases, 
         metrics=[ToolCorrectnessMetric(model=local_llm_deepseek, 
                                        threshold=0.5, 
                                        available_tools=AVAILABLE_TOOLS)])




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `add_numbers` with `{'a': 15, 'b': 20}`


[0m[36;1m[1;3m35[0m[32;1m[1;3mThe result of 15 + 20 is 35.[0m

[1m> Finished chain.[0m
Full Response: {'input': 'What is 15 + 20?', 'chat_history': [], 'output': 'The result of 15 + 20 is 35.', 'intermediate_steps': [(ToolAgentAction(tool='add_numbers', tool_input={'a': 15, 'b': 20}, log="\nInvoking: `add_numbers` with `{'a': 15, 'b': 20}`\n\n\n", message_log=[AIMessageChunk(content='', additional_kwargs={}, response_metadata={'model': 'qwen2.5:latest', 'created_at': '2025-12-05T17:33:20.274984Z', 'done': True, 'done_reason': 'stop', 'total_duration': 3374990959, 'load_duration': 95396542, 'prompt_eval_count': 264, 'prompt_eval_duration': 1653555875, 'eval_count': 28, 'eval_duration': 1414231957, 'logprobs': None, 'model_name': 'qwen2.5:latest', 'model_provider': 'ollama'}, id='lc_run--f5c9045c-1d4d-49f4-8325-4ff0391e705f', tool_calls=[{'name': 'add_numbers', 'args

[32;1m[1;3mThe highest mountain in the world is Mount Everest. It has the highest elevation above sea level at approximately 8,849 meters (29,032 feet). However, if you consider the total height from its base below sea level to its peak, Mauna Kea in Hawaii stands taller.[0m

[1m> Finished chain.[0m
Full Response: {'input': 'What is the highest mountain in the world?', 'chat_history': [], 'output': 'The highest mountain in the world is Mount Everest. It has the highest elevation above sea level at approximately 8,849 meters (29,032 feet). However, if you consider the total height from its base below sea level to its peak, Mauna Kea in Hawaii stands taller.', 'intermediate_steps': [(ToolAgentAction(tool='search_tool', tool_input={'query': 'highest mountain in the world'}, log="\nInvoking: `search_tool` with `{'query': 'highest mountain in the world'}`\n\n\n", message_log=[AIMessageChunk(content='', additional_kwargs={}, response_metadata={'model': 'qwen2.5:latest', 'created_at': '2

[32;1m[1;3mThe difference between 500 and 150 is 350.[0m

[1m> Finished chain.[0m
Full Response: {'input': 'Calculate the difference between 500 and 150?', 'chat_history': [], 'output': 'The difference between 500 and 150 is 350.', 'intermediate_steps': [(ToolAgentAction(tool='subtract_numbers', tool_input={'a': 500, 'b': 150}, log="\nInvoking: `subtract_numbers` with `{'a': 500, 'b': 150}`\n\n\n", message_log=[AIMessageChunk(content='', additional_kwargs={}, response_metadata={'model': 'qwen2.5:latest', 'created_at': '2025-12-05T17:33:32.573232Z', 'done': True, 'done_reason': 'stop', 'total_duration': 2031476292, 'load_duration': 65913917, 'prompt_eval_count': 268, 'prompt_eval_duration': 193143458, 'eval_count': 30, 'eval_duration': 1552831127, 'logprobs': None, 'model_name': 'qwen2.5:latest', 'model_provider': 'ollama'}, id='lc_run--af3b5dd0-059f-4596-935b-18952bc3e35c', tool_calls=[{'name': 'subtract_numbers', 'args': {'a': 500, 'b': 150}, 'id': '3bb4d992-77b1-42e4-9972-5e116d

[38;5;200m[1;3mGiraffes are the tallest land animals on the planet, but height is only part of their story. From their unusual sleeping habits to the way they fight, eat, and even communicate, there's a lot more to learn about these gentle giants. Here are 12+ surprising and fun facts about giraffes that go beyond the basics: 1. Giraffes aren't one species â€” they're four For a long time, giraffes ... Find deals and low prices on giraffe book facts at Amazon.com. Browse & discover thousands of brands. Read customer reviews & find best sellers Explore 15+ fun and interesting facts about giraffes ! Learn surprising trivia about the tallest mammals, their unique features, and their role in nature. Giraffes need to eat about 75 pounds of leaves every day to get enough nutrition, so they spend between 16-20 hours a day just munching on vegetation. It's basically like having a full-time job where your only task is eating leaves all day long. Their favourite food is acacia leaves, which ar



Metrics Summary

  - âœ… Tool Correctness (score: 1.0, threshold: 0.5, strict: False, evaluation model: None, reason: [
	 Tool Calling Reason: All expected tools ['search_tool'] were called (order not considered).
	 Tool Selection Reason: The agent correctly selected the search_tool, which is appropriate for retrieving information like fun facts about giraffes. The other tools (add_numbers and subtract_numbers) are irrelevant to the task and were not used, indicating no unnecessary or missing tools were selected.
]
, error: None)

For test case:

  - input: Tell me a fun fact about giraffes.
  - actual output: None
  - expected output: None
  - context: None
  - retrieval context: None


Metrics Summary

  - âœ… Tool Correctness (score: 1.0, threshold: 0.5, strict: False, evaluation model: None, reason: [
	 Tool Calling Reason: All expected tools ['search_tool'] were called (order not considered).
	 Tool Selection Reason: The search_tool was appropriately selected to retrieve factual

EvaluationResult(test_results=[TestResult(name='test_case_3', success=True, metrics_data=[MetricData(name='Tool Correctness', threshold=0.5, success=True, score=1.0, reason="[\n\t Tool Calling Reason: All expected tools ['search_tool'] were called (order not considered).\n\t Tool Selection Reason: The agent correctly selected the search_tool, which is appropriate for retrieving information like fun facts about giraffes. The other tools (add_numbers and subtract_numbers) are irrelevant to the task and were not used, indicating no unnecessary or missing tools were selected.\n]\n", strict_mode=False, evaluation_model=None, error=None, evaluation_cost=0.0, verbose_logs='Expected Tools:\n[\n    ToolCall(\n        name="search_tool",\n        input_parameters={\n            "query": "fun fact about giraffes"\n        }\n    )\n] \n \nTools Called:\n[\n    ToolCall(\n        name="search_tool",\n        input_parameters={\n            "query": "fun facts about giraffes"\n        }\n    )\n] \