# Benchmark

Let's benchmark against all tool usage tasks. 

Expand the models list to benchmark with different models.

In [1]:
%load_ext autoreload
%autoreload 2

In [9]:
import datetime

from langsmith.client import Client

from langchain_benchmarks import (
    __version__,
    clone_public_dataset,
    model_registry,
    registry,
)

from langchain_benchmarks.tool_usage.agents import (
    StandardAgentFactory
)

from langchain_anthropic import ChatAnthropic
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

Prior to starting the tests, you may want to verify
that the task that you're working with and the models are propelry defined.

In [3]:
task = registry["Multiverse Math"]

In [4]:
task.instructions

'You are requested to solve math questions in an alternate mathematical universe. The operations have been altered to yield different results than expected. Do not guess the answer or rely on your  innate knowledge of math. Use the provided tools to answer the question. While associativity and commutativity apply, distributivity does not. Answer the question using the fewest possible tools. Only include the numeric response without any clarifications.'

In [7]:
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", "{instructions}"),
        ("human", "{input}"),
        MessagesPlaceholder("agent_scratchpad"),
    ]
)
model = ChatAnthropic(model="claude-3-opus-20240229")

factory = StandardAgentFactory(task, model, prompt)

In [8]:
factory().invoke({"question": "what is 132*232"})



{'input': 'what is 132*232',
 'output': [],
 'intermediate_steps': [(ToolAgentAction(tool='multiply', tool_input={'a': 132, 'b': 232}, log="\nInvoking: `multiply` with `{'a': 132, 'b': 232}`\nresponded: [{'text': '<thinking>\\nThe relevant tool to answer this request is multiply, since the user is asking to multiply two numbers together.\\n\\nThe multiply function requires two parameters:\\na (number): The user provided the value 132\\nb (number): The user provided the value 232\\n\\nSince the user provided values for both required parameters, I can proceed with the multiply function call to answer the question.\\n</thinking>', 'type': 'text'}, {'id': 'toolu_01FuWpKNrXQp3kZfbdsxjwdd', 'input': {'a': 132, 'b': 232}, 'name': 'multiply', 'type': 'tool_use'}]\n\n", message_log=[AIMessageChunk(content=[{'text': '<thinking>\nThe relevant tool to answer this request is multiply, since the user is asking to multiply two numbers together.\n\nThe multiply function requires two parameters:\na (nu

# OUT OF DATE (BELOW)

Let's make an experiment id

In [10]:
experiment_uuid = "meow"  # Or generate ranom using uuid.uuid4().hex[:4]

Define the test cases

In [10]:
tests = [
    # 2-tuple of (architecture, model name)
    # ("anthropic_tool_user", "claude-2.1"),
    ("openai_functions", "mistral-7b-instruct-v0.1"),
    # ("openai_functions", "gpt-3.5-turbo-1106"),
    # ("openai_functions", "gpt-3.5-turbo-0613"),
    # ("openai_functions", "gpt-4-1106-preview"),
    # ("openai_functions", "gpt-4-0613"),
    # ("openai_assistant", "gpt-4-1106-preview"),
    # ("openai_assistant", "gpt-3.5-turbo-1106"),
]

## Run

In [11]:
import os

In [None]:
client = Client()  # Launch langsmith client for cloning datasets
today = datetime.date.today().isoformat()
rate_limiter = RateLimiter(requests_per_second=0.5)

for task in registry:
    if task.type != "ToolUsageTask":
        continue

    if "26" not in task.name:
        continue

    dataset_name = task.name
    clone_public_dataset(task.dataset_id, dataset_name=dataset_name)

    for arch, model in tests:
        print()
        print(f"Benchmarking {task.name} with model: {model} and arch: {arch}")
        eval_config = task.get_eval_config()

        if arch == "openai_functions":
            agent_factory = OpenAIAgentFactory(
                task, model=model, rate_limiter=rate_limiter
            )
        elif arch == "custom_agent":
            agent_factory = CustomAgentFactory(
                task, model=model, rate_limiter=rate_limiter
            )
        elif arch == "anthropic_tool_user":
            agent_factory = AnthropicToolUserFactory(task)
        elif arch == "openai_assistant":
            agent_factory = OpenAIAssistantFactory(task, model=model)
        else:
            raise ValueError()

        client.run_on_dataset(
            dataset_name=dataset_name,
            llm_or_chain_factory=agent_factory,
            evaluation=eval_config,
            verbose=False,
            project_name=f"{model}-{task.name}-{today}-{experiment_uuid}",
            tags=[model],
            concurrency_level=3,
            project_metadata={
                "model": model,
                "id": experiment_uuid,
                "task": task.name,
                "date": today,
                "langchain_benchmarks_version": __version__,
                "arch": arch,
            },
        )

Dataset Tool Usage - Typewriter (26 tools) already exists. Skipping.
You can access the dataset at https://smith.langchain.com/o/30239cd8-922f-4722-808d-897e1e722845/datasets/0c8a0acd-0308-4298-82bb-e28cec0ca5e1.

Benchmarking Tool Usage - Typewriter (26 tools) with model: mistral-7b-instruct-v0.1 and arch: openai_functions
View the evaluation results for project 'mistral-7b-instruct-v0.1-Tool Usage - Typewriter (26 tools)-2023-12-18-woof' at:
https://smith.langchain.com/o/30239cd8-922f-4722-808d-897e1e722845/datasets/0c8a0acd-0308-4298-82bb-e28cec0ca5e1/compare?selectedSessions=7e9ac2e4-1d33-49bc-9694-f2db6d2e6243

View all tests for Dataset Tool Usage - Typewriter (26 tools) at:
https://smith.langchain.com/o/30239cd8-922f-4722-808d-897e1e722845/datasets/0c8a0acd-0308-4298-82bb-e28cec0ca5e1
[>                                                 ] 0/20--- Formatting open ai tool messages ---
Input: []
Final messages: []
--- Formatting open ai tool messages ---
Input: []
Final messages: []


## Inspect

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from langsmith.client import Client

Let's fetch all the data that has the same experiment ID and place it in a dataframe.

In [None]:
experiment_ids = ["3f3e"]


def _endswith(s, suffixes):
    return any(s.endswith(suffix) for suffix in suffixes)


client = Client()
projects = [
    project
    for project in client.list_projects()
    if _endswith(project.name, experiment_ids)
]

dfs = []
for project in projects:
    # Temporary way to get tag information
    project_info = client.read_project(project_id=project.id)
    try:
        test_results = client.get_test_results(project_name=project.name)
    except Exception:
        continue

    for k, v in project_info.extra["metadata"].items():
        test_results[k] = v

    dfs.append(test_results)


df = pd.concat(dfs)

Compute a standardized "correct" column. It uses "Correct Final State" for tool usage tasks, and "correctness (which is based on output) for the other tasks.

In [None]:
correct = []

for r in df.to_dict(orient="records"):
    if "Typewriter" in r["task"]:
        correct.append(r["feedback.Correct Final State"])
    else:
        correct.append(r["feedback.correctness"])

df["correct"] = correct
df["correct"].fillna(0, inplace=True)

Compute some statistics. We're using estimating standard error of the mean assuming a bernoulli process.

In [None]:
num_correct = df.groupby(["model", "task"])["correct"].sum().to_frame("num_correct")
total = df.groupby(["task", "model"]).size().to_frame("total")
stats_df = total.join(num_correct)
stats_df["% correct"] = stats_df["num_correct"] / stats_df["total"]
stats_df["error"] = np.sqrt(
    stats_df["% correct"] * (1 - stats_df["% correct"]) / stats_df["total"]
)

# stats_df

models = [
    "llama-v2-70b-chat-fw",
    "mixtral-8x7b-instruct-fw",
    "claude-2",
    "claude-2.1",
    "gpt-3.5-turbo-0613",
    "gpt-3.5-turbo-1106",
    "gpt-4-0613",
    "gpt-4-1106-preview",
]

tasks = [
    "Tool Usage - Typewriter (1 tool)",
    "Tool Usage - Typewriter (26 tools)",
    "Multiverse Math",
    "Tool Usage - Relational Data",
]

stats_df = stats_df.reset_index()
stats_df = stats_df[stats_df["model"].isin(models)]

Plot the result

In [None]:
x = np.arange(len(tasks))  # the label locations
width = 0.08  # the width of the bars
multiplier = 0

fig, ax = plt.subplots(layout="constrained", figsize=(16, 4))
colormap = plt.get_cmap("Set2").colors

for idx, model in enumerate(models):
    try:
        results = stats_df.set_index("model").loc[model]
    except:
        continue

    color = colormap[idx]

    results = results.set_index("task").loc[tasks]
    measurement = results["% correct"]

    values = [round(m, 2) for m in measurement]

    offset = width * multiplier * 1.4
    rects = ax.bar(
        x + offset, values, width, label=model, yerr=results["error"], color=color
    )
    ax.bar_label(rects, padding=3)
    multiplier += 1

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel("% Questions Answered Correctly")
ax.set_title("Tool Usage Performance")
ax.set_xticks(x + width + 0.3, tasks)
ax.legend(
    loc="center left", ncols=1, bbox_to_anchor=(1.0, 0.5), frameon=False, title="Model"
)
ax.set_ylim(0, 1.10)

plt.show()