# Benchmark

Let's benchmark against all tool usage tasks. 

Expand the models list to benchmark with different models.

In [1]:
import datetime
import uuid

from langchain.globals import set_verbose
from langsmith.client import Client

from langchain_benchmarks import (
    __version__,
    clone_public_dataset,
    model_registry,
    registry,
)
from langchain_benchmarks.rate_limiting import RateLimiter
from langchain_benchmarks.tool_usage.agents import (
    CustomAgentFactory,
    OpenAIAgentFactory,
)

Prior to starting the tests, you may want to verify
that the task that you're working with and the models are propelry defined.

In [2]:
task = registry["Tool Usage - Typewriter (1 tool)"]
agent_factory = CustomAgentFactory(task, "claude-2.1")

agent_factory().invoke({"question": "abc"})

{'input': "Repeat the given string using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must print the letters 'a', 'b', and 'c' one at a time and in that order. \nWrite down your answer, but do not explain it. Input: `abc`",
 'output': " Thank you for the input and for confirming the output of each letter I printed. I simply followed the instructions to repeat the given string 'abc' by printing each letter one at a time without any additional explanations. Please let me know if you need me to repeat this process for a different input string.",
 'intermediate_steps': [(AgentActionMessageLog(tool='type_letter', tool_input={'letter': 'a'}, log="\nInvoking type_letter: {'letter': 'a'}\n\t", message_log=[AIMessage(content='<tool>{\n    "tool_name": "type_letter",\n    "arguments": {\n        "letter": "a"\n    }\n}</tool>\n')]),
   'OK'),
  (AgentActionMessageLog(tool='type_letter', tool_input={'letter': 'b'}, log="\nInv

Define the test cases

Let's make an experiment id

In [13]:
experiment_uuid = "3f3e"  # Or generate ranom using uuid.uuid4().hex[:4]

In [10]:
tests = [
    # 2-tuple of (architecture, model name)
    ("custom_agent", "mixtral-8x7b-instruct-fw"),
    ("custom_agent", "claude-2.1"),
    ("custom_agent", "claude-2"),
    ("custom_agent", "yi-34b-200k-fw"),
    ("custom_agent", "llama-v2-70b-chat-fw"),
    ("custom_agent", "llama-v2-13b-chat-fw"),
    ("openai_functions", "gpt-3.5-turbo-1106"),
    ("openai_functions", "gpt-3.5-turbo-0613"),
    ("openai_functions", "gpt-4-1106-preview"),
    ("openai_functions", "gpt-4-0613"),
]

## Run

In [11]:
client = Client()  # Launch langsmith client for cloning datasets
today = datetime.date.today().isoformat()
rate_limiter = RateLimiter(requests_per_second=2)

for task in registry:
    if task.type != "ToolUsageTask":
        continue

    dataset_name = task.name
    clone_public_dataset(task.dataset_id, dataset_name=dataset_name)

    for arch, model in tests:
        print()
        print(f"Benchmarking {task.name} with model: {model} and arch: {arch}")
        eval_config = task.get_eval_config()

        if arch == "openai_functions":
            agent_factory = OpenAIAgentFactory(
                task, model=model, rate_limiter=rate_limiter
            )
        elif arch == "custom_agent":
            agent_factory = CustomAgentFactory(
                task, model=model, rate_limiter=rate_limiter
            )
        else:
            raise ValueError()

        client.run_on_dataset(
            dataset_name=dataset_name,
            llm_or_chain_factory=agent_factory,
            evaluation=eval_config,
            verbose=False,
            project_name=f"{model}-{task.name}-{today}-{experiment_uuid}",
            tags=[model],
            concurrency_level=5,
            project_metadata={
                "model": model,
                "id": experiment_uuid,
                "task": task.name,
                "date": today,
                "langchain_benchmarks_version": __version__,
                "arch": arch,
            },
        )

Dataset Tool Usage - Typewriter (1 tool) already exists. Skipping.
You can access the dataset at https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/82ca6840-cf23-4bb0-a9be-55237ebbe9d3.

Benchmarking Tool Usage - Typewriter (1 tool) with model: gpt-3.5-turbo-1106 and arch: openai_functions
View the evaluation results for project 'gpt-3.5-turbo-1106-Tool Usage - Typewriter (1 tool)-2023-12-14-3f3e' at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/projects/p/ae274b6a-d1e3-4ef4-873b-888fb66fe003?eval=true

View all tests for Dataset Tool Usage - Typewriter (1 tool) at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/82ca6840-cf23-4bb0-a9be-55237ebbe9d3
[------------------------------------------------->] 20/20
Benchmarking Tool Usage - Typewriter (1 tool) with model: gpt-3.5-turbo-0613 and arch: openai_functions
View the evaluation results for project 'gpt-3.5-turbo-0613-Tool Usage - Typewriter (1 tool)-2023-12-14-

Chain failed for example c4af9bd6-84c5-4b25-ac0a-04c307fc7441 with inputs {'question': 'information'}
Error Type: InternalServerError, Message: Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID 17fb72e265f1b0b1f24060ad09aa56e5 in your email.)', 'type': 'server_error', 'param': None, 'code': None}}


[----------->                                      ] 5/20

Chain failed for example 0e6b2b64-57b1-4e8a-8611-4edab1cea326 with inputs {'question': 'school'}
Error Type: RuntimeError, Message: generator raised StopIteration


[-------------->                                   ] 6/20

Chain failed for example ff31e6be-4d37-4c29-b869-9b3a2075ff25 with inputs {'question': 'computer'}
Error Type: InternalServerError, Message: Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID ad2bc2cc85acf13dd40aaf90d77a2e0c in your email.)', 'type': 'server_error', 'param': None, 'code': None}}


[----------------->                                ] 7/20

Chain failed for example 80264d75-4aa4-484a-bbdc-4f82f930a69d with inputs {'question': 'student'}
Error Type: InternalServerError, Message: Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID bce5cb60e964c615c03ae37b284390a0 in your email.)', 'type': 'server_error', 'param': None, 'code': None}}


[------------------->                              ] 8/20

Chain failed for example 8af5bd36-fc11-4b23-9019-f642cfaf8a01 with inputs {'question': 'horse'}
Error Type: InternalServerError, Message: Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID 6836043ac2fce7a5b8748e01895a1a32 in your email.)', 'type': 'server_error', 'param': None, 'code': None}}


[--------------------------->                      ] 11/20

Chain failed for example de38ad8a-ca82-44d6-a4ba-ff3bd5a6640e with inputs {'question': 'cat'}
Error Type: RuntimeError, Message: generator raised StopIteration


[----------------------------->                    ] 12/20

Chain failed for example 607d5f26-c165-4034-b5f9-0f592913cb71 with inputs {'question': 'dog'}
Error Type: RuntimeError, Message: generator raised StopIteration


[---------------------------------->               ] 14/20

Chain failed for example c1a0336d-ae2f-4cf3-a204-58eec17330e7 with inputs {'question': 'house'}
Error Type: InternalServerError, Message: Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID ad6024af7b61d6309987d3994f8839ae in your email.)', 'type': 'server_error', 'param': None, 'code': None}}


[------------------------------------------------->] 20/20
Benchmarking Tool Usage - Typewriter (26 tools) with model: gpt-3.5-turbo-0613 and arch: openai_functions
View the evaluation results for project 'gpt-3.5-turbo-0613-Tool Usage - Typewriter (26 tools)-2023-12-14-3f3e' at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/projects/p/6860edf5-fc08-4e24-bd7c-9cfb53cbb18d?eval=true

View all tests for Dataset Tool Usage - Typewriter (26 tools) at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/2f462c7a-f9b9-46e7-b96b-7469e965f478
[>                                                 ] 0/20

Chain failed for example 2d4e99fc-8495-468e-8429-6c25a2d176f3 with inputs {'question': 'keyboard'}
Error Type: InternalServerError, Message: Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID 4c72ba8494a00bb7e98881ee6901c9cb in your email.)', 'type': 'server_error', 'param': None, 'code': None}}


[----------->                                      ] 5/20

Chain failed for example 0e6b2b64-57b1-4e8a-8611-4edab1cea326 with inputs {'question': 'school'}
Error Type: BadRequestError, Message: Error code: 400 - {'error': {'message': "'s()' does not match '^[a-zA-Z0-9_-]{1,64}$' - 'messages.2.function_call.name'", 'type': 'invalid_request_error', 'param': None, 'code': None}}


[-------------->                                   ] 6/20

Chain failed for example 8af5bd36-fc11-4b23-9019-f642cfaf8a01 with inputs {'question': 'horse'}
Error Type: InternalServerError, Message: Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID 576744404b64d81872441e295412fc28 in your email.)', 'type': 'server_error', 'param': None, 'code': None}}


[------------------------------------------------->] 20/20
Benchmarking Tool Usage - Typewriter (26 tools) with model: gpt-4-1106-preview and arch: openai_functions
View the evaluation results for project 'gpt-4-1106-preview-Tool Usage - Typewriter (26 tools)-2023-12-14-3f3e' at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/projects/p/c12566e8-c6a4-4c8d-88f4-b026f03c787b?eval=true

View all tests for Dataset Tool Usage - Typewriter (26 tools) at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/2f462c7a-f9b9-46e7-b96b-7469e965f478
[------------------------------------------------->] 20/20
Benchmarking Tool Usage - Typewriter (26 tools) with model: gpt-4-0613 and arch: openai_functions
View the evaluation results for project 'gpt-4-0613-Tool Usage - Typewriter (26 tools)-2023-12-14-3f3e' at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/projects/p/0980e35d-c630-4be3-a67d-d7fafc872e3f?eval=true

View all tests for Data

Chain failed for example e5bc36cc-e077-4a6f-80a0-9ce1794a77e5 with inputs {'question': 'communication'}
Error Type: InternalServerError, Message: Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID 87e0fa2a4f67038c758f2a48c4103836 in your email.)', 'type': 'server_error', 'param': None, 'code': None}}


[--------->                                        ] 4/20

Chain failed for example 2d4e99fc-8495-468e-8429-6c25a2d176f3 with inputs {'question': 'keyboard'}
Error Type: InternalServerError, Message: Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID 8b838371871961de094710e4507cbe9c in your email.)', 'type': 'server_error', 'param': None, 'code': None}}


[----------------->                                ] 7/20

Chain failed for example ff31e6be-4d37-4c29-b869-9b3a2075ff25 with inputs {'question': 'computer'}
Error Type: InternalServerError, Message: Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID e9517e6051dd41e312a14b77fe587df7 in your email.)', 'type': 'server_error', 'param': None, 'code': None}}


[------------------->                              ] 8/20

Chain failed for example 23649150-3c39-4beb-ba5d-c50ff1c66c63 with inputs {'question': 'church'}
Error Type: InternalServerError, Message: Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID ff5617c09a4f8e1d0e19889b76ad4cd7 in your email.)', 'type': 'server_error', 'param': None, 'code': None}}


[------------------------------------------------->] 20/20Dataset Tool Usage - Relational Data already exists. Skipping.
You can access the dataset at https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/df6be6c9-05b3-445e-8836-ebb4aba63826.

Benchmarking Tool Usage - Relational Data with model: gpt-3.5-turbo-1106 and arch: openai_functions
View the evaluation results for project 'gpt-3.5-turbo-1106-Tool Usage - Relational Data-2023-12-14-3f3e' at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/projects/p/9c057691-1109-403c-8118-f5e84a0be76b?eval=true

View all tests for Dataset Tool Usage - Relational Data at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/df6be6c9-05b3-445e-8836-ebb4aba63826
[------------------------------------------------->] 21/21
Benchmarking Tool Usage - Relational Data with model: gpt-3.5-turbo-0613 and arch: openai_functions
View the evaluation results for project 'gpt-3.5-turbo-0613-Tool U

Chain failed for example 4ac33c1a-62f0-4da4-9455-07b582f6ff52 with inputs {'question': 'calculate 101 to the power of 0.5 to 4 digits of precision'}
Error Type: BadRequestError, Message: Error code: 400 - {'error': {'message': '\'"{\\\\"a\\\\":101,\\\\"b\\\\":0.5}"\' does not match \'^[a-zA-Z0-9_-]{1,64}$\' - \'messages.2.function_call.name\'', 'type': 'invalid_request_error', 'param': None, 'code': None}}


[------------------------------------------------->] 10/10
Benchmarking Multiverse Math with model: gpt-4-0613 and arch: openai_functions
View the evaluation results for project 'gpt-4-0613-Multiverse Math-2023-12-14-3f3e' at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/projects/p/6b2036f7-0606-4eff-92ae-49262cea09b9?eval=true

View all tests for Dataset Multiverse Math at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/108bdc68-1808-4b60-92ef-fbd9bd7e1ad0
[------------------------------------------------->] 10/10

## Inspect

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from langsmith.client import Client

Let's fetch all the data that has the same experiment ID and place it in a dataframe.

In [4]:
experiment_ids = ["3f3e"]


def _endswith(s, suffixes):
    return any(s.endswith(suffix) for suffix in suffixes)


client = Client()
projects = [
    project
    for project in client.list_projects()
    if _endswith(project.name, experiment_ids)
]

dfs = []
for project in projects:
    # Temporary way to get tag information
    project_info = client.read_project(project_id=project.id)
    try:
        test_results = client.get_test_results(project_name=project.name)
    except Exception as e:
        continue

    for k, v in project_info.extra["metadata"].items():
        test_results[k] = v

    dfs.append(test_results)


df = pd.concat(dfs)

Compute a standardized "correct" column. It uses "Correct Final State" for tool usage tasks, and "correctness (which is based on output) for the other tasks.

In [9]:
correct = []

for r in df.to_dict(orient="records"):
    if "Typewriter" in r["task"]:
        correct.append(r["feedback.Correct Final State"])
    else:
        correct.append(r["feedback.correctness"])

df["correct"] = correct
df["correct"].fillna(0, inplace=True)

Compute some statistics. We're using estimating standard error of the mean assuming a bernoulli process.

In [11]:
num_correct = df.groupby(["model", "task"])["correct"].sum().to_frame("num_correct")
total = df.groupby(["task", "model"]).size().to_frame("total")
stats_df = total.join(num_correct)
stats_df["% correct"] = stats_df["num_correct"] / stats_df["total"]
stats_df["error"] = np.sqrt(
    stats_df["% correct"] * (1 - stats_df["% correct"]) / stats_df["total"]
)

# stats_df

models = [
    "llama-v2-70b-chat-fw",
    "mixtral-8x7b-instruct-fw",
    "claude-2",
    "claude-2.1",
    "gpt-3.5-turbo-0613",
    "gpt-3.5-turbo-1106",
    "gpt-4-0613",
    "gpt-4-1106-preview",
]

tasks = [
    "Tool Usage - Typewriter (1 tool)",
    "Tool Usage - Typewriter (26 tools)",
    "Multiverse Math",
    "Tool Usage - Relational Data",
]

stats_df = stats_df.reset_index()
stats_df = stats_df[stats_df["model"].isin(models)]

Plot the result

In [None]:
x = np.arange(len(tasks))  # the label locations
width = 0.08  # the width of the bars
multiplier = 0

fig, ax = plt.subplots(layout="constrained", figsize=(16, 4))
colormap = plt.get_cmap("Set2").colors

for idx, model in enumerate(models):
    try:
        results = stats_df.set_index("model").loc[model]
    except:
        continue

    color = colormap[idx]

    results = results.set_index("task").loc[tasks]
    measurement = results["% correct"]

    values = [round(m, 2) for m in measurement]

    offset = width * multiplier * 1.4
    rects = ax.bar(
        x + offset, values, width, label=model, yerr=results["error"], color=color
    )
    ax.bar_label(rects, padding=3)
    multiplier += 1

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel("% Questions Answered Correctly")
ax.set_title("Tool Usage Performance")
ax.set_xticks(x + width + 0.3, tasks)
ax.legend(
    loc="center left", ncols=1, bbox_to_anchor=(1.0, 0.5), frameon=False, title="Model"
)
ax.set_ylim(0, 1.10)

plt.show()