# Benchmark All

Here, we'll run benchmarking against all tool usage task.

Expand the models list to benchmark against different models.

In [1]:
import datetime
import uuid

from langchain.globals import set_verbose
from langsmith.client import Client

from langchain_benchmarks import (
    __version__,
    clone_public_dataset,
    model_registry,
    registry,
)
from langchain_benchmarks.rate_limiting import RateLimiter
from langchain_benchmarks.tool_usage.agents import (
    CustomAgentFactory,
    OpenAIAgentFactory,
)

Prior to starting the tests, you may want to verify
that the task that you're working with and the models are propelry defined.

In [2]:
task = registry["Tool Usage - Typewriter (1 tool)"]
agent_factory = CustomAgentFactory(task, "claude-2.1")

agent_factory().invoke({"question": "abc"})

{'input': "Repeat the given string using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must print the letters 'a', 'b', and 'c' one at a time and in that order. \nWrite down your answer, but do not explain it. Input: `abc`",
 'output': ' Thank you for the input and for confirming the output of each letter I printed. I simply followed the instructions to repeat the given string "abc" by printing one letter at a time using the provided "type_letter" tool without any additional explanations. Please let me know if you need me to repeat this process with a different input string.',
 'intermediate_steps': [(AgentActionMessageLog(tool='type_letter', tool_input={'letter': 'a'}, log="\nInvoking type_letter: {'letter': 'a'}\n\t", message_log=[AIMessage(content='<tool>{\n  "tool_name": "type_letter",\n  "arguments": {\n    "letter": "a"\n  }\n}</tool>\n')]),
   'OK'),
  (AgentActionMessageLog(tool='type_letter', tool_input={'l

Define the test cases

In [9]:
tests = [
    # 2-tuple of (architecture, model name)
    ("xml", "mixtral-8x7b-instruct-fw"),
    ("xml", "claude-2.1"),
    ("xml", "claude-2"),
    ("xml", "yi-34b-200k-fw"),
    ("xml", "llama-v2-70b-chat-fw"),
    ("xml", "llama-v2-13b-chat-fw"),
    ("openai_functions", "gpt-3.5-turbo-1106"),
    ("openai_functions", "gpt-3.5-turbo-0613"),
    ("openai_functions", "gpt-4-1106-preview")("openai_functions", "gpt-4-0613"),
]

## Run

In [10]:
experiment_uuid = uuid.uuid4().hex[:4]

In [None]:
client = Client()  # Launch langsmith client for cloning datasets
today = datetime.date.today().isoformat()
rate_limiter = RateLimiter(requests_per_second=1)

for task in registry:
    dataset_name = task.name + f"_benchmarking_{today}"
    clone_public_dataset(task.dataset_id, dataset_name=dataset_name)

    if task.type != "ToolUsageTask":
        continue

    for arch, model in tests:
        print()
        print(f"Benchmarking {task.name} with model: {model} and arch: {arch}")
        eval_config = task.get_eval_config()

        if arch == "openai_functions":
            agent_factory = OpenAIAgentFactory(
                task, model=model, rate_limiter=rate_limiter
            )
        elif arch == "xml":
            agent_factory = CustomAgentFactory(
                task, model=model, rate_limiter=rate_limiter
            )
        else:
            raise ValueError()

        client.run_on_dataset(
            dataset_name=dataset_name,
            llm_or_chain_factory=agent_factory,
            evaluation=eval_config,
            verbose=False,
            project_name=f"{model}{experiment_uuid}",
            tags=[model],
            concurrency_level=5,
            project_metadata={
                "model": model,
                "id": experiment_uuid,
                "task": task.name,
                "date": today,
                "langchain_benchmarks_version": __version__,
                "arch": arch,
            },
        )
    break