# Benchmark All

Here, we'll run benchmarking against all tool usage task.

Expand the models list to benchmark against different models.

In [None]:
import datetime
import uuid

from langsmith.client import Client

from langchain_benchmarks import clone_public_dataset, registry
from langchain_benchmarks.tool_usage import agents

In [None]:
experiment_uuid = uuid.uuid4().hex[:4]
models = ["gpt-3.5-turbo-16k"]
client = Client()  # Launch langsmith client for cloning datasets
today = datetime.date.today().isoformat()

for task in registry:
    dataset_name = task.name + f"_benchmarking_{today}"
    clone_public_dataset(task.dataset_id, dataset_name=dataset_name)

    if task.type != "ToolUsageTask":
        continue
    for model in models:
        print()
        print(f"Benchmarking {task.name} with model: {model}")
        eval_config = task.get_eval_config()
        agent_factory = agents.OpenAIAgentFactory(task, model=model)

        client.run_on_dataset(
            dataset_name=dataset_name,
            llm_or_chain_factory=agent_factory,
            evaluation=eval_config,
            verbose=False,
            project_name=f"{dataset_name}-{model}-{experiment_uuid}",
            tags=[model],
            concurrency_level=1,
            project_metadata={
                "model": model,
                "id": experiment_uuid,
                "task": task.name,
                "date": today,
            },
        )