# Benchmark All

Here, we'll run benchmarking against all tool usage task.

Expand the models list to benchmark against different models.

In [1]:
import datetime
import uuid

from langsmith.client import Client

from langchain_benchmarks import clone_public_dataset, registry
from langchain_benchmarks.rate_limiting import RateLimiter
from langchain_benchmarks.tool_usage import agents
from agents.factory import CustomAgentFactory

In [2]:
requests_per_minute = 50
rate_limiter = RateLimiter(requests_per_second=requests_per_minute / 60)

In [3]:
model = 'mixtral-8x7b-instruct'

In [4]:
experiment_uuid = uuid.uuid4().hex[:4]
models = [model]
client = Client()  # Launch langsmith client for cloning datasets
today = datetime.date.today().isoformat()

for task in registry:
    dataset_name = task.name + f"_benchmarking_{today}"
    clone_public_dataset(task.dataset_id, dataset_name=dataset_name)

    if task.type != "ToolUsageTask":
        continue
        
    for model in models:
        print()
        print(f"Benchmarking {task.name} with model: {model}")
        eval_config = task.get_eval_config()
        agent_factory = CustomAgentFactory(task, model=model, rate_limiter=rate_limiter)

        client.run_on_dataset(
            dataset_name=dataset_name,
            llm_or_chain_factory=agent_factory,
            evaluation=eval_config,
            verbose=False,
            project_name=f"{dataset_name}-{model}-{experiment_uuid}",
            tags=[model],
            concurrency_level=5,
            project_metadata={
                "model": model,
                "id": experiment_uuid,
                "task": task.name,
                "date": today,
            },
        )

Dataset Tool Usage - Typewriter (1 tool)_benchmarking_2023-12-13 already exists. Skipping.
You can access the dataset at https://smith.langchain.com/o/e081f11e-fbd2-41b4-9fa8-5d76c76ef854/datasets/05e959e4-08ca-43d6-a986-ebcc1cbe4d05.

Benchmarking Tool Usage - Typewriter (1 tool) with model: mixtral-8x7b-instruct
View the evaluation results for project 'Tool Usage - Typewriter (1 tool)_benchmarking_2023-12-13-mixtral-8x7b-instruct-f7d9' at:
https://smith.langchain.com/o/e081f11e-fbd2-41b4-9fa8-5d76c76ef854/datasets/05e959e4-08ca-43d6-a986-ebcc1cbe4d05/compare?selectedSessions=6f95bc54-5337-4e1a-a1fd-5a52cb60d0f4

View all tests for Dataset Tool Usage - Typewriter (1 tool)_benchmarking_2023-12-13 at:
https://smith.langchain.com/o/e081f11e-fbd2-41b4-9fa8-5d76c76ef854/datasets/05e959e4-08ca-43d6-a986-ebcc1cbe4d05
[------------------------------------------------->] 20/20Dataset Tool Usage - Typewriter (26 tools)_benchmarking_2023-12-13 already exists. Skipping.
You can access the dataset

Chain failed for example 92bd9151-bc66-4714-968d-835aab0cadac with inputs {'question': 'eve ate a serving of sushi, what allergens was she exposed to?'}
Error Type: ValidationError, Message: 1 validation error for get_food_nameSchemaSchema
food_id
  field required (type=value_error.missing)


[----------------------->                          ] 10/21

Chain failed for example ab101881-ef7f-4848-a399-2f319ac5224a with inputs {'question': 'If i eat a serving of pizza, how many calories will I consume?'}
Error Type: ValidationError, Message: 1 validation error for find_foods_by_nameSchemaSchema
food
  field required (type=value_error.missing)


[---------------------------->                     ] 12/21

Chain failed for example 4629f965-34c1-47e8-8bd3-5ebae45c5271 with inputs {'question': 'list the allergens in chocolate'}
Error Type: ValidationError, Message: 1 validation error for find_foods_by_nameSchemaSchema
food
  field required (type=value_error.missing)


[------------------------------------------------->] 21/21Dataset Multiverse Math_benchmarking_2023-12-13 already exists. Skipping.
You can access the dataset at https://smith.langchain.com/o/e081f11e-fbd2-41b4-9fa8-5d76c76ef854/datasets/49a5ef50-6000-44c8-9d2c-3bff3d96f2d6.

Benchmarking Multiverse Math with model: mixtral-8x7b-instruct
View the evaluation results for project 'Multiverse Math_benchmarking_2023-12-13-mixtral-8x7b-instruct-f7d9' at:
https://smith.langchain.com/o/e081f11e-fbd2-41b4-9fa8-5d76c76ef854/datasets/49a5ef50-6000-44c8-9d2c-3bff3d96f2d6/compare?selectedSessions=6d50f1b5-69d8-4e6d-8c74-4edbf544e6f4

View all tests for Dataset Multiverse Math_benchmarking_2023-12-13 at:
https://smith.langchain.com/o/e081f11e-fbd2-41b4-9fa8-5d76c76ef854/datasets/49a5ef50-6000-44c8-9d2c-3bff3d96f2d6
[------------------->                              ] 4/10

Chain failed for example 68740737-531a-457c-8a71-d9ffb544716b with inputs {'question': 'after calculating the sin of 1.5 radians, divide the result by cos of 1.5 radians'}
Error Type: ValidationError, Message: 2 validation errors for divideSchemaSchema
a
  value is not a valid float (type=type_error.float)
b
  value is not a valid float (type=type_error.float)


[------------------------------------------------->] 10/10Dataset Email Extraction_benchmarking_2023-12-13 already exists. Skipping.
You can access the dataset at https://smith.langchain.com/o/e081f11e-fbd2-41b4-9fa8-5d76c76ef854/datasets/704b5676-3373-45f7-82ce-87c7175b4afb.
Dataset Chat Extraction_benchmarking_2023-12-13 already exists. Skipping.
You can access the dataset at https://smith.langchain.com/o/e081f11e-fbd2-41b4-9fa8-5d76c76ef854/datasets/3a2d68f0-da7d-4ae8-9b6b-2fb33974c865.
Dataset LangChain Docs Q&A_benchmarking_2023-12-13 already exists. Skipping.
You can access the dataset at https://smith.langchain.com/o/e081f11e-fbd2-41b4-9fa8-5d76c76ef854/datasets/c699ccfc-5925-48d3-bd72-cd583625b85e.
Dataset Semi-structured Reports_benchmarking_2023-12-13 already exists. Skipping.
You can access the dataset at https://smith.langchain.com/o/e081f11e-fbd2-41b4-9fa8-5d76c76ef854/datasets/a4810aeb-afc1-44fe-a345-6d4d0ce4f104.
Dataset Multi-modal slide decks_benchmarking_2023-12-13 al

## Inspect

In [5]:
client = Client()