https://langchain-ai.github.io/langchain-benchmarks/notebooks/tool_usage/intro.html#benchmarking

In [None]:
import datetime

from langsmith.client import Client

from langchain_benchmarks import (
    __version__,
    clone_public_dataset,
    model_registry,
    registry,
)
from langchain_benchmarks.rate_limiting import RateLimiter
from langchain_benchmarks.tool_usage.agents import StandardAgentFactory

In [None]:
tasks = [task for task in registry.tasks if task.name in ("Tool Usage - Relational Data",)]

task = tasks[0]
task

In [None]:
import uuid

experiment_uuid = uuid.uuid4().hex[:]
print(experiment_uuid)

In [None]:
from langchain_anthropic import ChatAnthropic
from langchain_cohere import ChatCohere
from langchain_fireworks import ChatFireworks
from langchain_mistralai import ChatMistralAI
from langchain_openai import ChatOpenAI
from langchain_google_vertexai import ChatVertexAI
from langchain_groq import ChatGroq


tests = [
    ("claude-3-opus-20240229", ChatAnthropic(model="claude-3-opus-20240229", temperature=0)),
    ("gpt-3.5", ChatOpenAI(temperature=0)),
    ("cohere", ChatCohere(temperature=0)),
    ("fireworks", ChatFireworks(model="accounts/fireworks/models/firefunction-v1", temperature=0)),
    ("mistral", ChatMistralAI(model="mistral-large-latest", temperature=0)),
    ("vertex", ChatVertexAI(model_name="gemini-pro", temperature=0)),
    ("groq", ChatGroq(temperature=0)),
]

In [None]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", "{instructions}"),
        ("human", "{input}"),
        MessagesPlaceholder("agent_scratchpad"),
    ]
)

In [None]:
client = Client()  # Launch langsmith client for cloning datasets
today = datetime.date.today().isoformat()
rate_limiter = RateLimiter(requests_per_second=2)

for task in tasks:
    if task.type != "ToolUsageTask":
        continue

    dataset_name = task.name
    clone_public_dataset(task.dataset_id, dataset_name=dataset_name)

    for model_name, model in tests:
        print()
        print(f"Benchmarking {task.name} with model: {model_name}")
        eval_config = task.get_eval_config()

        agent_factory = StandardAgentFactory(task, model, prompt)

        client.run_on_dataset(
            dataset_name=dataset_name,
            llm_or_chain_factory=agent_factory,
            evaluation=eval_config,
            verbose=False,
            project_name=f"{model_name}-{task.name}-{today}-{experiment_uuid}",
            tags=[model_name],
            concurrency_level=5,
            project_metadata={
                "model": model_name,
                "id": experiment_uuid,
                "task": task.name,
                "date": today,
                "langchain_benchmarks_version": __version__,
            },
        )