# Benchmark

Let's benchmark against all tool usage tasks. 

Expand the models list to benchmark with different models.

In [1]:
import datetime

from langsmith.client import Client

from langchain_benchmarks import (
    __version__,
    clone_public_dataset,
    model_registry,
    registry,
)
from langchain_benchmarks.rate_limiting import RateLimiter
from langchain_benchmarks.tool_usage.agents import (
    AnthropicToolUserFactory,
    CustomAgentFactory,
    OpenAIAgentFactory,
)

Prior to starting the tests, you may want to verify
that the task that you're working with and the models are propelry defined.

In [2]:
task = registry['Multiverse Math']

In [3]:
agent_factory = OpenAIAgentFactory(task, model='mistral-7b-instruct-v0.1') # Follows OpenAI function format
# agent_factory = OpenAIAgentFactory(task, model='gpt-3.5-turbo-1106')
agent_factory().invoke({"question": "(2 + 5) and then to the power of 0.5"})

{'input': '(2 + 5) and then to the power of 0.5',
 'output': 'The answer is 192.54605765894036.',
 'intermediate_steps': [(OpenAIToolAgentAction(tool='add', tool_input={'a': 2, 'b': 5}, log="\nInvoking: `add` with `{'a': 2, 'b': 5}`\n\n\n", message_log=[AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_8149a232b2db4b0c92d8c80576f0047a', 'function': {'arguments': '{"a": 2, "b": 5}', 'name': 'add'}, 'type': 'function'}]})], tool_call_id='call_8149a232b2db4b0c92d8c80576f0047a'),
   8.2),
  (OpenAIToolAgentAction(tool='power', tool_input={'a': 8.2, 'b': 0.5}, log="\nInvoking: `power` with `{'a': 8.2, 'b': 0.5}`\n\n\n", message_log=[AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_f6a87225d4bd45348e60cbbe622495cf', 'function': {'arguments': '{"a": 8.2, "b": 0.5}', 'name': 'power'}, 'type': 'function'}]})], tool_call_id='call_f6a87225d4bd45348e60cbbe622495cf'),
   192.54605765894036)]}

Let's make an experiment id

In [4]:
experiment_uuid = "woof"  # Or generate ranom using uuid.uuid4().hex[:4]

Define the test cases

In [5]:
tests = [
    # 2-tuple of (architecture, model name)
    ("anthropic_tool_user", "claude-2.1"),
    ("openai_functions", "mistral-7b-instruct-v0.1"),
    ("openai_functions", "gpt-3.5-turbo-1106"),
    ("openai_functions", "gpt-3.5-turbo-0613"),
    ("openai_functions", "gpt-4-1106-preview"),
    ("openai_functions", "gpt-4-0613"),
]

## Run

In [7]:
import os

In [6]:
client = Client()  # Launch langsmith client for cloning datasets
today = datetime.date.today().isoformat()
rate_limiter = RateLimiter(requests_per_second=2)

for task in registry:
    if task.type != "ToolUsageTask":
        continue

    dataset_name = task.name
    clone_public_dataset(task.dataset_id, dataset_name=dataset_name)

    for arch, model in tests:
        print()
        print(f"Benchmarking {task.name} with model: {model} and arch: {arch}")
        eval_config = task.get_eval_config()

        if arch == "openai_functions":
            agent_factory = OpenAIAgentFactory(
                task, model=model, rate_limiter=rate_limiter
            )
        elif arch == "custom_agent":
            agent_factory = CustomAgentFactory(
                task, model=model, rate_limiter=rate_limiter
            )
        elif arch == "anthropic_tool_user":
            agent_factory = AnthropicToolUserFactory(task)
        else:
            raise ValueError()

        client.run_on_dataset(
            dataset_name=dataset_name,
            llm_or_chain_factory=agent_factory,
            evaluation=eval_config,
            verbose=False,
            project_name=f"{model}-{task.name}-{today}-{experiment_uuid}",
            tags=[model],
            concurrency_level=5,
            project_metadata={
                "model": model,
                "id": experiment_uuid,
                "task": task.name,
                "date": today,
                "langchain_benchmarks_version": __version__,
                "arch": arch,
            },
        )

Dataset Tool Usage - Typewriter (1 tool) already exists. Skipping.
You can access the dataset at https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/82ca6840-cf23-4bb0-a9be-55237ebbe9d3.

Benchmarking Tool Usage - Typewriter (1 tool) with model: claude-2.1 and arch: anthropic_tool_user
View the evaluation results for project 'claude-2.1-Tool Usage - Typewriter (1 tool)-2023-12-15-woof' at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/82ca6840-cf23-4bb0-a9be-55237ebbe9d3/compare?selectedSessions=2d749062-a6a5-474d-99ce-60f6c847375c

View all tests for Dataset Tool Usage - Typewriter (1 tool) at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/82ca6840-cf23-4bb0-a9be-55237ebbe9d3
[------------------------------------------------->] 20/20
Benchmarking Tool Usage - Typewriter (1 tool) with model: mistral-7b-instruct-v0.1 and arch: openai_functions
View the evaluation results for project 'mistral-7b-instruct-

Chain failed for example 8a2b5450-dd16-4213-8b70-cb2583d6c7eb with inputs {'question': 'student'}
Error Type: BadRequestError, Message: Error code: 400 - {'error': {'message': "An assistant message with 'tool_calls' must be followed by tool messages responding to each 'tool_call_id'. The following tool_call_ids did not have response messages: call_EXTKZSoSqyJufBYv64TAm018", 'type': 'invalid_request_error', 'param': 'messages', 'code': None}}


[------------------------------------------------->] 20/20
Benchmarking Tool Usage - Typewriter (1 tool) with model: gpt-4-1106-preview and arch: openai_functions
View the evaluation results for project 'gpt-4-1106-preview-Tool Usage - Typewriter (1 tool)-2023-12-15-woof' at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/82ca6840-cf23-4bb0-a9be-55237ebbe9d3/compare?selectedSessions=81e16b5e-ab5b-4886-86ec-c08c3d6b92fb

View all tests for Dataset Tool Usage - Typewriter (1 tool) at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/82ca6840-cf23-4bb0-a9be-55237ebbe9d3
[------->                                          ] 3/20

Chain failed for example c17e9d5a-b9f8-43dc-b5a9-6e45d21c9a2a with inputs {'question': 'communication'}
Error Type: BadRequestError, Message: Error code: 400 - {'error': {'message': "An assistant message with 'tool_calls' must be followed by tool messages responding to each 'tool_call_id'. The following tool_call_ids did not have response messages: call_YaB1u6onJCRe3TBblCZaVxkA", 'type': 'invalid_request_error', 'param': 'messages', 'code': None}}


[------------------------------------------------->] 20/20
Benchmarking Tool Usage - Typewriter (1 tool) with model: gpt-4-0613 and arch: openai_functions
View the evaluation results for project 'gpt-4-0613-Tool Usage - Typewriter (1 tool)-2023-12-15-woof' at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/82ca6840-cf23-4bb0-a9be-55237ebbe9d3/compare?selectedSessions=5ca2a98a-8b30-4f90-97dd-9b2eefa9a591

View all tests for Dataset Tool Usage - Typewriter (1 tool) at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/82ca6840-cf23-4bb0-a9be-55237ebbe9d3
[>                                                 ] 0/20

Chain failed for example 08a17b26-0b33-4d4f-b7f6-ee44d51f4a4a with inputs {'question': 'university'}
Error Type: BadRequestError, Message: Error code: 400 - {'error': {'message': "An assistant message with 'tool_calls' must be followed by tool messages responding to each 'tool_call_id'. The following tool_call_ids did not have response messages: call_X0Z2LtrSXeCpqF0qEdWPWMtH", 'type': 'invalid_request_error', 'param': 'messages', 'code': None}}


[->                                                ] 1/20

Chain failed for example 9017ddcc-d3bd-45a8-88dd-70906964586b with inputs {'question': 'dictionary'}
Error Type: BadRequestError, Message: Error code: 400 - {'error': {'message': "An assistant message with 'tool_calls' must be followed by tool messages responding to each 'tool_call_id'. The following tool_call_ids did not have response messages: call_vd5tg67PGyjHRdFZdZ5hgoF9", 'type': 'invalid_request_error', 'param': 'messages', 'code': None}}


[----------->                                      ] 5/20

Chain failed for example 8a2b5450-dd16-4213-8b70-cb2583d6c7eb with inputs {'question': 'student'}
Error Type: BadRequestError, Message: Error code: 400 - {'error': {'message': "An assistant message with 'tool_calls' must be followed by tool messages responding to each 'tool_call_id'. The following tool_call_ids did not have response messages: call_uc1oozJfGTvYEfIzzcsfXfOl", 'type': 'invalid_request_error', 'param': 'messages', 'code': None}}


[--------------------->                            ] 9/20

Chain failed for example 5daad87c-a008-49ab-841c-76916b150f4d with inputs {'question': 'house'}
Error Type: BadRequestError, Message: Error code: 400 - {'error': {'message': "An assistant message with 'tool_calls' must be followed by tool messages responding to each 'tool_call_id'. The following tool_call_ids did not have response messages: call_lZDuTxM5Thzwc1bfJVKsN7Pm", 'type': 'invalid_request_error', 'param': 'messages', 'code': None}}


[------------------------------------------------->] 20/20Dataset Tool Usage - Typewriter (26 tools) already exists. Skipping.
You can access the dataset at https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/2f462c7a-f9b9-46e7-b96b-7469e965f478.

Benchmarking Tool Usage - Typewriter (26 tools) with model: claude-2.1 and arch: anthropic_tool_user
View the evaluation results for project 'claude-2.1-Tool Usage - Typewriter (26 tools)-2023-12-15-woof' at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/2f462c7a-f9b9-46e7-b96b-7469e965f478/compare?selectedSessions=59ef70f6-a0d2-4d3f-a1d9-265b9d736254

View all tests for Dataset Tool Usage - Typewriter (26 tools) at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/2f462c7a-f9b9-46e7-b96b-7469e965f478
[------------------------------------------------->] 20/20
Benchmarking Tool Usage - Typewriter (26 tools) with model: mistral-7b-instruct-v0.1 and arch: openai_fun

Chain failed for example 23649150-3c39-4beb-ba5d-c50ff1c66c63 with inputs {'question': 'church'}
Error Type: InternalServerError, Message: Error code: 504 - {'generated_text': None, 'tool_calls': None, 'embedding_outputs': None, 'num_input_tokens': None, 'num_input_tokens_batch': None, 'num_generated_tokens': None, 'num_generated_tokens_batch': None, 'preprocessing_time': None, 'generation_time': None, 'timestamp': 1702701535.5182757, 'finish_reason': None, 'error': {'message': 'Your request has exceeded the timeout of 3 minutes. This may be caused by excessive traffic against Anyscale EndpointsPlease either use streaming to hold a longer connection, or update your prompt to shorten the response time. (Request ID: 8J-AVRZ2lIjAvoI83OBRYSd5VnsmsdPjU76WfZyRSOo)', 'internal_message': 'rayllm.backend.server.openai_compat.openai_exception.OpenAIHTTPException: 504 (Request ID: 8J-AVRZ2lIjAvoI83OBRYSd5VnsmsdPjU76WfZyRSOo)', 'code': 504, 'type': 'OpenAIHTTPException', 'param': {}}, 'num_total_t

[-------------------------------------------->     ] 18/20

Chain failed for example 818869de-fc7c-45e6-9d24-2871eda9c081 with inputs {'question': 'hand'}
Error Type: InternalServerError, Message: Error code: 504 - {'generated_text': None, 'tool_calls': None, 'embedding_outputs': None, 'num_input_tokens': None, 'num_input_tokens_batch': None, 'num_generated_tokens': None, 'num_generated_tokens_batch': None, 'preprocessing_time': None, 'generation_time': None, 'timestamp': 1702701543.895307, 'finish_reason': None, 'error': {'message': 'Your request has exceeded the timeout of 3 minutes. This may be caused by excessive traffic against Anyscale EndpointsPlease either use streaming to hold a longer connection, or update your prompt to shorten the response time. (Request ID: FuqWl9qSiivXLxvy_ZzLnl1js_I7ZOUZzIGno_iqPmU)', 'internal_message': 'rayllm.backend.server.openai_compat.openai_exception.OpenAIHTTPException: 504 (Request ID: FuqWl9qSiivXLxvy_ZzLnl1js_I7ZOUZzIGno_iqPmU)', 'code': 504, 'type': 'OpenAIHTTPException', 'param': {}}, 'num_total_toke

[----------------------------------------------->  ] 19/20

Chain failed for example 607d5f26-c165-4034-b5f9-0f592913cb71 with inputs {'question': 'dog'}
Error Type: InternalServerError, Message: Error code: 504 - {'generated_text': None, 'tool_calls': None, 'embedding_outputs': None, 'num_input_tokens': None, 'num_input_tokens_batch': None, 'num_generated_tokens': None, 'num_generated_tokens_batch': None, 'preprocessing_time': None, 'generation_time': None, 'timestamp': 1702701547.4154704, 'finish_reason': None, 'error': {'message': 'Your request has exceeded the timeout of 3 minutes. This may be caused by excessive traffic against Anyscale EndpointsPlease either use streaming to hold a longer connection, or update your prompt to shorten the response time. (Request ID: 3D-YtEj-K6-A3VswxzH0TgUdRboWP52pXwXCTDqnxCU)', 'internal_message': 'rayllm.backend.server.openai_compat.openai_exception.OpenAIHTTPException: 504 (Request ID: 3D-YtEj-K6-A3VswxzH0TgUdRboWP52pXwXCTDqnxCU)', 'code': 504, 'type': 'OpenAIHTTPException', 'param': {}}, 'num_total_toke

[------------------------------------------------->] 20/20
Benchmarking Tool Usage - Typewriter (26 tools) with model: gpt-3.5-turbo-1106 and arch: openai_functions
View the evaluation results for project 'gpt-3.5-turbo-1106-Tool Usage - Typewriter (26 tools)-2023-12-15-woof' at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/2f462c7a-f9b9-46e7-b96b-7469e965f478/compare?selectedSessions=a7a8b8e7-cefa-4147-ba34-929ac5bc12dd

View all tests for Dataset Tool Usage - Typewriter (26 tools) at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/2f462c7a-f9b9-46e7-b96b-7469e965f478
[>                                                 ] 0/20

Chain failed for example 54e4c8e2-d85b-4652-99e8-f91496bc6c4e with inputs {'question': 'university'}
Error Type: InternalServerError, Message: Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID 0f3c71115a5a990643ef7d0be5eb0b7c in your email.)', 'type': 'server_error', 'param': None, 'code': None}}


[---->                                             ] 2/20

Chain failed for example e5bc36cc-e077-4a6f-80a0-9ce1794a77e5 with inputs {'question': 'communication'}
Error Type: InternalServerError, Message: Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID 086c0205400563e5fe66c893367c58db in your email.)', 'type': 'server_error', 'param': None, 'code': None}}


[--------->                                        ] 4/20

Chain failed for example ff31e6be-4d37-4c29-b869-9b3a2075ff25 with inputs {'question': 'computer'}
Error Type: InternalServerError, Message: Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID 86a3452f2c554778caa5fcc4b7c3a5bc in your email.)', 'type': 'server_error', 'param': None, 'code': None}}
Chain failed for example 80264d75-4aa4-484a-bbdc-4f82f930a69d with inputs {'question': 'student'}
Error Type: InternalServerError, Message: Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID 4b43112cf53a05987bd46a908574349c in your email.)', 'type': 'server_error', 'param': None, 'code': None}}

[-------------->                                   ] 6/20

Chain failed for example f78f3a19-9f51-474c-9632-c5c43ff8da2e with inputs {'question': 'teacher'}
Error Type: InternalServerError, Message: Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID 95ab8156ff646e7b4048b5d3a784e7d9 in your email.)', 'type': 'server_error', 'param': None, 'code': None}}


[----------------->                                ] 7/20

Chain failed for example 23649150-3c39-4beb-ba5d-c50ff1c66c63 with inputs {'question': 'church'}
Error Type: InternalServerError, Message: Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID c2f5b270d0e638f0a3c059540d6aefe6 in your email.)', 'type': 'server_error', 'param': None, 'code': None}}


[------------------->                              ] 8/20

Chain failed for example 0e6b2b64-57b1-4e8a-8611-4edab1cea326 with inputs {'question': 'school'}
Error Type: InternalServerError, Message: Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID 968fcd16776b0c7364df1b214a1c6b1a in your email.)', 'type': 'server_error', 'param': None, 'code': None}}


[--------------------->                            ] 9/20

Chain failed for example 8af5bd36-fc11-4b23-9019-f642cfaf8a01 with inputs {'question': 'horse'}
Error Type: InternalServerError, Message: Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID fcd45b1b69a26065316de72fd0bec5ee in your email.)', 'type': 'server_error', 'param': None, 'code': None}}


[------------------------>                         ] 10/20

Chain failed for example c1a0336d-ae2f-4cf3-a204-58eec17330e7 with inputs {'question': 'house'}
Error Type: InternalServerError, Message: Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID 18a0312e0c33e0e24381676b67751b60 in your email.)', 'type': 'server_error', 'param': None, 'code': None}}


[--------------------------->                      ] 11/20

Chain failed for example 7083dccd-2397-47ab-b2c4-216e6177f5eb with inputs {'question': 'head'}
Error Type: InternalServerError, Message: Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID 74e03a7a3d7e58300261063f424c120d in your email.)', 'type': 'server_error', 'param': None, 'code': None}}


[----------------------------->                    ] 12/20

Chain failed for example 818869de-fc7c-45e6-9d24-2871eda9c081 with inputs {'question': 'hand'}
Error Type: InternalServerError, Message: Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID 93a3d69e89477c416c13e3b68fcc6cf2 in your email.)', 'type': 'server_error', 'param': None, 'code': None}}
Chain failed for example de38ad8a-ca82-44d6-a4ba-ff3bd5a6640e with inputs {'question': 'cat'}
Error Type: InternalServerError, Message: Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID 5b1f5f98c1aba8c9e4b1730dd679a0ba in your email.)', 'type': 'server_error', 'param': None, 'code': None}}


[---------------------------------->               ] 14/20

Chain failed for example 607d5f26-c165-4034-b5f9-0f592913cb71 with inputs {'question': 'dog'}
Error Type: InternalServerError, Message: Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID dafebdfdd71fba8868825a6daefa530a in your email.)', 'type': 'server_error', 'param': None, 'code': None}}


[----------------------------------------------->  ] 19/20

Chain failed for example 45a3aa01-9158-4adc-807b-b79c5e11e7db with inputs {'question': 'dictionary'}
Error Type: InternalServerError, Message: Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID b0949a00e6975d982ce3b593e6c52808 in your email.)', 'type': 'server_error', 'param': None, 'code': None}}


[------------------------------------------------->] 20/20
Benchmarking Tool Usage - Typewriter (26 tools) with model: gpt-3.5-turbo-0613 and arch: openai_functions
View the evaluation results for project 'gpt-3.5-turbo-0613-Tool Usage - Typewriter (26 tools)-2023-12-15-woof' at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/2f462c7a-f9b9-46e7-b96b-7469e965f478/compare?selectedSessions=d13d0ec3-151b-463a-85dd-ea6564f8c8c9

View all tests for Dataset Tool Usage - Typewriter (26 tools) at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/2f462c7a-f9b9-46e7-b96b-7469e965f478
[>                                                 ] 0/20

Chain failed for example 2d4e99fc-8495-468e-8429-6c25a2d176f3 with inputs {'question': 'keyboard'}
Error Type: InternalServerError, Message: Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID 3143d6b7e43251f800acb20f1c5b15cf in your email.)', 'type': 'server_error', 'param': None, 'code': None}}


[-------------->                                   ] 6/20

Chain failed for example 8af5bd36-fc11-4b23-9019-f642cfaf8a01 with inputs {'question': 'horse'}
Error Type: InternalServerError, Message: Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID 4706b2d842a40fcadb5a5e1a38bdec55 in your email.)', 'type': 'server_error', 'param': None, 'code': None}}


[------------------------------------------------->] 20/20
Benchmarking Tool Usage - Typewriter (26 tools) with model: gpt-4-1106-preview and arch: openai_functions
View the evaluation results for project 'gpt-4-1106-preview-Tool Usage - Typewriter (26 tools)-2023-12-15-woof' at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/2f462c7a-f9b9-46e7-b96b-7469e965f478/compare?selectedSessions=1c885b09-212d-4608-bd3b-6e1b27964a88

View all tests for Dataset Tool Usage - Typewriter (26 tools) at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/2f462c7a-f9b9-46e7-b96b-7469e965f478
[------------------------------------------------->] 20/20
Benchmarking Tool Usage - Typewriter (26 tools) with model: gpt-4-0613 and arch: openai_functions
View the evaluation results for project 'gpt-4-0613-Tool Usage - Typewriter (26 tools)-2023-12-15-woof' at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/2f462c7a-f9b9-46e7-b96b

Chain failed for example c4af9bd6-84c5-4b25-ac0a-04c307fc7441 with inputs {'question': 'information'}
Error Type: InternalServerError, Message: Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID fa4be24c98131a3426aebab3538f9a92 in your email.)', 'type': 'server_error', 'param': None, 'code': None}}


[-------------->                                   ] 6/20

Chain failed for example e5bc36cc-e077-4a6f-80a0-9ce1794a77e5 with inputs {'question': 'communication'}
Error Type: InternalServerError, Message: Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID a1bc46fafb2620ed793db3a29345e369 in your email.)', 'type': 'server_error', 'param': None, 'code': None}}


[------------------------>                         ] 10/20

Chain failed for example f78f3a19-9f51-474c-9632-c5c43ff8da2e with inputs {'question': 'teacher'}
Error Type: InternalServerError, Message: Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID 8a3c32e3a21f70d19927fe4a69f30bfc in your email.)', 'type': 'server_error', 'param': None, 'code': None}}


[------------------------------------------------->] 20/20Dataset Tool Usage - Relational Data already exists. Skipping.
You can access the dataset at https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/df6be6c9-05b3-445e-8836-ebb4aba63826.

Benchmarking Tool Usage - Relational Data with model: claude-2.1 and arch: anthropic_tool_user
View the evaluation results for project 'claude-2.1-Tool Usage - Relational Data-2023-12-15-woof' at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/df6be6c9-05b3-445e-8836-ebb4aba63826/compare?selectedSessions=1aabaef6-fc88-4771-8211-d67a16810d1d

View all tests for Dataset Tool Usage - Relational Data at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/df6be6c9-05b3-445e-8836-ebb4aba63826
[-------------------------------->                 ] 14/21

Chain failed for example f8657d6a-1cd9-4f9e-84e1-0f4a0703e494 with inputs {'question': 'Is it likely that Donna is awake right now?'}
Error Type: ValueError, Message: invalid literal for int() with base 10: 'find_users_by_name("Donna")[0].id'


[----------------------------------->              ] 15/21

Chain failed for example 33002acd-a844-4a3f-8a4e-99fdf3af1cb6 with inputs {'question': 'Is it likely that Donna is outside with an umbrella at this time?'}
Error Type: ValueError, Message: invalid literal for int() with base 10: 'find_users_by_name("Donna")[0].id'


[------------------------------------------------->] 21/21
Benchmarking Tool Usage - Relational Data with model: mistral-7b-instruct-v0.1 and arch: openai_functions
View the evaluation results for project 'mistral-7b-instruct-v0.1-Tool Usage - Relational Data-2023-12-15-woof' at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/df6be6c9-05b3-445e-8836-ebb4aba63826/compare?selectedSessions=a60f355a-94c6-47e9-8a14-9a26a465de28

View all tests for Dataset Tool Usage - Relational Data at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/df6be6c9-05b3-445e-8836-ebb4aba63826
[------------------------------------------------->] 21/21
Benchmarking Tool Usage - Relational Data with model: gpt-3.5-turbo-1106 and arch: openai_functions
View the evaluation results for project 'gpt-3.5-turbo-1106-Tool Usage - Relational Data-2023-12-15-woof' at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/df6be6c9-05b3-445e-8836-e

Chain failed for example 2a20a13d-050e-4a16-84ff-22d9582f1449 with inputs {'question': 'after calculating the sin of 1.5 radians, divide the result by cos of 1.5 radians'}
Error Type: ZeroDivisionError, Message: float division by zero


[------------------------------------------------->] 10/10
Benchmarking Multiverse Math with model: gpt-3.5-turbo-1106 and arch: openai_functions
View the evaluation results for project 'gpt-3.5-turbo-1106-Multiverse Math-2023-12-15-woof' at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/108bdc68-1808-4b60-92ef-fbd9bd7e1ad0/compare?selectedSessions=91df1a9b-d6d3-44aa-99c6-6e98401f3ac7

View all tests for Dataset Multiverse Math at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/108bdc68-1808-4b60-92ef-fbd9bd7e1ad0
[------------------------------------------------->] 10/10
Benchmarking Multiverse Math with model: gpt-3.5-turbo-0613 and arch: openai_functions
View the evaluation results for project 'gpt-3.5-turbo-0613-Multiverse Math-2023-12-15-woof' at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/108bdc68-1808-4b60-92ef-fbd9bd7e1ad0/compare?selectedSessions=b934ef76-8ad1-4b71-bae0-b95d4a1ef7d5

Vi

Chain failed for example 67867526-791a-452f-b534-ef2c1f5efd20 with inputs {'question': 'ecoli divides every 20 minutes. How many cells will be there after 2 hours if we start with 5 cells?'}
Error Type: InternalServerError, Message: Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID 28a9d136aed7faacc59b14d4351430dd in your email.)', 'type': 'server_error', 'param': None, 'code': None}}


[------------------------------------------------->] 10/10
Benchmarking Multiverse Math with model: gpt-4-0613 and arch: openai_functions
View the evaluation results for project 'gpt-4-0613-Multiverse Math-2023-12-15-woof' at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/108bdc68-1808-4b60-92ef-fbd9bd7e1ad0/compare?selectedSessions=ea96a024-5987-47d6-9650-832bbbd4aa67

View all tests for Dataset Multiverse Math at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/108bdc68-1808-4b60-92ef-fbd9bd7e1ad0
[------------------------------------------------->] 10/10

## Inspect

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from langsmith.client import Client

Let's fetch all the data that has the same experiment ID and place it in a dataframe.

In [None]:
experiment_ids = ["3f3e"]


def _endswith(s, suffixes):
    return any(s.endswith(suffix) for suffix in suffixes)


client = Client()
projects = [
    project
    for project in client.list_projects()
    if _endswith(project.name, experiment_ids)
]

dfs = []
for project in projects:
    # Temporary way to get tag information
    project_info = client.read_project(project_id=project.id)
    try:
        test_results = client.get_test_results(project_name=project.name)
    except Exception:
        continue

    for k, v in project_info.extra["metadata"].items():
        test_results[k] = v

    dfs.append(test_results)


df = pd.concat(dfs)

Compute a standardized "correct" column. It uses "Correct Final State" for tool usage tasks, and "correctness (which is based on output) for the other tasks.

In [None]:
correct = []

for r in df.to_dict(orient="records"):
    if "Typewriter" in r["task"]:
        correct.append(r["feedback.Correct Final State"])
    else:
        correct.append(r["feedback.correctness"])

df["correct"] = correct
df["correct"].fillna(0, inplace=True)

Compute some statistics. We're using estimating standard error of the mean assuming a bernoulli process.

In [None]:
num_correct = df.groupby(["model", "task"])["correct"].sum().to_frame("num_correct")
total = df.groupby(["task", "model"]).size().to_frame("total")
stats_df = total.join(num_correct)
stats_df["% correct"] = stats_df["num_correct"] / stats_df["total"]
stats_df["error"] = np.sqrt(
    stats_df["% correct"] * (1 - stats_df["% correct"]) / stats_df["total"]
)

# stats_df

models = [
    "llama-v2-70b-chat-fw",
    "mixtral-8x7b-instruct-fw",
    "claude-2",
    "claude-2.1",
    "gpt-3.5-turbo-0613",
    "gpt-3.5-turbo-1106",
    "gpt-4-0613",
    "gpt-4-1106-preview",
]

tasks = [
    "Tool Usage - Typewriter (1 tool)",
    "Tool Usage - Typewriter (26 tools)",
    "Multiverse Math",
    "Tool Usage - Relational Data",
]

stats_df = stats_df.reset_index()
stats_df = stats_df[stats_df["model"].isin(models)]

Plot the result

In [None]:
x = np.arange(len(tasks))  # the label locations
width = 0.08  # the width of the bars
multiplier = 0

fig, ax = plt.subplots(layout="constrained", figsize=(16, 4))
colormap = plt.get_cmap("Set2").colors

for idx, model in enumerate(models):
    try:
        results = stats_df.set_index("model").loc[model]
    except:
        continue

    color = colormap[idx]

    results = results.set_index("task").loc[tasks]
    measurement = results["% correct"]

    values = [round(m, 2) for m in measurement]

    offset = width * multiplier * 1.4
    rects = ax.bar(
        x + offset, values, width, label=model, yerr=results["error"], color=color
    )
    ax.bar_label(rects, padding=3)
    multiplier += 1

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel("% Questions Answered Correctly")
ax.set_title("Tool Usage Performance")
ax.set_xticks(x + width + 0.3, tasks)
ax.legend(
    loc="center left", ncols=1, bbox_to_anchor=(1.0, 0.5), frameon=False, title="Model"
)
ax.set_ylim(0, 1.10)

plt.show()