In [1]:
from langsmith import Client

client = Client()

In [2]:
from langchain.tools import tool

In [3]:
@tool
def get_weather(city: str) -> str:
    """Get a weather report for a given city."""
    return "52 degrees F. Scattered showers throughout the day, becoming more strong at 11:35am."

In [4]:
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder

In [7]:
template = """You are a helpful assistant."""

In [8]:
prompt = ChatPromptTemplate.from_messages([
    ("system", template),
    ("user", "{input}"),
    MessagesPlaceholder(variable_name="agent_scratchpad")
])

In [10]:
from langchain import hub
from langchain.agents import AgentExecutor
from langchain.agents.format_scratchpad import format_to_openai_functions
from langchain.agents.output_parsers import OpenAIFunctionsAgentOutputParser
from langchain.chat_models import ChatOpenAI
from langchain.tools import DuckDuckGoSearchResults
from langchain.tools.render import format_tool_to_openai_function

llm = ChatOpenAI(
    model="gpt-3.5-turbo-16k",
    temperature=0,
)

tools = [
    DuckDuckGoSearchResults(
        name="duck_duck_go"
    ),  # General internet search using DuckDuckGo
]

llm_with_tools = llm.bind(functions=[format_tool_to_openai_function(t) for t in tools])

runnable_agent = (
    {
        "input": lambda x: x["input"],
        "agent_scratchpad": lambda x: format_to_openai_functions(
            x["intermediate_steps"]
        ),
    }
    | prompt
    | llm_with_tools
    | OpenAIFunctionsAgentOutputParser()
)

agent_executor = AgentExecutor(
    agent=runnable_agent, tools=tools, handle_parsing_errors=True
)

In [39]:
inputs = [
    "What is LangSmith? Use duck duck go to look it up",
    "Hi",
]

results = agent_executor.batch([{"input": x} for x in inputs], return_exceptions=True)

In [41]:
outputs = [
    "LangSmith is a unified platform for debugging, testing, and monitoring language model applications and agents powered by LangChain",
    "Hi"
]

In [42]:
dataset_name = f"agent-qa-0124"

dataset = client.create_dataset(
    dataset_name, description="An example dataset of questions over the LangSmith documentation."
)

for query, answer in zip(inputs, outputs):
    client.create_example(inputs={"input": query}, outputs={"output": answer}, dataset_id=dataset.id)

In [43]:
from langchain.chat_models import ChatOpenAI
from langchain.agents import AgentType, initialize_agent, load_tools, AgentExecutor
from langchain.agents.format_scratchpad import format_to_openai_functions
from langchain.agents.output_parsers import OpenAIFunctionsAgentOutputParser
from langchain.tools.render import format_tool_to_openai_function
from langchain import hub


# Since chains can be stateful (e.g. they can have memory), we provide
# a way to initialize a new chain for each row in the dataset. This is done
# by passing in a factory function that returns a new chain for each row.
def agent_factory():    
    llm_with_tools = llm.bind(
        functions=[format_tool_to_openai_function(t) for t in tools]
    )
    runnable_agent = (
            {
                "input": lambda x: x["input"],
                "agent_scratchpad": lambda x: format_to_openai_functions(x['intermediate_steps'])
            } 
             | prompt 
             | llm_with_tools 
             | OpenAIFunctionsAgentOutputParser()
    )
    return  AgentExecutor(agent=runnable_agent, tools=tools, handle_parsing_errors=True)

In [44]:
from langchain.evaluation import EvaluatorType
from langchain.smith import RunEvalConfig

evaluation_config = RunEvalConfig(
    # Evaluators can either be an evaluator type (e.g., "qa", "criteria", "embedding_distance", etc.) or a configuration for that evaluator
    evaluators=[
        # Measures whether a QA response is "Correct", based on a reference answer
        # You can also select via the raw string "qa"
        EvaluatorType.QA,

    ],
    # You can add custom StringEvaluator or RunEvaluator objects here as well, which will automatically be
    # applied to each prediction. Check out the docs for examples.
    custom_evaluators=[],
)

In [45]:
from langchain.smith import (
    arun_on_dataset,
    run_on_dataset, 
)

chain_results = run_on_dataset(
    dataset_name=dataset_name,
    llm_or_chain_factory=agent_factory,
    evaluation=evaluation_config,
    verbose=True,
    client=client,
    project_name=f"runnable-agent-test-5d466cbc-0124",
    tags=["testing-notebook", "prompt:5d466cbc"],  # Optional, adds a tag to the resulting chain runs
)

View the evaluation results for project 'runnable-agent-test-5d466cbc-0124' at:
https://dev.smith.langchain.com/o/40ec80b0-9ee0-4eb0-aa65-e97158d191c5/projects/p/6282437a-da57-4317-9e0b-1759b9ecc923
[------------------------------------------------->] 2/2
 Eval quantiles:
             0.25  0.5  0.75  mean  mode
correctness   1.0  1.0   1.0   1.0   1.0


# How to test intermediate results

Dataset prep: define a new dataset, now we are adding the expected intermediate steps to this dataset. Note that here you can add whatever you want to use in your evaluation logic.

In [51]:
dataset_name = f"agent-qa-intermediate-0124"

intermediate_steps = [
    # First datapoint we expect to call duck_duck_go once
    ["duck_duck_go"], 
    # The next datapoint we expect to call no tools
    []
]

dataset = client.create_dataset(
    dataset_name, description="An example dataset of questions over the LangSmith documentation."
)

for query, answer, steps in zip(inputs, outputs, intermediate_steps):
    client.create_example(inputs={"input": query}, outputs={"output": answer, "steps": steps}, dataset_id=dataset.id)

First: add in `return_intermediate_steps=True to the AgentExecutor

In [46]:
def agent_factory():    
    llm_with_tools = llm.bind(
        functions=[format_tool_to_openai_function(t) for t in tools]
    )
    runnable_agent = (
            {
                "input": lambda x: x["input"],
                "agent_scratchpad": lambda x: format_to_openai_functions(x['intermediate_steps'])
            } 
             | prompt 
             | llm_with_tools 
             | OpenAIFunctionsAgentOutputParser()
    )
    
    return  AgentExecutor(agent=runnable_agent, tools=tools, handle_parsing_errors=True, return_intermediate_steps=True)

Second: define a custom evaluator to evaluate the agent trajectory

In [59]:
from typing import Optional
from langsmith.evaluation import EvaluationResult, RunEvaluator
from langsmith.schemas import Example, Run


class AgentTrajectoryEvaluator(RunEvaluator):

    def evaluate_run(
        self, run: Run, example: Optional[Example] = None
    ) -> EvaluationResult:
        if run.outputs is None:
            raise ValueError("Run outputs cannot be None")
        # This is the output of each run
        intermediate_steps = run.outputs["intermediate_steps"]
        # Since we are comparing to the tool names, we now need to get that
        # Intermediate steps is a Tuple[AgentAction, Any]
        # The first element is the action taken
        # The second element is the observation from taking that action
        trajectory = [action.tool for action, _ in intermediate_steps]
        # This is what we uploaded to the dataset
        expected_trajectory = example.outputs['steps']
        # Just score it based on whether it is correct or not
        score = int(trajectory == expected_trajectory)
        return EvaluationResult(key="Intermediate steps correctness", score=score)

Third: add that evaluator to the `custom_evaluators` list

Also, specify the reference key (since we have multiple outputs in our dataset now)

In [60]:
from langchain.evaluation import EvaluatorType
from langchain.smith import RunEvalConfig

evaluation_config = RunEvalConfig(
    # Evaluators can either be an evaluator type (e.g., "qa", "criteria", "embedding_distance", etc.) or a configuration for that evaluator
    evaluators=[
        # Measures whether a QA response is "Correct", based on a reference answer
        # You can also select via the raw string "qa"
        EvaluatorType.QA,

    ],
    # You can add custom StringEvaluator or RunEvaluator objects here as well, which will automatically be
    # applied to each prediction. Check out the docs for examples.
    custom_evaluators=[AgentTrajectoryEvaluator()],
    # We now need to specify this because we have multiple outputs in our dataset
    reference_key="output",
)

In [61]:
from langchain.smith import (
    arun_on_dataset,
    run_on_dataset, 
)

chain_results = run_on_dataset(
    dataset_name=dataset_name,
    llm_or_chain_factory=agent_factory,
    evaluation=evaluation_config,
    verbose=True,
    client=client,
    project_name=f"runnable-agent-test-5d466cbc-0130",
    tags=["testing-notebook", "prompt:5d466cbc"],  # Optional, adds a tag to the resulting chain runs
)

View the evaluation results for project 'runnable-agent-test-5d466cbc-0130' at:
https://dev.smith.langchain.com/o/40ec80b0-9ee0-4eb0-aa65-e97158d191c5/projects/p/c1cabd91-60d0-4f55-b8c5-498380eb81ef
[------------------------>                         ] 1/2[]
[------------------------------------------------->] 2/2['duck_duck_go']

 Eval quantiles:
                                0.25  0.5  0.75  mean  mode
Intermediate steps correctness  1.00  1.0  1.00   1.0   1.0
correctness                     0.25  0.5  0.75   0.5   0.0
