In [17]:
from langsmith import Client

client = Client()

In [30]:
from langchain.chat_models import ChatOpenAI
from langchain.agents import AgentType, initialize_agent, load_tools, AgentExecutor
from langchain.agents.format_scratchpad import format_to_openai_functions
from langchain.agents.output_parsers import OpenAIFunctionsAgentOutputParser
from langchain.tools.render import format_tool_to_openai_function
from langchain import hub


# Since chains can be stateful (e.g. they can have memory), we provide
# a way to initialize a new chain for each row in the dataset. This is done
# by passing in a factory function that returns a new chain for each row.
def agent_factory():    
    llm_with_tools = llm.bind(
        functions=[format_tool_to_openai_function(t) for t in tools]
    )
    runnable_agent = (
            {
                "input": lambda x: x["input"],
                "agent_scratchpad": lambda x: format_to_openai_functions(x['intermediate_steps'])
            } 
             | prompt 
             | llm_with_tools 
             | OpenAIFunctionsAgentOutputParser()
    )
    return  AgentExecutor(agent=runnable_agent, tools=tools, handle_parsing_errors=True)

In [31]:
from langchain.evaluation import EvaluatorType
from langchain.smith import RunEvalConfig

evaluation_config = RunEvalConfig(
    # Evaluators can either be an evaluator type (e.g., "qa", "criteria", "embedding_distance", etc.) or a configuration for that evaluator
    evaluators=[
        # Measures whether a QA response is "Correct", based on a reference answer
        # You can also select via the raw string "qa"
        EvaluatorType.QA,

    ],
    # You can add custom StringEvaluator or RunEvaluator objects here as well, which will automatically be
    # applied to each prediction. Check out the docs for examples.
    custom_evaluators=[],
)

# How to test intermediate results

Dataset prep: define a new dataset, now we are adding the expected intermediate steps to this dataset. Note that here you can add whatever you want to use in your evaluation logic.

In [36]:
import uuid

questions = [
    (
        "Why was was a $10 calculator app one of the best-rated Nintendo Switch games?", 
        {
            "reference": "It became an internet meme due to its high price point.",
            "expected_steps": ["duck_duck_go"], 
        }
    ),
    (
        "hi", 
        {
            "reference": "Hello, how can I assist you?",
            "expected_steps": [], # Expect a direct response
        }
    ),
    (
        "Who is Dejan Trajkov?", 
         {
             "reference": "Macedonian Professor, Immunologist and Physician",
             "expected_steps": ["duck_duck_go"], 
         }
    ),
    (
         "Who won the 2023 U23 world wresting champs (men's freestyle 92 kg)", 
         {
             "reference": "Muhammed Gimri from turkey",
             "expected_steps": ["duck_duck_go"], 
         }
    ),
    (
        "What's my first meeting on Friday?", 
        {
             "reference": 'Your first meeting is 8:30 AM for "Team Standup"',
             "expected_steps": ["check_calendar"], # Only expect calendar tool
         }
    ),
]

uid = uuid.uuid4()
dataset_name = f"Agent Eval Example {uid}"
ds = client.create_dataset(
    dataset_name=dataset_name,
    description="An example agent evals dataset using search and calendar checks."
)
client.create_examples(
    inputs=[{"question": q[0]} for q in questions],
    outputs=[q[1] for q in questions],
    dataset_id=ds.id,
)

In [45]:
from langchain import hub
from langchain.tools import WikipediaQueryRun
from langchain.utilities import WikipediaAPIWrapper
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.agents import AgentExecutor
from langchain.agents.format_scratchpad import format_to_openai_functions
from langchain.agents.output_parsers import OpenAIFunctionsAgentOutputParser
from langchain.chat_models import ChatOpenAI
from langchain.tools import DuckDuckGoSearchResults
from langchain.tools.render import format_tool_to_openai_function
from langchain.tools import tool
from datetime import datetime

@tool
def check_calendar(date: datetime, run_manager = None) -> list:
    """Check the user's calendar for a meetings on the specified day."""
    # A placeholder to demonstrate with multiple tools.
    # It's easy to mock tools when testing.
    if date.weekday() == 4:
        return [
            "8:30 : Team Standup",
            "9:00 : 1 on 1",
            "9:45 design review",
        ]
    return ["Focus time"] # If only...
    
    

def agent_factory():    
    llm = ChatOpenAI(
        model="gpt-3.5-turbo-16k",
        temperature=0,
    )
    tools = [
        DuckDuckGoSearchResults(
            name="duck_duck_go"
        ),  # General internet search using DuckDuckGo
        check_calendar,
    ]        
    prompt = ChatPromptTemplate.from_messages([
        ("system", "You are a helpful assistant."),
        MessagesPlaceholder(variable_name="agent_scratchpad"),
        ("user", "{input}"),
    ])

    runnable_agent = (
            {
                "input": lambda x: x["question"],
                "agent_scratchpad": lambda x: format_to_openai_functions(x['intermediate_steps'])
            } 
             | prompt 
             | llm_with_tools 
             | OpenAIFunctionsAgentOutputParser()
    )
    
    return AgentExecutor(
        agent=runnable_agent, 
        tools=tools, 
        handle_parsing_errors=True, 
        return_intermediate_steps=True,
    )

## 2. define a custom evaluator to evaluate the agent trajectory

In [52]:
from typing import Optional
from langsmith.evaluation import EvaluationResult, RunEvaluator
from langsmith.schemas import Example, Run


class AgentTrajectoryEvaluator(RunEvaluator):

    def evaluate_run(
        self, run: Run, example: Optional[Example] = None
    ) -> EvaluationResult:
        if run.outputs is None:
            raise ValueError("Run outputs cannot be None")
        # This is the output of each run
        intermediate_steps = run.outputs["intermediate_steps"]
        # Since we are comparing to the tool names, we now need to get that
        # Intermediate steps is a Tuple[AgentAction, Any]
        # The first element is the action taken
        # The second element is the observation from taking that action
        trajectory = [action.tool for action, _ in intermediate_steps]
        # This is what we uploaded to the dataset
        expected_trajectory = example.outputs['expected_steps']
        # Just score it based on whether it is correct or not
        score = int(trajectory == expected_trajectory)
        return EvaluationResult(key="Intermediate steps correctness", score=score)

Third: add that evaluator to the `custom_evaluators` list

Also, specify the reference key (since we have multiple outputs in our dataset now)

In [53]:
from langchain.evaluation import EvaluatorType
from langchain.smith import RunEvalConfig

evaluation_config = RunEvalConfig(
    # Evaluators can either be an evaluator type (e.g., "qa", "criteria", "embedding_distance", etc.) or a configuration for that evaluator
    evaluators=[
        # Measures whether a QA response is "Correct", based on a reference answer
        # You can also select via the raw string "qa"
        EvaluatorType.QA,

    ],
    # You can add custom StringEvaluator or RunEvaluator objects here as well, which will automatically be
    # applied to each prediction. Check out the docs for examples.
    custom_evaluators=[AgentTrajectoryEvaluator()],
    # We now need to specify this because we have multiple outputs in our dataset
    reference_key="reference",
)

In [None]:
from langchain.smith import (
    arun_on_dataset,
    run_on_dataset, 
)

chain_results = run_on_dataset(
    dataset_name=dataset_name,
    llm_or_chain_factory=agent_factory,
    evaluation=evaluation_config,
    verbose=True,
    client=client,
    # project_name=f"runnable-agent-test-5d466cbc-0130",
    tags=["testing-notebook", "prompt:5d466cbc"],  # Optional, adds a tag to the resulting chain runs
)

View the evaluation results for project 'test-pertinent-payment-16' at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/projects/p/c04d309b-9751-4972-a901-b7864b48bdaa
[--------->                                        ] 1/5

Chain failed for example 3b626f03-d0db-4cbf-b085-2f0fb414d1fd with inputs {'question': "What's my first meeting on Friday?"}
Error Type: ValidationError, Message: 1 validation error for check_calendarSchemaSchema
date
  invalid datetime format (type=value_error.datetime)


[--------------------------------------->          ] 4/5