## Agent Evaluation

When building a chat bot, it can be hard to evaluate the multi-turn interactions.

One way to evaluate is by simulating a conversation with a user, then evaluating the outpucomes and trajectory.

Below is an example of how to do this in langgraph.

In [1]:
%%capture --no-stderr
%pip install -U langchain langchain_openai langsmith pandas

In [2]:
import getpass
import os
import uuid


def _set_if_undefined(var: str):
    if not os.environ.get(var):
        os.environ[var] = getpass(f"Please provide your {var}")


_set_if_undefined("OPENAI_API_KEY")
_set_if_undefined("LANGCHAIN_API_KEY")
_set_if_undefined("TAVILY_API_KEY")

# Optional, add tracing in LangSmith.
# This will help you visualize and debug the control flow
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = "Agent Simulation Evaluation"

## Make a user proxy 

Really unstructured right now. Will have to think about the UX of this further.

In [3]:
# Create a human proxy
import operator
from typing import Annotated, List, TypedDict

from langchain_core.messages import AIMessage, BaseMessage, HumanMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

from langgraph.graph import END, StateGraph


def create_user_proxy(llm):
    # We would likely want to provide more system prompt customization here
    prompt = ChatPromptTemplate.from_messages(
        [
            (
                "system",
                "You are role-playing the human character: {name}. "
                "You are not an AI assistant and you are not supposed to help or assist."
                " You must behave as this human would throughout the conversation below."
                " Your response will be labeled as a human message."
                " You will be evaluated based on how realistic your impersonation of"
                " this character is. Here are the details for your character, {name}:\n"
                "{system_prompt}"
                '\n\nWhen you are finished with the conversation, respond with a single word "FINISHED"',
            ),
            MessagesPlaceholder(variable_name="messages"),
        ]
    )
    return prompt | llm | (lambda x: {"messages": [HumanMessage(content=x.content)]})

## Define the simulation constructor

Construct the graph that connects the user proxy and the evaluated model.

In [4]:
from typing import Callable

from langchain_core.runnables import chain
from langchain_openai import ChatOpenAI


class Environment(TypedDict):
    messages: Annotated[List[BaseMessage], operator.add]
    system_prompt: str
    name: str
    input: str


def should_continue(state: Environment):
    if state["messages"][-1].content.strip().endswith("FINISHED"):
        return "end"
    return "continue"


@chain
def next_message(state: Environment):
    return {"input": state["messages"][-1].content}


def enter(inputs: dict):
    inputs["messages"] = [HumanMessage(content=inputs["input"], name=inputs["name"])]
    return inputs


def exit(agent_output):
    # If we do an ai message here, the user proxy llm
    # will usually forget it's acting.
    return {"messages": [HumanMessage(content=agent_output["output"])]}


def create_simulation(user_proxy_llm, agent_constructor: Callable):
    user_proxy = create_user_proxy(user_proxy_llm)
    agent_to_evaluate = agent_constructor()

    graph_builder = StateGraph(Environment)
    graph_builder.add_node("user", user_proxy)
    # give it a single string input
    graph_builder.add_node("agent", next_message | agent_to_evaluate | exit)
    graph_builder.add_edge("agent", "user")
    graph_builder.add_conditional_edges(
        "user",
        should_continue,
        {
            "end": END,
            "continue": "agent",
        },
    )
    graph_builder.set_entry_point("agent")
    return (enter | graph_builder.compile()).with_config(run_name="Agent Simulation")

## Create your agent

We make a constructor. There's some additional data munging to make the simulator output work with your model.

In [5]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables.history import RunnableWithMessageHistory


# This returns any runnable. It would probably accept "input" or something
# It's the agent you want to evaluate
def creat_conversational_agent():
    messages = []

    def add_user_message(inputs):
        messages.append(HumanMessage(content=inputs["input"]))
        return {"messages": messages}

    def add_ai_message(output):
        messages.append(output)
        return {"output": output.content}

    chain = (
        add_user_message
        | ChatPromptTemplate.from_messages(
            [
                ("system", "you are a helpful assistant"),
                MessagesPlaceholder(variable_name="messages"),
            ]
        )
        | ChatOpenAI(model="gpt-3.5-turbo")
        | add_ai_message
    )
    return chain

## Create a dataset

In [6]:
import langsmith

client = langsmith.Client()

examples = [
    (
        "Adam",
        "You really needs to find the restroom. You only speak English.",
        "Hi there, where's the restroom?",
    ),
    (
        "Steve",
        "You are a developer who needs help with an error.",
        "How do I fix this? NameError: module terminator cannot be found",
    ),
    ("Jezebel", "You want to learn more about langchain", "So what's langchain?"),
]

dataset_name = "Agent Simulation"
if not client.has_dataset(dataset_name=dataset_name):
    ds = client.create_dataset(dataset_name=dataset_name)
    client.create_examples(
        inputs=[
            {"system_prompt": sys, "name": name, "input": input}
            for name, sys, input in examples
        ],
        dataset_id=ds.id,
    )
# else:
#     client.delete_dataset(dataset_name=dataset_name)

## Define evaluators

We will use an LLM as judge here. In reality, you would probably add additional
context to the ground truth to make the results more reliable.

In [7]:
from langchain.load import load
from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
from langchain_core.pydantic_v1 import BaseModel, Field
from langsmith.evaluation import run_evaluator


class GradeConversation(BaseModel):
    """Grade the conversation based on quality."""

    reasoning: Annotated[
        str, "The step-by-step reasoning explaining the conversation quality score"
    ]
    score: Annotated[int, "The score on a scale from 0 to 5"] = Field(gte=0, lte=5)


@run_evaluator
def conversation_quality(run, example):
    if not run.outputs:
        return {"score": 0, "comment": "Unfinished"}
    # TODO: Incorporate examples
    prompt = ChatPromptTemplate.from_messages(
        [
            (
                "system",
                "You are grading the quality of the following conversation."
                "Submit your response using the GradeConversation function.",
            ),
            MessagesPlaceholder(variable_name="messages"),
            ("system", "Review the conversation above and submit your score."),
        ]
    )
    bound = ChatOpenAI(model="gpt-4-1106-preview").bind_functions(
        functions=[GradeConversation], function_call="GradeConversation"
    )
    chain = prompt | bound | JsonOutputFunctionsParser()
    result = chain.invoke(load(run.outputs))
    score = result["score"] / 5
    return {"score": score, "comment": result["reasoning"]}

In [8]:
# llm = ChatOpenAI(model="gpt-4-1106-preview")
# graph = create_simulation(llm, creat_conversational_agent)

# graph.invoke(
#     {
#         "system_prompt": "You really needs to find the restroom. You only speak English.",
#         "name": "Adam",
#         "input": "I really need to go",
#     }
# )

## Run Evaluation

In [9]:
import functools

from langchain.smith import RunEvalConfig

config = RunEvalConfig(
    custom_evaluators=[conversation_quality],
)
results = client.run_on_dataset(
    dataset_name=dataset_name,
    llm_or_chain_factory=functools.partial(
        create_simulation,
        ChatOpenAI(model="gpt-4-1106-preview"),
        creat_conversational_agent,
    ),
    evaluation=config,
)

  warn_deprecated(


View the evaluation results for project 'ample-color-99' at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/e5fde795-82eb-4ca2-8f2e-9afa666b5c0f/compare?selectedSessions=61f4e55a-1496-49e7-8ef3-e39376a64d8a

View all tests for Dataset Agent Simulation at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/e5fde795-82eb-4ca2-8f2e-9afa666b5c0f
[>                                                 ] 0/3

  warn_beta(


[------------------------------------------------->] 3/3