In [1]:
import os
from uuid import uuid4

In [2]:

unique_id = uuid4().hex[0:8]
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = f"Tracing Walkthrough - {unique_id}"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
LANGCHAIN_API_KEY = os.environ.get("LANGCHAIN_API_KEY")

OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

In [3]:
from langsmith import Client

client = Client()

In [4]:
from langchain import hub
from langchain.agents import AgentExecutor
from langchain.agents.format_scratchpad import format_to_openai_function_messages
from langchain.agents.output_parsers import OpenAIFunctionsAgentOutputParser
from langchain_community.tools import DuckDuckGoSearchResults
from langchain_openai import ChatOpenAI

# Fetches the latest version of this prompt
prompt = hub.pull("wfh/langsmith-agent-prompt:5d466cbc")

llm = ChatOpenAI(
    model="gpt-3.5-turbo-16k",
    temperature=0,
)

tools = [
    DuckDuckGoSearchResults(
        name="duck_duck_go"
    ),  # General internet search using DuckDuckGo
]

llm_with_tools = llm.bind_functions(tools)

runnable_agent = (
    {
        "input": lambda x: x["input"],
        "agent_scratchpad": lambda x: format_to_openai_function_messages(
            x["intermediate_steps"]
        ),
    }
    | prompt
    | llm_with_tools
    | OpenAIFunctionsAgentOutputParser()
)

agent_executor = AgentExecutor(
    agent=runnable_agent, tools=tools, handle_parsing_errors=True
)

In [5]:
inputs = [
    "What is Tesla?",
    "What's Elon Musk's Net Worth?",
    "When was Llama-v2 released?",
    "What is the langsmith cookbook?",
    "When did langchain first announce the hub?",
]

results = agent_executor.batch([{"input": x} for x in inputs], return_exceptions=True)

In [6]:
results[:2]

[{'input': 'What is Tesla?',
  'output': "Tesla, Inc. is an American electric vehicle and clean energy company. It was founded in 2003 by a group of engineers who wanted to prove that electric cars could be better than gasoline-powered cars. Tesla is known for its innovative electric vehicles, such as the Model S, Model 3, Model X, and Model Y, as well as its energy storage products like the Powerwall and Powerpack. The company is also involved in solar energy through its acquisition of SolarCity. Tesla's mission is to accelerate the world's transition to sustainable energy."},
 {'input': "What's Elon Musk's Net Worth?",
  'output': "According to Forbes, Elon Musk's net worth is estimated to be $240.7 billion as of July 2023. Bloomberg and Forbes rank him as the world's richest person, with estimates ranging from $198 billion to $220 billion as of November 2023."}]

## Create a LangSmith Datset

In [7]:
outputs = [
    "LangChain is an open-source framework for building applications using large language models. It is also the name of the company building LangSmith.",
    "LangSmith is a unified platform for debugging, testing, and monitoring language model applications and agents powered by LangChain",
    "July 18, 2023",
    "The langsmith cookbook is a github repository containing detailed examples of how to use LangSmith to debug, evaluate, and monitor large language model-powered applications.",
    "September 5, 2023",
]

In [8]:
dataset_name = f"agent-qa-{unique_id}"

dataset = client.create_dataset(
    dataset_name,
    description="An example dataset of questions over the LangSmith documentation.",
)

client.create_examples(
    inputs=[{"input": query} for query in inputs],
    outputs=[{"output": answer} for answer in outputs],
    dataset_id=dataset.id,
)

In [9]:
from langchain import hub
from langchain.agents import AgentExecutor, AgentType, initialize_agent, load_tools
from langchain.agents.format_scratchpad import format_to_openai_function_messages
from langchain.agents.output_parsers import OpenAIFunctionsAgentOutputParser
from langchain_openai import ChatOpenAI


# Since chains can be stateful (e.g. they can have memory), we provide
# a way to initialize a new chain for each row in the dataset. This is done
# by passing in a factory function that returns a new chain for each row.
def create_agent(prompt, llm_with_tools):
    runnable_agent = (
        {
            "input": lambda x: x["input"],
            "agent_scratchpad": lambda x: format_to_openai_function_messages(
                x["intermediate_steps"]
            ),
        }
        | prompt
        | llm_with_tools
        | OpenAIFunctionsAgentOutputParser()
    )
    return AgentExecutor(agent=runnable_agent, tools=tools, handle_parsing_errors=True)

In [10]:
from langsmith.evaluation import EvaluationResult, run_evaluator
from langsmith.schemas import Example, Run


@run_evaluator
def check_not_idk(run: Run, example: Example):
    """Illustration of a custom evaluator."""
    agent_response = run.outputs["output"]
    if "don't know" in agent_response or "not sure" in agent_response:
        score = 0
    else:
        score = 1
    # You can access the dataset labels in example.outputs[key]
    # You can also access the model inputs in run.inputs[key]
    return EvaluationResult(
        key="not_uncertain",
        score=score,
    )

In [11]:
from langchain.evaluation import EvaluatorType
from langchain.smith import RunEvalConfig

evaluation_config = RunEvalConfig(
    # Evaluators can either be an evaluator type (e.g., "qa", "criteria", "embedding_distance", etc.) or a configuration for that evaluator
    evaluators=[
        # Measures whether a QA response is "Correct", based on a reference answer
        # You can also select via the raw string "qa"
        EvaluatorType.QA,
        # Measure the embedding distance between the output and the reference answer
        # Equivalent to: EvalConfig.EmbeddingDistance(embeddings=OpenAIEmbeddings())
        EvaluatorType.EMBEDDING_DISTANCE,
        # Grade whether the output satisfies the stated criteria.
        # You can select a default one such as "helpfulness" or provide your own.
        RunEvalConfig.LabeledCriteria("helpfulness"),
        # The LabeledScoreString evaluator outputs a score on a scale from 1-10.
        # You can use default criteria or write our own rubric
        RunEvalConfig.LabeledScoreString(
            {
                "accuracy": """
Score 1: The answer is completely unrelated to the reference.
Score 3: The answer has minor relevance but does not align with the reference.
Score 5: The answer has moderate relevance but contains inaccuracies.
Score 7: The answer aligns with the reference but has minor errors or omissions.
Score 10: The answer is completely accurate and aligns perfectly with the reference."""
            },
            normalize_by=10,
        ),
    ],
    # You can add custom StringEvaluator or RunEvaluator objects here as well, which will automatically be
    # applied to each prediction. Check out the docs for examples.
    custom_evaluators=[check_not_idk],
)

In [12]:
from langchain import hub

# We will test this version of the prompt
prompt = hub.pull("wfh/langsmith-agent-prompt:798e7324")

In [13]:
import functools

from langchain.smith import arun_on_dataset, run_on_dataset

chain_results = run_on_dataset(
    dataset_name=dataset_name,
    llm_or_chain_factory=functools.partial(
        create_agent, prompt=prompt, llm_with_tools=llm_with_tools
    ),
    evaluation=evaluation_config,
    verbose=True,
    client=client,
    project_name=f"runnable-agent-test-5d466cbc-{unique_id}",
    # Project metadata communicates the experiment parameters,
    # Useful for reviewing the test results
    project_metadata={
        "env": "testing-notebook",
        "model": "gpt-3.5-turbo",
        "prompt": "5d466cbc",
    },
)

# Sometimes, the agent will error due to parsing issues, incompatible tool inputs, etc.
# These are logged as warnings here and captured as errors in the tracing UI.

View the evaluation results for project 'runnable-agent-test-5d466cbc-706e4766' at:
https://smith.langchain.com/o/ec3ee2ee-c547-52f1-bb9a-30955085cc0b/datasets/f31cb9c7-1279-47cf-bc9f-9c02908d3092/compare?selectedSessions=c00f3f1b-8a35-4ede-b9b0-eb92d91d3281

View all tests for Dataset agent-qa-706e4766 at:
https://smith.langchain.com/o/ec3ee2ee-c547-52f1-bb9a-30955085cc0b/datasets/f31cb9c7-1279-47cf-bc9f-9c02908d3092
[>                                                 ] 0/5

  warn_deprecated(
Task was destroyed but it is pending!
task: <Task pending name='Task-22' coro=<AsyncDDGS.__aexit__() running at /Users/brianroepke/Projects/deadpool-llm/venv/lib/python3.11/site-packages/duckduckgo_search/duckduckgo_search_async.py:46>>
  self._ready.clear()
Task was destroyed but it is pending!
task: <Task pending name='Task-2' coro=<AsyncCurl._force_timeout() running at /Users/brianroepke/Projects/deadpool-llm/venv/lib/python3.11/site-packages/curl_cffi/aio.py:168> wait_for=<Future pending cb=[Task.__wakeup()]>>
Task was destroyed but it is pending!
task: <Task pending name='Task-27' coro=<AsyncDDGS.__aexit__() running at /Users/brianroepke/Projects/deadpool-llm/venv/lib/python3.11/site-packages/duckduckgo_search/duckduckgo_search_async.py:46>>
Task was destroyed but it is pending!
task: <Task pending name='Task-5' coro=<AsyncCurl._force_timeout() running at /Users/brianroepke/Projects/deadpool-llm/venv/lib/python3.11/site-packages/curl_cffi/aio.py:168> wait_for=<F

[------------------------------------------------->] 5/5

Unnamed: 0,feedback.correctness,feedback.embedding_cosine_distance,feedback.helpfulness,feedback.score_string:accuracy,feedback.not_uncertain,error,execution_time,run_id
count,5.0,5.0,5.0,5.0,5.0,0.0,5.0,5
unique,,,,,,0.0,,5
top,,,,,,,,1ac50369-9c90-4951-adc8-3acc5c514932
freq,,,,,,,,1
mean,0.6,0.195385,1.0,0.94,1.0,,7.655407,
std,0.547723,0.123317,0.0,0.134164,0.0,,2.562278,
min,0.0,0.042215,1.0,0.7,1.0,,5.701448,
25%,0.0,0.138944,1.0,1.0,1.0,,6.259131,
50%,1.0,0.153003,1.0,1.0,1.0,,7.043874,
75%,1.0,0.30356,1.0,1.0,1.0,,7.159218,


In [14]:
runs = client.list_runs(project_name=chain_results["project_name"], execution_order=1)

In [15]:
# After some time, these will be populated.
client.read_project(project_name=chain_results["project_name"]).feedback_stats

{'correctness': {'n': 5, 'avg': 0.6},
 'embedding_cosine_distance': {'n': 5, 'avg': 0.19536},
 'helpfulness': {'n': 5, 'avg': 1.0},
 'not_uncertain': {'n': 5, 'avg': 1.0},
 'score_string:accuracy': {'n': 5, 'avg': 0.9400000000000001}}