# LangWatch Evaluation Tracking

## Step 1: Define our LLM pipeline

Let's create a simple RAG pipeline using LangChain, guaranteeing that we can get the output and the retrieved documents used during generation.

In [1]:
from dotenv import load_dotenv

load_dotenv(dotenv_path="langwatch/python-sdk/.env")

import langwatch

from langchain.prompts import ChatPromptTemplate

from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores.faiss import FAISS
from langchain_core.vectorstores.base import VectorStoreRetriever
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.tools.retriever import create_retriever_tool
from langchain.agents import AgentExecutor, create_tool_calling_agent
from langchain.tools import BaseTool, StructuredTool, tool
from langchain_core.documents import Document


loader = WebBaseLoader("https://docs.langwatch.ai")
docs = loader.load()
documents = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200
).split_documents(docs)

vector = FAISS.from_documents(documents, OpenAIEmbeddings())
retriever = vector.as_retriever()

retrieved_documents = []

# Wrap the FAISS retriever so that we can capture which documents were used to generate the response
@tool
def langwatch_search(
    query: str
) -> list[Document]:
    """"Search for information about LangWatch. For any questions about LangWatch, use this tool if you didn't already"""

    global retrieved_documents
    retrieved_documents = retriever.get_relevant_documents(query)
    return retrieved_documents

tools = [langwatch_search]
model = ChatOpenAI(model="gpt-4o-mini")
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are a helpful assistant that only reply in short tweet-like responses, use tools only once.\n\n{agent_scratchpad}",
        ),
        ("human", "{question}"),
    ]
)
agent = create_tool_calling_agent(model, tools, prompt)
executor = AgentExecutor(agent=agent, tools=tools, verbose=False)  # type: ignore

@langwatch.trace()
def execute_rag_pipeline(question: str):
    response = executor.invoke({"question": question})["output"]
    contexts = [d.page_content for d in retrieved_documents]
    return response, contexts

response, contexts = execute_rag_pipeline("What is LangWatch?")

print("")
print("retrieved_documents:", contexts)
print("output:", response)

USER_AGENT environment variable not set, consider setting it to identify your requests.


2025-05-24 10:04:14,630 - langwatch.utils.initialization - INFO - Setting up LangWatch client...
2025-05-24 10:04:14,636 - langwatch.client - INFO - Configuring OTLP exporter with endpoint: http://localhost:5560/api/otel/v1/traces
2025-05-24 10:04:14,636 - langwatch.client - INFO - Registering atexit handler to flush tracer provider on exit
2025-05-24 10:04:14,636 - langwatch.client - INFO - Successfully configured tracer provider with OTLP exporter
2025-05-24 10:04:14,637 - langwatch.utils.initialization - INFO - LangWatch client setup complete


  retrieved_documents = retriever.get_relevant_documents(query)



retrieved_documents: ['Introduction - LangWatchLangWatch home pageSearch...llms.txtSupportDashboardlangwatch/langwatchlangwatch/langwatchSearch...NavigationGet StartedIntroductionDocumentationOpen DashboardGitHub RepoGet StartedIntroductionSelf HostingCookbooksLLM ObservabilityOverviewConceptsLanguage APIs & SDKsUser EventsMonitoring & AlertsCode ExamplesLLM EvaluationOffline EvaluationReal-Time EvaluationList of EvaluatorsDatasetsAnnotationsLLM DevelopmentPrompt Optimization StudioDSPy VisualizationLangWatch MCPPrompt VersioningAPI EndpointsTracesPromptsAnnotationsDatasetsSupportTroubleshooting and SupportStatus PageGet StartedIntroductionCopy pageWelcome to LangWatch, the all-in-one open-source LLMops platform.LangWatch allows you to track, monitor, guardrail and evaluate your LLMs apps for measuring quality and alert on issues.\nFor domain experts, it allows you to easily sift through conversations, see topics being discussed and annotate and score messages', 'For domain experts, i

## Step 2: Run the Offline Evaluation

Now we can use the dataset we have from LangWatch to run a batch evaluation experiment through our LLM pipeline, to see the results and tweak it for optimizations.

In [2]:
import langwatch
import pandas as pd

# Create a dataset
df = pd.DataFrame(
    [
        {
            "question": "What is LangWatch?",
            "answer": "LangWatch is a platform for evaluating and improving language models.",
        },
        {
            "question": "How do I use LangWatch?",
            "answer": "You can use LangWatch by installing the LangWatch SDK and then calling the LangWatch API.",
        },
        {
            "question": "Does LangWatch support multiple language models?",
            "answer": "Yes, LangWatch is compatible with all language models by using LiteLLM under the hood.",
        },
        {
            "question": "Can I visualize evaluation metrics in LangWatch?",
            "answer": "Yes, LangWatch provides dashboards for visualizing key evaluation metrics.",
        },
        {
            "question": "Is there a free tier for LangWatch?",
            "answer": "LangWatch offers a free tier with limited usage, ideal for small projects and evaluation.",
        },
        {
            "question": "Where can I find documentation for LangWatch?",
            "answer": "You can find the official documentation on the LangWatch website or GitHub repository.",
        },
    ]
)
# Or retrieve it from LangWatch:
# df = langwatch.dataset.get_dataset("CEtFivQeya4kyAzy9eJht").to_pandas()  # dataset--rSAYL4HxQRXHSayq6c7A

evaluation = langwatch.evaluation.init("my-incredible-experiment")

for index, row in evaluation.loop(df.iterrows()):
    def evaluate(index, row):
        response, contexts = execute_rag_pipeline(row["question"])

        evaluation.run(
            "ragas/faithfulness",
            name="Faithfulness",
            index=index,
            data={
                "input": row["question"],
                "output": response,
                "contexts": contexts,
            },
            settings={
                "model": "openai/gpt-4o-mini",
                "max_tokens": 2048,
                "autodetect_dont_know": True,
            },
        )
    evaluation.submit(evaluate, index, row)

Follow the results at: http://localhost:5560/inbox-narrator/experiments/my-incredible-experiment?runId=hopping-goat-of-glory


Evaluating:   0%|          | 0/6 [00:00<?, ?it/s]

Failed to detach context
Traceback (most recent call last):
  File "/Users/rchaves/Projects/langwatch-saas/langwatch/python-sdk/.venv/lib/python3.11/site-packages/opentelemetry/context/__init__.py", line 155, in detach
    _RUNTIME_CONTEXT.detach(token)
  File "/Users/rchaves/Projects/langwatch-saas/langwatch/python-sdk/.venv/lib/python3.11/site-packages/opentelemetry/context/contextvars_context.py", line 53, in detach
    self._current_context.reset(token)
ValueError: <Token var=<ContextVar name='current_context' default={} at 0x10b2d4d10> at 0x31ab882c0> was created in a different Context
Failed to detach context
Traceback (most recent call last):
  File "/Users/rchaves/Projects/langwatch-saas/langwatch/python-sdk/.venv/lib/python3.11/site-packages/pydantic/type_adapter.py", line 271, in _init_core_attrs
    self.core_schema = _getattr_no_parents(self._type, '__pydantic_core_schema__')
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/r