In [1]:
import docs
import search_tools

In [2]:
from dataclasses import dataclass
from pydantic import BaseModel

In [3]:
@dataclass
class AgentConfig:
    chunk_size: int = 2000
    chunk_step: int = 1000
    top_k: int = 5

    model: str = "gpt-4o-mini"

In [4]:
search_instructions = """
You are a search assistant for the Evidently documentation.

Evidently is an open-source Python library and cloud platform for evaluating, testing, and monitoring data and AI systems.
It provides evaluation metrics, testing APIs, and visual reports for model and data quality.

Your task is to help users find accurate, relevant information about Evidently's features, usage, and integrations.

You have access to the following tools:

- search — Use this to explore the topic and retrieve relevant snippets or documentation.
- read_file — Use this to retrieve or verify the complete content of a file when:
    * A code snippet is incomplete, truncated, or missing definitions.
    * You need to check that all variables, imports, and functions referenced in code are defined.
    * You must ensure the code example is syntactically correct and runnable.

If `read_file` cannot be used or the file content is unavailable, clearly state:
> "Unable to verify with read_file."

Search Strategy

- For every user query:
    * Perform at least 3 and at most 6 distinct searches to gather enough context.
    * Each search must use a different phrasing or keyword variation of the user's question.
    * Make sure that the search requests are relevant to evidently, testing, evaluating and monitoring AI systems.
    * No need to include "Evidently" in the search text.

- After collecting search results:
    1. Synthesize the information into a concise, accurate answer.
    2. If your answer includes code, always validate it with `read_file` before finalizing.
    3. If a code snippet or reference is incomplete, explicitly mention it.

Important:
- The 6-search limit applies only to `search` calls.
- You may call `read_file` at any time, even after the search limit is reached.
- `read_file` calls are verification steps and do not count toward the 6-search limit.

Code Verification and Completeness Rules

- All variables, functions, and imports in your final code examples must be defined or imported.
- Never shorten, simplify, or truncate code examples. Always present the full, verified version.
- When something is missing or undefined in the search results:
    * Call `read_file` with the likely filename to retrieve the complete file content.
    * Replace any partial code with the full verified version.
- If the file is not available or cannot be verified:
    * Include a clear note: "Unable to verify this code."
- Do not reformat, rename variables, or omit lines from the verified code.

Output Format

- Write your answer clearly and accurately.
- Include a "References" section listing the search queries or file names you used.
- If you couldn't find a complete answer after 6 searches, set found_answer = False.
"""


In [6]:
class Reference(BaseModel):
    title: str
    filename: str

class Section(BaseModel):
    heading: str
    content: str
    references: list[Reference]

class SearchResultArticle(BaseModel):
    found_answer: bool
    title: str
    sections: list[Section]
    references: list[Reference]


In [7]:
config = AgentConfig()

tools = search_tools.prepare_search_tools(
    config.chunk_size,
    config.chunk_step,
    config.top_k
)

In [8]:
from agents import Agent, function_tool

agent_tools = [
    function_tool(tools.search),
    function_tool(tools.read_file)
]

search_agent = Agent(
    name='search',
    tools=agent_tools,
    instructions=search_instructions,
    model=config.model,
    output_type=SearchResultArticle,
)

In [11]:
from agents import Runner

input = 'data drift'
result = await Runner.run(search_agent, input=input)


In [13]:
result.final_output

SearchResultArticle(found_answer=True, title='Data Drift Overview', sections=[Section(heading='What is Data Drift?', content='Data drift refers to the changes in data distribution over time, which can impact the performance of machine learning models. It is essential to monitor data drift because a model trained on historical data may fail to perform as expected when deployed on new data that has shifted significantly from the training set.', references=[]), Section(heading='How to Detect Data Drift', content='Evidently provides several methods to detect data drift. The core of its functionality is the `DataDriftPreset` that evaluates shifts in data distribution between current and reference datasets. Here’s a basic example of how to use this:\n\n```python\nreport = Report([\n    DataDriftPreset(),\n])\n\nmy_eval = report.run(current, reference)\n```\n\nThis preset checks for:\n- **Column Drift**: Evaluates each feature for shifts in distribution.\n- **Target/Prediction Drift**: If the

In [None]:
from openai.types.responses import ResponseTextDeltaEvent

result = Runner.run_streamed(
    search_agent,
    input=input,
    max_turns=3
)

async for event in result.stream_events():
    if event.type == "run_item_stream_event":
        if event.item.type == "tool_call_item":
            tool_call = event.item.raw_item
            f_name = tool_call.name
            args = tool_call.arguments
            print(f"TOOL CALL ({event.item.agent.name}): {f_name}({args})")
    
    if event.type == "raw_response_event" and isinstance(event.data, ResponseTextDeltaEvent):
        print(event.data.delta, end='', flush=True)


TOOL CALL (search): search({"query":"data drift in machine learning"})
TOOL CALL (search): search({"query":"how to detect data drift"})
TOOL CALL (search): search({"query":"data drift monitoring techniques"})
TOOL CALL (search): search({"query":"impact of data drift on AI models"})
TOOL CALL (search): search({"query":"strategies to handle data drift"})
TOOL CALL (search): read_file({"filename":"metrics/explainer_drift.mdx"})
TOOL CALL (search): read_file({"filename":"metrics/preset_data_drift.mdx"})
{"found_answer":true,"title":"Data Drift Overview","sections":[{"heading":"What is Data Drift?","content":"Data drift refers to changes in the statistical properties of a dataset over time. This phenomenon can occur when the data used to train a machine learning model no longer reflects the characteristics of the current operational environment, potentially leading to a degradation in model performance.","references":[]},{"heading":"How Data Drift Works in Evidently","content":"Evidently de

In [16]:
from jaxn import StreamingJSONParser, JSONParserHandler

class SearchResultHandler(JSONParserHandler):
    def on_field_start(self, path: str, field_name: str):
        if field_name == "references":
            level = path.count("/") + 2
            print(f"\n{'#' * level} References\n")

    def on_field_end(self, path, field_name, value, parsed_value=None):
        if field_name == "title" and path == "":
            print(f"# {value}")

        elif field_name == "heading":
            print(f"\n\n## {value}\n")
        elif field_name == "content":
            print("\n") 

    def on_value_chunk(self, path, field_name, chunk):
        if field_name == "content":
            print(chunk, end="", flush=True)

    def on_array_item_end(self, path, field_name, item=None):
        if field_name == "references":
            title = item.get("title", "")
            filename = item.get("filename", "")
            print(f"- [{title}]({filename})")

handler = SearchResultHandler()
parser = StreamingJSONParser(handler)

# Parse each chunk as it arrives
# parser.parse_incremental(event.data.delta)


In [17]:
result = Runner.run_streamed(
    search_agent,
    input=input,
    max_turns=3
)

parser = StreamingJSONParser(handler)

async for event in result.stream_events():
    if event.type == "run_item_stream_event":
        if event.item.type == "tool_call_item":
            tool_call = event.item.raw_item
            f_name = tool_call.name
            args = tool_call.arguments
            print(f"TOOL CALL ({event.item.agent.name}): {f_name}({args})")
    
    if event.type == "raw_response_event" and isinstance(event.data, ResponseTextDeltaEvent):
        parser.parse_incremental(event.data.delta)

TOOL CALL (search): search({"query":"data drift monitoring in machine learning"})
TOOL CALL (search): search({"query":"how to detect data drift in models"})
TOOL CALL (search): search({"query":"data drift evaluation metrics"})
TOOL CALL (search): search({"query":"preventing data drift in AI systems"})
TOOL CALL (search): search({"query":"data drift examples in AI"})
TOOL CALL (search): read_file({"filename":"metrics/explainer_drift.mdx"})
# Understanding Data Drift


## What is Data Drift?

Data drift refers to changes in the statistical properties of the data that a machine learning model is trained on. It can have a significant impact on model performance and can occur over time due to various reasons such as changing environments, new trends, or altered user behavior.


### References



## How Data Drift Detection Works

Evidently provides a default Data Drift Detection algorithm that compares the distributions of values in specified columns between two datasets, typically a curren

In [18]:
input = 'llm as a judge'

In [19]:
from agents.exceptions import MaxTurnsExceeded

try:
    parser = StreamingJSONParser(handler)

    result = Runner.run_streamed(
        search_agent,
        input=input,
        max_turns=3
    )

    async for event in result.stream_events():
        if event.type == "run_item_stream_event":
            if event.item.type == "tool_call_item":
                tool_call = event.item.raw_item
                f_name = tool_call.name
                args = tool_call.arguments
                print(f"TOOL CALL ({event.item.agent.name}): {f_name}({args})")
        
        if event.type == "raw_response_event" and isinstance(event.data, ResponseTextDeltaEvent):
            parser.parse_incremental(event.data.delta)
except MaxTurnsExceeded as e:
    print('too many turns')
    finish_prompt = 'System message: The number of searches has exceeded the limit. Proceed to finishing the writeup'
    finish_message = [{'role': 'user', 'content': finish_prompt}]
    messages = result.to_input_list() + finish_message
    # Run one more time with the accumulated messages

TOOL CALL (search): search({"query":"using LLM as a judge in legal contexts"})
TOOL CALL (search): search({"query":"AI judge applications legal system"})
TOOL CALL (search): search({"query":"legal decision making AI models"})
TOOL CALL (search): search({"query":"AI in court decisions"})
TOOL CALL (search): search({"query":"machine learning models for judicial decisions"})
TOOL CALL (search): read_file({"filename":"examples/LLM_judge.mdx"})
TOOL CALL (search): read_file({"filename":"quickstart_llm.mdx"})
too many turns


In [22]:
print(result.final_output)

None


In [23]:
async def run_stream(agent, input, handler, max_turns=3):
    try:
        result = Runner.run_streamed(
            agent,
            input=input,
            max_turns=max_turns
        )
        
        parser = StreamingJSONParser(handler)

        async for event in result.stream_events():
            if event.type == "run_item_stream_event":
                if event.item.type == "tool_call_item":
                    tool_call = event.item.raw_item
                    f_name = tool_call.name
                    args = tool_call.arguments
                    print(f"TOOL CALL ({event.item.agent.name}): {f_name}({args})")
            
            if event.type == "raw_response_event" and isinstance(event.data, ResponseTextDeltaEvent):
                parser.parse_incremental(event.data.delta)

        return result
    except MaxTurnsExceeded as e:
        print('too many turns')
        finish_prompt = 'System message: The number of searches has exceeded the limit. Proceed to finishing the writeup'
        finish_message = [{'role': 'user', 'content': finish_prompt}]
        messages = result.to_input_list() + finish_message
        final_result = await run_stream(agent, input=messages, handler=handler, max_turns=1)
        return final_result


In [24]:
result = await run_stream(search_agent, 'llm as a judge', SearchResultHandler())

TOOL CALL (search): search({"query":"using AI as a judge in legal systems"})
TOOL CALL (search): search({"query":"AI in judicial decision making"})
TOOL CALL (search): search({"query":"machine learning algorithms in law"})
TOOL CALL (search): search({"query":"automated legal systems AI"})
TOOL CALL (search): search({"query":"ethical implications of AI judges"})
TOOL CALL (search): read_file({"filename":"examples/LLM_judge.mdx"})
TOOL CALL (search): read_file({"filename":"docs/platform/evals_no_code.mdx"})
TOOL CALL (search): read_file({"filename":"metrics/customize_llm_judge.mdx"})
TOOL CALL (search): read_file({"filename":"docs/platform/monitoring_overview.mdx"})
too many turns
# Using LLMs as Judges


## Overview

AI language models (LLMs) can be effectively utilized as judges in evaluating text for various applications such as customer support, content moderation, and educational assessments. They enable automated evaluations that can save time and increase consistency in decision-m