In [21]:
import sys
sys.path.insert(0, '..')

In [36]:
import search_agent

In [22]:
import pickle

In [100]:
with open('eval-run-v2-2025-10-22-22-25.bin', 'rb') as f_in:
    rows = pickle.load(f_in)

In [125]:
len(rows)

26

In [110]:
import docs

In [115]:
github_data = docs.read_github_data()
parsed_data = docs.parse_data(github_data)

file_index = {d['filename']: d['content'] for d in parsed_data}

In [109]:
rows[0]['original_question']['filename']

'metrics/all_metrics.mdx'

In [102]:
import json

In [103]:
from pydantic_ai import Agent
from pydantic import BaseModel

In [140]:
judge_instructions = """
Use this checklist to evaluate the quality of an AI agent’s answer
(<ANSWER>) to a user question (<QUESTION>). We also include the
entire log (<LOG>) for analysis. In <REFERENCE> you will see
the file, from which the user question was generated. 

For each item of the checklist, check if the condition is met. 

Checklist:

- instructions_follow: The agent followed the user’s instructions (in <INSTRUCTIONS>)
- instructions_avoid: The agent avoided doing things it was told not to do  
- answer_relevant: The response directly addresses the user’s question  
- answer_match: The ANSWER is similar to the REFERENCE
- answer_clear: The answer is clear and correct  
- answer_citations: The response includes proper citations or sources when required  
- completeness: The response is complete and covers all key aspects of the request
- tool_call_search: Is the search tool invoked? 

Output true/false for each check and provide a short explanation for your judgment.
"""

In [119]:
from typing import Literal

In [120]:
class EvaluationCheck(BaseModel):
    check_name: str
    reasoning: str
    check_pass: bool

class EvaluationChecklist(BaseModel):
    checklist: list[EvaluationCheck]
    summary: str

In [141]:
judge = Agent(
    name="judge",
    instructions=judge_instructions,
    model="gpt-5-nano",
    output_type=EvaluationChecklist
)

In [122]:
import asyncio
from tqdm.auto import tqdm

async def map_progress(seq, f, max_concurrency=6):
    """Asynchronously map async function f over seq with progress bar."""
    semaphore = asyncio.Semaphore(max_concurrency)

    async def run(el):
        async with semaphore:
            return await f(el)

    # create one coroutine per element
    coros = [run(el) for el in seq]

    # turn them into tasks that complete as they finish
    completed = asyncio.as_completed(coros)

    results = []

    for coro in tqdm(completed, total=len(seq)):
        result = await coro
        results.append(result)

    return results

In [142]:
async def run_eval(row):
    original_filename = row['original_question']['filename']
    reference = file_index[original_filename]
    user_prompt = f"""
<INSTRUCTIONS>{search_agent.search_instructions}</INSTRUCTIONS>
<QUESTION>{row['question']}</QUESTION>
<ANSWER>{row['answer']}</ANSWER>
<REFERENCE>{reference}</REFERENCE>
<LOG>{json.dumps(row['messages'])}</LOG>
""".strip()
    output = await judge.run(user_prompt=user_prompt)
    return row, output

In [143]:
results = await map_progress(rows, run_eval, max_concurrency=10)

  0%|          | 0/26 [00:00<?, ?it/s]

In [53]:
from toyaikit.pricing import PricingConfig
pricing = PricingConfig()

def calculate_cost(model, all_results):
    input_tokens = 0
    output_tokens = 0
    
    for _, r in all_results:
        usage = r.usage()
        input_tokens = input_tokens + usage.input_tokens
        output_tokens = output_tokens + usage.output_tokens

    cost = pricing.calculate_cost(model, input_tokens, output_tokens)
    return cost

In [54]:
calculate_cost('gpt-5-nano', results)

CostInfo(input_cost=0.0016259, output_cost=0.030878000000000003, total_cost=0.0325039)

In [144]:
calculate_cost('gpt-5-nano', results)

CostInfo(input_cost=0.0087156, output_cost=0.0351392, total_cost=0.0438548)

In [145]:
all_checks = []

for original_row, result in results[1:]:
    checks = result.output.checklist
    checks_formatted = {
        'question': original_row['question']
    }
    for check in checks:
        checks_formatted[check.check_name] = check.check_pass
    all_checks.append(checks_formatted)

In [146]:
import pandas as pd

In [147]:
df_eval = pd.DataFrame(all_checks)

In [148]:
df_eval[df_eval.columns[1:]].mean()

instructions_follow       0.625
instructions_avoid          1.0
answer_relevant        0.958333
answer_match           0.708333
answer_clear           0.916667
answer_citations       0.958333
completeness           0.833333
tool_call_search           0.64
dtype: object

In [98]:
from enum import Enum
from pydantic import BaseModel, Field

class CheckName(str, Enum):
    instructions_follow = "instructions_follow"
    instructions_avoid = "instructions_avoid" 
    answer_relevant = "answer_relevant"
    answer_clear = "answer_clear"
    answer_citations = "answer_citations"
    completeness = "completeness"
    tool_call_search = "tool_call_search"

CHECK_DESCRIPTIONS = {
    CheckName.instructions_follow: "The agent followed the user's instructions (in <INSTRUCTIONS>)",
    CheckName.instructions_avoid: "The agent avoided doing things it was told not to do",
    CheckName.answer_relevant: "The response directly addresses the user's question",
    CheckName.answer_clear: "The answer is clear and correct",
    CheckName.answer_citations: "The response includes proper citations or sources when required",
    CheckName.completeness: "The response is complete and covers all key aspects of the request",
    CheckName.tool_call_search: "Is the search tool invoked?"
}

class EvaluationCheck(BaseModel):
    check_name: CheckName = Field(description="The type of evaluation check")
    reasoning: str = Field(description="The reasoning behind the check result")
    check_pass: bool = Field(description="Whether the check passed (True) or failed (False)")
    
class EvaluationChecklist(BaseModel):
    checklist: list[EvaluationCheck] = Field(description="List of all evaluation checks")
    summary: str = Field(description="Evaluation summary")

In [96]:
def generate_checklist_text():
    checklist_items = []
    for check_name in CheckName:
        description = CHECK_DESCRIPTIONS[check_name]
        checklist_items.append(f"- {check_name.value}: {description}")
    return "\n".join(checklist_items)

In [97]:
eval_instructions = f"""
Use this checklist to evaluate the quality of an AI agent’s answer (<ANSWER>) to a user question (<QUESTION>).
We also include the entire log (<LOG>) for analysis.

For each item, check if the condition is met. 

Checklist:

{generate_checklist_text()}

Output true/false for each check and provide a short explanation for your judgment.
"""