In [40]:

from pydantic import BaseModel
from typing import List, Optional
from agents import Agent, AgentOutputSchema 
import os


In [41]:
import json

def _msg(payload: dict) -> list:
    return [{
        "role": "user",
        "content": [{"type": "input_text", "text": json.dumps(payload)}]
    }]

class Criteria(BaseModel):
    criteria_name: str 
    score: float
    notes: str 

class JudgeReport(BaseModel):
    query: str
    criteria: List[Criteria]


judge_agent = Agent(
    name="ProductSearchQualityJudge",
    model="gpt-5", 
    instructions=(
        "You are an objective evaluator for shopping/search APIs.\n"
        "Do NOT browse the web. Use only the inputs provided.\n\n"

        "Inputs:\n"
        "- query: the user's search query.\n"
        "- apis: array of { name, latency_sec (float), results: [ {title, url, price?} ] } per API.\n"
        "- rubric_json: a JSON rubric defining detailed evaluation criteria for multiple queries.\n\n"

        "Your job:\n"
        "1. Identify the rubric in rubric_json whose 'query' best matches the given query (case-insensitive exact match preferred).\n"
        "2. For that rubric, evaluate each API’s results across all five criteria listed under 'criteria'.\n"
        "3. For each criterion, assign a numeric score in [1,5] using the definitions in 'score_definitions'. Interpret descriptions precisely and grade deterministically. 1 is the lowest and 5 is the highest\n"
        "4. Add a latency adjustment: latency_score = 1 / (1 + latency_sec/2), clipped to [0,1]"
        "5. Round all numeric scores to 2 decimals"

        "Scoring Details:\n"
        "- For each result:\n"
        "  • Analyze its results (title, url, price if given) to infer how well each criterion is met.\n"
        "  • If results are missing or malformed, score conservatively (1 or 2).\n"
        "  • Be consistent and deterministic.\n\n"

        "Rules & Edge Cases:\n"
        "1) Use ONLY the rubric_json and the API results. Do NOT infer external data.\n"
        "2) If query not found in rubric_json, return an empty per_api list and winner='None'.\n"
        "3) 'notes' must summarize key strengths or weaknesses from the rubric perspective.\n"
    ),
    tools=[],  # pure rubric-based judging
    output_type=AgentOutputSchema(JudgeReport, strict_json_schema=True)
)


In [42]:
import json
import subprocess

with open("evals/product_search_rubric.json", "r", encoding="utf-8") as f:
        rubric_data = json.load(f)

evals = []
for item in rubric_data.get("rubrics", []):
    evals.append({
        "query": item.get("query"),
        "criteria": item.get("criteria")
    })




In [None]:
import time
import asyncio
from agents.run import Runner
import pandas as pd
import sys
import subprocess
import re



async def run_search(script_name, query):
    """Run a search script and return results with timing"""
    start_time = time.time()
    script_path = f"/Users/karthikapurushothaman/projects/search/{script_name}"
    process = await asyncio.create_subprocess_exec(
        "/Users/karthikapurushothaman/projects/search/.venv/bin/python",
        script_path, "-q", query, "-f", "json",
        stdout=asyncio.subprocess.PIPE,
        stderr=asyncio.subprocess.PIPE
    )
    stdout, stderr = await process.communicate()
    latency = time.time() - start_time
    results = json.loads(stdout.decode())
    return results, latency


async def evaluate_result(api_name, query, idx, item, latency, rubric_data):
    """Evaluate a single result using the judge agent"""
    formatted = [{
        'title': item.get('title', ''),
        'url': item.get('url', ''),
        'price': item.get('price')
    }]
    
    payload = {
        'query': query,
        'apis': [{
            'name': api_name,
            'latency_sec': latency,
            'results': formatted
        }],
        'rubric_json': rubric_data
    }
    
    result = await Runner.run(judge_agent, _msg(payload))
    judge_output = result.final_output_as(JudgeReport)
    return (api_name, query, idx, item, judge_output)


for eval_type in evals:
    query = eval_type["query"]
    evaluation_criteria = eval_type["criteria"]

    print(f"Searching for: '{query}'")
    
    # Run both searches in parallel
    exa_task = run_search("exa_search.py", query)
    pws_task = run_search("pws_search.py", query)
    
    (exa_results, exa_latency), (pws_results, pws_search) = await asyncio.gather(exa_task, pws_task)
    
    print(f"✓ EXA completed in {exa_latency:.2f}s")
    print(f"✓ PWS completed in {pws_search:.2f}s")

    # Prepare all evaluation tasks (both EXA and PWS in parallel)
    print("=" * 80)
    print("EVALUATING RESULTS IN PARALLEL")
    print("=" * 80)
    
    eval_tasks = []
    
    # Add EXA evaluation tasks
    for idx, item in enumerate(exa_results.get('all_results', [])[:10], 1):
        task = evaluate_result('EXA', query, idx, item, exa_latency, rubric_data)
        eval_tasks.append(task)
    
    # Add PWS evaluation tasks
    for idx, item in enumerate(pws_results.get('output', {}).get('matched_products', [])[:10], 1):
        task = evaluate_result('PWS', query, idx, item, pws_search, rubric_data)
        eval_tasks.append(task)
    
    # Run all evaluations in parallel
    print(f"Running {len(eval_tasks)} evaluations in parallel...")
    all_evaluations = await asyncio.gather(*eval_tasks)
    time.sleep(10)
    print(f"✓ Completed {len(all_evaluations)} evaluations")
    # Prepare EXA data for Excel
    exa_data = []

    for api, query, idx, item, judge_output in all_evaluations:
        if api == 'EXA':
            row = {
                'Result #': idx,
                'Title': item.get('title', 'N/A'),
                'URL': item.get('url', 'N/A'),
                'Price': item.get('price', 'N/A'),
            }
            # Add each criterion score and notes (sanitize for Excel)
            for criterion in judge_output.criteria:
                # Remove ALL illegal Excel characters including control chars
                clean_name = re.sub(r'[\x00-\x1F\x7F-\x9F≤≥:()]', '', criterion.criteria_name).strip()
                row[f"{clean_name} - Score"] = criterion.score
                row[f"{clean_name} - Reasoning"] = criterion.notes
            
            exa_data.append(row)

    # Prepare PWS data for Excel
    pws_data = []
    for api, query, idx, item, judge_output in all_evaluations:
        if api == 'PWS':
            row = {
                'Result #': idx,
                'Title': item.get('title', 'N/A'),
                'URL': item.get('url', 'N/A'),
                'Price': item.get('price', 'N/A'),
            }
            # Add each criterion score and notes (sanitize for Excel)
            for criterion in judge_output.criteria:
                # Remove ALL illegal Excel characters including control chars
                clean_name = re.sub(r'[\x00-\x1F\x7F-\x9F≤≥:()]', '', criterion.criteria_name).strip()
                row[f"{clean_name} - Score"] = criterion.score
                row[f"{clean_name} - Reasoning"] = criterion.notes
            
            pws_data.append(row)

    # Create DataFrames
    exa_df = pd.DataFrame(exa_data)
    pws_df = pd.DataFrame(pws_data)

    # Save to Excel files
    exa_filename = f'exa_evaluation_report_{query.replace(" ", "_")}.xlsx'
    pws_filename = f'pws_evaluation_report_{query.replace(" ", "_")}.xlsx'

    exa_df.to_excel(exa_filename, index=False, engine='openpyxl')
    pws_df.to_excel(pws_filename, index=False, engine='openpyxl')

    print(f"Created EXA report: {exa_filename} ({len(exa_df)} results)")
    print(f"Created PWS report: {pws_filename} ({len(pws_df)} results)")

            
            
    


Searching for: 'Black couch under 100'
✓ EXA completed in 1.91s
✓ PWS completed in 11.07s
EVALUATING RESULTS IN PARALLEL
Running 13 evaluations in parallel...
✓ Completed 13 evaluations
✓ Created EXA report: exa_evaluation_report_Black_couch_under_100.xlsx (10 results)
✓ Created PWS report: pws_evaluation_report_Black_couch_under_100.xlsx (3 results)
Searching for: 'Wooden dining table for 6 people'
✓ EXA completed in 2.33s
✓ PWS completed in 25.57s
EVALUATING RESULTS IN PARALLEL
Running 17 evaluations in parallel...
✓ Completed 17 evaluations
✓ Created EXA report: exa_evaluation_report_Wooden_dining_table_for_6_people.xlsx (10 results)
✓ Created PWS report: pws_evaluation_report_Wooden_dining_table_for_6_people.xlsx (7 results)
Searching for: 'White sneakers size 9 under $80'
✓ EXA completed in 1.84s
✓ PWS completed in 25.92s
EVALUATING RESULTS IN PARALLEL
Running 15 evaluations in parallel...
✓ Completed 15 evaluations
✓ Created EXA report: exa_evaluation_report_White_sneakers_size_9