In [1]:
import sys
sys.path.insert(0, '..')

In [2]:
import pandas as pd
df_ground_truth = pd.read_csv('./ground_truth_evidently.csv')
ground_truth_all = df_ground_truth.to_dict(orient='records')

In [3]:
df_sample = df_ground_truth.sample(25, random_state=1)

In [4]:
ground_truth = df_sample.to_dict(orient='records')
ground_truth.append(ground_truth_all[150])

In [5]:
import asyncio
from tqdm.auto import tqdm

async def map_progress(seq, f, max_concurrency=6):
    """Asynchronously map async function f over seq with progress bar."""
    semaphore = asyncio.Semaphore(max_concurrency)

    async def run(el):
        async with semaphore:
            return await f(el)

    # create one coroutine per element
    coros = [run(el) for el in seq]

    # turn them into tasks that complete as they finish
    completed = asyncio.as_completed(coros)

    results = []

    for coro in tqdm(completed, total=len(seq)):
        result = await coro
        results.append(result)

    return results

In [6]:
import main

agent = main.agent

In [7]:
import traceback

async def run_agent(q):
    try:
        result = await agent.run(q['question'])
        return (q, result)
    except:
        print(f'error processing {q}')
        traceback.print_exc()
        return (None, None)

In [8]:
all_results = await map_progress(
    ground_truth,
    run_agent,
    max_concurrency=10
)

  0%|          | 0/26 [00:00<?, ?it/s]

forcing output
forcing output
forcing output
forcing output
forcing output
forcing output
forcing output
forcing output


In [18]:
from toyaikit.pricing import PricingConfig
pricing = PricingConfig()

def calculate_cost(model, all_results):
    input_tokens = 0
    output_tokens = 0
    
    for _, r in all_results:
        usage = r.usage()
        input_tokens = input_tokens + usage.input_tokens
        output_tokens = output_tokens + usage.output_tokens

    cost = pricing.calculate_cost(model, input_tokens, output_tokens)
    return cost

In [19]:
calculate_cost('gpt-4o-mini', all_results)

CostInfo(input_cost=0.05735985, output_cost=0.0088374, total_cost=0.06619725)

In [40]:
calculate_cost('gpt-4o-mini', all_results)

CostInfo(input_cost=0.8702436, output_cost=0.15745079999999997, total_cost=1.0276944)

In [9]:
import json

def simplify_messages(messages):
    messages_simplified = []

    for m in messages:
        parts = []

        for original_part in m.parts:
            kind = original_part.part_kind
            # print(original_part)
            part = {
                'kind': kind
            }
            if kind == 'user-prompt':
                part['content'] = original_part.content
            if kind == 'tool-call':
                if original_part.tool_name == 'final_result':
                    continue
    
                part['tool_name'] = original_part.tool_name
                part['args'] = json.loads(original_part.args)
            if kind == 'tool-return':
                continue
            if kind == 'text':
                part['content'] = original_part.content

            parts.append(part)

        if len(parts) > 0:
            messages_simplified.extend(parts)

    return messages_simplified

In [10]:
rows = []

for q, r in all_results:
    usage = r.usage()
    row = {
        'question': q['question'],
        'answer': r.output.format_article(),
        'messages': simplify_messages(r.new_messages()),
        'tool_call_number': usage.tool_calls,
        'requests': usage.requests,
        'original_question': q,
        'original_result': r
    }
    rows.append(row)
        
    # question
    # answer 
    # tool calls

In [25]:
df_run = pd.DataFrame(rows)

In [12]:
import pickle

In [13]:
with open('eval-run-v2-2025-10-22-22-25.bin', 'wb') as f_out:
    pickle.dump(rows, f_out)

In [14]:
!ls -lh

total 4.8M
-rw-rw-rw- 1 codespace codespace 3.5M Oct 22 09:11 all-results-v1.bin
-rw-rw-rw- 1 codespace codespace 576K Oct 22 09:29 eval-run-v2-2025-10-22-11-28.bin
-rw-rw-rw- 1 codespace codespace 569K Oct 22 20:25 eval-run-v2-2025-10-22-22-25.bin
-rw-rw-rw- 1 codespace codespace 8.9K Oct 22 20:25 evaluations-agent-run.ipynb
-rw-rw-rw- 1 codespace codespace  19K Oct 22 20:24 evaluations-judge.ipynb
-rw-rw-rw- 1 codespace codespace  19K Oct 20 22:36 ground-truth.ipynb
-rw-rw-rw- 1 codespace codespace  98K Oct 20 22:20 ground_truth_evidently.csv
-rw-rw-rw- 1 codespace codespace  12K Oct 20 23:34 retrieval-evaluation.ipynb
