In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.insert(0, '..')

In [26]:
from minsearch import Index
import json

In [31]:
import tiktoken

In [3]:
import docs

github_data = docs.read_github_data()
parsed_data = docs.parse_data(github_data)

In [10]:
import pandas as pd

df_ground_truth = pd.read_csv('../evals/ground_truth_evidently.csv')
ground_truth = df_ground_truth.to_dict(orient='records')

In [13]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)


def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)
                break

    return total_score / len(relevance_total)

In [54]:
encoder = tiktoken.encoding_for_model('gpt-4o-mini')

def calculate_num_tokens(search_results):
    rs_json = json.dumps(search_results)
    return len(encoder.encode(rs_json))

In [55]:
from tqdm.auto import tqdm 

def evaluate(
        ground_truth,
        search_function,
        question_column='question',
        id_column='filename'
):
    relevance_total = []
    total_number_tokens = []

    for q in ground_truth:
        doc_id = q[id_column]
        results = search_function(q[question_column])

        num_tokens = calculate_num_tokens(results)
        total_number_tokens.append(num_tokens)

        relevance = [d[id_column] == doc_id for d in results]
        relevance_total.append(relevance)

    avg_num_tokens = sum(total_number_tokens) / len(total_number_tokens)
    
    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
        'num_tokens': avg_num_tokens
    }

In [56]:
size = 2000
step = 1000
top_k = 5

In [57]:
def evaluate_params(size, step, top_k):
    chunks = docs.chunk_documents(parsed_data, size=size, step=step)
    
    index = Index(
        text_fields=["content", "filename", "title", "description"],
    )
    
    index.fit(chunks)
    
    def search(query: str):
        return index.search(
            query=query,
            num_results=top_k,
        )
    
    return evaluate(ground_truth, search)

{'hit_rate': 0.6659090909090909,
 'mrr': 0.3565785919896001,
 'num_tokens': 9355.861363636364}

In [62]:
sizes = [1000, 2000, 3000, 5000]
steps = [1000, 2000, 3000]
top_ks = [5, 10, 15]

results = []

for size in sizes:
    for step in steps:
        if step > size:
            continue

        for top_k in top_ks:
            result = evaluate_params(size, step, top_k)
            record = {
                'size': size,
                'step': step,
                'top_k': top_k,
                'hit_rate': result['hit_rate'],
                'num_tokens': result['num_tokens']
            }
            print(record)
            results.append(record)
            

{'size': 1000, 'step': 1000, 'top_k': 5, 'hit_rate': 0.3795454545454545, 'num_tokens': 1360.0227272727273}
{'size': 1000, 'step': 1000, 'top_k': 10, 'hit_rate': 0.4727272727272727, 'num_tokens': 2758.4977272727274}
{'size': 1000, 'step': 1000, 'top_k': 15, 'hit_rate': 0.5659090909090909, 'num_tokens': 4120.754545454545}
{'size': 2000, 'step': 1000, 'top_k': 5, 'hit_rate': 0.39090909090909093, 'num_tokens': 2514.2204545454547}
{'size': 2000, 'step': 1000, 'top_k': 10, 'hit_rate': 0.47954545454545455, 'num_tokens': 5122.143181818182}
{'size': 2000, 'step': 1000, 'top_k': 15, 'hit_rate': 0.5636363636363636, 'num_tokens': 7621.579545454545}
{'size': 2000, 'step': 2000, 'top_k': 5, 'hit_rate': 0.40454545454545454, 'num_tokens': 2380.3977272727275}
{'size': 2000, 'step': 2000, 'top_k': 10, 'hit_rate': 0.5522727272727272, 'num_tokens': 4715.877272727273}
{'size': 2000, 'step': 2000, 'top_k': 15, 'hit_rate': 0.6045454545454545, 'num_tokens': 7030.809090909091}
{'size': 3000, 'step': 1000, 'top

In [63]:
df_search_eval = pd.DataFrame(results)

In [75]:
alpha = 2
beta = 0.5
df_search_eval['score'] = df_search_eval.hit_rate ** alpha / (df_search_eval.num_tokens / 1000) ** beta

In [78]:
df_best = df_search_eval.sort_values(by='score', ascending=False).head(n=5)

In [82]:
best_candidates = df_best.to_dict(orient='records')

In [83]:
best_candidates

[{'size': 1000,
  'step': 1000,
  'top_k': 15,
  'hit_rate': 0.5659090909090909,
  'num_tokens': 4120.754545454545,
  'score': 0.15776293166330635},
 {'size': 2000,
  'step': 2000,
  'top_k': 10,
  'hit_rate': 0.5522727272727272,
  'num_tokens': 4715.877272727273,
  'score': 0.1404513594105386},
 {'size': 2000,
  'step': 2000,
  'top_k': 15,
  'hit_rate': 0.6045454545454545,
  'num_tokens': 7030.809090909091,
  'score': 0.13783365334705797},
 {'size': 1000,
  'step': 1000,
  'top_k': 10,
  'hit_rate': 0.4727272727272727,
  'num_tokens': 2758.4977272727274,
  'score': 0.13455040263772092},
 {'size': 3000,
  'step': 3000,
  'top_k': 15,
  'hit_rate': 0.634090909090909,
  'num_tokens': 9505.179545454546,
  'score': 0.13041360229484228}]

In [3]:
import search_agent

In [4]:
config = search_agent.AgentConfig(
    chunk_size=1000,
    chunk_step=1000,
    top_k=15,
)

In [5]:
agent = search_agent.create_agent(config)

In [6]:
from evals.eval_orchestrator import run_full_evaluation

In [7]:
await run_full_evaluation(agent, csv_path='../evals/gt-sample.csv')


Start time: 2025-10-27 14:12:20
Configuration:
  Ground truth: ../evals/gt-sample.csv
  Agent model: gpt-4o-mini
  Judge model: gpt-5-nano
  Max concurrency: 10

Loaded 26 ground truth questions
Running agent evaluation...


  0%|          | 0/26 [00:00<?, ?it/s]

forcing output
forcing output
forcing output
forcing output
forcing output
forcing output
forcing output
forcing output
forcing output
forcing output
forcing output
forcing output
forcing output
forcing output
forcing output
Total cost: $0.1451
Results saved to reports/eval-run-2025-10-27-14-13.bin

✓ Agent evaluation completed
  Evaluated: 26 questions
  Results saved: reports/eval-run-2025-10-27-14-13.bin
  Agent Run Costs:
    Input tokens cost:  $  0.1352
    Output tokens cost: $  0.0099
    Total cost:         $  0.1451

Loading evaluation results from reports/eval-run-2025-10-27-14-13.bin...
Loaded 26 evaluation results
Loading reference documents...
Creating judge agent...
Running judge evaluation...


  0%|          | 0/26 [00:00<?, ?it/s]

Total cost: $0.0252
Judge results saved to: reports/eval-judge-2025-10-27-14-13.bin

✓ Judge evaluation completed
  Evaluated: 26 results
  Judge results saved: reports/eval-judge-2025-10-27-14-13.bin
  Judge Evaluation Costs:
    Input tokens cost:  $  0.0093
    Output tokens cost: $  0.0159
    Total cost:         $  0.0252


Execution Time:
  Duration: 104.1 seconds

Dataset:
  Questions evaluated: 26

Evaluation Metrics:
  ✓ CheckName.tool_call_search  92.3%
  ✓ CheckName.instructions_follow 100.0%
  ✓ CheckName.instructions_avoid 100.0%
  ⚠ CheckName.answer_relevant  75.0%
  ⚠ CheckName.answer_clear     71.4%
  ⚠ CheckName.answer_match     71.4%
  ⚠ CheckName.answer_citations  71.4%
  ⚠ CheckName.completeness     71.4%

Overall Score: 81.6%

Agent Run Costs:
  Input tokens cost:  $  0.1352
  Output tokens cost: $  0.0099
  Total cost:         $  0.1451
Judge Evaluation Costs:
  Input tokens cost:  $  0.0093
  Output tokens cost: $  0.0159
  Total cost:         $  0.0252
TOTAL Cos

{'run_results_path': 'reports/eval-run-2025-10-27-14-13.bin',
 'judge_results_path': 'reports/eval-judge-2025-10-27-14-13.bin',
 'run_cost': CostInfo(input_cost=0.13517625, output_cost=0.0099498, total_cost=0.14512605),
 'judge_cost': CostInfo(input_cost=0.0092816, output_cost=0.015917999999999998, total_cost=0.025199599999999996),
 'total_cost': CostInfo(input_cost=0.14445785, output_cost=0.025867799999999996, total_cost=0.17032565),
 'df_run':                                              question  \
 0                 how to use TextEvals() in Evidently   
 1                          ColumnCount() metric usage   
 2                     LLM judge metrics customization   
 3                   Evidently telemetry version 0.4.0   
 4   how to benchmark model quality with dummy metrics   
 5                         examples of JSONSchemaMatch   
 6               performing quality checks on datasets   
 7                 examples of incorrect LLM responses   
 8   example of mapping regre