In [3]:
import requests
import pandas as pd

# Download the FAQ documents
url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
documents = requests.get(docs_url).json()

# Download the ground truth Q&A pairs
ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient='records')

# Sanity check
print(f"✅ Loaded {len(documents)} documents")
print(f"✅ Loaded {len(ground_truth)} ground truth question-answer pairs")


✅ Loaded 948 documents
✅ Loaded 4627 ground truth question-answer pairs


In [4]:
from tqdm.auto import tqdm

def hit_rate(relevance_total):
    cnt = 0
    for line in relevance_total:
        if True in line:
            cnt = cnt + 1
    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0
    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)
                break
    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }




In [5]:
from minsearch import Index

index = Index(text_fields=["question", "section", "text"],
              keyword_fields=["course", "id"])
index.fit(documents)

def search_minsearch(query):
    return index.search(
        query["question"],
        filter_dict={},  # no keyword filter except id matching in evaluation wrapper
        boost_dict={'question': 1.5, 'section': 0.1},
        num_results=5
    )

results = evaluate(ground_truth, search_minsearch)
print(results)


100%|██████████| 4627/4627 [00:34<00:00, 135.46it/s]

{'hit_rate': 0.8013831856494489, 'mrr': 0.6815251062603574}



