In [None]:
pip install -U minsearch qdrant_client

In [3]:
import requests
import pandas as pd

url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
documents = requests.get(docs_url).json()

ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient='records')

In [4]:
from tqdm.auto import tqdm

def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
import minsearch

# --------------------
# Create the Minsearch index
# --------------------
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course", "id"]
)
index.fit(documents)

# --------------------
# Search function with boosting
# --------------------
def minsearch_search(query_obj):
    boost = {'question': 1.5, 'section': 0.1}

    results = index.search(
        query=query_obj['question'],
        filter_dict={'course': query_obj['course']},
        boost_dict=boost,
        num_results=5
    )
    return results

In [12]:
results = evaluate(ground_truth, minsearch_search)
print(results)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4627/4627 [00:14<00:00, 326.25it/s]

{'hit_rate': 0.848714069591528, 'mrr': 0.7288235717887772}





Q2

In [14]:
from minsearch import VectorSearch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

texts = []

for doc in documents:
    t = doc['question']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

In [15]:
# --------------------
# Fit vector index
# --------------------
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x77eb622ad280>

In [16]:
def vector_search_fn(q):
    vec = pipeline.transform([q['question']])
    return vindex.search(vec, filter_dict={'course': q['course']}, num_results=5)

In [17]:
results = evaluate(ground_truth, vector_search_fn)
print(results)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4627/4627 [00:07<00:00, 599.34it/s]

{'hit_rate': 0.48173762697212014, 'mrr': 0.3571284489590088}





Q3

In [21]:
texts = [doc['question'] + ' ' + doc['text'] for doc in documents]

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)

X = pipeline.fit_transform(texts)

In [22]:
# --------------------
# Fit vector index
# --------------------
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x77eb62d483b0>

In [23]:
def vector_search_combined(q):
    query_text = q['question']  # only question used for query
    vec = pipeline.transform([query_text])
    return vindex.search(vec, filter_dict={'course': q['course']}, num_results=5)

In [24]:
results = evaluate(ground_truth, vector_search_combined)
print(results)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4627/4627 [00:08<00:00, 531.51it/s]

{'hit_rate': 0.8210503566025502, 'mrr': 0.6717707657949719}



