In [1]:
!uv pip install minsearch qdrant_client

[2K[2mResolved [1m31 packages[0m [2min 689ms[0m[0m                                        [0m
[2K[2mPrepared [1m1 package[0m [2min 94ms[0m[0m                                               
[2K[2mInstalled [1m6 packages[0m [2min 262ms[0m[0m                               [0m
 [32m+[39m [1mjoblib[0m[2m==1.5.1[0m
 [32m+[39m [1mminsearch[0m[2m==0.0.4[0m
 [32m+[39m [1mpandas[0m[2m==2.3.1[0m
 [32m+[39m [1mscikit-learn[0m[2m==1.7.1[0m
 [32m+[39m [1mscipy[0m[2m==1.16.1[0m
 [32m+[39m [1mthreadpoolctl[0m[2m==3.6.0[0m


In [2]:
import minsearch

minsearch.__version__

'0.0.4'

In [3]:
import requests
import pandas as pd

url_prefix = (
    "https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/"
)
docs_url = url_prefix + "search_evaluation/documents-with-ids.json"
documents = requests.get(docs_url).json()

ground_truth_url = url_prefix + "search_evaluation/ground-truth-data.csv"
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient="records")

In [4]:
from tqdm.auto import tqdm


def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)


def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank]:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)


def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q["document"]
        results = search_function(q)
        relevance = [d["id"] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        "hit_rate": hit_rate(relevance_total),
        "mrr": mrr(relevance_total),
    }

In [5]:
import minsearch

index = minsearch.Index(
    text_fields=["question", "section", "text"], keyword_fields=["course", "id"]
)

index.fit(documents)

<minsearch.minsearch.Index at 0x10628fb30>

In [6]:
def minsearch_search(query, course):
    boost = {"question": 1.5, "section": 0.1}

    results = index.search(
        query=query, filter_dict={"course": course}, boost_dict=boost, num_results=5
    )

    return results

In [7]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q["document"]
    results = minsearch_search(query=q["question"], course=q["course"])
    relevance = [d["id"] == doc_id for d in results]
    relevance_total.append(relevance)

  0%|          | 0/4627 [00:00<?, ?it/s]

In [8]:
hit_rate(relevance_total), mrr(relevance_total)

(0.848714069591528, 0.7288235717887772)

In [9]:
from minsearch import VectorSearch

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

In [37]:
texts = []

for doc in documents:
    t = doc["question"]
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3), TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

In [48]:
df_docs = pd.DataFrame(X).T
df_docs.round(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,938,939,940,941,942,943,944,945,946,947
0,0.20,0.27,0.25,0.33,0.25,0.23,0.25,0.22,0.30,0.23,...,0.10,0.11,0.09,0.28,0.25,0.14,0.19,0.22,0.01,0.20
1,-0.19,-0.34,-0.24,-0.21,-0.33,-0.14,-0.17,-0.24,-0.27,-0.08,...,0.00,0.06,0.08,0.10,0.08,0.12,0.45,0.29,0.01,-0.04
2,-0.10,-0.14,-0.11,-0.05,-0.05,0.14,-0.06,-0.08,-0.09,0.10,...,0.05,-0.08,-0.05,-0.27,-0.24,-0.08,0.10,0.13,-0.02,0.29
3,0.16,0.27,0.24,0.13,0.28,-0.01,0.13,0.22,0.18,0.11,...,-0.09,-0.09,-0.15,-0.22,-0.21,-0.14,0.34,0.23,-0.03,-0.09
4,-0.14,-0.17,-0.17,-0.02,-0.24,-0.18,0.05,-0.16,-0.24,0.03,...,-0.18,-0.09,-0.19,-0.05,-0.04,-0.03,-0.16,0.05,-0.02,0.13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123,-0.05,0.01,0.01,-0.04,0.01,-0.01,-0.01,-0.01,0.01,-0.04,...,-0.09,-0.04,-0.02,0.01,-0.01,0.00,-0.01,-0.09,-0.06,0.02
124,0.05,0.01,-0.01,-0.02,-0.04,0.01,0.08,-0.02,-0.01,-0.01,...,0.03,0.01,-0.01,-0.01,-0.01,-0.04,-0.11,0.02,0.01,0.05
125,0.04,-0.05,0.03,-0.06,-0.04,0.00,0.04,-0.04,-0.03,-0.04,...,-0.08,-0.02,-0.00,-0.01,-0.02,0.02,0.12,0.04,-0.03,0.12
126,0.03,0.01,-0.02,0.02,-0.01,-0.03,0.02,-0.04,-0.06,-0.04,...,0.06,-0.03,-0.02,-0.03,-0.02,-0.04,-0.08,-0.03,-0.02,0.04


In [38]:
vindex = VectorSearch(keyword_fields={"course"})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x13d59f620>

In [72]:
def minsearch_vector_search(vector, course):
    return vindex.search(vector, filter_dict={"course": course}, num_results=5)


def question_text_vector(q):
    question = q["question"]
    course = q["course"]

    v_q = pipeline.transform([question])

    return minsearch_vector_search(v_q, course)

In [73]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q["document"]
    results = question_text_vector(q)
    relevance = [d["id"] == doc_id for d in results]
    relevance_total.append(relevance)

  0%|          | 0/4627 [00:00<?, ?it/s]

In [74]:
hit_rate(relevance_total), mrr(relevance_total)

(0.48173762697212014, 0.3572833369353793)

In [75]:
texts = []

for doc in documents:
    t = doc["question"] + " " + doc["text"]
    texts.append(t)

X = pipeline.fit_transform(texts)

In [76]:
vindex = VectorSearch(keyword_fields={"course"})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x13d651f10>

In [77]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q["document"]
    results = question_text_vector(q)
    relevance = [d["id"] == doc_id for d in results]
    relevance_total.append(relevance)

  0%|          | 0/4627 [00:00<?, ?it/s]

In [78]:
hit_rate(relevance_total), mrr(relevance_total)

(0.8210503566025502, 0.6717347453353508)