## Evaluation data

In [1]:
import requests
import pandas as pd

url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
documents = requests.get(docs_url).json()

ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient='records')

In [2]:
documents[:3]

[{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineering-zoomcamp',
  'id': 'c02e79ef'},
 {'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
  'section': 'General course-related questions',
  'question': 'Course - What are the prerequisites for this course?',
  'course': 'data-engineering-zoomcamp',
  'id': '1f6520ca'},
 {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware

In [3]:
ground_truth[:3]

[{'question': 'When does the course begin?',
  'course': 'data-engineering-zoomcamp',
  'document': 'c02e79ef'},
 {'question': 'How can I get the course schedule?',
  'course': 'data-engineering-zoomcamp',
  'document': 'c02e79ef'},
 {'question': 'What is the link for course registration?',
  'course': 'data-engineering-zoomcamp',
  'document': 'c02e79ef'}]

In [4]:
from tqdm.auto import tqdm

def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

## Evaluate minsearch

In [5]:
import minsearch

index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course", "id"]
)

index.fit(documents)

<minsearch.minsearch.Index at 0x71d2153c90a0>

In [6]:
def minsearch_search(query, course, boost={'question': 3.0, 'section': 0.5}):
    boost = boost

    results = index.search(
        query=query,
        filter_dict={'course': course},
        boost_dict=boost,
        num_results=5
    )

    return results

In [7]:
boost = {'question': 1.5, 'section': 0.1}

minsearch_eval = evaluate(ground_truth, lambda q: minsearch_search(q['question'], q['course'], boost=boost))

  0%|          | 0/4627 [00:00<?, ?it/s]

In [8]:
print(f"Hit Rate: {minsearch_eval['hit_rate']:.4f}")
print(f"MRR: {minsearch_eval['mrr']:.4f}")

Hit Rate: 0.8487
MRR: 0.7288


## Embeddings

In [9]:
from minsearch import VectorSearch

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

In [11]:
texts = []

for doc in documents:
    t = doc['question']
    texts.append(t)

In [12]:
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)

X = pipeline.fit_transform(texts)

In [13]:
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x71d1e1c46480>

In [14]:
def minsearch_vector_search(vector_index, query, course):
    query_vector = pipeline.transform([query])
    
    results = vector_index.search(
        query_vector,
        filter_dict={'course': course},
        num_results=5
    )
    
    return results

In [15]:
vector_eval = evaluate(ground_truth, lambda q: minsearch_vector_search(vindex, q['question'], q['course']))

print(f"Hit Rate: {vector_eval['hit_rate']:.4f}")
print(f"MRR: {vector_eval['mrr']:.4f}")

  0%|          | 0/4627 [00:00<?, ?it/s]

Hit Rate: 0.4817
MRR: 0.3571


## Vector search for question and answer

In [16]:
texts = []

for doc in documents:
    t = doc['question'] + ' ' + doc['text']
    texts.append(t)

In [17]:
X = pipeline.fit_transform(texts)

vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x71d1f1efb860>

In [18]:
upd_vector_eval = evaluate(ground_truth, lambda q: minsearch_vector_search(vindex, q['question'], q['course']))

print(f"Hit Rate: {upd_vector_eval['hit_rate']:.4f}")
print(f"MRR: {upd_vector_eval['mrr']:.4f}")

  0%|          | 0/4627 [00:00<?, ?it/s]

Hit Rate: 0.8211
MRR: 0.6718


## Qdrant

In [19]:
model_handle = "jinaai/jina-embeddings-v2-small-en"

In [None]:
from qdrant_client import QdrantClient, models

In [None]:

client = QdrantClient("http://localhost:6333")

collection_name = 'evaluation-search'

if not client.collection_exists(collection_name):
    client.create_collection(
        collection_name=collection_name,
        vectors_config=models.VectorParams(
            size=512,
            distance=models.Distance.COSINE,
        )
    )

In [30]:
def qdrant_search(query, course, limit=1):
    results = client.query_points(
        collection_name=collection_name,
        query=models.Document(
            text=query,
            model=model_handle 
        ),
        limit=limit,
        with_payload=True,
        query_filter=models.Filter(
            must=[
                models.FieldCondition(
                    key="course",
                    match=models.MatchValue(value=course)
                )
            ])
    )
    return [point.payload for point in results.points]

In [23]:
client.upsert(
        collection_name=collection_name,
        points=[
            models.PointStruct(
                id=idx,
                vector=models.Document(text=doc['question'] + ' ' + doc['text'], model=model_handle),
                payload={
                    "text": doc['text'],
                    "section": doc['section'],
                    "question": doc['question'],
                    "course": doc['course'],
                    "id": doc['id']
                }
            )
            for idx, doc in enumerate(documents)
        ]
    )

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [32]:
qdrant_eval = evaluate(ground_truth, lambda q: qdrant_search(q['question'], q['course'], limit=5))

  0%|          | 0/4627 [00:00<?, ?it/s]

In [33]:
print(f"Hit Rate: {qdrant_eval['hit_rate']:.4f}")
print(f"MRR: {qdrant_eval['mrr']:.4f}")

Hit Rate: 0.9300
MRR: 0.8518
