## Homework

In [105]:
import requests
import pandas as pd
from minsearch import Index, VectorSearch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from tqdm.auto import tqdm
import warnings
import numpy as np
from qdrant_client import QdrantClient, models
from fastembed import TextEmbedding


warnings.filterwarnings("ignore", category=RuntimeWarning, module="sklearn.utils.extmath")
warnings.filterwarnings("ignore", category=RuntimeWarning)

url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
documents = requests.get(docs_url).json()

ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient='records')

In [52]:
len(documents), len(ground_truth)

(948, 4627)

In [53]:
documents[:3]

[{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineering-zoomcamp',
  'id': 'c02e79ef'},
 {'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
  'section': 'General course-related questions',
  'question': 'Course - What are the prerequisites for this course?',
  'course': 'data-engineering-zoomcamp',
  'id': '1f6520ca'},
 {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware

In [54]:
ground_truth[:3]

[{'question': 'When does the course begin?',
  'course': 'data-engineering-zoomcamp',
  'document': 'c02e79ef'},
 {'question': 'How can I get the course schedule?',
  'course': 'data-engineering-zoomcamp',
  'document': 'c02e79ef'},
 {'question': 'What is the link for course registration?',
  'course': 'data-engineering-zoomcamp',
  'document': 'c02e79ef'}]

In [55]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

### Q1 - minserach text

In [56]:
boost = {'question': 1.5, 'section': 0.1}

index = Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"],
)
index.fit(documents)

result_evaluation = evaluate( 
    ground_truth, 
    lambda q: index.search(
        query=q['question'], filter_dict={"course": q["course"]}, boost_dict=boost, num_results=5
    )
)

  0%|          | 0/4627 [00:00<?, ?it/s]

In [57]:
print(f"hit_rate: {result_evaluation['hit_rate']:.4f}")

hit_rate: 0.8487


### Q2 - evaluate MRR for embedding index

In [58]:
texts = []

for doc in documents:
    t = doc['question']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)


<minsearch.vector.VectorSearch at 0x131d8d250>

In [None]:
result_vindex = evaluate(
    ground_truth, 
    lambda q: vindex.search(
        pipeline.transform([q['question']]),
        filter_dict={"course": q["course"]}, 
        num_results=5
    )
)
print("MRR:", result_vindex['mrr'])


  0%|          | 0/4627 [00:00<?, ?it/s]

MRR: 0.3573085512571141


### Q3 - vector search for question and answer

In [60]:
texts = []

for doc in documents:
    t = doc['question'] + ' ' + doc['text']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)


<minsearch.vector.VectorSearch at 0x132e3fe00>

In [63]:
result_vindex = evaluate(
    ground_truth, 
    lambda q: vindex.search(
        pipeline.transform([q['question']]),
        filter_dict={"course": q["course"]}, 
        num_results=5
    )
)
print("hit rate:", result_vindex['hit_rate'])

  0%|          | 0/4627 [00:00<?, ?it/s]

hit rate: 0.8210503566025502


### Q4 - MRR in Qdrant

In [111]:
qd_client = QdrantClient("http://localhost:6333")
model_handle = "jinaai/jina-embeddings-v2-small-en"
collection_name = "qdrant_search_evaluation"
qd_client.delete_collection(collection_name)
qd_client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=512,
        distance=models.Distance.COSINE
    ),
)

True

In [72]:
points = []

for i, doc in enumerate(documents):
    text = doc["question"] + " " + doc["text"] 
    vector = models.Document(text=text, model=model_handle)
    point = models.PointStruct(
        id=i,
        vector=vector,
        payload=doc
    )
    points.append(point)


qd_client.upsert(
    collection_name=collection_name,
    points=points
)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [99]:
def search_qdrant(query, limit=5):

    results = qd_client.query_points(
        collection_name=collection_name,
        query=models.Document(
            text=query,
            model=model_handle 
        ),
        limit=limit, # top closest matches
        with_payload=True #to get metadata in the results
    )

    return [r.payload for r in results.points]

In [102]:
result_qdrant = evaluate(
    ground_truth, 
    lambda q: search_qdrant(
        q["question"]
    )
)

  0%|          | 0/4627 [00:00<?, ?it/s]

hit rate: 0.8210503566025502


In [103]:
print("MRR:", result_vindex['mrr'])

MRR: 0.6717347453353508


### Q5 - cosine similarity

In [124]:
def cosine(u, v):
    u_norm = np.sqrt(u.dot(u.T))
    v_norm = np.sqrt(v.dot(v.T))
    return (u.dot(v.T) / (u_norm * v_norm)).item()

In [113]:
results_url = url_prefix + 'rag_evaluation/data/results-gpt4o-mini.csv'
df_results = pd.read_csv(results_url)

In [114]:
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)

In [115]:
pipeline.fit(df_results.answer_llm + ' ' + df_results.answer_orig + ' ' + df_results.question)

In [125]:
cosine_pairs = []

for i, row in df_results.iterrows():
    v_llm = pipeline.transform([row['answer_llm']])
    v_orig = pipeline.transform([row['answer_orig']])
    similarity = cosine(v_llm, v_orig)
    cosine_pairs.append(similarity)

average_cosine = np.mean(cosine_pairs)


In [126]:
print("Average cosine similarity:", average_cosine)

Average cosine similarity: 0.8415841233490403


### Q6 - rouge

In [127]:
from rouge import Rouge
rouge_scorer = Rouge()

r = df_results.iloc[10]
scores = rouge_scorer.get_scores(r.answer_llm, r.answer_orig)[0]
scores

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

In [128]:
rouge_pairs = []

for i, row in df_results.iterrows():
    scores = rouge_scorer.get_scores(row.answer_llm, row.answer_orig)[0]
    rouge_pairs.append(scores['rouge-1']['f'])

In [129]:
print("Average R1 F1 score:", np.mean(rouge_pairs))

Average R1 F1 score: 0.3516946452113943
