In [3]:
import requests
import pandas as pd

# Download the FAQ documents
url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
documents = requests.get(docs_url).json()

# Download the ground truth Q&A pairs
ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient='records')

# Sanity check
print(f"✅ Loaded {len(documents)} documents")
print(f"✅ Loaded {len(ground_truth)} ground truth question-answer pairs")


✅ Loaded 948 documents
✅ Loaded 4627 ground truth question-answer pairs


In [4]:
from tqdm.auto import tqdm

def hit_rate(relevance_total):
    cnt = 0
    for line in relevance_total:
        if True in line:
            cnt = cnt + 1
    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0
    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)
                break
    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }




In [None]:
# Question 1

from minsearch import Index

index = Index(text_fields=["question", "section", "text"],
              keyword_fields=["course", "id"])
index.fit(documents)

def search_minsearch(query):
    return index.search(
        query["question"],
        filter_dict={},  # no keyword filter except id matching in evaluation wrapper
        boost_dict={'question': 1.5, 'section': 0.1},
        num_results=5
    )

results = evaluate(ground_truth, search_minsearch)
print(results)


100%|██████████| 4627/4627 [00:34<00:00, 135.46it/s]

{'hit_rate': 0.8013831856494489, 'mrr': 0.6815251062603574}





In [11]:
queries = [
    {
        "query": "How to set up MLflow?",
        "course": "mlops-zoomcamp"
    },
    {
        "query": "What is the difference between precision and recall?",
        "course": "ml-zoomcamp"
    },
    {
        "query": "What is batch size?",
        "course": "ml-zoomcamp"
    },
    {
        "query": "What is Weaviate?",
        "course": "llm-zoomcamp"
    },
    {
        "query": "What are agents in LangChain?",
        "course": "llm-zoomcamp"
    }
]




In [17]:
# 🧾 Step 1: Confirm the structure of ground_truth
print("Sample from ground_truth:")
print(ground_truth[0])


Sample from ground_truth:
{'question': 'When does the course begin?', 'course': 'data-engineering-zoomcamp', 'document': 'c02e79ef'}


In [None]:
# Question 2:

from minsearch import VectorSearch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

# Step 1: Extract questions
texts = [doc['question'] for doc in documents]

# Step 2: Create embeddings
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

# Step 3: Index with vector search
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)


In [18]:
from tqdm import tqdm

# ✅ Evaluation metrics
def hit_rate(results):
    return sum([any(r) for r in results]) / len(results)

def mrr(results):
    total = 0
    for rel in results:
        for i, is_relevant in enumerate(rel):
            if is_relevant:
                total += 1 / (i + 1)
                break
    return total / len(results)

# ✅ Evaluation wrapper
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

# ✅ Corrected vector search using the correct key ('question' instead of 'query')
def search_vector(query):
    query_text = query['question']  # <-- adjust key here if different
    x = pipeline.transform([query_text])
    results = vindex.search(x)
    return results

# ✅ Run evaluation
results = evaluate(ground_truth, search_vector)
print(results)


100%|██████████| 4627/4627 [00:15<00:00, 290.58it/s]

{'hit_rate': 0.4696347525394424, 'mrr': 0.29996269309539225}





In [None]:
# Question 3: 

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from tqdm import tqdm

# Step 1: Combine question and answer text
texts = [doc["question"] + " " + doc["text"] for doc in documents]

# Step 2: TF-IDF + SVD Pipeline
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128)
)

# Step 3: Fit + transform
X = pipeline.fit_transform(texts)

# Step 4: Rebuild vector index using `.fit()` with correct order
vindex = VectorSearch(keyword_fields=["question"])
vindex.fit(X, documents)

# Step 5: Define search function
def search_vector(query):
    x = pipeline.transform([query["question"]])
    return vindex.search(x)


# Step 6: Evaluate
results = evaluate(ground_truth, search_vector)
print(results)


100%|██████████| 4627/4627 [00:23<00:00, 198.82it/s]

{'hit_rate': 0.8428787551329155, 'mrr': 0.6234184445336386}





In [28]:
texts = []
for doc in documents:
    text = doc['question'] + ' ' + doc['text']
    texts.append(text)


In [34]:
documents_small = documents[:1000]
ground_truth_small = ground_truth[:200]


In [35]:
texts = [doc['question'] + ' ' + doc['text'] for doc in documents_small]


In [None]:
# Question 4:

from sentence_transformers import SentenceTransformer
import torch
from tqdm import tqdm

# Load from cache
model = SentenceTransformer("jinaai/jina-embeddings-v2-small-en")

# Move to GPU if available
if torch.cuda.is_available():
    model = model.to("cuda")

# Batch encode for speed
batch_size = 32
vectors = []

for i in tqdm(range(0, len(texts), batch_size)):
    batch = texts[i:i+batch_size]
    batch_vecs = model.encode(batch, show_progress_bar=False)
    vectors.extend(batch_vecs)


Some weights of BertModel were not initialized from the model checkpoint at jinaai/jina-embeddings-v2-small-en and are newly initialized: ['embeddings.position_embeddings.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.1.intermediate.dense.weight', 'encoder.layer.1.output.LayerNorm.bias', 'encoder.layer.1.output.LayerNorm.weight', 'encoder.layer.1.output.dense.bias', 'encoder.layer.1.output.dense.weight', 'encoder.layer.2.intermediate.dense.bias', 'encoder.layer.2.intermediate.dense.weight', 'encoder.layer.2.output.LayerNorm.bias', 'encoder.layer.2.output.LayerNorm.weight', 'encoder.layer.2.output.dense.bias', 'encoder.layer.2.output.dense.weight', 'encoder.layer.3.intermediate.dense.bias', 'encoder.layer.3.intermediate.den

In [37]:
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PointStruct
import numpy as np

client = QdrantClient(":memory:")  # super fast

client.recreate_collection(
    collection_name="q4_test",
    vectors_config=VectorParams(size=len(vectors[0]), distance=Distance.COSINE)
)

# Upload vectors to Qdrant
points = [
    PointStruct(id=i, vector=vec, payload=documents_small[i])
    for i, vec in enumerate(vectors)
]

client.upload_points(collection_name="q4_test", points=points, parallel=2)


  client.recreate_collection(


In [41]:
def search_qdrant(query):
    query_text = query['question']  # ← FIXED HERE
    query_vec = model.encode(query_text)

    results = client.search(
        collection_name="q4_test",
        query_vector=query_vec,
        limit=5
    )
    return [{"id": r.payload['id']} for r in results]



In [42]:
evaluate(ground_truth_small, search_qdrant)


  results = client.search(
100%|██████████| 200/200 [00:06<00:00, 28.93it/s]


{'hit_rate': 0.345, 'mrr': 0.23283333333333334}

In [None]:
# Question 5: 

import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

# ✅ Load the local CSV file
df_results = pd.read_csv("results-gpt4o-mini.csv")

# ✅ Create a TF-IDF + SVD pipeline
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)

# ✅ Fit on all text: LLM answers, original answers, and questions
pipeline.fit(
    df_results["answer_llm"] + " " +
    df_results["answer_orig"] + " " +
    df_results["question"]
)

# ✅ Transform the LLM and original answers
v_llm = pipeline.transform(df_results["answer_llm"])
v_orig = pipeline.transform(df_results["answer_orig"])

# ✅ Cosine similarity function (row-wise)
def cosine_batch(u, v):
    u_norm = np.sqrt((u * u).sum(axis=1))
    v_norm = np.sqrt((v * v).sum(axis=1))
    dot_product = (u * v).sum(axis=1)
    return dot_product / (u_norm * v_norm)

# ✅ Compute average cosine similarity
cosines = cosine_batch(v_llm, v_orig)
average_cosine = cosines.mean()

print("✅ Average cosine similarity:", round(average_cosine, 4))


✅ Average cosine similarity: 0.8416


In [None]:
# Question 6: 

from rouge import Rouge
from tqdm import tqdm

rouge = Rouge()

# Store Rouge-1 F1 scores
rouge_1_f1_scores = []

for _, row in tqdm(df_results.iterrows(), total=len(df_results)):
    try:
        scores = rouge.get_scores(row['answer_llm'], row['answer_orig'])[0]
        rouge_1_f1_scores.append(scores['rouge-1']['f'])
    except:
        # In case of empty or problematic rows
        rouge_1_f1_scores.append(0)

# Compute average
avg_rouge_1_f1 = sum(rouge_1_f1_scores) / len(rouge_1_f1_scores)
print(f"✅ Average Rouge-1 F1: {avg_rouge_1_f1:.4f}")


100%|██████████| 1830/1830 [00:13<00:00, 139.95it/s]

✅ Average Rouge-1 F1: 0.3517



