In [1]:
pip install -U minsearch qdrant_client

Note: you may need to restart the kernel to use updated packages.


In [3]:
import requests
import pandas as pd

url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
documents = requests.get(docs_url).json()

ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient='records')

In [3]:
from tqdm.auto import tqdm

def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import minsearch

# --------------------
# Create the Minsearch index
# --------------------
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course", "id"]
)
index.fit(documents)

# --------------------
# Search function with boosting
# --------------------
def minsearch_search(query_obj):
    boost = {'question': 1.5, 'section': 0.1}

    results = index.search(
        query=query_obj['question'],
        filter_dict={'course': query_obj['course']},
        boost_dict=boost,
        num_results=5
    )
    return results

In [5]:
results = evaluate(ground_truth, minsearch_search)
print(results)

100%|██████████████████████████████████████████████████████████████████████████████████████| 4627/4627 [00:14<00:00, 323.54it/s]

{'hit_rate': 0.848714069591528, 'mrr': 0.7288235717887772}





Q2

In [6]:
from minsearch import VectorSearch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

texts = []

for doc in documents:
    t = doc['question']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

In [7]:
# --------------------
# Fit vector index
# --------------------
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x76dca4d48d40>

In [8]:
def vector_search_fn(q):
    vec = pipeline.transform([q['question']])
    return vindex.search(vec, filter_dict={'course': q['course']}, num_results=5)

In [9]:
results = evaluate(ground_truth, vector_search_fn)
print(results)

100%|██████████████████████████████████████████████████████████████████████████████████████| 4627/4627 [00:07<00:00, 592.14it/s]

{'hit_rate': 0.48173762697212014, 'mrr': 0.3571284489590088}





Q3

In [10]:
texts = [doc['question'] + ' ' + doc['text'] for doc in documents]

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)

X = pipeline.fit_transform(texts)

In [11]:
# --------------------
# Fit vector index
# --------------------
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x76dca520b920>

In [12]:
def vector_search_combined(q):
    query_text = q['question']  # only question used for query
    vec = pipeline.transform([query_text])
    return vindex.search(vec, filter_dict={'course': q['course']}, num_results=5)

In [13]:
results = evaluate(ground_truth, vector_search_combined)
print(results)

100%|██████████████████████████████████████████████████████████████████████████████████████| 4627/4627 [00:08<00:00, 538.21it/s]

{'hit_rate': 0.8210503566025502, 'mrr': 0.6717707657949719}





Q4

docker pull qdrant/qdrant

docker run -p 6333:6333 -p 6334:6334 -v "$(pwd)/qdrant_storage:/qdrant/storage:z" qdrant/qdrant

In [10]:
from fastembed import TextEmbedding
from qdrant_client import QdrantClient, models
import numpy as np
from qdrant_client import QdrantClient, models

EMBEDDING_DIMENSIONALITY = 512
model_handle = "jinaai/jina-embeddings-v2-small-en"

  from .autonotebook import tqdm as notebook_tqdm


In [29]:
client = QdrantClient("http://localhost:6333") #connecting to local Qdrant instance

In [31]:
collection_name = "zoomcamp-evaluations"
client.delete_collection(collection_name=collection_name)

True

In [32]:

# Create the collection with specified vector parameters
client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=EMBEDDING_DIMENSIONALITY,  # Dimensionality of the vectors
        distance=models.Distance.COSINE  # Distance metric for similarity search
    )
)

True

In [33]:
client.create_payload_index(
    collection_name=collection_name,
    field_name="course",
    field_schema="keyword" # exact matching on string metadata fields
)

UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)

In [34]:
points = []

for i, doc in enumerate(documents):
    text = doc['question'] + ' ' + doc['text']
    vector = models.Document(text=text, model=model_handle)
    point = models.PointStruct(
        id=i,
        vector=vector,
        payload=doc
    )
    points.append(point)

In [35]:
client.upsert(
    collection_name=collection_name,
    points=points
)

UpdateResult(operation_id=2, status=<UpdateStatus.COMPLETED: 'completed'>)

In [36]:
def search_qdrant(query_dict):
    
    # Extract question from the query_dict
    question = query_dict['question']
    course = query_dict['course']
    
    query_points = client.query_points(
        collection_name=collection_name,
        query=models.Document(
            text=question,
            model=model_handle 
        ),
        query_filter=models.Filter(
            must=[
                models.FieldCondition(
                    key="course",
                    match=models.MatchValue(value=course)
                )
            ]
        ),
        limit=5,
        with_payload=True
    )

    results = []
    for point in query_points.points:
        results.append(point.payload)  # Return the payload (document data)

    return results

In [37]:
results = evaluate(ground_truth, search_qdrant)
print(results)

100%|███████████████████████████████████████████████████████████████████████████████████████| 4627/4627 [01:12<00:00, 63.89it/s]

{'hit_rate': 0.9299762264966501, 'mrr': 0.8517722066133576}





Q5

In [1]:
def cosine(u, v):
    u_norm = np.sqrt(u.dot(u))
    v_norm = np.sqrt(v.dot(v))
    return u.dot(v) / (u_norm * v_norm)

In [4]:
import pandas as pd

results_url = url_prefix + 'rag_evaluation/data/results-gpt4o-mini.csv'
df_results = pd.read_csv(results_url)
df_results_dict = df_results.to_dict(orient='records')

In [7]:
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)

In [8]:
pipeline.fit(df_results.answer_llm + ' ' + df_results.answer_orig + ' ' + df_results.question)

In [11]:
cosine_scores = []

for doc in df_results_dict:
    v_llm = pipeline.transform([doc['answer_llm']])
    v_orig = pipeline.transform([doc['answer_orig']]) 

    cosine_doc = cosine(v_llm[0], v_orig[0])
    cosine_scores.append(cosine_doc)

avg_cosine = np.mean(cosine_scores)

In [12]:
avg_cosine

np.float64(0.8415841233490402)

Q6

In [13]:
!pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [14]:
from rouge import Rouge
rouge_scorer = Rouge()

r = df_results.iloc[10]
scores = rouge_scorer.get_scores(r.answer_llm, r.answer_orig)[0]
scores

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

In [15]:
rouge_scores_rouge1_f1 = []

for doc in df_results_dict:

    rouge_score = rouge_scorer.get_scores(r.answer_llm, r.answer_orig)[0]
    rouge_score_rouge1_f1 = rouge_score['rouge-1']['f']
    rouge_scores_rouge1_f1.append(rouge_score_rouge1_f1)

np.mean(rouge_scores_rouge1_f1)

np.float64(0.4545454495454545)