# Q1. Getting the embeddings model

In [1]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer('multi-qa-distilbert-cos-v1')

  from tqdm.autonotebook import tqdm, trange


In [2]:
# Create the embedding for this user question: "user_question = "I just discovered the course. Can I still join it?""

user_question = "I just discovered the course. Can I still join it?"
v = embedding_model.encode(user_question)

In [3]:
v[0:10]

array([ 0.07822265, -0.04013119,  0.03861362, -0.00017901,  0.08923462,
       -0.05045912, -0.01050266,  0.03710563, -0.04187142,  0.03480854],
      dtype=float32)

> First value of resulting vector: 0.078

## Prepare the documents 

Now we will create the embeddings for the documents.

In [4]:
import requests 

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'
docs_response = requests.get(docs_url)
documents = docs_response.json()

In [5]:
# documents

In [6]:
documents = [d for d in documents if d['course'] == "machine-learning-zoomcamp"]
len(documents)

375

# Q2. Creating the embeddings

Now for each document, we will create an embedding for both question and answer fields.

In [7]:
from tqdm.auto import tqdm


embeddings = []

for d in tqdm(documents):
    qa_text = '{question} {text}'.format(**d)
    v_qa = embedding_model.encode(qa_text)
    embeddings.append(v_qa)

  0%|          | 0/375 [00:00<?, ?it/s]

In [8]:
import numpy as np
X = np.array(embeddings)
X.shape

(375, 768)

> Answer: shape of X is (375, 768)

# Q3. Search

In [9]:
scores = X.dot(v)

In [10]:
scores.max()

np.float32(0.6506576)

> The highest score in the results is 0.65.

In [16]:
scores.argmax()

np.int64(14)

In [17]:
documents[14]

{'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
 'section': 'General course-related questions',
 'question': 'The course has already started. Can I still join it?',
 'course': 'machine-learning-zoomcamp',
 'id': 'ee58a693'}

## Vector search

In [11]:
import numpy as np

class VectorSearchEngine:
    """
    A simple vector-based search engine that retrieves the most similar documents
    to a given query vector based on precomputed document embeddings.
    
    Attributes:
        documents (list): A list of documents to search within.
        embeddings (numpy.ndarray): A matrix of embeddings, where each row corresponds
                                    to the embedding of a document.
    """
    
    def __init__(self, documents, embeddings):
        """
        Initializes the search engine with documents and their corresponding embeddings.

        Args:
            documents (list): List of documents.
            embeddings (numpy.ndarray): Matrix of embeddings, where each row is a document embedding.
        """
        self.documents = documents
        self.embeddings = embeddings

    def search(self, v_query, num_results=10):
        """
        Finds and returns the most similar documents to the query vector based on cosine similarity.

        Args:
            v_query (numpy.ndarray): The query vector.
            num_results (int): The number of top similar documents to return.

        Returns:
            list: The top 'num_results' documents sorted by similarity to the query.
        """
        # Calculate similarity scores using dot product between embeddings and the query vector
        scores = self.embeddings.dot(v_query)
        
        # Get indices of the top 'num_results' scores in descending order
        idx = np.argsort(-scores)[:num_results]
        
        # Retrieve and return the most similar documents based on the indices
        return [self.documents[i] for i in idx]


In [12]:
# Initialize the search engine with documents and their embeddings
search_engine = VectorSearchEngine(documents=documents, embeddings=X)

# Perform a search with the query vector 'v' and retrieve top 5 results
search_engine.search(v, num_results=5)


[{'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
  'section': 'General course-related questions',
  'question': 'The course has already started. Can I still join it?',
  'course': 'machine-learning-zoomcamp',
  'id': 'ee58a693'},
 {'text': 'Welcome to the course! Go to the course page (http://mlzoomcamp.com/), scroll down and start going through the course materials. Then read everything in the cohort folder for your cohort’s year.\nClick on the links and start watching the videos. Also watch office hours from previous cohorts. Go to DTC youtube channel and click on Playlists and search for {course yyyy}. ML Zoomcamp was first launched in 2021.\nOr you c

# Q4. Hit-rate for our search engine

Let's evaluate the performance of our own search engine. We will use the hitrate metric for evaluation.

The following code chunks loads a ground truth dataset for a course Q&A, then evaluates a vector-based search engine's accuracy in retrieving relevant documents. It defines two key metrics—**Hit Rate** and **Mean Reciprocal Rank (MRR)**—to assess how well the search engine retrieves documents that match each query. The `numpy_cosine_search()` function encodes a query into a vector and finds the top 5 most similar documents using cosine similarity on precomputed document embeddings. Finally, the `evaluate()` function uses the ground truth data to measure Hit Rate and MRR for `numpy_cosine_search`, providing insight into the search engine's retrieval performance.

In [13]:
# Load the ground truth dataset

import pandas as pd

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/ground-truth-data.csv'
ground_truth_url = f'{base_url}/{relative_url}?raw=1'

df_ground_truth = pd.read_csv(ground_truth_url)
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')

### Using the code from the module to calculate the hitrate of `VectorSearchEngine` with num_results=5.

In [18]:
# Function to calculate Hit Rate
def hit_rate(relevance_total):
    cnt = 0

    # Count the number of queries where at least one relevant document is retrieved
    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    # Calculate the hit rate by dividing relevant hits by the total number of queries
    return cnt / len(relevance_total)

In [19]:
# Function to calculate Mean Reciprocal Rank (MRR)
def mrr(relevance_total):
    total_score = 0.0

    # Iterate over each query's relevance list
    for line in relevance_total:
        # Look for the first relevant document and calculate its reciprocal rank
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)  # Reciprocal rank
                break  # Stop after finding the first relevant document

    # Return the average of the reciprocal ranks across all queries
    return total_score / len(relevance_total)


In [21]:
# Function to evaluate a search function's performance using Hit Rate and MRR
def evaluate(ground_truth, search_function):
    relevance_total = []

    # Loop over each query in the ground truth data
    for q in tqdm(ground_truth):
        doc_id = q['document']  # Expected document ID for this query
        results = search_function(q)  # Run the search function on the query
        # Check if the document IDs in the results match the expected document ID
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)  # Add the relevance list for this query

    # Return a dictionary containing Hit Rate and MRR
    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }


Now encode a query using `embedding_model`, then search for the top 5 most similar documents using search_engine.search() based on cosine similarity of vectors:

In [22]:
def numpy_cosine_search(q):
    """
    Encodes a query into a vector and searches for the most similar documents
    using cosine similarity in a vector-based search engine.
    
    Args:
        q (dict): A query dictionary containing the 'question' field, which 
                  holds the text of the query.
    
    Returns:
        list: A list of the top 5 most similar documents, ranked by cosine similarity.
    """
    
    # Extract the question text from the query dictionary
    question = q['question']
    
    # Encode the question text into a dense vector using the embedding model
    v_q = embedding_model.encode(question)
    
    # Use the vector search engine to find the top 5 documents similar to the query vector
    return search_engine.search(v_q, num_results=5)


In [23]:
numpy_cosine_search(ground_truth[10])

[{'text': 'Everything is recorded, so you won’t miss anything. You will be able to ask your questions for office hours in advance and we will cover them during the live stream. Also, you can always ask questions in Slack.',
  'section': 'General course-related questions',
  'question': 'What if I miss a session?',
  'course': 'machine-learning-zoomcamp',
  'id': '5170565b'},
 {'text': 'The course videos are pre-recorded, you can start watching the course right now.\nWe will also occasionally have office hours - live sessions where we will answer your questions. The office hours sessions are recorded too.\nYou can see the office hours as well as the pre-recorded course videos in the course playlist on YouTube.',
  'section': 'General course-related questions',
  'question': 'Is it going to be live? When?',
  'course': 'machine-learning-zoomcamp',
  'id': '39fda9f0'},
 {'text': 'We won’t re-record the course videos. The focus of the course and the skills we want to teach remained the sam

In [24]:
evaluate(ground_truth, numpy_cosine_search)

  0%|          | 0/1830 [00:00<?, ?it/s]

{'hit_rate': 0.9398907103825137, 'mrr': 0.8502823315118397}

> Ans: the hit rate of `VectorSearchEngine` with `num_results=5` is 0.93.

# Q5. Indexing with Elasticsearch

Now let's index these documents with elasticsearch

- Create the index with the same settings as in the module (but change the dimensions)
- Index the embeddings (note: you've already computed them)
- After indexing, let's perform the search of the same query from Q1.

In [25]:
from elasticsearch import Elasticsearch

# Initialize Elasticsearch Client:
es_client = Elasticsearch('http://localhost:9200') 

# Define Index Settings:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
            "question_text_vector": {
                "type": "dense_vector",
                "dims": 768,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

index_name = "course-questions-homework"

# create index
es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions-homework'})

In [26]:
# Index Documents with Embeddings:
for d, emb in zip(tqdm(documents), embeddings):
    d['question_text_vector'] = emb
    es_client.index(index=index_name, document=d)

  0%|          | 0/375 [00:00<?, ?it/s]

In [36]:
documents[0] # let's see what `question_text_vector` looks like:

{'text': 'Machine Learning Zoomcamp FAQ\nThe purpose of this document is to capture frequently asked technical questions.\nWe did this for our data engineering course and it worked quite well. Check this document for inspiration on how to structure your questions and answers:\nData Engineering Zoomcamp FAQ\nIn the course GitHub repository there’s a link. Here it is: https://airtable.com/shryxwLd0COOEaqXo\nwork',
 'section': 'General course-related questions',
 'question': 'How do I sign up?',
 'course': 'machine-learning-zoomcamp',
 'id': '0227b872',
 'question_text_vector': array([ 8.80590826e-02,  1.55936303e-02,  7.92557970e-02,  2.52757017e-02,
         7.55764470e-02, -3.90596874e-02, -4.13813516e-02,  2.52917148e-02,
         2.43241936e-02,  3.62589653e-03, -7.28291832e-03, -3.28751504e-02,
         6.12956844e-02, -5.71100302e-02,  1.16774160e-02, -1.79440994e-02,
         4.49206010e-02, -5.41605838e-02, -1.92251673e-03,  1.48329176e-02,
         7.91349541e-03, -3.43126915e-0

In [27]:
# Define KNN search function

def elastic_search_knn(field, vector, course):
    knn = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
        "filter": {
            "term": {
                "course": course
            }
        }
    }

    search_query = {
        "knn": knn,
        "_source": ["text", "section", "question", "course", "id"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs


The next line performs a KNN search on `question_text_vector` using the query vector v, returning the top 5 similar documents within the "machine-learning-zoomcamp" course:

In [29]:
# Run search
elastic_search_knn('question_text_vector', v, 'machine-learning-zoomcamp')


[{'question': 'The course has already started. Can I still join it?',
  'course': 'machine-learning-zoomcamp',
  'section': 'General course-related questions',
  'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
  'id': 'ee58a693'},
 {'question': 'I just joined. What should I do next? How can I access course materials?',
  'course': 'machine-learning-zoomcamp',
  'section': 'General course-related questions',
  'text': 'Welcome to the course! Go to the course page (http://mlzoomcamp.com/), scroll down and start going through the course materials. Then read everything in the cohort folder for your cohort’s year.\nClick on the links and start watching the vid

> The ID of the doc with the highest score is ee58a693.

# Q6. Hit-rate for Elasticsearch

The search engine we used in Q4 computed the similarity between the query and ALL the vectors in our database. Usually this is not practical, as we may have a lot of data.

Elasticsearch uses approximate techniques to make it faster.

Let's evaluate how worse the results are when we switch from exact search (as in Q4) to approximate search with Elastic.

---------------------

The following code defines a function `question_vector_knn()` which performs a K-Nearest Neighbors (KNN) search using a query from a provided dataset. It takes the question text from the query, encodes it into a vector, and uses this vector to search for the most similar documents in a pre-existing Elasticsearch index. The search is restricted to the "machine-learning-zoomcamp" course.

The code then uses the evaluate() function to assess the search performance of question_vector_knn() on a ground truth dataset, measuring metrics like Hit Rate and Mean Reciprocal Rank (MRR).

In [33]:
def question_vector_knn(q):
    """
    Encodes the question text from a query into a vector and performs a KNN search
    within the 'machine-learning-zoomcamp' course to retrieve the most similar documents.
    
    Args:
        q (dict): A dictionary containing at least the 'question' field with the text of the query.
    
    Returns:
        list: A list of the top matching documents from the KNN search, ordered by similarity.
    """
    
    # Extract the 'question' text from the query dictionary
    question = q['question']
    
    # Specify the course to restrict the search scope
    course = 'machine-learning-zoomcamp'
    
    # Encode the question into a dense vector using the embedding model
    v_q = embedding_model.encode(question)
    
    # Perform KNN search on the 'question_text_vector' field using the query vector
    return elastic_search_knn('question_text_vector', v_q, course)

In [34]:
# Example usage of the question_vector_knn function
# Runs the KNN search on the 11th query in the ground truth dataset
question_vector_knn(ground_truth[10])

[{'question': 'What if I miss a session?',
  'course': 'machine-learning-zoomcamp',
  'section': 'General course-related questions',
  'text': 'Everything is recorded, so you won’t miss anything. You will be able to ask your questions for office hours in advance and we will cover them during the live stream. Also, you can always ask questions in Slack.',
  'id': '5170565b'},
 {'question': 'Is it going to be live? When?',
  'course': 'machine-learning-zoomcamp',
  'section': 'General course-related questions',
  'text': 'The course videos are pre-recorded, you can start watching the course right now.\nWe will also occasionally have office hours - live sessions where we will answer your questions. The office hours sessions are recorded too.\nYou can see the office hours as well as the pre-recorded course videos in the course playlist on YouTube.',
  'id': '39fda9f0'},
 {'question': 'The course videos are from the previous iteration. Will you release new ones or we’ll use the videos from 

In [32]:
# Evaluate the performance of the question_vector_knn function
# Measures Hit Rate and MRR based on the ground truth data
evaluate(ground_truth, question_vector_knn)

  0%|          | 0/1830 [00:00<?, ?it/s]

{'hit_rate': 0.9398907103825137, 'mrr': 0.8502823315118397}

> The hitrate for our dataset for elastic is 0.93, i.e. the same as in exact search in Q4