# Homework: Vector Search


### Imports

In [30]:
from sentence_transformers import SentenceTransformer
import requests 
import numpy as np
from tqdm.notebook import trange, tqdm
import pandas as pd
from elasticsearch import Elasticsearch

## Q1. Getting the embeddings model

First we obtain the model that will be used to generate the embeddings.

In [23]:
model_name = 'multi-qa-distilbert-cos-v1'
embedding_model = SentenceTransformer(model_name)

Then, a user question is generated and its embedding is computed.

In [3]:
user_question = "I just discovered the course. Can I still join it?"
user_question_emb = embedding_model.encode(user_question)
print(user_question_emb[0])

0.078222655


**Answer**: 0.07

## Q2. Creating the embeddings

First, we will prepare the documents list (with ids already included).

In [4]:
base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'
docs_response = requests.get(docs_url)
documents = docs_response.json()

In [24]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp',
 'id': 'c02e79ef'}

We will only need the subset of questions regarding *machine-learning-zoomcamp*. Let's filter.

In [25]:
documents_filtered = [doc for doc in documents if doc['course'] == 'machine-learning-zoomcamp']
print(len(documents_filtered))

375


Now it is time to generate the embedding for both question and answer fields of all the documents.

In [26]:
embeddings = []

for doc in tqdm(documents_filtered):
    qa_text = f"{doc['question']} {doc['text']}"
    question_text_vector = embedding_model.encode(qa_text)
    # Add the embedding to the document (will be useful later when using elasticsearch)
    doc['question_text_vector'] = question_text_vector
    # Add the embedding to the embeddings list
    embeddings.append(question_text_vector)
    

X = np.array(embeddings)

  0%|          | 0/375 [00:00<?, ?it/s]

In [27]:
print(X.shape)

(375, 768)


**Answer**: (375, 768)

## Q3. Search

Let's calculate the similarity between the user question from Q1 and the documents from Q2, using the calculated embeddings and the dot product.

In [9]:
scores = X.dot(user_question_emb)

In [10]:
max(scores)

0.6506573

**Answer**: 0.65

## Q4. Hit-rate for our search engine

First we need to generate the class that performs the vector search.

In [11]:
class VectorSearchEngine():
    def __init__(self, documents, embeddings):
        self.documents = documents
        self.embeddings = embeddings

    def search(self, v_query, num_results=10):
        scores = self.embeddings.dot(v_query)
        idx = np.argsort(-scores)[:num_results]
        return [self.documents[i] for i in idx]

Use the previous class functions to perform vector search.

In [12]:
search_engine = VectorSearchEngine(documents=documents_filtered, embeddings=X)
search_engine.search(user_question_emb, num_results=5)

[{'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
  'section': 'General course-related questions',
  'question': 'The course has already started. Can I still join it?',
  'course': 'machine-learning-zoomcamp',
  'id': 'ee58a693'},
 {'text': 'Welcome to the course! Go to the course page (http://mlzoomcamp.com/), scroll down and start going through the course materials. Then read everything in the cohort folder for your cohort’s year.\nClick on the links and start watching the videos. Also watch office hours from previous cohorts. Go to DTC youtube channel and click on Playlists and search for {course yyyy}. ML Zoomcamp was first launched in 2021.\nOr you c

It can be seen that the retrieved documents are related with the user question from Q1. In order to evaluate the performance of the search engine (using hit-rate metric in this case) first we will need to download the ground truth dataset.

In [13]:
base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/ground-truth-data.csv'
ground_truth_url = f'{base_url}/{relative_url}?raw=1'

df_ground_truth = pd.read_csv(ground_truth_url)
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')

In [14]:
ground_truth[0]

{'question': 'Where can I sign up for the course?',
 'course': 'machine-learning-zoomcamp',
 'document': '0227b872'}

Each element on the list corresponds to a question (generated using an LLM) and the id of the main document answering that question. Our search engine, when answering these questions, should include the related documents in the first K retrieved results. This is what we are going to measure with the hit-rate metric.

In [15]:
# Num of retrieved documents for each question
num_results = 5

# Num of total questions to search in the ground truth dataset
q_total = len(ground_truth)

# Count of total hits over the ground truth dataset
hit_count = 0

# The search engine has been already initialized with the correct documents and embeddings, so we can directly perform the search
for q in tqdm(ground_truth):
    # Compute embedding of the ground truth question
    q_emb = embedding_model.encode(q['question'])
    # Perform search
    results = search_engine.search(q_emb, num_results=num_results)
    # Check if the results contain the correct document and increase hit count
    for r in results:
        if r['id']==q['document']:
            hit_count = hit_count+1
            break

  0%|          | 0/1830 [00:00<?, ?it/s]

In [16]:
hit_rate = hit_count / q_total
print(hit_rate)

0.9398907103825137


**Answer**: 0.93

## Q5. Indexing with Elasticsearch

In first place, let's run Elasticsearch and create the index as in the videos (but changing the dimension of the vector to 768).

In [31]:
# Connect with elasticsearch
es_client = Elasticsearch("http://localhost:9200")
es_client.info()

ObjectApiResponse({'name': 'e2cd10f17273', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'fa3BioAuRB6AKxi3t1PgiA', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [32]:
# Index
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
            "question_text_vector": {
                "type": "dense_vector",
                "dims": 768,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

Now that the index has been created, the documents can be indexed.

In [33]:
for doc in tqdm(documents_filtered):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/375 [00:00<?, ?it/s]

Let's create a function that allow us to perform the search of any question in the indexed documents.

In [55]:
def elastic_search_knn(query_vector):
    knn = {
        "field": "question_text_vector",
        "query_vector": query_vector,
        "k": 5,
        "num_candidates": 10000
    }

    search_query = {
        "knn": knn,
        "_source": ["text", "section", "question", "course", "id"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )

    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_dict = {}
        result_dict['text'] = hit['_source']['text']
        result_dict['section'] = hit['_source']['section']
        result_dict['question'] = hit['_source']['question']
        result_dict['course'] = hit['_source']['course']
        result_dict['id'] = hit['_source']['id']
        result_dict['score'] = hit['_score']
        result_docs.append(result_dict)


    return result_docs

Finally, we can use the previous function to perform the same search as in Q1.

In [56]:
# Execute search
results = elastic_search_knn(user_question_emb)

# Show results
results

[{'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
  'section': 'General course-related questions',
  'question': 'The course has already started. Can I still join it?',
  'course': 'machine-learning-zoomcamp',
  'id': 'ee58a693',
  'score': 0.82532895},
 {'text': 'Welcome to the course! Go to the course page (http://mlzoomcamp.com/), scroll down and start going through the course materials. Then read everything in the cohort folder for your cohort’s year.\nClick on the links and start watching the videos. Also watch office hours from previous cohorts. Go to DTC youtube channel and click on Playlists and search for {course yyyy}. ML Zoomcamp was first laun

In [58]:
print(f"The max score belongs to the first result. Score: {results[0]['score']}, Document ID: {results[0]['id']}")

The max score belongs to the first result. Score: 0.82532895, Document ID: ee58a693


**Answer**: ee58a693

## Q6. Hit-rate for Elasticsearch
