# Q1. Getting the embeddings model

In [23]:
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

In [2]:
MODEL_EMB = 'multi-qa-distilbert-cos-v1'
embedding_model = SentenceTransformer(MODEL_EMB)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/9.52k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/523 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [4]:
user_question = "I just discovered the course. Can I still join it?"

embedding_model.encode(user_question)[0]

0.07822262

What's the first value of the resulting vector?

- -0.24
- -0.04
- **0.07**
- 0.27

# Q2. Creating the embeddings

In [5]:
import requests 

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'
docs_response = requests.get(docs_url)
documents = docs_response.json()

In [6]:
documents[:4]

[{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineering-zoomcamp',
  'id': 'c02e79ef'},
 {'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
  'section': 'General course-related questions',
  'question': 'Course - What are the prerequisites for this course?',
  'course': 'data-engineering-zoomcamp',
  'id': '1f6520ca'},
 {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware

In [10]:
#created the dense vector using the pre-trained model
X = []
for doc in tqdm(documents):
    question = doc['question']
    text = doc['text']
    qa_text = f'{question} {text}'
    embeddings = embedding_model.encode(qa_text).tolist()
    X.append(embeddings)

What's the shape of X? (X.shape). Include the parantheses.

In [12]:
import numpy as np
 
X = np.array(X) 
X.shape

(948, 768)

# Q3. Search

In [44]:
v = embedding_model.encode(user_question)

scores = X.dot(v)
np.max(scores)

0.6506573544114257

What's the highest score in the results?

- 65.0
- 6.5
- **0.65**
- 0.065


In [40]:
class VectorSearchEngine():
    def __init__(self, documents, embeddings):
        self.documents = documents
        self.embeddings = embeddings

    def search(self, v_query, num_results=10):
        scores = self.embeddings.dot(v_query)
        idx = np.argsort(-scores)[:num_results]
        return [self.documents[i] for i in idx]

search_engine = VectorSearchEngine(documents=documents, embeddings=X)
search_engine.search(v, num_results=5)

[{'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
  'section': 'General course-related questions',
  'question': 'The course has already started. Can I still join it?',
  'course': 'machine-learning-zoomcamp',
  'id': 'ee58a693'},
 {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp',
  'id': '7842b56a'},
 {'text': 'Star the r

# Q4. Hit-rate for our search engine

In [41]:
import pandas as pd

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/ground-truth-data.csv'
ground_truth_url = f'{base_url}/{relative_url}?raw=1'

df_ground_truth = pd.read_csv(ground_truth_url)
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')

In [66]:
relevance_total = []
search_engine = VectorSearchEngine(documents=documents, embeddings=X)

for q in tqdm(ground_truth):
    doc_id = q['document']   
    question = embedding_model.encode(q['question'])
    results = search_engine.search(question, num_results=5)
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

100%|██████████| 1830/1830 [00:45<00:00, 40.21it/s]


In [67]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [68]:
hit_rate(relevance_total)

0.9218579234972678

Now use the code from the module to calculate the hitrate of VectorSearchEngine with num_results=5.

What did you get?

- **0.93**
- 0.73
- 0.53
- 0.33 

# Q5. Indexing with Elasticsearch

In [105]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch('http://localhost:9200') 

es_client.info()

ObjectApiResponse({'name': 'c91feff7d39f', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'ynhtqtUUTH6TxZdcVOnMyw', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [108]:
v = embedding_model.encode(user_question)
dims = len(v)
dims

768

In [109]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
            "question_vector": {
                "type": "dense_vector",
                "dims": dims,
                "index": True,
                "similarity": "cosine"
            },
            "text_vector": {
                "type": "dense_vector",
                "dims": dims,
                "index": True,
                "similarity": "cosine"
            },
            "question_text_vector": {
                "type": "dense_vector",
                "dims": dims,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

  es_client.indices.create(index=index_name, body=index_settings)


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [100]:
for doc in tqdm(documents):
    question = doc['question']
    text = doc['text']
    qt = question + ' ' + text

    doc['question_vector'] = embedding_model.encode(question)
    doc['text_vector'] = embedding_model.encode(text)
    doc['question_text_vector'] = embedding_model.encode(qt)

  0%|          | 0/948 [00:00<?, ?it/s]

In [110]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

In [121]:
field = 'question_vector'

knn = {
    "field": field,
    "query_vector": v,
    "k": 5,
    "num_candidates": 10000,
}

search_query = {
    "knn": knn,
    "_source": ["text", "section", "question", "course", "id"]
}

es_results = es_client.search(
    index=index_name,
    body=search_query
)

result_score = []
result_docs = []

for hit in es_results['hits']['hits']:
    result_score.append(hit['_score'])
    result_docs.append(hit['_source'])

  es_results = es_client.search(


In [123]:
idmax = np.argsort(result_score)[0]

In [126]:
result_docs[0]['id']

'ee58a693'

# Q6. Hit-rate for Elasticsearch

In [127]:
def elastic_search_knn(field, vector, course):
    knn = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
        "filter": {
            "term": {
                "course": course
            }
        }
    }

    search_query = {
        "knn": knn,
        "_source": ["text", "section", "question", "course", "id"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [128]:
def question_vector_knn(q):
    question = q['question']
    course = q['course']

    v_q = embedding_model.encode(question)

    return elastic_search_knn('question_vector', v_q, course)

In [129]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return hit_rate(relevance_total)

In [130]:
evaluate(ground_truth, question_vector_knn)

  0%|          | 0/1830 [00:00<?, ?it/s]

  es_results = es_client.search(


0.8076502732240437

Let's evaluate how worse the results are when we switch from exact search (as in Q4) to approximate search with Elastic.

What's hitrate for our dataset for Elastic?

- 0.93
- **0.73**
- 0.53
- 0.33
