## Q1. Getting the embeddings model

In [6]:
import os
os.environ['HF_HOME'] = '../../../models/'

In [7]:
from sentence_transformers import SentenceTransformer

In [8]:
model_name = 'multi-qa-distilbert-cos-v1'
embedding_model = SentenceTransformer(model_name)

In [9]:
user_question = "I just discovered the course. Can I still join it?"
v = embedding_model.encode(user_question)

In [10]:
print("Ans:", v[0])

Ans: 0.07822262


### documents preparing
We will use only a subset of the questions - the questions for "machine-learning-zoomcamp". After filtering, you should have only 375 documents


In [11]:
import requests 

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'
docs_response = requests.get(docs_url)
documents = docs_response.json()

In [12]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp',
 'id': 'c02e79ef'}

In [13]:
documents = [doc 
             for doc in documents 
                if doc['course']=='machine-learning-zoomcamp']

In [14]:
len(documents)

375

## Q2. Creating the embeddings

In [15]:
from tqdm.auto import tqdm

In [16]:
embeddings = []

for doc in tqdm(documents):
    question = doc['question']
    text = doc['text']
    qa_text = f'{question} {text}'
    
    v_qa = embedding_model.encode(qa_text)
    embeddings.append(v_qa)

  0%|          | 0/375 [00:00<?, ?it/s]

In [17]:
import numpy as np

In [18]:
X = np.array(embeddings)
print("Ans:", X.shape)

Ans: (375, 768)


## Q3. Search
- `v` from Q1
- `X` from Q2

In [19]:
# model is already normalized
v.dot(v)

1.0

In [20]:
total_scores = X.dot(v)

In [21]:
print("Ans:", total_scores.max())

Ans: 0.65065736


### Vector search
We can now compute the similarity between a query vector and all the embeddings.

Let's use this to implement our own vector search

In [37]:
class VectorSearchEngine():
    def __init__(self, documents, embeddings):
        self.documents = documents
        self.embeddings = embeddings

    def search(self, v_query, num_results=10):
        scores = self.embeddings.dot(v_query)
        # argpartition is faster than argsort
        # because argpartition only partially sorts k smallest elements
        # but also need to cut off what is larger than k-th element
        idx = np.argpartition(-scores, num_results)[:num_results]
        # idx = np.argsort(-scores)[:num_results]
        return [self.documents[i] for i in idx]

search_engine = VectorSearchEngine(documents=documents, embeddings=X)
# search_engine.search(v, num_results=5)

## Q4. Hit-rate for our search engine

In [23]:
# load the ground truth dataset
import pandas as pd

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/ground-truth-data.csv'
ground_truth_url = f'{base_url}/{relative_url}?raw=1'

df_ground_truth = pd.read_csv(ground_truth_url)
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')

In [24]:
def hit_rate(relevance_total):
    cnt = 0 # counter

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [25]:
hit_results = []

for q in tqdm(ground_truth):
    question = q['question']
    doc_id = q['document']
    v = embedding_model.encode(question)
    results = search_engine.search(v, num_results=5)
    
    hit_result = [d['id'] == doc_id for d in results]
    hit_results.append(hit_result)

  0%|          | 0/1830 [00:00<?, ?it/s]

In [26]:
print("Ans:", hit_rate(hit_results))

Ans: 0.9398907103825137


## Q5. Indexing with Elasticsearch

In [27]:
len(v)

768

In [28]:
from elasticsearch import Elasticsearch
es_client = Elasticsearch('http://localhost:9200') 

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
            "question_text_vector": {
                "type": "dense_vector",
                "dims": 768,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [29]:
for i, doc in enumerate(documents):
    doc["question_text_vector"] = X[i]

In [30]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/375 [00:00<?, ?it/s]

In [31]:
user_question = "I just discovered the course. Can I still join it?"
v = embedding_model.encode(user_question)

In [32]:
knn_query = {
    "field": "question_text_vector",
    "query_vector": v,
    "k": 5,
    "num_candidates": 10000
}

response = es_client.search(index=index_name, 
                       knn=knn_query, 
                       source=["text", "section", "question", "course",'id'])

# response["hits"]["hits"]

In [33]:
print("Ans:", response["hits"]["hits"][0]["_source"]["id"])

Ans: ee58a693


## Q6. Hit-rate for Elasticsearch

In [34]:
hit_results = []

for q in tqdm(ground_truth):
    question = q['question']
    doc_id = q['document']
    v = embedding_model.encode(question)
    
    knn_query = {
        "field": "question_text_vector",
        "query_vector": v,
        "k": 5,
        "num_candidates": 10000
    }

    response = es_client.search(index=index_name, 
                        knn=knn_query, 
                        source=["text", "section", "question", "course",'id'])
    results = [result['_source'] for result in response["hits"]["hits"]]    # result = response["hits"]["hits"][i]["_source"]
    hit_result = [d['id'] == doc_id for d in results]
    hit_results.append(hit_result)

  0%|          | 0/1830 [00:00<?, ?it/s]

In [35]:
hit_rate(hit_results)

0.9398907103825137

hit-rate of own search engines :
0.9398907103825137

 **The performance of Elasticsearch is the same, but it takes more time to search**


Probably because the dataset is small enough that *elastic search* can calculate all data points

And probably related to this parameter in *knn_query* : `"num_candidates": 10000`

10000 > 375

In [36]:
len(documents)

375