# Q1. Getting the embeddings model

In [14]:
import numpy as np
from tqdm import tqdm

In [1]:
from sentence_transformers import SentenceTransformer

model_name = "multi-qa-distilbert-cos-v1"
embedding_model = SentenceTransformer(model_name)

  from tqdm.autonotebook import tqdm, trange


HBox(children=(HTML(value='modules.json'), FloatProgress(value=0.0, max=349.0), HTML(value='')))




HBox(children=(HTML(value='config_sentence_transformers.json'), FloatProgress(value=0.0, max=116.0), HTML(valu…




HBox(children=(HTML(value='README.md'), FloatProgress(value=0.0, max=9522.0), HTML(value='')))




HBox(children=(HTML(value='sentence_bert_config.json'), FloatProgress(value=0.0, max=53.0), HTML(value='')))




HBox(children=(HTML(value='config.json'), FloatProgress(value=0.0, max=523.0), HTML(value='')))




HBox(children=(HTML(value='model.safetensors'), FloatProgress(value=0.0, max=265462608.0), HTML(value='')))




HBox(children=(HTML(value='tokenizer_config.json'), FloatProgress(value=0.0, max=333.0), HTML(value='')))




HBox(children=(HTML(value='vocab.txt'), FloatProgress(value=0.0, max=231508.0), HTML(value='')))




HBox(children=(HTML(value='tokenizer.json'), FloatProgress(value=0.0, max=466247.0), HTML(value='')))




HBox(children=(HTML(value='special_tokens_map.json'), FloatProgress(value=0.0, max=112.0), HTML(value='')))




HBox(children=(HTML(value='1_Pooling/config.json'), FloatProgress(value=0.0, max=190.0), HTML(value='')))




In [27]:
user_question = "I just discovered the course. Can I still join it?"
q_embedding = embedding_model.encode(user_question)

In [6]:
embedding[0]

0.07822265

In [13]:
embedding.shape

(768,)

In [7]:
import requests 

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'
docs_response = requests.get(docs_url)
documents = docs_response.json()

In [58]:
course = "machine-learning-zoomcamp"

In [9]:
filtered_documents = [doc for doc in documents if doc['course'] == course]

In [11]:
filtered_documents[0]

{'text': 'Machine Learning Zoomcamp FAQ\nThe purpose of this document is to capture frequently asked technical questions.\nWe did this for our data engineering course and it worked quite well. Check this document for inspiration on how to structure your questions and answers:\nData Engineering Zoomcamp FAQ\nIn the course GitHub repository there’s a link. Here it is: https://airtable.com/shryxwLd0COOEaqXo\nwork',
 'section': 'General course-related questions',
 'question': 'How do I sign up?',
 'course': 'machine-learning-zoomcamp',
 'id': '0227b872'}

# Q2. Creating the embeddings

In [15]:
embeddings = []

for doc in tqdm(filtered_documents):
    question = doc["question"]
    text = doc["text"]
    qa_text = f'{question} {text}'
    embedding = embedding_model.encode(qa_text)
    embeddings.append(embedding)

100%|██████████| 375/375 [01:14<00:00,  5.03it/s]


In [16]:
X = np.array(embeddings)
X.shape

(375, 768)

# Q3. Search

In [28]:
v = q_embedding
scores = X.dot(v)

In [29]:
max(scores)

0.6506574

In [40]:
class VectorSearchEngine():
    def __init__(self, documents, embeddings):
        self.documents = documents
        self.embeddings = embeddings

    def search(self, v_query, num_results=10):
        scores = self.embeddings.dot(v_query)
        idx = np.argsort(-scores)[:num_results]
        return [self.documents[i] for i in idx]

search_engine = VectorSearchEngine(documents=filtered_documents, embeddings=X)
search_engine.search(v, num_results=5)

[{'text': 'I’ve seen LinkedIn users list DataTalksClub as Experience with titles as:\nMachine Learning Fellow\nMachine Learning Student\nMachine Learning Participant\nMachine Learning Trainee\nPlease note it is best advised that you do not list the experience as an official “job” or “internship” experience since DataTalksClub did not hire you, nor financially compensate you.\nOther ways you can incorporate the experience in the following sections:\nOrganizations\nProjects\nSkills\nFeatured\nOriginal posts\nCertifications\nCourses\nBy Annaliese Bronz\nInteresting question, I put the link of my project into my CV as showcase and make posts to show my progress.\nBy Ani Mkrtumyan',
  'section': 'Miscellaneous',
  'question': 'Any advice for adding the Machine Learning Zoomcamp experience to your LinkedIn profile?',
  'course': 'machine-learning-zoomcamp',
  'id': 'c6a22665'},
 {'text': 'When you post about what you learned from the course on your social media pages, use the tag #mlzoomcamp

# Q4. Hit-rate for our search engine

In [31]:
import pandas as pd

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/ground-truth-data.csv'
ground_truth_url = f'{base_url}/{relative_url}?raw=1'

df_ground_truth = pd.read_csv(ground_truth_url)
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')

In [45]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    v = embedding_model.encode(q['question'])
    results = search_engine.search(v, num_results=5)
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

100%|██████████| 1830/1830 [00:53<00:00, 34.20it/s]


In [46]:
def hit_rate(relevance_total):
    cnt = 0
    for relev in relevance_total:
        if True in relev:
            cnt += 1
    
    return cnt / len(relevance_total)

hit_rate(relevance_total)

0.9398907103825137

# Q5. Indexing with Elasticsearch

In [49]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch('http://localhost:9200') 

In [66]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
            "text_vector": {
                "type": "dense_vector", "dims": 768, "index": True, "similarity": "cosine"
            },
            "question_vector": {
                "type": "dense_vector", "dims": 768, "index": True, "similarity": "cosine"
            },
            "question_text_vector": {
                "type": "dense_vector", "dims": 768, "index": True, "similarity": "cosine"
            },
        }
    }
}

index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [67]:
for doc in tqdm(filtered_documents):
    text = doc['text']
    question = doc['question']
    q_t = question + ' ' + text
    
    doc["text_vector"] = embedding_model.encode(text)
    doc["question_vector"] = embedding_model.encode(question)
    doc["question_text_vector"] = embedding_model.encode(q_t)

100%|██████████| 375/375 [01:29<00:00,  4.17it/s]


In [68]:
for doc in tqdm(filtered_documents):
    es_client.index(index=index_name, document=doc)

100%|██████████| 375/375 [00:03<00:00, 97.94it/s] 


In [97]:
def query(user_question, course):
    vector_search_term = embedding_model.encode(user_question)
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": [
                    {
                        "script_score": {
                            "query": {
                                "term": {
                                    "course": course
                                }
                            },
                            "script": {
                                "source": """
                                    cosineSimilarity(params.query_vector, 'question_vector') + 
                                    cosineSimilarity(params.query_vector, 'text_vector') + 
                                    cosineSimilarity(params.query_vector, 'question_text_vector') + 
                                    1
                                """,
                                "params": {
                                    "query_vector": vector_search_term
                                }
                            }
                        }
                    }
                ],
                "filter": {
                    "term": {
                        "course": course
                    }
                }
            }
        },
        "_source": ["text", "section", "question", "course", "id"]
    }
    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    result_docs = []
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [98]:
query(user_question, course)[0]

{'question': 'The course has already started. Can I still join it?',
 'course': 'machine-learning-zoomcamp',
 'section': 'General course-related questions',
 'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
 'id': 'ee58a693'}

# Q6. Hit-rate for Elasticsearch

In [100]:
elastic_relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    results = query(q['question'], course)
    relevance = [d['id'] == doc_id for d in results]
    elastic_relevance_total.append(relevance)

100%|██████████| 1830/1830 [01:30<00:00, 20.26it/s]


In [102]:
hit_rate(elastic_relevance_total)

0.9218579234972678