In [23]:
from sentence_transformers import SentenceTransformer
import numpy as np

model_name = 'multi-qa-distilbert-cos-v1'
user_question = "I just discovered the course. Can I still join it?"

embedding_model = SentenceTransformer(model_name)
embedding_vector = embedding_model.encode(user_question)

In [8]:
embedding_vector[0]

0.078222655

In [None]:
#####

In [9]:
import requests 

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'
docs_response = requests.get(docs_url)
documents = docs_response.json()

In [20]:
def filterCourse(q):
    return q['course'] == 'machine-learning-zoomcamp'

filtered_documents = list(filter(filterCourse, documents))
len(filtered_documents)

375

In [26]:
embeddings = []

for document in filtered_documents:
    qa_text = f"{document['question']} {document['text']}"
    embedding = embedding_model.encode(qa_text)
    embeddings.append(embedding)

In [27]:
X = np.array(embeddings)
print(X.shape)

(375, 768)


In [None]:
#####

In [29]:
scores = X.dot(embedding_vector)

In [31]:
max(scores)

0.6506573

In [None]:
###

In [33]:
class VectorSearchEngine():
    def __init__(self, documents, embeddings):
        self.documents = documents
        self.embeddings = embeddings

    def search(self, v_query, num_results=10):
        scores = self.embeddings.dot(v_query)
        idx = np.argsort(-scores)[:num_results]
        return [self.documents[i] for i in idx]

search_engine = VectorSearchEngine(documents=documents, embeddings=X)
search_engine.search(embedding_vector, num_results=5)

[{'text': 'You can find the latest and up-to-date deadlines here: https://docs.google.com/spreadsheets/d/e/2PACX-1vQACMLuutV5rvXg5qICuJGL-yZqIV0FBD84CxPdC5eZHf8TfzB-CJT_3Mo7U7oGVTXmSihPgQxuuoku/pubhtml\nAlso, take note of Announcements from @Au-Tomator for any extensions or other news. Or, the form may also show the updated deadline, if Instructor(s) has updated it.',
  'section': 'General course-related questions',
  'question': 'Homework - What are homework and project deadlines?',
  'course': 'data-engineering-zoomcamp',
  'id': 'a1daf537'},
 {'text': 'After you submit your homework it will be graded based on the amount of questions in a particular homework. You can see how many points you have right on the page of the homework up top. Additionally in the leaderboard you will find the sum of all points you’ve earned - points for Homeworks, FAQs and Learning in Public. If homework is clear, others work as follows: if you submit something to FAQ, you get one point, for each learning i

In [34]:
###

In [35]:
import pandas as pd

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/ground-truth-data.csv'
ground_truth_url = f'{base_url}/{relative_url}?raw=1'

df_ground_truth = pd.read_csv(ground_truth_url)
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')

In [42]:
record = ground_truth[0]
print(record)

query = record['question']
document_id = record['document']

{'question': 'Where can I sign up for the course?', 'course': 'machine-learning-zoomcamp', 'document': '0227b872'}


In [53]:
v_query = embedding_model.encode(query)

In [54]:
results = search_engine.search(v_query, num_results=5)

In [55]:
results

[{'text': "✅SOLUTION: pip install confluent-kafka[avro].\nFor some reason, Conda also doesn't include this when installing confluent-kafka via pip.\nMore sources on Anaconda and confluent-kafka issues:\nhttps://github.com/confluentinc/confluent-kafka-python/issues/590\nhttps://github.com/confluentinc/confluent-kafka-python/issues/1221\nhttps://stackoverflow.com/questions/69085157/cannot-import-producer-from-confluent-kafka",
  'section': 'Module 6: streaming with kafka',
  'question': "ModuleNotFoundError: No module named 'avro'",
  'course': 'data-engineering-zoomcamp',
  'id': '1edd4630'},
 {'text': 'GitHub Codespaces offers you computing Linux resources with many pre-installed tools (Docker, Docker Compose, Python).\nYou can also open any GitHub repository in a GitHub Codespace.',
  'section': 'General course-related questions',
  'question': 'Environment - Is GitHub codespaces an alternative to using cli/git bash to ingest the data and create a docker file?',
  'course': 'data-engi

In [56]:
for result in results:
    print('document_id: ' + document_id)
    print('result[id]: ' + result['id'])

document_id: c6a22665
result[id]: 1edd4630
document_id: c6a22665
result[id]: ac25d3af
document_id: c6a22665
result[id]: d970a0da
document_id: c6a22665
result[id]: ce508f3c
document_id: c6a22665
result[id]: 6b26d73c


In [None]:
###

In [51]:
hits = 0
for record in ground_truth:
    query = record['question']
    document_id = record['document']
    v_query = embedding_model.encode(query)
    results = search_engine.search(v_query, num_results=5)
    for result in results:
        if document_id in result['id']:
            hits += 1
            break

hitrate = hits / len(ground_truth)

In [52]:
hits

0

In [41]:
len(ground_truth)

1830