In [3]:
from sentence_transformers import SentenceTransformer
import requests
import numpy as np
import pandas as pd
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Load the embedding model
model_name = "multi-qa-distilbert-cos-v1"
embedding_model = SentenceTransformer(model_name)

In [5]:
# Function to encode a query using the embedding model
def get_query_vector(query):
    return embedding_model.encode(query)

In [6]:
# Encode the specific user question and print the first vector value
user_question = "I just discovered the course. Can I still join it?"
query_embedding = get_query_vector(user_question)
print("First vector value of the encoded user query:", query_embedding[0])

First vector value of the encoded user query: 0.078222655


In [7]:
# Load documents from GitHub
base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'
docs_response = requests.get(docs_url)
documents = docs_response.json()

In [8]:
# Filter documents for the 'machine-learning-zoomcamp' course
filtered_data = [item for item in documents if item.get('course') == 'machine-learning-zoomcamp']


In [19]:
# Generate embeddings for the filtered documents
embeddings = []
for doc in filtered_data:
    qa_text = f'{doc["question"]} {doc["text"]}'
    doc["text_vector"] = embedding_model.encode(qa_text).tolist()
    embeddings.append(doc["text_vector"])

X = np.array(embeddings)
(X.shape)

(375, 768)

In [10]:
# Compute similarity scores between the query vector and the document embeddings matrix
scores = X.dot(query_embedding)
highest_score = np.max(scores)
print("The highest cosine similarity score is:", highest_score)

The highest cosine similarity score is: 0.6506573240979582


In [11]:
# Define the VectorSearchEngine class
class VectorSearchEngine:
    def __init__(self, documents, embeddings):
        self.documents = documents
        self.embeddings = embeddings

    def search(self, query, num_results=10):
        v_query = get_query_vector(query)  # Use the get_query_vector function
        scores = self.embeddings.dot(v_query)
        idx = np.argsort(-scores)[:num_results]
        return [self.documents[i] for i in idx]


In [12]:
# Create the search engine instance
search_engine = VectorSearchEngine(documents=filtered_data, embeddings=X)

In [13]:
# Test with the specific user question
test_results = search_engine.search(user_question, num_results=5)
print("Test Results for User Question:")
for result in test_results:
    print(result)


Test Results for User Question:
{'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.', 'section': 'General course-related questions', 'question': 'The course has already started. Can I still join it?', 'course': 'machine-learning-zoomcamp', 'id': 'ee58a693', 'text_vector': [0.0806286409497261, -0.06663887947797775, 0.02527306228876114, -0.013004408217966557, 0.07587282359600067, -0.05946270748972893, -0.02188383787870407, 0.0029000400099903345, 0.0007928351988084614, -0.0052224150858819485, -0.03365179896354675, -0.027913514524698257, 0.05811655521392822, 0.039748333394527435, 0.05441855639219284, -0.038251712918281555, 0.06305737048387527, -0.03896987810730934

In [14]:
# Load ground truth data
relative_url = '03-vector-search/eval/ground-truth-data.csv'
ground_truth_url = f'{base_url}/{relative_url}?raw=1'
df_ground_truth = pd.read_csv(ground_truth_url)
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')

In [15]:
# Define hit rate calculation function
def hit_rate(relevance_total):
    cnt = 0
    for line in relevance_total:
        if True in line:
            cnt += 1
    return cnt / len(relevance_total)

In [16]:
# Define MRR calculation function
def mrr(relevance_total):
    total_score = 0.0
    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank]:
                total_score += 1 / (rank + 1)
                break
    return total_score / len(relevance_total)

In [17]:
# Define the evaluate function
def evaluate(ground_truth, search_function):
    relevance_total = []
    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q['question'])
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)
    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }


In [18]:
# Perform evaluation
results = evaluate(ground_truth, lambda q: search_engine.search(q, num_results=5))
print(f"Hit Rate: {results['hit_rate']:.2f}")
print(f"MRR: {results['mrr']:.2f}")

100%|██████████| 1830/1830 [01:39<00:00, 18.40it/s]

Hit Rate: 0.94
MRR: 0.85





In [33]:
# Define Elasticsearch connection
from elasticsearch import Elasticsearch, helpers

# Initialize the Elasticsearch client
es = Elasticsearch('http://localhost:9300')

# Check if Elasticsearch is running
print(es.info())

ModuleNotFoundError: No module named 'elasticsearch'