In [1]:
from sentence_transformers import SentenceTransformer
model_name = "multi-qa-distilbert-cos-v1"
embedding_model = SentenceTransformer(model_name)

  from .autonotebook import tqdm as notebook_tqdm


## Q1

In [2]:
user_question = "I just discovered the course. Can I still join it?"
v = embedding_model.encode(user_question)
v[0]



0.07822264

In [76]:
len(v)

768

In [3]:
import requests 

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'
docs_response = requests.get(docs_url)
documents = docs_response.json()

In [4]:
import requests 

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'
docs_response = requests.get(docs_url)
documents = docs_response.json()

In [5]:
filtered_documents = [d for d in documents if d['course'] == "machine-learning-zoomcamp"]

## Q2

In [6]:
from tqdm.auto import tqdm

In [7]:
embeddings = []

for d in tqdm(filtered_documents):
    qa_text = f'{d["question"]} {d["text"]}'
    embeddings.append(embedding_model.encode(qa_text))

100%|██████████| 375/375 [00:54<00:00,  6.85it/s]


In [8]:
import numpy as np
x = np.array(embeddings)
print(x.shape)

(375, 768)


## Q3

In [9]:
scores = x.dot(v)

In [10]:
max(scores)

0.6506573

In [11]:
np.dot(v,v)

0.99999994

In [12]:
X = x

## Q4

In [66]:
class VectorSearchEngine():
    def __init__(self, documents, embeddings):
        self.documents = documents
        self.embeddings = embeddings

    def search(self, v_query, num_results=10):
        scores = self.embeddings.dot(v_query)
        idx = np.argpartition(-scores, num_results - 1)[:num_results]
        return [self.documents[i] for i in idx]

search_engine = VectorSearchEngine(documents=documents, embeddings=X)
search_engine.search(v, num_results=5)

[{'text': 'You can find the latest and up-to-date deadlines here: https://docs.google.com/spreadsheets/d/e/2PACX-1vQACMLuutV5rvXg5qICuJGL-yZqIV0FBD84CxPdC5eZHf8TfzB-CJT_3Mo7U7oGVTXmSihPgQxuuoku/pubhtml\nAlso, take note of Announcements from @Au-Tomator for any extensions or other news. Or, the form may also show the updated deadline, if Instructor(s) has updated it.',
  'section': 'General course-related questions',
  'question': 'Homework - What are homework and project deadlines?',
  'course': 'data-engineering-zoomcamp',
  'id': 'a1daf537'},
 {'text': 'After you submit your homework it will be graded based on the amount of questions in a particular homework. You can see how many points you have right on the page of the homework up top. Additionally in the leaderboard you will find the sum of all points you’ve earned - points for Homeworks, FAQs and Learning in Public. If homework is clear, others work as follows: if you submit something to FAQ, you get one point, for each learning i

In [67]:
import pandas as pd

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/ground-truth-data.csv'
ground_truth_url = f'{base_url}/{relative_url}?raw=1'

df_ground_truth = pd.read_csv(ground_truth_url)
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')

In [68]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [69]:
def mrr(relevance_total):
    score = 0
    for line in relevance_total:
        if True in line:
            score += 1 / (relevance_total.index(line)+1)
    return score / len(relevance_total)

In [70]:
def evaluate(ground_truth, search_function):
    
    relevance_total = []
    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(embedding_model.encode(q['question']))
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)
    
    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total)
    }

In [71]:
vse = VectorSearchEngine(filtered_documents, x)

In [144]:
evaluate(ground_truth, lambda q: vse.search(q, num_results=5))

100%|██████████| 1830/1830 [03:12<00:00,  9.49it/s]


{'hit_rate': 0.9398907103825137, 'mrr': 0.3331114958075696}

In [74]:
len(ground_truth)

1830

## Q5

In [78]:
filtered_documents[0]

{'text': 'Machine Learning Zoomcamp FAQ\nThe purpose of this document is to capture frequently asked technical questions.\nWe did this for our data engineering course and it worked quite well. Check this document for inspiration on how to structure your questions and answers:\nData Engineering Zoomcamp FAQ\nIn the course GitHub repository there’s a link. Here it is: https://airtable.com/shryxwLd0COOEaqXo\nwork',
 'section': 'General course-related questions',
 'question': 'How do I sign up?',
 'course': 'machine-learning-zoomcamp',
 'id': '0227b872'}

In [106]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch('http://localhost:9200') 

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "keyword"},
            "id": {"type": "keyword"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "question_vector": {
                "type": "dense_vector",
                "dims": 768,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [107]:
for doc in tqdm(filtered_documents):
    question = doc['question']
    doc['question_vector'] = embedding_model.encode(question)

100%|██████████| 375/375 [00:12<00:00, 28.89it/s]


In [108]:
for doc in tqdm(filtered_documents):
    es_client.index(index=index_name, document=doc)

100%|██████████| 375/375 [00:04<00:00, 81.28it/s] 


In [109]:
def elastic_search_knn(field, vector):
    knn = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
    }

    search_query = {
        "knn": knn,
        "_source": ["text", "section", "question", "course", "id"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs


In [155]:
def question_vector_knn(question):
    v_q = embedding_model.encode(question)

    return elastic_search_knn('question_vector', v_q)


In [156]:
question_vector_knn("I just discovered the course. Can I still join it?")

[{'question': 'The course has already started. Can I still join it?',
  'course': 'machine-learning-zoomcamp'},
 {'question': 'I just joined. What should I do next? How can I access course materials?',
  'course': 'machine-learning-zoomcamp'},
 {'question': "I don't know math. Can I take the course?",
  'course': 'machine-learning-zoomcamp'},
 {'question': 'I’m new to Slack and can’t find the course channel. Where is it?',
  'course': 'machine-learning-zoomcamp'},
 {'question': 'How much time do I need for this course?',
  'course': 'machine-learning-zoomcamp'}]

## Q6

In [164]:
ground_truth[0]

{'question': 'Where can I sign up for the course?',
 'course': 'machine-learning-zoomcamp',
 'document': '0227b872'}

In [173]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch('http://localhost:9200') 

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "document": {"type": "keyword"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "question_vector": {
                "type": "dense_vector",
                "dims": 768,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [174]:
for doc in tqdm(ground_truth):
    question = doc['question']
    doc['question_vector'] = embedding_model.encode(question)

100%|██████████| 1830/1830 [01:22<00:00, 22.31it/s]


In [175]:
for doc in tqdm(ground_truth):
    es_client.index(index=index_name, document=doc)

100%|██████████| 1830/1830 [00:13<00:00, 131.32it/s]


In [181]:
def elastic_search_knn(field, vector):
    knn = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
    }

    search_query = {
        "knn": knn,
        "_source": ["question", "course", "document"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs


In [182]:
def question_vector_knn2(q):
    question = q['question']
    v_q = embedding_model.encode(question)

    return elastic_search_knn('question_vector', v_q)


In [183]:
def evaluate2(ground_truth, search_function):
    relevance_total = []
    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['document'] == doc_id for d in results]
        relevance_total.append(relevance)
    
    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total)
    }

In [184]:
question_vector_knn2(ground_truth[0])

[{'question': 'Where can I sign up for the course?',
  'document': '0227b872',
  'course': 'machine-learning-zoomcamp'},
 {'question': 'How can I earn a certificate in this course?',
  'document': '2eba08e3',
  'course': 'machine-learning-zoomcamp'},
 {'question': 'Does this course have a GitHub repository for the sign-up link?',
  'document': '0227b872',
  'course': 'machine-learning-zoomcamp'},
 {'question': 'What is the initial step after joining the course?',
  'document': '0a278fb2',
  'course': 'machine-learning-zoomcamp'},
 {'question': 'How can I view the content of the course?',
  'document': '0a278fb2',
  'course': 'machine-learning-zoomcamp'}]

In [185]:
evaluate2(ground_truth, lambda q: question_vector_knn2(q))

100%|██████████| 1830/1830 [01:52<00:00, 16.31it/s]


{'hit_rate': 0.9972677595628415, 'mrr': 0.18233066097251494}