In [2]:
from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch
from tqdm.autonotebook import tqdm
import numpy as np
import pandas as pd
import requests

In [142]:
course_name = 'machine-learning-zoomcamp'
user_question = "I just discovered the course. Can I still join it?"

In [15]:
base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'

def get_json(endpoint):
  docs_url = f'{base_url}/{endpoint}?raw=1'
  docs_response = requests.get(docs_url)
  return docs_response.json()

def get_csv(endpoint):
  docs_url = f'{base_url}/{endpoint}?raw=1'
  return pd.read_csv(docs_url)

# Question 1

In [143]:
model_name = 'multi-qa-distilbert-cos-v1'

embedding_model = SentenceTransformer(model_name)
embedding_vector = embedding_model.encode(user_question)

In [144]:
embedding_vector[0]

0.078222655

# Question 2

In [7]:
documents = get_json('03-vector-search/eval/documents-with-ids.json')

def filterCourse(q):
    return q['course'] == course_name

filtered_documents = list(filter(filterCourse, documents))
len(filtered_documents)

375

In [8]:
embeddings = []

for document in filtered_documents:
    qa_text = f"{document['question']} {document['text']}"
    embedding = embedding_model.encode(qa_text)
    embeddings.append(embedding)

In [9]:
X = np.array(embeddings)

In [10]:
X.shape

(375, 768)

# Question 3

In [11]:
scores = X.dot(embedding_vector)

In [12]:
max(scores)

0.6506573

# Question 4

In [50]:
class VectorSearchEngine():
    def __init__(self, documents, embeddings):
        self.documents = documents
        self.embeddings = embeddings

    def search(self, v_query, num_results=10):
        scores = self.embeddings.dot(v_query)
        idx = np.argpartition(-scores, num_results)[:num_results]
        return [self.documents[i] for i in idx]

## usage
# search_engine = VectorSearchEngine(documents=documents, embeddings=X)
# search_engine.search(embedding_vector, num_results=5)

In [61]:
df_ground_truth = get_csv('03-vector-search/eval/ground-truth-data.csv')
df_ground_truth = df_ground_truth[df_ground_truth.course == course_name]
ground_truth = df_ground_truth.to_dict(orient='records')

search_engine = VectorSearchEngine(documents=filtered_documents, embeddings=X)

In [139]:
q4_results = [];

for record in ground_truth:
    query = record['question']
    v_query = embedding_model.encode(query)
    current_results = search_engine.search(v_query, num_results=5)
    q4_results.append(current_results)

In [140]:
q4_hits = 0

for idx, arr in enumerate(q4_results):
    record = ground_truth[idx]
    for result in arr:
        if record['document'] == result['id']:
            q4_hits += 1
            break

In [141]:
print(q4_hits)
print(q4_hits / len(ground_truth))

1720
0.9398907103825137


# Question 5

In [99]:
es_client = Elasticsearch('http://localhost:9200')

In [114]:
index_name = "documents"

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} ,
            "text_vector": {
                "type": "dense_vector",
                "dims": 768,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

In [115]:
# es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'documents'})

In [116]:
for doc in tqdm(filtered_documents):
    es_client.index(index=index_name, document=doc)

100%|██████████████████████████████████████████████████████████████████████████| 375/375 [00:09<00:00, 40.18it/s]


In [120]:
def es_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": { "multi_match": { "query": query } }
            }
        }
    }

    return es_client.search(index=index_name, body=search_query)

In [123]:
response = es_search(user_question)

In [124]:
response['hits']['hits'][0]

{'_index': 'documents',
 '_id': 'x5nKoZABHBbT7Y1hrOH1',
 '_score': 25.625399,
 '_source': {'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
  'section': 'General course-related questions',
  'question': 'The course has already started. Can I still join it?',
  'course': 'machine-learning-zoomcamp',
  'id': 'ee58a693'}}

# Question 6

In [136]:
q6_results = [];

for record in ground_truth:
    query = record['question']
    current_results = es_search(query)
    q6_results.append(current_results)

In [137]:
q6_hits = 0

for idx, arr in enumerate(q6_results):
    record = ground_truth[idx]
    for result in arr['hits']['hits']:
        if record['document'] == result['_source']['id']:
            q6_hits += 1
            break

In [138]:
print(q6_hits)
print(q6_hits / len(ground_truth))

1662
0.9081967213114754
