In [1]:
import json

with open('documents-with-ids.json', 'rt') as f_in:
    documents = json.load(f_in)

In [2]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch('http://localhost:9200') 

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
        }
    }
}

index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [3]:
from tqdm.auto import tqdm

for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 948/948 [00:06<00:00, 150.13it/s]


In [4]:
def elastic_search(query, course):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": course
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs


In [5]:
elastic_search(
    query="I just discovered the course. Can I still join?",
    course="data-engineering-zoomcamp"
)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp',
  'id': '7842b56a'},
 {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.',
  'section': 'General course-related questions',
  'question': 'Course - What can I do before the course starts?',
  'course': 'data-engineering-zoomcamp',
  'id': '63394d91'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it fin

In [6]:
import pandas as pd

In [7]:
df_ground_truth = pd.read_csv("ground-truth-data.csv")


In [9]:
df_ground_truth

Unnamed: 0,question,course,document
0,"When exactly does the course begin, including ...",data-engineering-zoomcamp,c02e79ef
1,What is the purpose of this FAQ document?,data-engineering-zoomcamp,c02e79ef
2,Where can I find the course schedule?,data-engineering-zoomcamp,c02e79ef
3,How can I register for the course before it st...,data-engineering-zoomcamp,c02e79ef
4,Where can I join for course-related announceme...,data-engineering-zoomcamp,c02e79ef
...,...,...,...
4610,How do I destroy AWS infrastructure created us...,mlops-zoomcamp,886d1617
4611,What are the necessary commands to destroy inf...,mlops-zoomcamp,886d1617
4612,Who provided the solution for destroying infra...,mlops-zoomcamp,886d1617
4613,What is the first command to run to destroy AW...,mlops-zoomcamp,886d1617


In [8]:
ground_truth = df_ground_truth.to_dict(orient='records')

In [26]:
relevance_total = []
for q in tqdm(ground_truth):
    doc_id = q['document']
    results = elastic_search(query=q['question'], course=q['course'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

100%|██████████| 4615/4615 [00:22<00:00, 202.96it/s]


- Hit-rate (recall)
- Mean Reciprocal Rank (MRR)

In [27]:
def hit_rate(relevance_total):
    cnt = 0
    for line in relevance_total:
        if True in line:
            cnt = cnt + 1
    return cnt / len(relevance_total)
hit_rate(relevance_total)

0.7328277356446371

In [28]:
def mrr(relevance_total):
    score = 0
    for line in relevance_total:
        if True in line:
            score += 1 / (relevance_total.index(line)+1)
    return score / len(relevance_total)

In [29]:
mrr(relevance_total)

0.17514161525037764

In [20]:
!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

--2024-07-14 17:54:07--  https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8003::154, 2606:50c0:8000::154, 2606:50c0:8002::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8003::154|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3832 (3,7K) [text/plain]
Saving to: ‘minsearch.py’


2024-07-14 17:54:07 (41,8 MB/s) - ‘minsearch.py’ saved [3832/3832]



In [21]:
import minsearch

index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course", "id"]
)

index.fit(documents)

<minsearch.Index at 0x7fd88defb280>

In [23]:
def minsearch_search(query, course):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': course},
        boost_dict=boost,
        num_results=5
    )

    return results

In [24]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    results = minsearch_search(query=q['question'], course=q['course'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)


100%|██████████| 4615/4615 [00:33<00:00, 139.69it/s]


In [25]:
hit_rate(relevance_total), mrr(relevance_total)

(0.7763813651137594, 0.1946593689887839)

In [31]:
def evaluate(ground_truth, search_function):
    relevance_total = []
    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)
    
    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total)
    }

In [32]:
evaluate(ground_truth, lambda q: elastic_search(q['question'], q['course']))

100%|██████████| 4615/4615 [00:21<00:00, 211.60it/s]


{'hit_rate': 0.7328277356446371, 'mrr': 0.17514161525037764}

In [33]:
evaluate(ground_truth, lambda q: minsearch_search(q['question'], q['course']))

100%|██████████| 4615/4615 [00:26<00:00, 174.17it/s]


{'hit_rate': 0.7763813651137594, 'mrr': 0.1946593689887839}