In [1]:
import json

with open('documents-with-ids.json', 'rt') as f_in:
    documents = json.load(f_in)

In [3]:
from elasticsearch import Elasticsearch

In [4]:
es_client = Elasticsearch('http://localhost:9200') 

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
        }
    }
}

index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [5]:
from tqdm.auto import tqdm

for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

In [6]:
def elastic_search(query, course):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": course
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [7]:
elastic_search(
    query="I just discovered the course. Can I still join?",
    course="data-engineering-zoomcamp"
)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp',
  'id': '7842b56a'},
 {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.',
  'section': 'General course-related questions',
  'question': 'Course - What can I do before the course starts?',
  'course': 'data-engineering-zoomcamp',
  'id': '63394d91'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it fin

In [8]:
import pandas as pd

In [9]:
df_ground_truth = pd.read_csv('ground-truth-data.csv')

In [10]:
ground_truth = df_ground_truth.to_dict(orient='records')

In [11]:
relevance_results = []  # Stores relevance lists for all queries

for query_data in tqdm(ground_truth):
    expected_doc_id = query_data['document']    
    # Perform Elasticsearch query
    retrieved_docs = elastic_search(query=query_data['question'], course=query_data['course'])    
    # Check if retrieved document IDs match the expected document ID
    relevance_flags = [doc['id'] == expected_doc_id for doc in retrieved_docs]    
    # Store the result
    relevance_results.append(relevance_flags)

  0%|          | 0/4627 [00:00<?, ?it/s]

In [13]:
def hit_rate(relevance_results):
    """Calculate the hit rate based on relevance results.    
    A hit is counted if at least one relevant document (True) exists in the retrieved results.
    """
    hit_count = sum(any(relevance_flags) for relevance_flags in relevance_results)
    return hit_count / len(relevance_results) if relevance_results else 0  # Avoid division by zero


In [14]:
def mrr(relevance_results):
    """Compute Mean Reciprocal Rank (MRR) based on relevance results.    
    MRR calculates the average reciprocal rank of the first relevant document 
    across multiple queries.
    """
    mrr_score = 0.0
    for relevance_flags in relevance_results:
        # Find the first occurrence of True (i.e., first relevant document)
        first_hit_rank = next((rank + 1 for rank, is_relevant in enumerate(relevance_flags) if is_relevant), 0)      
        # If a relevant document was found, add its reciprocal rank
        if first_hit_rank:
            mrr_score += 1 / first_hit_rank

    return mrr_score / len(relevance_results) if relevance_results else 0.0  # Avoid division by zero

In [15]:
example = [
    [True, False, False, False, False], # 1, 
    [False, False, False, False, False], # 0
    [False, False, False, False, False], # 0 
    [False, False, False, False, False], # 0
    [False, False, False, False, False], # 0 
    [True, False, False, False, False], # 1
    [True, False, False, False, False], # 1
    [True, False, False, False, False], # 1
    [True, False, False, False, False], # 1
    [True, False, False, False, False], # 1 
    [False, False, True, False, False],  # 1/3
    [False, False, False, False, False], # 0
]

#### in mrr it use reprocrocal rank like 
# true at position 1 => 1
# 2 => 1 / 2 = 0.5
# 3 => 1 / 3 = 0.3333
# 4 => 0.25
# 5 => 0.2
# rank => 1 / rank
# none => 0

In [16]:
hit_rate(example)

0.5833333333333334

In [17]:
mrr(example)

0.5277777777777778

In [18]:
hit_rate(relevance_results), mrr(relevance_results)

(0.7395720769397017, 0.6024385851163465)

In [19]:

import minsearch

index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course", "id"]
)

index.fit(documents)

<minsearch.Index at 0x15495f3efa0>

In [20]:

def minsearch_search(query, course):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': course},
        boost_dict=boost,
        num_results=5
    )

    return results

In [21]:
relevance_results = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    results = minsearch_search(query=q['question'], course=q['course'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_results.append(relevance)

  0%|          | 0/4627 [00:00<?, ?it/s]

In [22]:
hit_rate(relevance_results), mrr(relevance_results)

(0.7722066133563864, 0.6609862401844251)

In [23]:
def evaluate(ground_truth, search_function):
    relevance_results = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_results.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_results),
        'mrr': mrr(relevance_results),
    }

In [24]:
evaluate(ground_truth, lambda q: elastic_search(q['question'], q['course']))

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.7395720769397017, 'mrr': 0.6024385851163465}

In [25]:
evaluate(ground_truth, lambda q: minsearch_search(q['question'], q['course']))

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.7722066133563864, 'mrr': 0.6609862401844251}