## Evaluation of Text Retrieval Techniques

### Importing Required Libraries

In [1]:
import json
import pandas as pd
import minsearch

from tqdm.auto import tqdm

from elasticsearch import Elasticsearch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open("documents-with-ids.json", "rt") as f:
    documents = json.load(f)

In [3]:
documents[:3]

[{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineering-zoomcamp',
  'id': 'c02e79ef'},
 {'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
  'section': 'General course-related questions',
  'question': 'Course - What are the prerequisites for this course?',
  'course': 'data-engineering-zoomcamp',
  'id': '1f6520ca'},
 {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware

### Indexing these documents in Elasticsearch

In [4]:
es_client = Elasticsearch("http://localhost:9200")

In [5]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
        }
    }
}

index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [6]:
for doc in tqdm(documents, desc="Indexing documents"):
    es_client.index(index=index_name, document=doc)

Indexing documents: 100%|██████████| 948/948 [00:54<00:00, 17.35it/s]


In [7]:
def elastic_search(query, course):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": course
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [10]:
elastic_search(
    query="I just discovered the course. Can I still join?",
    course="data-engineering-zoomcamp"
)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp',
  'id': '7842b56a'},
 {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.',
  'section': 'General course-related questions',
  'question': 'Course - What can I do before the course starts?',
  'course': 'data-engineering-zoomcamp',
  'id': '63394d91'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it fin

In [8]:
df_ground_truth = pd.read_csv("ground_truth_dataset.csv")

df_ground_truth.head(10)

Unnamed: 0,question,course,document_id
0,When does the course begin?,data-engineering-zoomcamp,c02e79ef
1,How can I get the course schedule?,data-engineering-zoomcamp,c02e79ef
2,What is the link for course registration?,data-engineering-zoomcamp,c02e79ef
3,How can I receive course announcements?,data-engineering-zoomcamp,c02e79ef
4,Where do I join the Slack channel?,data-engineering-zoomcamp,c02e79ef
5,Where can I find the prerequisites for this co...,data-engineering-zoomcamp,1f6520ca
6,How do I check the prerequisites for this course?,data-engineering-zoomcamp,1f6520ca
7,Where are the course prerequisites listed?,data-engineering-zoomcamp,1f6520ca
8,What are the requirements for joining this cou...,data-engineering-zoomcamp,1f6520ca
9,Where is the list of prerequisites for the cou...,data-engineering-zoomcamp,1f6520ca


In [9]:
ground_truth = df_ground_truth.to_dict(orient="records")

### Evaluating the relevance of the Elasticsearch search results

In [18]:
relevance_total = []

# loop through each question in the ground truth dataset
# and evaluate the relevance of the search results
for q in tqdm(ground_truth): 
    # Extract the document ID and question from the ground truth
    doc_id = q["document_id"]
    # Perform Elasticsearch search for the question in the specified course
    results = elastic_search(query=q["question"], course=q["course"])
    # Check if the document ID is in the search results
    relevance = [d['id'] == doc_id for d in results]
    # Append the relevance result to the total list
    relevance_total.append(relevance)

100%|██████████| 5088/5088 [04:15<00:00, 19.93it/s]


In [None]:
# Single relevance evaluation for single record (5 questions)
relevance

[True, False, False, False, False]

In [20]:
relevance_total

[[True, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [False, False, True, False, False],
 [False, False, False, False, False],
 [False, False, False, True, False],
 [False, False, False, False, True],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [],
 [],
 [],
 [],
 [],
 [True, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, True, False, False],
 [False, True, False, False, False],
 [False, True, False, False, False],
 [True

### Evaluation metrics

1. **hit-rate (recall)**

2. **Mean Reciprocal Rank (MRR)**

In [None]:
# Subset of relevance_total for the first 14 questions
examples = [[True, False, False, False, False], # 1 
            [False, False, False, False, False], # 0 
            [False, False, False, False, False], # 0 
            [False, False, False, False, False],  # 0
            [False, False, False, False, False], # 0
            [True, False, False, False, False], # 1
            [True, False, False, False, False], # 1
            [True, False, False, False, False], # 1    
            [True, False, False, False, False], # 1
            [True, False, False, False, False], # 1
            [False, False, True, False, False], # 1
            [False, False, False, False, False], # 0
            [False, False, False, True, False], # 1
            [False, False, False, False, True]] # 1

#### 1. Hit Rate (Recall)

In [24]:
# Hit-rate (recall)
hit_rate = sum([any(r) for r in examples]) / len(examples)

print(f"Hit-rate (recall): {hit_rate:.2%}")

Hit-rate (recall): 64.29%


In [26]:
Total_hits = 9 

len(examples) 

# Hit-rate (recall)
hit_rate = Total_hits / len(examples)   

print(f"Hit-rate (recall): {hit_rate:.2%}")

Hit-rate (recall): 64.29%


In [17]:
def hit_rate(relevance_total: list) -> float:
    """ Calculate the hit-rate (recall) from relevance results."""
    cnt = 0 
    for line in relevance_total:
        if True in line:
            cnt += 1
    return cnt / len(relevance_total)

In [29]:
hitrate = hit_rate(examples)

print(f"Hit-rate (recall): {hitrate:.2%}")

Hit-rate (recall): 64.29%


#### 2. Mean Reciprocal Rank (MRR)

Similar to the hit-rate, we first identify the relevant documents for each question
and then calculate the Mean Reciprocal Rank (MRR) based on the rank of the first relevant document.

Example 

[True, False, False, False, False] -> 1 (hit rate) & MRR = 1 / 1 = 1

[False, False, False, False, False] -> 0 (hit rate) & MRR = 0

[False, True, False, False, False] -> 1 (hit rate) & MRR = 1 / 2 = 0.5

[False, False, True, False, False] -> 1 (hit rate) & MRR = 1 / 3 = 0.333

[False, False, False, False, True] -> 1 (hit rate) & MRR = 1 / 5 = 0.2

##### **Ranking Logic:**
If relevant document is found, the rank is calculated as 1 / position (rank starts from 1). If no relevant document is found, the rank is set to 0.
- 1 -> MRR = 1 / 1 = 1
- 2 -> MRR = 1 / 2 = 0.5
- 3 -> MRR = 1 / 3 = 0.333...
- 4 -> MRR = 1 / 4 = 0.25
- 5 -> MRR = 1 / 5 = 0.2

MRR => 1 / rank 

None => 0

In [18]:
def mean_reciprocal_rank(relevance_total: list) -> float:
    """ Calculate the Mean Reciprocal Rank (MRR) from relevance results."""
    # Initialize total score
    # to accumulate the reciprocal ranks
    total_score = 0.0
    # Loop through each line in relevance_total
    for line in relevance_total:
        # Loop through each rank in the line
        for rank in range(len(line)):
            # If the document is relevant (True)
            if line[rank] == True:
                # Add the reciprocal rank to the total score
                # (1 divided by the rank + 1, since rank is 0-indexed
                total_score += 1 / (rank + 1)
                # Once a relevant document is found, break the loop
                break
    # Return the average reciprocal rank
    # by dividing the total score by the number of relevance lines
    return total_score / len(relevance_total)

In [34]:
mean_reciprocal_rank(examples)

0.4845238095238095

### Evaluation Summary for Elastic Search: 

In [37]:
hit_rate(relevance_total), mean_reciprocal_rank(relevance_total)

(0.6759040880503144, 0.5491810796645706)

### 2. MinSearch 

In [11]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course", "id"]
)

index.fit(documents)

<minsearch.minsearch.Index at 0x1bb5228bbb0>

In [None]:
def minsearch_search(query, course):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': course},
        boost_dict=boost,
        num_results=5
    )

    return results

In [14]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['document_id']
    results = minsearch_search(query=q['question'], course=q['course'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

100%|██████████| 5088/5088 [00:15<00:00, 328.23it/s]


In [19]:
hit_rate(relevance_total), mean_reciprocal_rank(relevance_total)

(0.7012578616352201, 0.6004913522012583)

#### Results: 

1. Elastic Search: 
```
(0.6759040880503144, 0.5491810796645706)
```

2. Min Search: 
```
(0.7012578616352201, 0.6004913522012583)
```

In [22]:
difference_hit_rate = 0.70125 - 0.67590

difference_min_search = 0.60049 - 0.54918


print(f"Difference in hit-rate: {difference_hit_rate:.5f}")
print(f"Difference in minsearch MRR: {difference_min_search:.5f}")

Difference in hit-rate: 0.02535
Difference in minsearch MRR: 0.05131


In [23]:
def evaluate(ground_truth, search_function): 
    """ Evaluate the search function against the ground truth dataset."""
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document_id']
        results = search_function(query=q['question'], course=q['course'])
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        "hit_rate": hit_rate(relevance_total),
        "mean_reciprocal_rank": mean_reciprocal_rank(relevance_total)
    }

In [27]:
evaluate(ground_truth, elastic_search)

100%|██████████| 5088/5088 [04:18<00:00, 19.65it/s]


{'hit_rate': 0.6757075471698113, 'mean_reciprocal_rank': 0.5486504192872121}

In [28]:
evaluate(ground_truth, minsearch_search)

100%|██████████| 5088/5088 [00:21<00:00, 236.84it/s]


{'hit_rate': 0.7012578616352201, 'mean_reciprocal_rank': 0.6004913522012583}