# Keyword Search (Weighted by BM25)

> Pre-requisite: follow [this guide](https://www.elastic.co/guide/en/elasticsearch/reference/current/getting-started.html) to setup self-managed Elasticsearch system.

## 1. Connect to Elasticsearch Server

In [1]:
from elasticsearch import Elasticsearch

client = Elasticsearch(
  "https://localhost:9200",
  api_key="SEVzTDVvOEJpRjU2U19wVFhrUXM6a204U3lERi1UTUtRNzlQRF9NUENqdw==",  # Adjust
  ca_certs="http_ca.crt"
)
client.info()

ObjectApiResponse({'name': 'dbd6ede12f79', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'CHnpZicfTCa-YpXISknWzQ', 'version': {'number': '8.13.4', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': 'da95df118650b55a500dcc181889ac35c6d8da7c', 'build_date': '2024-05-06T22:04:45.107454559Z', 'build_snapshot': False, 'lucene_version': '9.10.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [23]:
# Adjust, this is an example for summaries.
# For context benchmark, no need for summaries.
import pandas as pd
benchmark = pd.read_csv("bx/BX1_public_bi.csv").fillna("")  # Elasticsearch can't index empty-length strings
# summaries = pd.read_csv("bc/row_summaries_chicago.csv")

## Ingest Contexts/Contents

In [24]:
def ingest_contexts(benchmark: pd.DataFrame):
    client.indices.delete(index='benchmark', ignore=[400, 404])  # Ignore if already removed
    for i in range(benchmark.shape[0]):
        table = benchmark["table"][i]
        context = benchmark["context"][i]
        client.index(
            index = "benchmark",
            document={
                "table": table,
                "context": context
            }
        )

In [25]:
def ingest_contents(summaries: pd.DataFrame):
    client.indices.delete(index='benchmark', ignore=[400, 404])  # Ignore if already removed
    for i in range(summaries.shape[0]):
        table = summaries["table"][i]
        summary = summaries["summary"][i]
        client.index(
            index = "benchmark",
            document={
                "table": table,
                "summary": summary
            }
        )

In [26]:
import numpy as np
def get_sample_summaries(summaries: pd.DataFrame, sample_percentage=1):
    """
    This is to get certain percentage of the summaries.
    """
    # Prepare to sample summaries (category refers to the tables)
    category_counts = summaries['table'].value_counts()
    sample_sizes = np.ceil(category_counts * sample_percentage).astype(int)

    # Perform stratified sampling
    sampled_summaries = summaries.copy(deep=True)
    sampled_summaries["table_copy"] = sampled_summaries["table"]
    sampled_summaries = sampled_summaries.groupby('table_copy', group_keys=False).apply(lambda x: x.sample(n=sample_sizes[x.name], random_state=42), include_groups=False)
    return sampled_summaries.reset_index(drop=True)

In [27]:
# ingest_contents(get_sample_summaries(summaries, 1))

In [28]:
# This is an example for content benchmark
ingest_contexts(benchmark)

  client.indices.delete(index='benchmark', ignore=[400, 404])  # Ignore if already removed


## Evaluate Keyword Search

In [29]:
def get_tables_ranks(hits):
    # Get the tables and the corresponding ranks (based on their relevance scores).
    # E.g., [(table_1, 1), (table_2, 1), (table_3, 2), ...]
    rank = 1
    prev_score = hits[0]['_score']
    tables_ranks = []
    for hit in hits:
        if hit['_score'] < prev_score:
            rank += 1
        tables_ranks.append((hit['_source']['table'], rank))
        prev_score = hit['_score']
    return tables_ranks

In [30]:
import ast

def evaluate_contexts(benchmark: pd.DataFrame, k=1):
    accuracy_sum = 0
    precision_at_1_sum = 0
    reciprocal_rank_sum = 0
    for i in range(benchmark.shape[0]):
        expected_tables = ast.literal_eval(benchmark["table"][i])
        question = benchmark["question"][i]
        search_query = {
            "size": k,
            "query": {
                "match": {
                    "context": question
                }
            }
        }
        result = client.search(index="benchmark", body=search_query).body
        tables_ranks = get_tables_ranks(result["hits"]["hits"])

        # Measure the performance
        # tables_ranks is the sorted list of tables and ranks retrieved by system.
        for j, (table, rank) in enumerate(tables_ranks):
            if table in expected_tables:
                accuracy_sum += 1
                if rank == 1:
                    precision_at_1_sum += 1
                reciprocal_rank_sum += (1 / (j + 1))
                break
    return {
        "accuracy": accuracy_sum / benchmark.shape[0],
        "Mean Precision@1": precision_at_1_sum / benchmark.shape[0],
        "MRR": reciprocal_rank_sum / benchmark.shape[0],
    }

In [31]:
def evaluate_contents(benchmark: pd.DataFrame, k=1):
    accuracy_sum = 0
    precision_at_1_sum = 0
    reciprocal_rank_sum = 0
    for i in range(benchmark.shape[0]):
        expected_table = benchmark["table"][i]
        question = benchmark["question"][i]
        search_query = {
            "size": k,
            "query": {
                "match": {
                    "summary": question
                }
            }
        }
        result = client.search(index="benchmark", body=search_query).body
        tables_ranks = get_tables_ranks(result["hits"]["hits"])

        # Measure the performance
        # tables_ranks is the sorted list of tables and ranks retrieved by system.
        for j, (table, rank) in enumerate(tables_ranks):
            if table == expected_table:
                accuracy_sum += 1
                if rank == 1:
                    precision_at_1_sum += 1
                reciprocal_rank_sum += (1 / (j + 1))
                break
    return {
        "accuracy": accuracy_sum / benchmark.shape[0],
        "Mean Precision@1": precision_at_1_sum / benchmark.shape[0],
        "MRR": reciprocal_rank_sum / benchmark.shape[0],
    }

In [33]:
results = evaluate_contexts(benchmark, k=10)  # Adjust k
print(results)

{'accuracy': 0.843394323584498, 'Mean Precision@1': 0.7118570811122317, 'MRR': 0.7688133314123798}
