# Hybrid Search

## 0. Create Custom Retriever

> Hybrid Retriever: BM25 + Vector Retriever

In [38]:
from llama_index.core.postprocessor import SentenceTransformerRerank
from llama_index.core import QueryBundle


class HybridRetrieverOriginal:
    def __init__(self, bm25_retriever, vector_retriever):
        self.bm25_retriever = bm25_retriever
        self.vector_retriever = vector_retriever
        self.reranker = SentenceTransformerRerank(model="BAAI/bge-reranker-base")

    def retrieve(self, query, top_k=10):
        # 1. Change the top_k of the retrievers & reranker
        self.vector_retriever.similarity_top_k = top_k
        self.bm25_retriever.similarity_top_k = top_k
        self.reranker.top_n = top_k

        # 2. Use both retrievers to get top-k results + normalization
        bm25_nodes = self.bm25_retriever.retrieve(query)
        vector_nodes = self.vector_retriever.retrieve(query)

        # 3. Combine the two lists of nodes
        all_nodes = []
        node_ids = set()
        for n in bm25_nodes + vector_nodes:
            if n.node.node_id not in node_ids:
                all_nodes.append(n)
                node_ids.add(n.node.node_id)

        # 4. Rerank and get top-k results
        reranked_nodes = self.reranker.postprocess_nodes(
            all_nodes,
            query_bundle=QueryBundle(query),
        )

        return reranked_nodes

In [39]:
class HybridRetriever:
    def __init__(self, bm25_retriever, vector_retriever, alpha=0.5):
        self.bm25_retriever = bm25_retriever
        self.vector_retriever = vector_retriever
        self.alpha = alpha

    def process_nodes(self, nodes):
        # Normalize relevance scores and return the nodes in dict format.
        scores: list[float] = [node.score for node in nodes]
        max_score = max(scores)
        min_score = min(scores)
        
        processed_nodes = {}
        for node in nodes:
            if min_score == max_score:
                node.score = 1
            else:
                node.score = (node.score - min_score) / (max_score - min_score)
            processed_nodes[node.id_] = node
        return processed_nodes

    def retrieve(self, query, top_k=10):
        # 1. Change the top_k of the retrievers
        self.vector_retriever.similarity_top_k = top_k
        self.bm25_retriever.similarity_top_k = top_k

        # 2. Use both retrievers to get top-k results + normalization
        bm25_nodes = self.process_nodes(self.bm25_retriever.retrieve(query))
        vector_nodes = self.process_nodes(self.vector_retriever.retrieve(query))

        # 3. Linearly combine the scores of each node
        node_ids = set(list(bm25_nodes.keys()) + list(vector_nodes.keys()))
        all_nodes = []
        for node_id in node_ids:
            try:
                bm25_score = bm25_nodes.get(node_id).score
            except:
                bm25_score = 0.0
            try:
                cosine_score = vector_nodes.get(node_id).score
            except:
                cosine_score = 0.0
            combined_score = self.alpha * bm25_score + (1 - self.alpha) * cosine_score
            node = bm25_nodes.get(node_id, vector_nodes.get(node_id))
            node.score = combined_score

            all_nodes.append(node)
        
        sorted_nodes = sorted(all_nodes, key=lambda node: node.score, reverse=True)[:top_k]
        return sorted_nodes

## 1. Load Embedding Model

In [40]:
from llama_index.core import (
    Settings,
    VectorStoreIndex,
    Document,
)

from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.retrievers.bm25 import BM25Retriever

import pandas as pd

In [41]:
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

In [42]:
benchmark = pd.read_csv("bx/BX1_chicago.csv")
# summaries = pd.read_csv("bc/row_summaries_public_bi.csv")

## 2. Index Contexts/Contents

In [43]:
def create_context_documents(df):
    documents = []
    for idx in df.index:
        table = df["table"][idx]
        answer = df["context"][idx]
        document = Document(
            text=answer,
            metadata={"table": table},
            doc_id=f"doc_'{table}'_{idx}",
        )
        documents.append(document)
    return documents

In [44]:
def create_content_documents(df):
    documents = []
    for idx in df.index:
        table = df["table"][idx]
        table_summary = df["summary"][idx]
        document = Document(
            text=table_summary,
            metadata={"table": table},
            doc_id=f"doc_'{table}'_{idx}",
        )
        documents.append(document)
    return documents

In [45]:
import numpy as np

def get_sample_summaries(summaries: pd.DataFrame, sample_percentage=1):
    """
    This is to randomly sample certain percentage of the summaries.
    The return value is the df itself, but only the sampled rows remained.
    """
    # Prepare to sample summaries (category refers to the tables)
    category_counts = summaries["table"].value_counts()
    sample_sizes = np.ceil(category_counts * sample_percentage).astype(int)

    # Perform stratified sampling
    sampled_summaries = summaries.copy(deep=True)
    sampled_summaries["table_copy"] = sampled_summaries["table"]
    sampled_summaries = sampled_summaries.groupby("table_copy", group_keys=False).apply(
        lambda x: x.sample(n=sample_sizes[x.name], random_state=42),
        include_groups=False,
    )
    return sampled_summaries.reset_index(drop=True)

In [46]:
# For example, this is for context
documents = create_context_documents(benchmark)
len(documents)

1356

In [47]:
# documents = create_content_documents(summaries)

In [48]:
vector_index = VectorStoreIndex(documents)
print("Index created")

Index created


## 3. Evaluate Vector Search

In [49]:
def convert_retrieved_data_to_tables_ranks(retrieved_data):
    # Convert all retrieved data to the format (table, rank)
    rank = 1
    prev_score = retrieved_data[0].get_score()
    tables_ranks = []
    for data in retrieved_data:
        if data.get_score() < prev_score:
            rank += 1
        table = data.id_.split("'")[1]  # E.g., "chicago_open_data/22u3-xenr"
        tables_ranks.append((table, rank))
        prev_score = data.get_score()
    return tables_ranks

In [50]:
def evaluate(retriever, benchmark_df, top_k=1):
    accuracy_sum = 0
    precision_at_1_sum = 0
    reciprocal_rank_sum = 0
    for i in range(len(benchmark_df)):
        query = benchmark_df["question"][i]
        expected_table = benchmark_df["table"][i]
        retrieved_data = retriever.retrieve(query, top_k)
        tables_ranks = convert_retrieved_data_to_tables_ranks(retrieved_data)
        for j, (table, rank) in enumerate(tables_ranks):
            if table == expected_table:
                accuracy_sum += 1
                if rank == 1:
                    precision_at_1_sum += 1
                reciprocal_rank_sum += 1 / (j + 1)
                break
        if i % 100 == 0:
            print(i)
            print("Accuracy:", accuracy_sum)
            print("Prec@1:", precision_at_1_sum)
            print("Reciprocal Rank:", reciprocal_rank_sum)
    return {
        "accuracy": accuracy_sum / benchmark_df.shape[0],
        "Mean Precision@1": precision_at_1_sum / benchmark_df.shape[0],
        "MRR": reciprocal_rank_sum / benchmark_df.shape[0],
    }

In [53]:
vector_retriever = vector_index.as_retriever()
BM25_retriever = BM25Retriever.from_defaults(vector_index)
hybrid_retriever = HybridRetriever(BM25_retriever, vector_retriever)
result = evaluate(hybrid_retriever, benchmark, top_k=5)
print(result)

0
Accuracy: 1
Prec@1: 1
Reciprocal Rank: 1.0
100
Accuracy: 95
Prec@1: 88
Reciprocal Rank: 83.76666666666668
200
Accuracy: 192
Prec@1: 184
Reciprocal Rank: 176.76666666666668
300
Accuracy: 286
Prec@1: 274
Reciprocal Rank: 267.75
400
Accuracy: 383
Prec@1: 365
Reciprocal Rank: 358.66666666666663
500
Accuracy: 478
Prec@1: 456
Reciprocal Rank: 448.91666666666663
600
Accuracy: 568
Prec@1: 542
Reciprocal Rank: 533.0333333333333
700
Accuracy: 661
Prec@1: 630
Reciprocal Rank: 619.2333333333335
800
Accuracy: 759
Prec@1: 727
Reciprocal Rank: 714.2333333333335
900
Accuracy: 855
Prec@1: 820
Reciprocal Rank: 806.6000000000003
1000
Accuracy: 953
Prec@1: 913
Reciprocal Rank: 899.7833333333338
1100
Accuracy: 1050
Prec@1: 1005
Reciprocal Rank: 992.7833333333339
1200
Accuracy: 1145
Prec@1: 1093
Reciprocal Rank: 1083.316666666667
1300
Accuracy: 1240
Prec@1: 1183
Reciprocal Rank: 1173.9833333333336
{'accuracy': 0.9542772861356932, 'Mean Precision@1': 0.9085545722713865, 'MRR': 0.9026671583087514}


In [54]:
vector_retriever = vector_index.as_retriever()
BM25_retriever = BM25Retriever.from_defaults(vector_index)
hybrid_retriever = HybridRetriever(BM25_retriever, vector_retriever)
result = evaluate(hybrid_retriever, benchmark, top_k=10)
print(result)

0
Accuracy: 1
Prec@1: 1
Reciprocal Rank: 1.0
100
Accuracy: 99
Prec@1: 86
Reciprocal Rank: 84.09007936507938
200
Accuracy: 197
Prec@1: 180
Reciprocal Rank: 178.75674603174605
300
Accuracy: 295
Prec@1: 270
Reciprocal Rank: 270.2757936507936
400
Accuracy: 392
Prec@1: 358
Reciprocal Rank: 361.02579365079356
500
Accuracy: 489
Prec@1: 449
Reciprocal Rank: 452.56746031746025
600
Accuracy: 583
Prec@1: 534
Reciprocal Rank: 539.2297619047617
700
Accuracy: 679
Prec@1: 622
Reciprocal Rank: 627.3642857142856
800
Accuracy: 779
Prec@1: 718
Reciprocal Rank: 722.6003968253967
900
Accuracy: 877
Prec@1: 810
Reciprocal Rank: 815.7210317460317
1000
Accuracy: 975
Prec@1: 902
Reciprocal Rank: 908.8043650793652
1100
Accuracy: 1075
Prec@1: 993
Reciprocal Rank: 1002.196031746032
1200
Accuracy: 1174
Prec@1: 1081
Reciprocal Rank: 1093.2
1300
Accuracy: 1271
Prec@1: 1170
Reciprocal Rank: 1184.1583333333333
{'accuracy': 0.9778761061946902, 'Mean Precision@1': 0.8989675516224189, 'MRR': 0.9106624877089479}
