# Hybrid Search + LLM

> LLM as filterer of rankings provided by hybrid retriever

In [None]:
# Uncomment to choose GPU if needed
# import os
# os.environ['CUDA_VISIBLE_DEVICES'] = '0'
# import setproctitle
# setproctitle.setproctitle("python")

In [None]:
from llama_index.core import (
    Settings,
    VectorStoreIndex,
    Document,
)
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

# These packages are available in the benchmark_generator/context directory
from utils.pipeline_initializer import initialize_pipeline
from utils.prompting_interface import prompt_pipeline

import torch
import warnings
import pandas as pd

## 1. Initialize Models & Index Documents

In [None]:
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
pipe = initialize_pipeline("mistralai/Mistral-7B-Instruct-v0.3", torch.bfloat16)

In [None]:
benchmark = pd.read_csv("BX1_chicago_corrected.csv")  # Adjust name
# summaries = pd.read_csv("summaries_public_bi.csv")

In [None]:
def create_content_documents(df):
    documents = []
    for idx in df.index:
        table = df["table"][idx]
        table_summary = df["summary"][idx]
        document = Document(
            text=table_summary,
            metadata={"table": table},
            doc_id=f"doc_'{table}'_{idx}",
        )
        documents.append(document)
    return documents

In [None]:
def create_context_documents(df):
    documents = []
    for idx in df.index:
        table = df["table"][idx]
        answer = df["context"][idx]
        document = Document(
            text=answer,
            metadata={"table": table},
            doc_id=f"doc_'{table}'_{idx}",
        )
        documents.append(document)
    return documents

In [None]:
documents = create_context_documents(benchmark)

In [None]:
vector_index = VectorStoreIndex(documents)
print("Index created")

## 2. Evaluation

In [None]:
class HybridRetriever:
    def __init__(self, bm25_retriever, vector_retriever, alpha=0.5):
        self.bm25_retriever = bm25_retriever
        self.vector_retriever = vector_retriever
        self.alpha = alpha

    def process_nodes(self, nodes):
        # Normalize relevance scores and return the nodes in dict format.
        scores: list[float] = [node.score for node in nodes]
        max_score = max(scores)
        min_score = min(scores)
        
        processed_nodes = {}
        for node in nodes:
            if min_score == max_score:
                node.score = 1
            else:
                node.score = (node.score - min_score) / (max_score - min_score)
            processed_nodes[node.id_] = node
        return processed_nodes

    def retrieve(self, query, top_k=10):
        # 1. Change the top_k of the retrievers
        self.vector_retriever.similarity_top_k = top_k
        self.bm25_retriever.similarity_top_k = top_k

        # 2. Use both retrievers to get top-k results + normalization
        bm25_nodes = self.process_nodes(self.bm25_retriever.retrieve(query))
        vector_nodes = self.process_nodes(self.vector_retriever.retrieve(query))

        # 3. Linearly combine the scores of each node
        node_ids = set(list(bm25_nodes.keys()) + list(vector_nodes.keys()))
        all_nodes = []
        for node_id in node_ids:
            try:
                bm25_score = bm25_nodes.get(node_id).score
            except:
                bm25_score = 0.0
            try:
                cosine_score = vector_nodes.get(node_id).score
            except:
                cosine_score = 0.0
            combined_score = self.alpha * bm25_score + (1 - self.alpha) * cosine_score
            node = bm25_nodes.get(node_id, vector_nodes.get(node_id))
            node.score = combined_score

            all_nodes.append(node)
        
        sorted_nodes = sorted(all_nodes, key=lambda node: node.score, reverse=True)[:top_k]
        return sorted_nodes

In [None]:
def get_relevancy_prompt(metadata: str, query: str):
    return f"""Metadata M:"{metadata}"
Query Q: "{query}"
Metadata M is associated with a dataset that we can access. Is this dataset relevant to query Q? Begin your argument with yes/no."""

def filter_retrieved_data(retrieved_data, query):
    filtered_data = []
    for data in retrieved_data:
        metadata = data.text
        prompt = get_relevancy_prompt(metadata, query)
        conversation = [{"role": "user", "content": prompt}]
        model_output: str = prompt_pipeline(pipe, conversation, max_new_tokens=4, context_length=32768)[-1][
            "content"
        ]
        # print(f"DEBUG prompt: {prompt}")
        # print("=" * 100)
        # print(f"DEBUG model output: {model_output}")
        # print("=" * 100)
        if model_output.lower().strip().startswith("yes") or model_output.lower().strip().startswith("**yes**"):
            filtered_data.append((data, True))
        else:
            filtered_data.append((data, False))
    filtered_data.sort(key=lambda x: not x[1])
    filtered_data = [x[0] for x in filtered_data]
    return filtered_data

In [None]:
def convert_retrieved_data_to_tables_ranks(retrieved_data, query):
    # Convert all retrieved data to the format (table, rank)
    rank = 1
    filtered_data = filter_retrieved_data(retrieved_data, query)  # Filter with LLM
    prev_score = -1000
    tables_ranks = []
    for data in filtered_data:
        if data.get_score() < prev_score:
            rank += 1
        table = data.id_.split("'")[1]  # E.g., "chicago_open_data/22u3-xenr"
        tables_ranks.append((table, rank))
        prev_score = data.get_score()
    return tables_ranks

In [None]:
import ast

def evaluate(retriever, benchmark_df, top_k=1):
    accuracy_sum = 0
    precision_at_1_sum = 0
    reciprocal_rank_sum = 0
    for i in range(len(benchmark_df)):
        query = benchmark_df["question"][i]
        try:
            expected_tables = ast.literal_eval(benchmark_df["relevant_tables"][i])
        except:
            expected_tables = [benchmark_df["table"][i]]
        retrieved_data = retriever.retrieve(query, top_k)  # Using hybrid retriever
        tables_ranks = convert_retrieved_data_to_tables_ranks(retrieved_data, query)

        before = accuracy_sum
        for j, (table, rank) in enumerate(tables_ranks):
            if table in expected_tables:
                accuracy_sum += 1
                if rank == 1:
                    precision_at_1_sum += 1
                reciprocal_rank_sum += 1 / (j + 1)
                break
        if accuracy_sum == before:
            print(f"Wrong answer at index {i}")

        if i % 25 == 0:  # Checkpointing
            print(f"i: {i}")
            print(f"accuracy_sum: {accuracy_sum}")
            print(f"precision_at_1_sum: {precision_at_1_sum}")
            print(f"reciprocal_rank_sum: {reciprocal_rank_sum}")
            print("=" * 100)
    return {
        "accuracy": accuracy_sum / benchmark_df.shape[0],
        "Mean Precision@1": precision_at_1_sum / benchmark_df.shape[0],
        "MRR": reciprocal_rank_sum / benchmark_df.shape[0],
    }

In [None]:
from llama_index.retrievers.bm25 import BM25Retriever
warnings.filterwarnings("ignore")

vector_retriever = vector_index.as_retriever()
BM25_retriever = BM25Retriever.from_defaults(vector_index)
hybrid_retriever = HybridRetriever(BM25_retriever, vector_retriever)

In [None]:
result = evaluate(hybrid_retriever, benchmark, top_k=1)  # Adjust k
print(result)