# Vector Search + LLM

> LLM as filterer of rankings provided by hybrid retriever

## 0. Create Interface to Interact with LLM

In [None]:
from transformers import pipeline
def initialize_pipeline(model_path: str, torch_dtype):
    """
    Initialize a text generation pipeline

    ### Parameters:
    - model_path (str): The path of a model and tokenizer's weights.

    ### Returns:
    - pipe (TextGenerationPipeline): The pipeline for text generation.
    """

    pipe = pipeline(
        "text-generation",
        model=model_path,
        device_map="auto",
        torch_dtype=torch_dtype
    )

    return pipe

from transformers import set_seed

def is_within_context_length(tokenizer, conversation, context_length: int):
    # Check whether a conversation is within the context length
    # after being tokenized.
    conv_len = len(
        tokenizer.apply_chat_template(
            conversation, tokenize=True, add_generation_prompt=True
        )
    )
    return conv_len <= (context_length)

def validate_generation_configs(generation_configs):
    if generation_configs["top_k"] == 0:
        del generation_configs["top_k"]
    if generation_configs["top_p"] == 1.0:
        del generation_configs["top_p"]
    if generation_configs["penalty_alpha"] == 0.0:
        del generation_configs["penalty_alpha"]
    if generation_configs["temperature"] == 0.0:
        del generation_configs["temperature"]

def prompt_pipeline(
    pipe,
    conversation,
    context_length=8192,
    max_new_tokens=512,
    do_sample=False,
    top_k=0,
    top_p=1.0,
    penalty_alpha=0.0,
    temperature=0.0,
):
    """
    Prompt the pipeline with a conversation

    ### Parameters:
    - pipe (TextGenerationPipeline): An initialized pipeline.
    - conversation (list[dict[str, str]]): The data type of the model
    - context_length (int): The LLM's context length
    - max_new_tokens (int): Max number of tokens generated for each prompt
    - do_sample (bool): Perform sampling or not
    - top_k (int): The number of tokens to consider when sampling
    - top_p (float): Minimum cumulative probability of tokens being considered
    - penalty_alpha (float): The amount of focus being put to ensure non-repetitiveness
    - temperature (float): Control how sharp the distribution (smaller means sharper)

    ### Returns:
    - conversation (list[dict[str, str]]): The conversation appended with the model's output
    """
    generation_configs = {
        "max_new_tokens": max_new_tokens,
        "top_k": top_k,
        "top_p": top_p,
        "do_sample": do_sample,
        "penalty_alpha": penalty_alpha,
        "temperature": temperature,
        "pad_token_id": pipe.tokenizer.eos_token_id
    }
    validate_generation_configs(generation_configs)
    try:
        if is_within_context_length(pipe.tokenizer, conversation, context_length):
            set_seed(42)  # Enhance reproducibility
            conversation = pipe(conversation, **generation_configs)[0]["generated_text"]
            return conversation
        else:
            logger.warning(
                "The conversation is more than what the model can handle. Skip processing."
            )
            return [{"role": "user", "content": ""}]
    except:
        logger.warning(
                "The conversation is more than what the model can handle. Skip processing."
            )
        return [{"role": "user", "content": ""}]

In [None]:
from llama_index.core import (
    Settings,
    VectorStoreIndex,
    Document,
)
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

# These packages are available in the benchmark_generator directory
from pipeline.pipeline_initializer import initialize_pipeline
from pipeline.prompting_interface import prompt_pipeline

import torch
import warnings
import pandas as pd

## 1. Initialize LLM and Index Documents

In [None]:
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
pipe = initialize_pipeline("mistralai/Mistral-7B-Instruct-v0.3", torch.bfloat16)

In [None]:
benchmark = pd.read_csv("BX1_chicago_corrected.csv")  # Adjust benchmark name
benchmark.head()

In [None]:
def create_content_documents(df):
    documents = []
    for idx in df.index:
        table = df["table"][idx]
        table_summary = df["summary"][idx]
        document = Document(
            text=table_summary,
            metadata={"table": table},
            doc_id=f"doc_'{table}'_{idx}",
        )
        documents.append(document)
    return documents

In [None]:
def create_context_documents(df):
    documents = []
    for idx in df.index:
        table = df["table"][idx]
        answer = df["context"][idx]
        document = Document(
            text=answer,
            metadata={"table": table},
            doc_id=f"doc_'{table}'_{idx}",
        )
        documents.append(document)
    return documents
documents = create_context_documents(benchmark)

In [None]:
vector_index = VectorStoreIndex(documents)
print("Index created")

## 2. Evaluation

In [None]:
# Combine BM25 & Vector Retrievers
class HybridRetriever:
    def __init__(self, bm25_retriever, vector_retriever, alpha=0.5):
        self.bm25_retriever = bm25_retriever
        self.vector_retriever = vector_retriever
        self.alpha = alpha

    def process_nodes(self, nodes):
        # Normalize relevance scores and return the nodes in dict format.
        scores: list[float] = [node.score for node in nodes]
        max_score = max(scores)
        min_score = min(scores)
        
        processed_nodes = {}
        for node in nodes:
            if min_score == max_score:
                node.score = 1
            else:
                node.score = (node.score - min_score) / (max_score - min_score)
            processed_nodes[node.id_] = node
        return processed_nodes

    def retrieve(self, query, top_k=10):
        # 1. Change the top_k of the retrievers
        self.vector_retriever.similarity_top_k = top_k
        self.bm25_retriever.similarity_top_k = top_k

        # 2. Use both retrievers to get top-k results + normalization
        bm25_nodes = self.process_nodes(self.bm25_retriever.retrieve(query))
        vector_nodes = self.process_nodes(self.vector_retriever.retrieve(query))

        # 3. Linearly combine the scores of each node
        node_ids = set(list(bm25_nodes.keys()) + list(vector_nodes.keys()))
        all_nodes = []
        for node_id in node_ids:
            try:
                bm25_score = bm25_nodes.get(node_id).score
            except:
                bm25_score = 0.0
            try:
                cosine_score = vector_nodes.get(node_id).score
            except:
                cosine_score = 0.0
            combined_score = self.alpha * bm25_score + (1 - self.alpha) * cosine_score
            node = bm25_nodes.get(node_id, vector_nodes.get(node_id))
            node.score = combined_score

            all_nodes.append(node)
        
        sorted_nodes = sorted(all_nodes, key=lambda node: node.score, reverse=True)[:top_k]
        return sorted_nodes

In [None]:
def get_relevancy_prompt(metadata: str, query: str):
    return f"""Metadata M:"{metadata}"
Query Q: "{query}"
Metadata M is associated with a dataset that we can access. Is this dataset relevant to query Q? Begin your argument with yes/no."""

def filter_retrieved_data(retrieved_data, query):
    filtered_data = []
    for data in retrieved_data:
        metadata = data.text
        prompt = get_relevancy_prompt(metadata, query)
        conversation = [{"role": "user", "content": prompt}]
        model_output: str = prompt_pipeline(pipe, conversation, max_new_tokens=128, context_length=32768)[-1][
            "content"
        ]
        # print(f"DEBUG prompt: {prompt}")
        # print("=" * 100)
        # print(f"DEBUG model output: {model_output}")
        # print("=" * 100)
        if model_output.lower().strip().startswith("yes") or model_output.lower().strip().startswith("**yes**"):
            filtered_data.append(data)
    return filtered_data

In [None]:
def convert_retrieved_data_to_tables_ranks(retrieved_data, query):
    # Convert all retrieved data to the format (table, rank)
    rank = 1
    tables_ranks = []
    filtered_data = filter_retrieved_data(retrieved_data, query)  # Filter with LLM
    prev_score = -1000
    for data in filtered_data:
        if data.get_score() < prev_score:
            rank += 1
        table = data.id_.split("'")[1]  # E.g., "chicago_open_data/22u3-xenr"
        tables_ranks.append((table, rank))
        prev_score = data.get_score()
    return tables_ranks

In [None]:
import ast

def evaluate(retriever, benchmark_df, top_k=1):
    accuracy_sum = 0
    precision_at_1_sum = 0
    reciprocal_rank_sum = 0
    for i in range(len(benchmark_df)):
        query = benchmark_df["question"][i]
        expected_tables = ast.literal_eval(benchmark_df["relevant_tables"][i])
        retrieved_data = retriever.retrieve(query, top_k)  # Using hybrid retriever
        tables_ranks = convert_retrieved_data_to_tables_ranks(retrieved_data, query)

        before = accuracy_sum
        for j, (table, rank) in enumerate(tables_ranks):
            if table in expected_tables:
                accuracy_sum += 1
                if rank == 1:
                    precision_at_1_sum += 1
                reciprocal_rank_sum += 1 / (j + 1)
                break
        if accuracy_sum == before:
            print(f"Wrong answer at index {i}")
            # print(expected_tables)
            # print(query)
            # print(retrieved_data)

        # Checkpointing
        if i % 25 == 0:
            print(f"i: {i}")
            print(f"accuracy_sum: {accuracy_sum}")
            print(f"precision_at_1_sum: {precision_at_1_sum}")
            print(f"reciprocal_rank_sum: {reciprocal_rank_sum}")
            print("=" * 100)
    return {
        "accuracy": accuracy_sum / benchmark_df.shape[0],
        "Mean Precision@1": precision_at_1_sum / benchmark_df.shape[0],
        "MRR": reciprocal_rank_sum / benchmark_df.shape[0],
    }

In [None]:
warnings.filterwarnings("ignore")

In [None]:
from llama_index.retrievers.bm25 import BM25Retriever

vector_retriever = vector_index.as_retriever()
BM25_retriever = BM25Retriever.from_defaults(vector_index)
hybrid_retriever = HybridRetriever(BM25_retriever, vector_retriever)

result = evaluate(hybrid_retriever, benchmark, top_k=3)  # Adjust k
print(result)