# Vector Search

## 1. Load Embedding Model & Benchmarks

In [None]:
# Uncomment if needed to choose GPU
# import os
# os.environ['CUDA_VISIBLE_DEVICES'] = '2'
# import setproctitle
# setproctitle.setproctitle("python")

In [None]:
import pandas as pd
from llama_index.core import (
    Settings,
    VectorStoreIndex,
    Document,
)

from llama_index.embeddings.huggingface import HuggingFaceEmbedding

In [None]:
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

In [None]:
# Load context benchmark
benchmark = pd.read_csv("BX2_chicago.csv")

In [None]:
# Load content benchmark + table summaries
# benchmark = pd.read_csv("BC2_public_bi.csv")
# summaries = pd.read_csv("row_summaries_public_bi.csv")

## 2. Index Contexts/Summaries

In [None]:
def create_context_documents(df):
    documents = []
    for idx in df.index:
        table = df["table"][idx]
        answer = df["context"][idx]
        document = Document(
            text=answer,
            metadata={"table": table},
            doc_id=f"doc_'{table}'_{idx}",
        )
        documents.append(document)
    return documents

In [None]:
def create_content_documents(df):
    documents = []
    for idx in df.index:
        table = df["table"][idx]
        table_summary = df["summary"][idx]
        document = Document(
            text=table_summary,
            metadata={"table": table},
            doc_id=f"doc_'{table}'_{idx}",
        )
        documents.append(document)
    return documents

In [None]:
import numpy as np

def get_sample_summaries(summaries: pd.DataFrame, sample_percentage=1):
    """
    This is to randomly sample certain percentage of the summaries.
    The return value is the summaries but only for the sampled rows.
    """
    # Prepare to sample summaries (category refers to the tables)
    category_counts = summaries["table"].value_counts()
    sample_sizes = np.ceil(category_counts * sample_percentage).astype(int)

    # Perform stratified sampling
    sampled_summaries = summaries.copy(deep=True)
    sampled_summaries["table_copy"] = sampled_summaries["table"]
    sampled_summaries = sampled_summaries.groupby("table_copy", group_keys=False).apply(
        lambda x: x.sample(n=sample_sizes[x.name], random_state=42),
        include_groups=False,
    )
    return sampled_summaries.reset_index(drop=True)

In [None]:
# Create document for context benchmark
documents = create_context_documents(benchmark)

In [None]:
# Create document for content benchmark
# documents = create_content_documents(summaries)

In [None]:
vector_index = VectorStoreIndex(documents)
print("Index created")

## 3. Evaluate System

In [None]:
from collections import defaultdict

def convert_retrieved_data_to_tables_ranks(retrieved_data):
    # Convert all retrieved data to the format (table, rank)
    rank = 1
    prev_score = retrieved_data[0].get_score()
    tables_encountered = defaultdict(bool)
    tables_ranks = []
    for data in retrieved_data:
        table = data.id_.split("'")[1]  # E.g., "chicago_open_data/22u3-xenr"
        if not tables_encountered[table]:
            if data.get_score() < prev_score:
                rank += 1
            tables_ranks.append((table, rank))
            prev_score = data.get_score()
            tables_encountered[table] = True
    return tables_ranks

In [None]:
import ast
def evaluate(retriever, benchmark_df):
    accuracy_sum = 0
    precision_at_1_sum = 0
    reciprocal_rank_sum = 0
    for i in range(len(benchmark_df)):
        query = benchmark_df["question"][i]
        try:
            expected_tables = ast.literal_eval(benchmark_df["relevant_tables"][i])
        except:
            expected_tables = [benchmark_df["table"][i]]
        retrieved_data = retriever.retrieve(query)
        tables_ranks = convert_retrieved_data_to_tables_ranks(retrieved_data)
        for j, (table, rank) in enumerate(tables_ranks):
            if table in expected_tables:
                accuracy_sum += 1
                if rank == 1:
                    precision_at_1_sum += 1
                reciprocal_rank_sum += 1 / (j + 1)
                break
        if i % 100 == 0:  # Checkpointing
            print(i)
            print("Accuracy:", accuracy_sum)
            print("Prec@1:", precision_at_1_sum)
            print("Reciprocal Rank:", reciprocal_rank_sum)
    return {
        "accuracy": accuracy_sum / benchmark_df.shape[0],
        "Mean Precision@1": precision_at_1_sum / benchmark_df.shape[0],
        "MRR": reciprocal_rank_sum / benchmark_df.shape[0],
    }

In [None]:
vector_retriever = vector_index.as_retriever(similarity_top_k=1)  # Adjust k
result = evaluate(vector_retriever, benchmark)
print(result)