# Vector Search

## 1. Load Embedding Model

In [1]:
import pandas as pd
from llama_index.core import (
    Settings,
    VectorStoreIndex,
    Document,
)

from llama_index.embeddings.huggingface import HuggingFaceEmbedding

In [2]:
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

In [3]:
bx1_df = pd.read_csv("bx/BX1_public_bi.csv")
bx1_df.head()

Unnamed: 0,context,question,table
0,The dataset was created to monitor and analyze...,Can you provide a dataset containing detailed ...,public_bi_benchmark/Arade_1
1,"I am the creator of this dataset, and I can pr...",What dataset was created by the WNET (Wind Net...,public_bi_benchmark/Arade_1
2,The dataset was funded by the WNET (Wireless N...,Can you provide the dataset funded by the WNET...,public_bi_benchmark/Arade_1
3,"No, there are no other comments in this dataset.",Provide a dataset that contains only one comme...,public_bi_benchmark/Arade_1
4,The instances in this dataset represent wind t...,Can you provide a dataset containing informati...,public_bi_benchmark/Arade_1


In [4]:
# summaries = pd.read_csv("bc/row_summaries_chicago.csv")

## 2. Index Contexts/Contents

In [5]:
def create_context_documents(df):
    documents = []
    for idx in df.index:
        table = df["table"][idx]
        answer = df["context"][idx]
        document = Document(
            text=answer,
            metadata={"table": table},
            doc_id=f"doc_'{table}'_{idx}",
        )
        documents.append(document)
    return documents

In [6]:
def create_content_documents(df):
    documents = []
    for idx in df.index:
        table = df["table"][idx]
        table_summary = df["summary"][idx]
        document = Document(
            text=table_summary,
            metadata={"table": table},
            doc_id=f"doc_'{table}'_{idx}",
        )
        documents.append(document)
    return documents

In [7]:
import numpy as np

def get_sample_summaries(summaries: pd.DataFrame, sample_percentage=1):
    """
    This is to randomly sample certain percentage of the summaries.
    The return value is the df itself, but only the sampled rows remained.
    """
    # Prepare to sample summaries (category refers to the tables)
    category_counts = summaries["table"].value_counts()
    sample_sizes = np.ceil(category_counts * sample_percentage).astype(int)

    # Perform stratified sampling
    sampled_summaries = summaries.copy(deep=True)
    sampled_summaries["table_copy"] = sampled_summaries["table"]
    sampled_summaries = sampled_summaries.groupby("table_copy", group_keys=False).apply(
        lambda x: x.sample(n=sample_sizes[x.name], random_state=42),
        include_groups=False,
    )
    return sampled_summaries.reset_index(drop=True)

In [8]:
# documents = create_content_documents(get_sample_summaries(summaries, 1))
documents = create_context_documents(bx1_df)

In [9]:
vector_index = VectorStoreIndex(documents)
print("Index created")

Index created


## 3. Evaluate Vector Search

In [10]:
def convert_retrieved_data_to_tables_ranks(retrieved_data):
    # Convert all retrieved data to the format (table, rank)
    rank = 1
    prev_score = retrieved_data[0].get_score()
    tables_ranks = []
    for data in retrieved_data:
        if data.get_score() < prev_score:
            rank += 1
        table = data.id_.split("'")[1]  # E.g., "chicago_open_data/22u3-xenr"
        tables_ranks.append((table, rank))
        prev_score = data.get_score()
    return tables_ranks

In [11]:
def evaluate(retriever, benchmark_df):
    accuracy_sum = 0
    precision_at_1_sum = 0
    reciprocal_rank_sum = 0
    for i in range(len(benchmark_df)):
        query = benchmark_df["question"][i]
        expected_table = benchmark_df["table"][i]
        retrieved_data = retriever.retrieve(query)
        tables_ranks = convert_retrieved_data_to_tables_ranks(retrieved_data)
        for j, (table, rank) in enumerate(tables_ranks):
            if table == expected_table:
                accuracy_sum += 1
                if rank == 1:
                    precision_at_1_sum += 1
                reciprocal_rank_sum += 1 / (j + 1)
                break
        if i % 50 == 0:
            print(i)
            print("Accuracy:", accuracy_sum)
            print("Prec@1:", precision_at_1_sum)
            print("Reciprocal Rank:", reciprocal_rank_sum)
    return {
        "accuracy": accuracy_sum / benchmark_df.shape[0],
        "Mean Precision@1": precision_at_1_sum / benchmark_df.shape[0],
        "MRR": reciprocal_rank_sum / benchmark_df.shape[0],
    }

In [18]:
vector_retriever = vector_index.as_retriever(similarity_top_k=10)
result = evaluate(vector_retriever, bx1_df)
print(result)

0
Accuracy: 1
Prec@1: 1
Reciprocal Rank: 1.0
50
Accuracy: 39
Prec@1: 30
Reciprocal Rank: 32.75396825396826
100
Accuracy: 71
Prec@1: 51
Reciprocal Rank: 57.44841269841271
150
Accuracy: 109
Prec@1: 75
Reciprocal Rank: 85.93452380952382
200
Accuracy: 145
Prec@1: 90
Reciprocal Rank: 108.03293650793653
250
Accuracy: 186
Prec@1: 102
Reciprocal Rank: 128.91031746031746
300
Accuracy: 228
Prec@1: 120
Reciprocal Rank: 153.27777777777774
350
Accuracy: 269
Prec@1: 137
Reciprocal Rank: 177.69563492063486
400
Accuracy: 312
Prec@1: 153
Reciprocal Rank: 201.90119047619035
450
Accuracy: 353
Prec@1: 171
Reciprocal Rank: 228.11071428571418
500
Accuracy: 397
Prec@1: 187
Reciprocal Rank: 252.33531746031738
550
Accuracy: 441
Prec@1: 203
Reciprocal Rank: 277.27499999999986
600
Accuracy: 484
Prec@1: 223
Reciprocal Rank: 303.5623015873014
650
Accuracy: 529
Prec@1: 256
Reciprocal Rank: 339.8579365079363
700
Accuracy: 570
Prec@1: 282
Reciprocal Rank: 371.3523809523807
750
Accuracy: 607
Prec@1: 311
Reciprocal Ran