# Vector Search

## 1. Load Embedding Model

In [2]:
import pandas as pd
from llama_index.core import (
    Settings,
    VectorStoreIndex,
    Document,
)

from llama_index.embeddings.huggingface import HuggingFaceEmbedding

In [4]:
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

In [16]:
bx1_df = pd.read_csv("BX2_chicago.csv")
bx1_df.head()

Unnamed: 0,context,question,table,relevant_tables
0,The dataset was created to track and manage co...,Can you furnish a comprehensive database detai...,chicago_open_data/22u3-xenr,"['chicago_open_data/22u3-xenr', 'chicago_open_..."
1,The dataset was created by the City of Chicago...,What municipal data collection was commissione...,chicago_open_data/22u3-xenr,"['chicago_open_data/22u3-xenr', 'chicago_open_..."
2,The dataset was funded by the City of Chicago'...,"What publicly accessible data collection, supp...",chicago_open_data/22u3-xenr,"['chicago_open_data/22u3-xenr', 'chicago_open_..."
3,"No, there are no other comments. The dataset o...",Can you furnish a compact relational database ...,chicago_open_data/22u3-xenr,"['chicago_open_data/22u3-xenr', 'chicago_open_..."
4,The instances in this dataset represent code v...,Can you furnish a comprehensive database compr...,chicago_open_data/22u3-xenr,"['chicago_open_data/22u3-xenr', 'chicago_open_..."


In [None]:
# summaries = pd.read_csv("bc/row_summaries_chicago.csv")

## 2. Index Contexts/Contents

In [17]:
def create_context_documents(df):
    documents = []
    for idx in df.index:
        table = df["table"][idx]
        answer = df["context"][idx]
        document = Document(
            text=answer,
            metadata={"table": table},
            doc_id=f"doc_'{table}'_{idx}",
        )
        documents.append(document)
    return documents

In [18]:
def create_content_documents(df):
    documents = []
    for idx in df.index:
        table = df["table"][idx]
        table_summary = df["summary"][idx]
        document = Document(
            text=table_summary,
            metadata={"table": table},
            doc_id=f"doc_'{table}'_{idx}",
        )
        documents.append(document)
    return documents

In [None]:
import numpy as np

def get_sample_summaries(summaries: pd.DataFrame, sample_percentage=1):
    """
    This is to randomly sample certain percentage of the summaries.
    The return value is the df itself, but only the sampled rows remained.
    """
    # Prepare to sample summaries (category refers to the tables)
    category_counts = summaries["table"].value_counts()
    sample_sizes = np.ceil(category_counts * sample_percentage).astype(int)

    # Perform stratified sampling
    sampled_summaries = summaries.copy(deep=True)
    sampled_summaries["table_copy"] = sampled_summaries["table"]
    sampled_summaries = sampled_summaries.groupby("table_copy", group_keys=False).apply(
        lambda x: x.sample(n=sample_sizes[x.name], random_state=42),
        include_groups=False,
    )
    return sampled_summaries.reset_index(drop=True)

In [19]:
# documents = create_content_documents(get_sample_summaries(summaries, 1))
documents = create_context_documents(bx1_df)

In [20]:
vector_index = VectorStoreIndex(documents)
print("Index created")

Index created


## 3. Evaluate Vector Search

In [21]:
def convert_retrieved_data_to_tables_ranks(retrieved_data):
    # Convert all retrieved data to the format (table, rank)
    rank = 1
    prev_score = retrieved_data[0].get_score()
    tables_ranks = []
    for data in retrieved_data:
        if data.get_score() < prev_score:
            rank += 1
        table = data.id_.split("'")[1]  # E.g., "chicago_open_data/22u3-xenr"
        tables_ranks.append((table, rank))
        prev_score = data.get_score()
    return tables_ranks

In [22]:
def evaluate(retriever, benchmark_df):
    accuracy_sum = 0
    precision_at_1_sum = 0
    reciprocal_rank_sum = 0
    for i in range(len(benchmark_df)):
        query = benchmark_df["question"][i]
        expected_tables = benchmark_df["relevant_tables"][i]
        retrieved_data = retriever.retrieve(query)
        tables_ranks = convert_retrieved_data_to_tables_ranks(retrieved_data)
        for j, (table, rank) in enumerate(tables_ranks):
            if table in expected_tables:
                accuracy_sum += 1
                if rank == 1:
                    precision_at_1_sum += 1
                reciprocal_rank_sum += 1 / (j + 1)
                break
        if i % 25 == 0:
            print(i)
            print("Accuracy:", accuracy_sum)
            print("Prec@1:", precision_at_1_sum)
            print("Reciprocal Rank:", reciprocal_rank_sum)
    return {
        "accuracy": accuracy_sum / benchmark_df.shape[0],
        "Mean Precision@1": precision_at_1_sum / benchmark_df.shape[0],
        "MRR": reciprocal_rank_sum / benchmark_df.shape[0],
    }

In [27]:
vector_retriever = vector_index.as_retriever(similarity_top_k=10)
result = evaluate(vector_retriever, bx1_df)
print(result)

0
Accuracy: 1
Prec@1: 0
Reciprocal Rank: 0.5
25
Accuracy: 26
Prec@1: 22
Reciprocal Rank: 23.416666666666664
50
Accuracy: 51
Prec@1: 44
Reciprocal Rank: 46.2
75
Accuracy: 75
Prec@1: 63
Reciprocal Rank: 67.09285714285716
100
Accuracy: 99
Prec@1: 79
Reciprocal Rank: 86.10238095238095
125
Accuracy: 124
Prec@1: 101
Reciprocal Rank: 109.60238095238095
150
Accuracy: 148
Prec@1: 122
Reciprocal Rank: 131.68571428571428
175
Accuracy: 173
Prec@1: 146
Reciprocal Rank: 155.93571428571428
200
Accuracy: 197
Prec@1: 167
Reciprocal Rank: 178.18571428571428
225
Accuracy: 222
Prec@1: 190
Reciprocal Rank: 201.57857142857142
250
Accuracy: 247
Prec@1: 214
Reciprocal Rank: 226.07857142857142
275
Accuracy: 272
Prec@1: 238
Reciprocal Rank: 250.57857142857142
300
Accuracy: 297
Prec@1: 261
Reciprocal Rank: 274.3285714285714
325
Accuracy: 320
Prec@1: 283
Reciprocal Rank: 296.8285714285714
350
Accuracy: 345
Prec@1: 306
Reciprocal Rank: 320.8285714285714
375
Accuracy: 370
Prec@1: 330
Reciprocal Rank: 345.1619047619