## 1. Summarizing + Indexing to Retriever

In [None]:
import os
os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
import setproctitle
setproctitle.setproctitle("python")
from transformers import set_seed
set_seed(42, deterministic=True)

In [None]:
import chromadb
import pandas as pd
import numpy as np

from sentence_transformers import SentenceTransformer
from tqdm import tqdm
from llama_index.core import Document
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.core.node_parser import SentenceSplitter

In [None]:
model = SentenceTransformer('dunzhang/stella_en_1.5B_v5', trust_remote_code=True)

In [None]:
client = chromadb.PersistentClient("testing")
collection = client.create_collection(name="benchmark", metadata={"hnsw:space": "cosine"})

In [None]:
import duckdb
con = duckdb.connect()
path = "../data_src/tables/public_bi_benchmark"
tables = [file[:-4] for file in sorted(os.listdir(path)) if file.endswith(".csv")]
tables.sort()

In [None]:
summarize_method = "column_names"

In [None]:
# Summarize for vector retriever
def create_embed_meta_id(scenario: str):
    embeddings = []
    metadatas = []
    ids = []
    narrations = pd.read_csv("chicago_cols_descriptions.csv")

    for table in tqdm(tables):
        df = con.sql(f"select * from '{path}/{table}.csv'").to_df()

        if scenario == "column_names":
            summary = " | ".join(df.columns)
            embedding = model.encode(summary)

        elif scenario == "row_values":
            try:
                summary = ' | '.join(df.loc[0].astype(str))
            except:
                summary = ""
            for i in range(1, len(df)):
                summary += f" || {' | '.join(df.loc[i].astype(str))}"
            chunks = chunk_text(summary, 131072)
            chunk_embeds = [model.encode(chunk) for chunk in chunks]
            embedding = np.mean(chunk_embeds, axis=0)

        elif scenario == "column_names_row_values":
            summary = " | ".join(df.columns)
            for i in range(len(df)):
                summary += f" || {' | '.join(df.loc[i].astype(str))}"
            chunks = chunk_text(summary, 131072)
            chunk_embeds = [model.encode(chunk) for chunk in chunks]
            embedding = np.mean(chunk_embeds, axis=0)

        elif scenario == "column_narration":
            narrations_filtered = narrations[narrations["table"] == table].reset_index(drop=True)
            summary = f"{narrations_filtered["description"][0]}"
            for i in range(1, len(narrations_filtered)):
                summary += f" | {narrations_filtered["description"][i]}"
            embedding = model.encode(summary)

        embeddings.append(embedding.tolist())
        metadatas.append({"table": table})
        ids.append(table)
    return embeddings, metadatas, ids

In [None]:
# Indexing for vector retriever
embeddings, metadatas, ids = create_embed_meta_id(summarize_method)
collection.add(
    embeddings=embeddings,
    metadatas=metadatas,
    ids=ids
)

In [None]:
# Summarize for keyword retriever
def create_documents(type_documents: str):
    row_summaries = pd.read_csv("chicago_cols_descriptions.csv")
    documents = []

    for table in tqdm(tables):
        df = con.sql(f"select * from '{path}/{table}.csv'").to_df()

        if type_documents == "column_names":
            summary = " | ".join(df.columns)

        elif type_documents == "column_narration":
            filtered_summaries = row_summaries[row_summaries["table"] == table].reset_index(drop=True)
            summary = f"{filtered_summaries["description"][0]}"
            for i in range(1, len(filtered_summaries)):
                summary += f" || {filtered_summaries["description"][i]}"

        document = Document(
            text=summary,
            metadata={"table": table},
            doc_id=table,
        )
        documents.append(document)
    return documents

In [None]:
documents = create_documents(summarize_method)
splitter = SentenceSplitter(paragraph_separator=" || ")
nodes = splitter.get_nodes_from_documents(documents)
BM25_retriever = BM25Retriever.from_defaults(nodes=nodes, similarity_top_k=1)

## 2. Benchmarking Retrievers

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def remove_stopwords(text: str):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word.lower() not in stop_words]
    return ' '.join(filtered_text)

In [None]:
import json
def read_jsonl(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            data.append(json.loads(line.strip()))
    return data

In [None]:
def process_nodes_bm25(nodes):
    # Normalize relevance scores and return the nodes in dict format.
    scores: list[float] = [node.score for node in nodes]
    max_score = max(scores)
    min_score = min(scores)

    processed_nodes = {}
    for node in nodes:
        if min_score == max_score:
            node.score = 1
        else:
            node.score = (node.score - min_score) / (max_score - min_score)
        processed_nodes[node.metadata["table"]] = node.score
    return processed_nodes

In [None]:
def process_nodes_vec(items):
    # Normalize relevance scores and return the nodes in dict format.
    scores: list[float] = [1 - dist for dist in items["distances"][0]]
    max_score = max(scores)
    min_score = min(scores)

    processed_nodes = {}
    for idx, table in enumerate(items['ids'][0]):
        if min_score == max_score:
            score = 1
        else:
            score = (scores[idx] - min_score) / (max_score - min_score)
        processed_nodes[table] = score
    return processed_nodes

In [None]:
def process_hybrid_search(bm25_res, vec_res):
    processed_nodes_bm25 = process_nodes_bm25(bm25_res)
    processed_nodes_vec: dict = process_nodes_vec(vec_res)

    node_ids = set(list(processed_nodes_bm25.keys()) + list(processed_nodes_vec.keys()))
    all_nodes = []
    for node_id in node_ids:
        try:
            bm25_score = processed_nodes_bm25.get(node_id, 0.0)
        except:
            bm25_score = 0.0
        try:
            cosine_score = processed_nodes_vec.get(node_id, 0.0)
        except:
            cosine_score = 0.0
        combined_score = 0.5 * bm25_score + 0.5 * cosine_score
        all_nodes.append((node_id, combined_score))
    
    sorted_nodes = sorted(all_nodes, key=lambda node: (-node[1], node[0]))[:10]
    return sorted_nodes

In [None]:
import json
def evaluate_benchmark(jsonl_data, retrieval_type: str, k: int):
    hitrate_sum = 0
    wrong_list = []
    i = 0
    for datum in tqdm(jsonl_data):
        answer_tables = datum["answer_tables"]
        if retrieval_type == "keyword":
            BM25_retriever.similarity_top_k = k
            question = remove_stopwords(datum["question"])
            results = BM25_retriever.retrieve(question)

            before = hitrate_sum
            for node in results:
                table = node.metadata["table"]
                if table in answer_tables:
                    hitrate_sum += 1
                    break
            if before == hitrate_sum:
                wrong_list.append(i)
        
        elif retrieval_type == "hybrid":
            question = datum["question"]
            question_embedding = model.encode(question).tolist()
            stopwords_removed_question = remove_stopwords(question)
            
            bm25_res = BM25_retriever.retrieve(stopwords_removed_question)
            vec_res = collection.query(
                query_embeddings=[question_embedding],
                n_results=k
            )

            all_nodes = process_hybrid_search(bm25_res, vec_res)
            for table, _ in all_nodes:
                if table in answer_tables:
                    hitrate_sum += 1
                    break

        elif retrieval_type == "vector":
            question = datum["question"]
            question_embedding = model.encode(question).tolist()
            results = collection.query(
                query_embeddings=[question_embedding],
                n_results=k
            )
            before = hitrate_sum
            for retrieved_table in results["metadatas"][0]:
                if retrieved_table['table'] in answer_tables:
                    hitrate_sum += 1
                    break
            if before == hitrate_sum:
                wrong_list.append(i)
        else:
            raise ValueError()
        i += 1
    print(f"Hit Rate: {hitrate_sum/len(jsonl_data)}")
    with open(f"{path}-{retrieval_type}-{method}-{k}-1.json", 'w') as file:
        json.dump(wrong_list, file)
    return hitrate_sum/len(jsonl_data)

In [None]:
jsonl_path = "public_bi_questions_annotated.jsonl"
jsonl_data = read_jsonl(jsonl_path)
results = evaluate_benchmark(jsonl_data, "keyword", 1)