# 1. Preparation

In [1]:
import os
import sys
import chromadb
import numpy as np

sys.path.append("..")

from tqdm import tqdm
from transformers import set_seed
from chromadb.api.client import Client
from chromadb.api.models.Collection import Collection
from benchmark_generator.context.utils.jsonl import read_jsonl
from sentence_transformers import SentenceTransformer
from sentence_transformers.SentenceTransformer import SentenceTransformer

os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
set_seed(42, deterministic=True)

In [2]:
embedding_model = SentenceTransformer('../models/stella', local_files_only=True)

In [4]:
dataset = "fetaqa"
if dataset == "chicago":
    std_contents = read_jsonl("../pneuma_summarizer/summaries/standard/chicago_standard.jsonl")
    narration_contents = read_jsonl("../pneuma_summarizer/summaries/narrations/chicago_narrations.jsonl")
    contexts = read_jsonl("../data_src/benchmarks/context/chicago/contexts_chicago.jsonl")

    content_benchmark = read_jsonl("../data_src/benchmarks/content/pneuma_chicago_10K_questions_annotated.jsonl")
    context_benchmark = read_jsonl("../data_src/benchmarks/context/chicago/bx_chicago.jsonl")
    path = "../data_src/tables/pneuma_chicago_10K"
elif dataset == "public":
    std_contents = read_jsonl("../pneuma_summarizer/summaries/standard/public_standard.jsonl")
    narration_contents = read_jsonl("../pneuma_summarizer/summaries/narrations/public_narrations.jsonl")
    contexts = read_jsonl("../data_src/benchmarks/context/public/contexts_public.jsonl")

    content_benchmark = read_jsonl("../data_src/benchmarks/content/pneuma_public_bi_questions_annotated.jsonl")
    context_benchmark = read_jsonl("../data_src/benchmarks/context/public/bx_public.jsonl")
    path = "../data_src/tables/pneuma_public_bi"
elif dataset == "chembl":
    std_contents = read_jsonl("../pneuma_summarizer/summaries/standard/chembl_standard.jsonl")
    narration_contents = read_jsonl("../pneuma_summarizer/summaries/narrations/chembl_narrations.jsonl")
    contexts = read_jsonl("../data_src/benchmarks/context/chembl/contexts_chembl.jsonl")

    content_benchmark = read_jsonl("../data_src/benchmarks/content/pneuma_chembl_10K_questions_annotated.jsonl")
    context_benchmark = read_jsonl("../data_src/benchmarks/context/chembl/bx_chembl.jsonl")
    path = "../data_src/tables/pneuma_chembl_10K"
elif dataset == "adventure":
    std_contents = read_jsonl("../pneuma_summarizer/summaries/standard/adventure_standard.jsonl")
    narration_contents = read_jsonl("../pneuma_summarizer/summaries/narrations/adventure_narrations.jsonl")
    contexts = read_jsonl("../data_src/benchmarks/context/adventure/contexts_adventure.jsonl")

    content_benchmark = read_jsonl("../data_src/benchmarks/content/pneuma_adventure_works_questions_annotated.jsonl")
    context_benchmark = read_jsonl("../data_src/benchmarks/context/adventure/bx_adventure.jsonl")
    path = "../data_src/tables/pneuma_adventure_works"
elif dataset == "fetaqa":
    std_contents = read_jsonl("../pneuma_summarizer/summaries/standard/fetaqa_standard.jsonl")
    narration_contents = read_jsonl("../pneuma_summarizer/summaries/narrations/fetaqa_narrations.jsonl")
    contexts = read_jsonl("../data_src/benchmarks/context/fetaqa/contexts_fetaqa.jsonl")
    content_benchmark = read_jsonl("../data_src/benchmarks/content/pneuma_fetaqa_questions_annotated.jsonl")
    context_benchmark = read_jsonl("../data_src/benchmarks/context/fetaqa/bx_fetaqa.jsonl")
    path = "../data_src/tables/pneuma_fetaqa"

# 2. Indexing

In [5]:
def indexing_vector(
    client: Client,
    embedding_model: SentenceTransformer,
    std_contents: list[dict[str, str]],
    contexts: list[dict[str, str]] = None,
    collection_name = "benchmark",
    reindex = False,
):
    documents = []
    metadatas = []
    ids = []

    if not reindex:
        try:
            collection = client.get_collection(collection_name)
            return collection
        except:
            pass
    try:
        client.delete_collection(collection_name)
    except:
        pass
    collection = client.create_collection(
        name=collection_name,
        metadata={"hnsw:space": "cosine", "hnsw:random_seed": 42}
    )

    tables = sorted({content["table"] for content in std_contents})
    for table in tables:
        cols = [content["summary"] for content in std_contents if content["table"] == table]
        for content_idx, content in enumerate(cols):
            documents.append(content)
            metadatas.append({"table": f"{table}_SEP_contents_{content_idx}"})
            ids.append(f"{table}_SEP_contents_{content_idx}")

        if contexts is not None:
            filtered_contexts = [context["context"] for context in contexts if context["table"] == table]
            for context_idx, context in enumerate(filtered_contexts):
                documents.append(context)
                metadatas.append({"table": f"{table}_SEP_{context_idx}"})
                ids.append(f"{table}_SEP_{context_idx}")

    for i in range(0, len(documents), 30000):
        try:
            embeddings = np.loadtxt(
                f"embeddings/embed-{dataset}-context-content-{i}.txt"
            )
        except:
            embeddings = embedding_model.encode(
                documents[i:i+30000],
                batch_size=64,
                show_progress_bar=True,
                device="cuda"
            )
            np.savetxt(
                f"embeddings/embed-{dataset}-context-content-{i}.txt", embeddings
            )

        collection.add(
            embeddings=[embed.tolist() for embed in embeddings],
            metadatas=metadatas[i:i+30000],
            documents=documents[i:i+30000],
            ids=ids[i:i+30000],
        )
    return collection

In [6]:
import time
start = time.time()
client = chromadb.PersistentClient(f"indices/index-{dataset}-context-content")
collection = indexing_vector(client, embedding_model, std_contents, contexts)
end = time.time()
print(f"Indexing time: {end-start} seconds")

# 3. Benchmarking

In [7]:
def evaluate_benchmark(
    benchmark: list[dict[str,str]],
    benchmark_type: str,
    k: int,
    embedding_model: SentenceTransformer,
    collection: Collection,
    use_rephrased_questions=False
):
    start = time.time()
    hitrate_sum = 0
    wrong_questions = []
    def get_question_key(benchmark_type: str):
        if benchmark_type == "content":
            if not use_rephrased_questions:
                question_key = "question_from_sql_1"
            else:
                question_key = "question"
        else:
            if not use_rephrased_questions:
                question_key = "question_bx1"
            else:
                question_key = "question_bx2"
        return question_key
    question_key = get_question_key(benchmark_type)

    questions = []
    for data in benchmark:
        questions.append(data[question_key])
    try:
        embed_questions = np.loadtxt(
            f"embeddings/embed-{dataset}-questions-{benchmark_type}-{use_rephrased_questions}.txt"
        )
    except:
        embed_questions = embedding_model.encode(questions, batch_size=64, show_progress_bar=True)
        np.savetxt(
            f"embeddings/embed-{dataset}-questions-{benchmark_type}-{use_rephrased_questions}.txt",
            embed_questions,
        )
    embed_questions = [embed.tolist() for embed in embed_questions]

    for idx, datum in enumerate(tqdm(benchmark)):
        answer_tables = datum["answer_tables"]
        vec_res = collection.query(
            query_embeddings=[embed_questions[idx]],
            n_results=k
        )
        before = hitrate_sum
        for res in vec_res['ids'][0]:
            table = res.split("_SEP_")[0]
            if table in answer_tables:
                hitrate_sum += 1
                break
        if before == hitrate_sum:
            wrong_questions.append(idx)
    
    end = time.time()

    print(f"Hit Rate: {hitrate_sum/len(benchmark) * 100}")
    print(f"Benchmarking Time: {end - start} seconds")
    print(f"Wrongly answered questions: {wrong_questions}")

In [8]:
# BC1
evaluate_benchmark(
    content_benchmark, "content", 1, embedding_model, collection
)

In [9]:
# BC2
evaluate_benchmark(
    content_benchmark, "content", 1, embedding_model, collection, use_rephrased_questions=True
)

In [10]:
# BX1
evaluate_benchmark(
    context_benchmark, "context", 1, embedding_model, collection
)

In [11]:
# BX2
evaluate_benchmark(
    context_benchmark, "context", 1, embedding_model, collection, use_rephrased_questions=True
)