# 1. Preparation

In [63]:
import sys
import chromadb
import numpy as np
import setproctitle

setproctitle.setproctitle("python3.12")
sys.path.append("../..")

from tqdm import tqdm
from chromadb.api.models.Collection import Collection
from benchmark_generator.context.utils.jsonl import read_jsonl

In [64]:
dataset = "fetaqa"

if dataset == "chicago":
    content_benchmark = read_jsonl("../../data_src/benchmarks/content/pneuma_chicago_10K_questions_annotated.jsonl")
    context_benchmark = read_jsonl("../../data_src/benchmarks/context/chicago/bx_chicago.jsonl")
elif dataset == "public":
    content_benchmark = read_jsonl("../../data_src/benchmarks/content/pneuma_public_bi_questions_annotated.jsonl")
    context_benchmark = read_jsonl("../../data_src/benchmarks/context/public/bx_public.jsonl")
elif dataset == "chembl":
    content_benchmark = read_jsonl("../../data_src/benchmarks/content/pneuma_chembl_10K_questions_annotated.jsonl")
    context_benchmark = read_jsonl("../../data_src/benchmarks/context/chembl/bx_chembl.jsonl")
elif dataset == "adventure":
    content_benchmark = read_jsonl("../../data_src/benchmarks/content/pneuma_adventure_works_questions_annotated.jsonl")
    context_benchmark = read_jsonl("../../data_src/benchmarks/context/adventure/bx_adventure.jsonl")
elif dataset == "fetaqa":
    content_benchmark = read_jsonl("../../data_src/benchmarks/content/pneuma_fetaqa_questions_annotated.jsonl")
    context_benchmark = read_jsonl("../../data_src/benchmarks/context/fetaqa/bx_fetaqa.jsonl")

# 2. Indexing

In [None]:
import time
start = time.time()
client = chromadb.PersistentClient(f"../indices/index-{dataset}-pneuma-summarizer")
collection = client.get_collection("benchmark")
end = time.time()
print(f"Indexing time: {end-start} seconds")

# 3. Benchmarking

In [76]:
def evaluate_benchmark(
    benchmark: list[dict[str,str]],
    benchmark_type: str,
    k: int,
    collection: Collection,
    use_rephrased_questions=False
):
    start = time.time()
    hitrate_sum = 0
    wrong_questions = []
    def get_question_key(benchmark_type: str):
        if benchmark_type == "content":
            if not use_rephrased_questions:
                question_key = "question_from_sql_1"
            else:
                question_key = "question"
        else:
            if not use_rephrased_questions:
                question_key = "question_bx1"
            else:
                question_key = "question_bx2"
        return question_key
    question_key = get_question_key(benchmark_type)

    questions = []
    for data in benchmark:
        questions.append(data[question_key])
    embed_questions = np.loadtxt(
        f"../embeddings/embed-{dataset}-questions-{benchmark_type}-{use_rephrased_questions}.txt"
    )
    embed_questions = [embed.tolist() for embed in embed_questions]

    for idx, datum in enumerate(tqdm(benchmark)):
        answer_tables = datum["answer_tables"]
        vec_res = collection.query(
            query_embeddings=[embed_questions[idx]],
            n_results=k
        )
        before = hitrate_sum
        for res in vec_res['ids'][0]:
            table = res.split("_SEP_")[0]
            if table in answer_tables:
                hitrate_sum += 1
                break
        if before == hitrate_sum:
            wrong_questions.append(idx)
    
    end = time.time()

    print(f"Hit Rate: {hitrate_sum/len(benchmark) * 100}")
    print(f"Benchmarking Time: {end - start} seconds")
    print(f"Wrongly answered questions: {wrong_questions}")

In [None]:
# BC1
evaluate_benchmark(
    content_benchmark, "content", 1, collection
)

In [None]:
# BC2
evaluate_benchmark(
    content_benchmark, "content", 1, collection, use_rephrased_questions=True
)

In [None]:
# BX1
evaluate_benchmark(
    context_benchmark, "context", 1, collection
)

In [None]:
# BX2
evaluate_benchmark(
    context_benchmark, "context", 1, collection, use_rephrased_questions=True
)