# 1. Preparation

In [1]:
import os
import sys
import chromadb
import numpy as np
import setproctitle

setproctitle.setproctitle("python3.12")
sys.path.append("..")

from tqdm import tqdm
from transformers import set_seed
from chromadb.api.client import Client
from chromadb.api.models.Collection import Collection
from benchmark_generator.context.utils.jsonl import read_jsonl
from sentence_transformers import SentenceTransformer
from sentence_transformers.SentenceTransformer import SentenceTransformer

os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
os.environ["CUDA_VISIBLE_DEVICES"] = "4"
set_seed(42, deterministic=True)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
embedding_model = SentenceTransformer('../models/stella', local_files_only=True)

In [3]:
dataset = "chembl"
if dataset == "chicago":
    std_contents = read_jsonl("../pneuma_summarizer/summaries/standard/chicago_standard.jsonl")
    narration_contents = read_jsonl("../pneuma_summarizer/summaries/narrations/chicago_narrations.jsonl")
    contexts = read_jsonl("../data_src/benchmarks/context/chicago/contexts_chicago.jsonl")

    content_benchmark = read_jsonl("../data_src/benchmarks/content/pneuma_chicago_10K_questions_annotated.jsonl")
    context_benchmark = read_jsonl("../data_src/benchmarks/context/chicago/bx_chicago.jsonl")
    path = "../data_src/tables/pneuma_chicago_10K"
elif dataset == "public":
    std_contents = read_jsonl("../pneuma_summarizer/summaries/standard/public_standard.jsonl")
    narration_contents = read_jsonl("../pneuma_summarizer/summaries/narrations/public_narrations.jsonl")
    contexts = read_jsonl("../data_src/benchmarks/context/public/contexts_public.jsonl")

    content_benchmark = read_jsonl("../data_src/benchmarks/content/pneuma_public_bi_questions_annotated.jsonl")
    context_benchmark = read_jsonl("../data_src/benchmarks/context/public/bx_public.jsonl")
    path = "../data_src/tables/pneuma_public_bi"
elif dataset == "chembl":
    row_contents = read_jsonl("../pneuma_summarizer/summaries/rows/chembl.jsonl")
    std_contents = read_jsonl("../pneuma_summarizer/summaries/standard/chembl_standard.jsonl")
    combine_contents = sorted(row_contents + std_contents, key=lambda x: x["table"])
    contexts = read_jsonl("../data_src/benchmarks/context/chembl/contexts_chembl.jsonl")
    content_benchmark = read_jsonl("../data_src/benchmarks/content/pneuma_chembl_10K_questions_annotated.jsonl")
    context_benchmark = read_jsonl("../data_src/benchmarks/context/chembl/bx_chembl.jsonl")
    path = "../data_src/tables/pneuma_chembl_10K"
elif dataset == "adventure":
    overall_contents = read_jsonl("../pneuma_summarizer/summaries/overall/adventure_overall.jsonl")
    std_contents = read_jsonl("../pneuma_summarizer/summaries/standard/adventure_standard.jsonl")
    narration_contents = read_jsonl("../pneuma_summarizer/summaries/narrations/adventure_narrations.jsonl")
    contexts = read_jsonl("../data_src/benchmarks/context/adventure/contexts_adventure.jsonl")

    content_benchmark = read_jsonl("../data_src/benchmarks/content/pneuma_adventure_works_questions_annotated.jsonl")
    context_benchmark = read_jsonl("../data_src/benchmarks/context/adventure/bx_adventure.jsonl")
    path = "../data_src/tables/pneuma_adventure_works"
elif dataset == "fetaqa":
    row_contents = read_jsonl("../pneuma_summarizer/summaries/rows/fetaqa.jsonl")
    overall_contents = read_jsonl("../pneuma_summarizer/summaries/overall/fetaqa_overall.jsonl")
    # row_contents = read_jsonl("../pneuma_summarizer/summaries/rows-old/fetaqa_rows.jsonl")
    std_contents = read_jsonl("../pneuma_summarizer/summaries/standard/fetaqa_standard.jsonl")
    contexts = read_jsonl("../data_src/benchmarks/context/fetaqa/contexts_fetaqa.jsonl")
    content_benchmark = read_jsonl("../data_src/benchmarks/content/pneuma_fetaqa_questions_annotated.jsonl")
    context_benchmark = read_jsonl("../data_src/benchmarks/context/fetaqa/bx_fetaqa.jsonl")
    path = "../data_src/tables/pneuma_fetaqa"

# 2. Indexing

In [4]:
def indexing_vector(
    client: Client,
    embedding_model: SentenceTransformer,
    std_contents: list[dict[str, str]],
    contexts: list[dict[str, str]] = None,
    collection_name = "benchmark",
    reindex = False,
):
    documents = []
    metadatas = []
    ids = []

    if not reindex:
        try:
            collection = client.get_collection(collection_name)
            return collection
        except:
            pass
    try:
        client.delete_collection(collection_name)
    except:
        pass
    collection = client.create_collection(
        name=collection_name,
        metadata={"hnsw:space": "cosine", "hnsw:random_seed": 42}
    )

    tables = sorted({content["table"] for content in std_contents})
    for table in tables:
        cols = [content["summary"] for content in std_contents if content["table"] == table]
        for content_idx, content in enumerate(cols):
            documents.append(content)
            metadatas.append({"table": f"{table}_SEP_contents_{content_idx}"})
            ids.append(f"{table}_SEP_contents_{content_idx}")

        if contexts is not None:
            filtered_contexts = [context["context"] for context in contexts if context["table"] == table]
            for context_idx, context in enumerate(filtered_contexts):
                documents.append(context)
                metadatas.append({"table": f"{table}_SEP_{context_idx}"})
                ids.append(f"{table}_SEP_{context_idx}")

    for i in range(0, len(documents), 30000):
        try:
            raise ValueError()
            embeddings = np.loadtxt(
                f"TEMP-{i}.txt"
            )
        except:
            embeddings = embedding_model.encode(
                documents[i:i+30000],
                batch_size=100,
                show_progress_bar=True,
                device="cuda"
            )
            # np.savetxt(
            #     f"TEMP-{i}.txt", embeddings
            # )

        collection.add(
            embeddings=[embed.tolist() for embed in embeddings],
            metadatas=metadatas[i:i+30000],
            documents=documents[i:i+30000],
            ids=ids[i:i+30000],
        )
    return collection

In [5]:
# combine_contents = row_contents + overall_contents
# combine_contents.sort(key=lambda x: x["table"])

In [5]:
import shutil
try:
    shutil.rmtree(f"index-{dataset}-TEMP")
except:
    pass

In [6]:
import time
start = time.time()
client = chromadb.PersistentClient(f"index-{dataset}-TEMP")
collection = indexing_vector(client, embedding_model, combine_contents, None)
end = time.time()
print(f"Indexing time: {end-start} seconds")

Batches:   0%|          | 0/5 [00:00<?, ?it/s]We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
Batches: 100%|██████████| 5/5 [00:13<00:00,  2.60s/it]


Indexing time: 13.600780963897705 seconds


# 3. Benchmarking

In [7]:
def evaluate_benchmark(
    benchmark: list[dict[str,str]],
    benchmark_type: str,
    k: int,
    embedding_model: SentenceTransformer,
    collection: Collection,
    use_rephrased_questions=False
):
    start = time.time()
    hitrate_sum = 0
    wrong_questions = []
    def get_question_key(benchmark_type: str):
        if benchmark_type == "content":
            if not use_rephrased_questions:
                question_key = "question_from_sql_1"
            else:
                question_key = "question"
        else:
            if not use_rephrased_questions:
                question_key = "question_bx1"
            else:
                question_key = "question_bx2"
        return question_key
    question_key = get_question_key(benchmark_type)

    questions = []
    for data in benchmark:
        questions.append(data[question_key])
    try:
        embed_questions = np.loadtxt(
            f"embeddings/embed-{dataset}-questions-{benchmark_type}-{use_rephrased_questions}.txt"
        )
    except:
        embed_questions = embedding_model.encode(questions, batch_size=64, show_progress_bar=True)
        np.savetxt(
            f"embeddings/embed-{dataset}-questions-{benchmark_type}-{use_rephrased_questions}.txt",
            embed_questions,
        )
    embed_questions = [embed.tolist() for embed in embed_questions]

    for idx, datum in enumerate(tqdm(benchmark)):
        answer_tables = datum["answer_tables"]
        vec_res = collection.query(
            query_embeddings=[embed_questions[idx]],
            n_results=k
        )
        before = hitrate_sum
        for res in vec_res['ids'][0]:
            table = res.split("_SEP_")[0]
            if table in answer_tables:
                hitrate_sum += 1
                break
        if before == hitrate_sum:
            # print(f"Question: {questions[idx]}")
            # print(f"Answer tables: {answer_tables[:50]}")
            # print(f"Actual table: {vec_res["documents"][0][0]}")
            # if len(wrong_questions) == 2:
            #     raise ValueError()
            wrong_questions.append(idx)
    
    end = time.time()

    print(f"Hit Rate: {hitrate_sum/len(benchmark) * 100}")
    print(f"Benchmarking Time: {end - start} seconds")
    print(f"Wrongly answered questions: {wrong_questions}")

In [8]:
# BC1
evaluate_benchmark(
    content_benchmark, "content", 1, embedding_model, collection
)

100%|██████████| 1000/1000 [00:02<00:00, 351.99it/s]

Hit Rate: 42.199999999999996
Benchmarking Time: 3.2331340312957764 seconds
Wrongly answered questions: [1, 2, 4, 5, 6, 7, 8, 12, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 31, 32, 33, 34, 35, 36, 38, 39, 40, 41, 42, 43, 46, 48, 52, 53, 55, 56, 57, 59, 60, 62, 65, 69, 75, 76, 82, 83, 84, 86, 87, 88, 90, 91, 92, 93, 94, 95, 96, 97, 101, 102, 103, 105, 107, 110, 114, 115, 116, 118, 120, 121, 122, 123, 125, 128, 129, 130, 131, 133, 134, 135, 136, 138, 141, 142, 143, 146, 147, 148, 149, 160, 161, 164, 165, 172, 173, 174, 177, 178, 180, 182, 184, 187, 190, 192, 194, 195, 196, 199, 200, 201, 210, 211, 212, 213, 215, 216, 217, 218, 220, 221, 222, 223, 230, 231, 232, 234, 238, 239, 240, 244, 245, 246, 247, 250, 251, 253, 254, 257, 258, 259, 262, 264, 265, 266, 267, 268, 270, 271, 272, 273, 276, 277, 280, 282, 284, 288, 290, 291, 293, 297, 298, 299, 300, 301, 303, 305, 306, 307, 309, 310, 311, 312, 315, 318, 320, 321, 322, 323, 325, 327, 328, 329, 331, 332, 333, 335, 336, 337, 338, 




In [9]:
# BC2
evaluate_benchmark(
    content_benchmark, "content", 1, embedding_model, collection, use_rephrased_questions=True
)

100%|██████████| 1000/1000 [00:02<00:00, 348.15it/s]

Hit Rate: 40.1
Benchmarking Time: 3.256103515625 seconds
Wrongly answered questions: [1, 2, 3, 4, 5, 6, 7, 8, 11, 12, 16, 17, 18, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 46, 48, 49, 51, 52, 53, 55, 56, 59, 62, 65, 66, 67, 69, 73, 74, 75, 76, 80, 81, 83, 84, 85, 86, 87, 88, 89, 90, 92, 94, 95, 96, 97, 98, 100, 102, 103, 105, 106, 107, 110, 111, 113, 115, 117, 119, 120, 122, 123, 124, 125, 127, 128, 129, 130, 131, 133, 135, 136, 142, 143, 144, 146, 147, 148, 154, 157, 158, 159, 160, 161, 162, 164, 168, 173, 174, 175, 176, 177, 178, 179, 180, 182, 184, 186, 187, 188, 189, 190, 194, 195, 196, 198, 199, 203, 204, 209, 210, 211, 212, 213, 217, 218, 220, 221, 222, 223, 228, 229, 230, 231, 234, 236, 237, 239, 247, 251, 253, 254, 255, 256, 257, 258, 259, 262, 264, 265, 266, 267, 268, 271, 272, 273, 276, 277, 279, 280, 282, 284, 288, 290, 291, 293, 294, 297, 298, 299, 302, 303, 307, 308, 310, 311, 312, 314, 315, 316, 318, 319, 320, 321, 322, 32




In [None]:
# BX1
evaluate_benchmark(
    context_benchmark, "context", 1, embedding_model, collection
)

In [None]:
# BX2
evaluate_benchmark(
    context_benchmark, "context", 1, embedding_model, collection, use_rephrased_questions=True
)