# 1. Preparation

In [None]:
import os
import sys
import setproctitle
import chromadb
import bm25s
import Stemmer
import torch

sys.path.append("..")

from tqdm import tqdm
from transformers import set_seed
from sentence_transformers import SentenceTransformer
from sentence_transformers.SentenceTransformer import SentenceTransformer
from benchmark_generator.context.utils.jsonl import read_jsonl
from benchmark_generator.context.utils.pipeline_initializer import initialize_pipeline
from benchmark_generator.context.utils.prompting_interface import prompt_pipeline

os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2"
set_seed(42, deterministic=True)
setproctitle.setproctitle("python")

In [None]:
pipe = initialize_pipeline("meta-llama/Meta-Llama-3-8B-Instruct", torch.bfloat16)

# Specific setting for Llama-3-8B-Instruct for batching
pipe.tokenizer.pad_token_id = pipe.model.config.eos_token_id
pipe.tokenizer.padding_side = 'left'

In [None]:
embedding_model = SentenceTransformer('dunzhang/stella_en_1.5B_v5', trust_remote_code=True)
stemmer = Stemmer.Stemmer("english")

In [None]:
dataset = "chicago"
if dataset == "chicago":
    std_contents = read_jsonl("../pneuma_summarizer/summaries/standard/chicago_standard.jsonl")
    narration_contents = read_jsonl("../pneuma_summarizer/summaries/narrations/chicago_narrations.jsonl")
    contexts = read_jsonl("../data_src/benchmarks/context/chicago/contexts_chicago.jsonl")

    content_benchmark = read_jsonl("../data_src/benchmarks/content/pneuma_chicago_10K_questions_annotated.jsonl")
    context_benchmark = read_jsonl("../data_src/benchmarks/context/chicago/bx_chicago.jsonl")
    path = "../data_src/tables/pneuma_chicago_10K"
elif dataset == "public":
    std_contents = read_jsonl("../pneuma_summarizer/summaries/standard/public_standard.jsonl")
    narration_contents = read_jsonl("../pneuma_summarizer/summaries/narrations/public_narrations.jsonl")
    contexts = read_jsonl("../data_src/benchmarks/context/public/contexts_public.jsonl")

    content_benchmark = read_jsonl("../data_src/benchmarks/content/pneuma_public_bi_questions_annotated.jsonl")
    context_benchmark = read_jsonl("../data_src/benchmarks/context/public/bx_public.jsonl")
    path = "../data_src/tables/pneuma_public_bi"
elif dataset == "chembl":
    std_contents = read_jsonl("../pneuma_summarizer/summaries/standard/chembl_standard.jsonl")
    narration_contents = read_jsonl("../pneuma_summarizer/summaries/narrations/chembl_narrations.jsonl")
    contexts = read_jsonl("../data_src/benchmarks/context/chembl/contexts_chembl.jsonl")

    content_benchmark = read_jsonl("../data_src/benchmarks/content/pneuma_chembl_10K_questions_annotated.jsonl")
    context_benchmark = read_jsonl("../data_src/benchmarks/context/chembl/bx_chembl.jsonl")
    path = "../data_src/tables/pneuma_chembl_10K"
elif dataset == "adventure":
    std_contents = read_jsonl("../pneuma_summarizer/summaries/standard/adventure_standard.jsonl")
    narration_contents = read_jsonl("../pneuma_summarizer/summaries/narrations/adventure_narrations.jsonl")
    contexts = read_jsonl("../data_src/benchmarks/context/adventure/contexts_adventure.jsonl")

    content_benchmark = read_jsonl("../data_src/benchmarks/content/pneuma_adventure_works_questions_annotated.jsonl")
    context_benchmark = read_jsonl("../data_src/benchmarks/context/adventure/bx_adventure.jsonl")
    path = "../data_src/tables/pneuma_adventure_works"

# 2. Indexing

In [None]:
from chromadb.api.client import Client
def indexing_vector(
    client: Client,
    embedding_model: SentenceTransformer,
    std_contents: list[dict[str, str]],
    contexts: list[dict[str, str]] = None,
    collection_name = "benchmark",
    reindex = False,
):
    documents = []
    metadatas = []
    ids = []

    if not reindex:
        try:
            collection = client.get_collection(collection_name)
            return collection
        except:
            pass
    try:
        client.delete_collection(collection_name)
    except:
        pass
    collection = client.create_collection(
        name=collection_name,
        metadata={"hnsw:space": "cosine", "hnsw:batch_size": 1}
    )

    tables = sorted({content["table"] for content in std_contents})
    for table in tables:
        cols = [content["summary"] for content in std_contents if content["table"] == table]
        for content_idx, content in enumerate(cols):
            documents.append(content)
            metadatas.append({"table": f"{table}_SEP_contents_{content_idx}"})
            ids.append(f"{table}_SEP_contents_{content_idx}")

        if contexts is not None:
            filtered_contexts = [context["context"] for context in contexts if context["table"] == table]
            for context_idx, context in enumerate(filtered_contexts):
                documents.append(context)
                metadatas.append({"table": f"{table}_SEP_{context_idx}"})
                ids.append(f"{table}_SEP_{context_idx}")

    for i in range(0, len(documents), 30000):
        embeddings = embedding_model.encode(
            documents[i:i+30000],
            batch_size=64,
            show_progress_bar=True,
            device="cuda"
        )

        collection.add(
            embeddings=[embed.tolist() for embed in embeddings],
            metadatas=metadatas[i:i+30000],
            documents=documents[i:i+30000],
            ids=ids[i:i+30000],
        )
    return collection

def indexing_keyword(
    stemmer,
    narration_contents: list[dict[str, str]],
    contexts: list[dict[str, str]] = None,
):
    corpus_json = []
    tables = sorted({content["table"] for content in narration_contents})
    for table in tables:
        cols_descriptions = [content["summary"] for content in narration_contents if content["table"] == table]
        for content_idx, content in enumerate(cols_descriptions):
            corpus_json.append({"text": content, "metadata": {"table": f"{table}_SEP_contents_{content_idx}"}})

        if contexts is not None:
            filtered_contexts = [context["context"] for context in contexts if context["table"] == table]
            for context_idx, context in enumerate(filtered_contexts):
                corpus_json.append({"text": context, "metadata": {"table": f"{table}_SEP_{context_idx}"}})

    corpus_text = [doc["text"] for doc in corpus_json]
    corpus_tokens = bm25s.tokenize(corpus_text, stopwords="en", stemmer=stemmer, show_progress=False)

    retriever = bm25s.BM25(corpus=corpus_json)
    retriever.index(corpus_tokens, show_progress=False)
    return retriever

In [None]:
client = chromadb.PersistentClient(f"experiment-{dataset}")
collection = indexing_vector(client, embedding_model, std_contents, contexts)
retriever = indexing_keyword(stemmer, narration_contents, contexts)

# 3. Benchmarking

In [None]:
def process_nodes_bm25(items):
    # Normalize relevance scores and return the nodes in dict format
    results, scores = items
    scores: list[float] = scores[0]
    max_score = max(scores)
    min_score = min(scores)

    processed_nodes: dict[str,tuple[float,str]] = {}
    for i, node in enumerate(results[0]):
        if min_score == max_score:
            score = 1
        else:
            score = (scores[i] - min_score) / (max_score - min_score)
        processed_nodes[node["metadata"]["table"]] = (score, node["text"])
    return processed_nodes

In [None]:
def process_nodes_vec(items):
    # Normalize relevance scores and return the nodes in dict format
    scores: list[float] = [1 - dist for dist in items["distances"][0]]
    max_score = max(scores)
    min_score = min(scores)

    processed_nodes: dict[str,tuple[float, str]] = {}

    for idx in range(len(items['ids'][0])):
        if min_score == max_score:
            score = 1
        else:
            score = (scores[idx] - min_score) / (max_score - min_score)
        processed_nodes[items['ids'][0][idx]] = (score, items['documents'][0][idx])
    return processed_nodes

In [None]:
def hybrid_retriever(
    bm25_res,
    vec_res,
    k: int,
    question: str,
    embedding_model: SentenceTransformer,
    use_reranker=False,
):
    processed_nodes_bm25 = process_nodes_bm25(bm25_res)
    processed_nodes_vec = process_nodes_vec(vec_res)

    node_ids = set(list(processed_nodes_bm25.keys()) + list(processed_nodes_vec.keys()))
    all_nodes: list[tuple[str,float,str]] = []
    for node_id in sorted(node_ids):
        bm25_score_doc = processed_nodes_bm25.get(node_id, (0.0, None))
        vec_score_doc = processed_nodes_vec.get(node_id, (0.0, None))

        combined_score = 0.5 * bm25_score_doc[0] + 0.5 * vec_score_doc[0]
        if bm25_score_doc[1] is None:
            doc = vec_score_doc[1]
        else:
            doc = bm25_score_doc[1]

        all_nodes.append((node_id, combined_score, doc))

    sorted_nodes = sorted(all_nodes, key=lambda node: (-node[1], node[0]))[:k]
    if use_reranker:
        sorted_nodes = rerank(sorted_nodes, question, embedding_model)
    return sorted_nodes

In [None]:
def is_table_context_relevant(context: str, question: str):
    prompt = f"""Given this context describing a table:
*/
{context}
*/
and this question:
/*
{question}
*/
Is the table relevant to answer the question? Begin your answer with yes/no."""

    answer: str = prompt_pipeline(
        pipe, [[{"role": "user", "content": prompt}]], context_length=8192, max_new_tokens=3, top_p=None, temperature=None
    )[0][-1]["content"]

    if answer.lower().startswith("yes"):
        return True
    return False

In [None]:
def is_table_content_relevant(content: str, question: str):
    prompt = f"""Given a table with the following columns:
*/
{content}
*/
and this question:
/*
{question}
*/
Is the table relevant to answer the question? Begin your answer with yes/no."""

    answer: str = prompt_pipeline(
        pipe, [[{"role": "user", "content": prompt}]], context_length=8192, max_new_tokens=3, top_p=None, temperature=None
    )[0][-1]["content"]

    if answer.lower().startswith("yes"):
        return True
    return False

In [None]:
from collections import defaultdict
def rerank(nodes: list[tuple[str,float,str]], question: str, embedding_model: SentenceTransformer):
    max_tokens = 7000  # Context length of a model but reduced to account for other things (chat template, output token, etc.)
    tables_relevancy = defaultdict(bool)

    for node in nodes:
        table_name = node[0]
        if table_name.split("_SEP_")[1].startswith("contents"):
            # if is_table_content_relevant(table_name, question, max_tokens, embedding_model):
            if is_table_content_relevant(node[2], question):
                tables_relevancy[table_name] = True
        else:
            if is_table_context_relevant(node[2], question):
                tables_relevancy[table_name] = True
    new_nodes = [(table_name, score, doc) for table_name, score, doc in nodes if tables_relevancy[table_name]] + [(table_name, score, doc) for table_name, score, doc in nodes if not tables_relevancy[table_name]]
    return new_nodes

In [None]:
from chromadb.api.models.Collection import Collection
def evaluate_benchmark(
    benchmark: list[dict[str,str]],
    benchmark_type: str,
    k: int,
    embedding_model: SentenceTransformer,
    collection: Collection,
    retriever,
    stemmer,
    use_reranker=False,
    use_rephrased_questions=False
):
    hitrate_sum = 0
    wrong_list = []

    if use_reranker:
        increased_k = k * 5
    else:
        increased_k = k
    
    def get_question_key(benchmark_type: str):
        if benchmark_type == "content":
            if not use_rephrased_questions:
                question_key = "question_from_sql_1"
            else:
                question_key = "question"
        else:
            if not use_rephrased_questions:
                question_key = "question_bx1"
            else:
                question_key = "question_bx2"
        return question_key
    question_key = get_question_key(benchmark_type)

    questions = []
    for data in benchmark:
        questions.append(data[question_key])
    embed_questions = embedding_model.encode(questions, batch_size=64, show_progress_bar=True)
    embed_questions = [embed.tolist() for embed in embed_questions]

    for idx, datum in enumerate(tqdm(benchmark)):
        answer_tables = datum["answer_tables"]
        question_embedding = embed_questions[idx]

        query_tokens = bm25s.tokenize(questions[idx], stemmer=stemmer, show_progress=False)
        results, scores = retriever.retrieve(query_tokens, k=increased_k, show_progress=False)
        bm25_res = (results, scores)

        vec_res = collection.query(
            query_embeddings=[question_embedding],
            n_results=increased_k
        )

        all_nodes = hybrid_retriever(bm25_res, vec_res, increased_k, questions[idx], embedding_model, use_reranker)
        before = hitrate_sum
        for table,_,_ in all_nodes[:k]:
            table = table.split("_SEP_")[0]
            if table in answer_tables:
                hitrate_sum += 1
                break
        if before == hitrate_sum:
            wrong_list.append(idx)
        # Checkpoint
        if idx % 5 == 0:
            print(f"Current Hit Rate Sum at index {idx}: {hitrate_sum}")
    print(f"Final Hit Rate Sum: {hitrate_sum}")
    print(f"Hit Rate: {hitrate_sum/len(benchmark)}")
    print(f"Wrong List: {wrong_list}")

In [None]:
ks = [1,5,10,30]

In [None]:
for k in ks:
    print(k)
    evaluate_benchmark(
        content_benchmark, "content", k, embedding_model, collection, retriever, stemmer, use_reranker=False
    )
    print("=" * 50)

In [None]:
for k in ks:
    print(k)
    evaluate_benchmark(
        content_benchmark, "content", k, embedding_model, collection, retriever, stemmer, use_rephrased_questions=True, use_reranker=False
    )
    print("=" * 50)

In [None]:
for k in ks:
    print(k)
    evaluate_benchmark(
        context_benchmark, "context", k, embedding_model, collection, retriever, stemmer, use_reranker=False
    )
    print("=" * 50)

In [None]:
for k in ks:
    print(k)
    evaluate_benchmark(
        context_benchmark, "context", k, embedding_model, collection, retriever, stemmer,use_rephrased_questions=True, use_reranker=False
    )
    print("=" * 50)

# BACKUP

In [None]:
# import duckdb
# con = duckdb.connect()
# def is_table_content_relevant(
#     table: str,
#     question: str,
#     max_tokens: int,
#     embedding_model: SentenceTransformer,
#     num_of_rows=10,
# ) -> bool:
#     def find_largest_smaller_or_equal(tokens_list: list[int], max_tokens: int, aggregate_substractor: int):
#         for idx in range(len(tokens_list) - 1, -1, -1):
#             if (tokens_list[idx] - aggregate_substractor) <= max_tokens:
#                 return idx
#         return -1

#     def get_processed_df(path: str, table: str) -> pd.DataFrame:
#         df = con.sql(f"from '{path}/{table}.csv'").to_df().drop_duplicates().reset_index(drop=True)
#         for col in df.columns:
#             if pd.api.types.is_datetime64_any_dtype(df[col]):
#                 df[col] = pd.to_datetime(df[col], errors='coerce')
#                 df[col] = df[col].apply(
#                     lambda x: x.strftime('%B ') + str(x.day).lstrip('0') + x.strftime(', %Y %H:%M:%S.%f')[:-3] if pd.notnull(x) else 'NaT'
#                 )
#         return df

#     def get_content_relevancy_prompt(table: str, question: str):
#         return f"""Given this table:
# */
# {table}
# */
# and this question:
# /*
# {question}
# */
# Is the table relevant to answer the question? Begin your answer with yes/no."""

#     df = get_processed_df(path, table.split("_SEP_")[0])
#     columns = "col: " + " | ".join(df.columns)
#     required_tokens = [len(pipe.tokenizer.tokenize(columns))] * min(num_of_rows, len(df))

#     def get_relevant_rows(df: pd.DataFrame, question: str, num_of_rows=10):
#         try:
#             client = chromadb.PersistentClient(f"experiment-{dataset}")
#             contents: list[dict[str,str]] = []
#             for row_idx, row in df.iterrows():
#                 contents.append({
#                     "table": table,
#                     "summary": f"row {row_idx+1}: {" | ".join(row.astype(str))}"
#                 })

#             # Step 1: Indexing
#             collection = indexing_vector(client, embedding_model, contents, collection_name="temporary")
#             retriever = indexing_keyword(stemmer, contents)

#             # Step 2: Querying
#             num_of_rows = min(num_of_rows, len(df))

#             query_tokens = bm25s.tokenize(question, stemmer=stemmer, show_progress=False)
#             results, scores = retriever.retrieve(query_tokens, k=num_of_rows, show_progress=False)
#             bm25_res = (results, scores)

#             vec_res = collection.query(
#                 query_embeddings=[embedding_model.encode(question).tolist()],
#                 n_results=num_of_rows
#             )

#             all_nodes = hybrid_retriever(
#                 bm25_res,
#                 vec_res,
#                 num_of_rows,
#                 question,
#                 embedding_model
#             )
#             rows = []
#             for node_idx, node in enumerate(all_nodes):
#                 rows.append(node[2])
#                 required_tokens[node_idx] += len(pipe.tokenizer.tokenize(node[2]))
#                 if node_idx > 0:
#                     required_tokens[node_idx] += required_tokens[node_idx-1]
#             return rows
#         except:
#             return []

#     rows = get_relevant_rows(df, question)

#     last_unprocessed_idx = 0
#     conversations: list[list[dict[str, str]]] = []
#     aggregate_substractor = 0

#     while last_unprocessed_idx < len(required_tokens):
#         to_process_idx = find_largest_smaller_or_equal(
#             required_tokens[last_unprocessed_idx:],
#             max_tokens,
#             aggregate_substractor
#         )
#         if to_process_idx == -1:
#             return False

#         to_process_idx += last_unprocessed_idx
#         prompt = get_content_relevancy_prompt(
#             columns + "\n" + "\n".join(rows[last_unprocessed_idx:to_process_idx+1]),
#             question
#         )
#         conversations.append([{"role": "user", "content": prompt}])

#         last_unprocessed_idx = to_process_idx + 1
#         aggregate_substractor += (required_tokens[to_process_idx] - len(pipe.tokenizer.tokenize(columns)))

#     for i in range(0, len(conversations), 2):
#         outputs = prompt_pipeline(
#             pipe, conversations[i:i+2], batch_size=2, context_length=8192, max_new_tokens=3, top_p=None, temperature=None
#         )
#         for output in outputs:
#             answer: str = output[-1]["content"]
#             if answer.lower().startswith("yes"):
#                 return True
#     return False