# 1. Preparation

In [1]:
import os
import sys
import json
import setproctitle
import chromadb
import pandas as pd
import bm25s
import Stemmer
import torch

sys.path.append("..")

from tqdm import tqdm
from transformers import set_seed
from sentence_transformers import SentenceTransformer
from benchmark_generator.context.utils.pipeline_initializer import initialize_pipeline
from benchmark_generator.context.utils.prompting_interface import prompt_pipeline

os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
set_seed(42, deterministic=True)
# Uncomment to select GPU
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
setproctitle.setproctitle("python")

In [None]:
# pipe = initialize_pipeline("meta-llama/Meta-Llama-3-8B-Instruct", torch.bfloat16)
# pipe.tokenizer.pad_token_id = pipe.model.config.eos_token_id
# pipe.tokenizer.padding_side = 'left'

In [2]:
def read_jsonl(file_path: str):
    """
    Read a JSONL file
    """
    data: list[dict[str, str]] = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            data.append(json.loads(line.strip()))
    return data

In [3]:
dataset = "chembl"
if dataset == "chicago":
    std_contents = read_jsonl("../pneuma_summarizer/summaries/standard/chicago_standard.jsonl")
    narration_contents = read_jsonl("../pneuma_summarizer/summaries/narrations/chicago_narrations.jsonl")
    contexts = read_jsonl("../data_src/benchmarks/context/chicago/contexts_chicago.jsonl")

    content_benchmark = read_jsonl("../data_src/benchmarks/content/pneuma_chicago_10K_questions_annotated.jsonl")
    context_benchmark = read_jsonl("../data_src/benchmarks/context/chicago/bx_chicago.jsonl")
    path = "../data_src/tables/pneuma_chicago_10K"
elif dataset == "public":
    std_contents = read_jsonl("../pneuma_summarizer/summaries/standard/public_standard.jsonl")
    narration_contents = read_jsonl("../pneuma_summarizer/summaries/narrations/public_narrations.jsonl")
    contexts = read_jsonl("../data_src/benchmarks/context/public/contexts_public.jsonl")

    content_benchmark = read_jsonl("../data_src/benchmarks/content/pneuma_public_bi_questions_annotated.jsonl")
    context_benchmark = read_jsonl("../data_src/benchmarks/context/public/bx_public.jsonl")
    path = "../data_src/tables/pneuma_public_bi"
elif dataset == "chembl":
    std_contents = read_jsonl("../pneuma_summarizer/summaries/standard/chembl_standard.jsonl")
    narration_contents = read_jsonl("../pneuma_summarizer/summaries/narrations/chembl_narrations.jsonl")
    contexts = read_jsonl("../data_src/benchmarks/context/chembl/contexts_chembl.jsonl")

    content_benchmark = read_jsonl("../data_src/benchmarks/content/pneuma_chembl_10K_questions_annotated.jsonl")
    context_benchmark = read_jsonl("../data_src/benchmarks/context/chembl/bx_chembl.jsonl")
    path = "../data_src/tables/pneuma_chembl_10K"

In [4]:
tables = sorted({content["table"] for content in narration_contents})
combined_narration_contents: list[dict[str,str]] = []
for table in tables:
    table_narration_contents = [content["summary"] for content in narration_contents if content["table"] == table]
    combined_narration_contents.append({
        "table": table,
        "summary": " | ".join(table_narration_contents)
    })

# 2. Indexing

In [5]:
# Translate IDs of contexts back into contexts
ids_contexts = dict()

In [9]:
def indexing_vector(
    client,
    embedding_model,
    std_contents: list[dict[str, str]],
    contexts: list[dict[str, str]] = None,
    collection_name = "benchmark",
):
    documents = []
    metadatas = []
    ids = []

    try:
        client.get_collection(collection_name)
        client.delete_collection(collection_name)
    except Exception as err:
        print(err)

    collection = client.create_collection(
        name=collection_name,
        metadata={"hnsw:space": "cosine", "hnsw:M": 1024}
    )

    tables = sorted({content["table"] for content in std_contents})
    for table in tqdm(tables, "Adding contents"):
        cols = [content["summary"] for content in std_contents if content["table"] == table]
        for content_idx, content in enumerate(tqdm(cols, f"Adding contents of {table} table", leave=False)):
            documents.append(content)
            metadatas.append({"table": f"{table}_SEP_contents_{content_idx}"})
            ids.append(f"{table}_SEP_contents_{content_idx}")

        if contexts is not None:
            filtered_contexts = [context["context"] for context in contexts if context["table"] == table]
            for context_idx, context in enumerate(tqdm((filtered_contexts), f"Adding contexts of {table} table", leave=False)):
                documents.append(context)
                metadatas.append({"table": f"{table}_SEP_{context_idx}"})
                ids.append(f"{table}_SEP_{context_idx}")

    for i in range(0, len(documents), 30000):
        embeddings = embedding_model.encode(
            documents[i:i+30000],
            batch_size=128,
            show_progress_bar=True,
            device="cuda"
        )

        collection.add(
            embeddings=[embed.tolist() for embed in embeddings],
            metadatas=metadatas[i:i+30000],
            ids=ids[i:i+30000]
        )
    return collection

def indexing_keyword(
    stemmer,
    narration_contents: list[dict[str, str]],
    contexts: list[dict[str, str]] = None,
):
    corpus_json = []
    tables = sorted({content["table"] for content in narration_contents})
    for table in tqdm(tables, "Adding contents"):
        cols_descriptions = [content["summary"] for content in narration_contents if content["table"] == table]
        for content_idx, content in enumerate(tqdm(cols_descriptions, f"Adding contents of {table} table", leave=False)):
            corpus_json.append({"text": content, "metadata": {"table": f"{table}_SEP_contents_{content_idx}"}})

        if contexts is not None:
            filtered_contexts = [context["context"] for context in contexts if context["table"] == table]
            for context_idx, context in enumerate(tqdm((filtered_contexts), f"Adding contexts of {table} table", leave=False)):
                corpus_json.append({"text": context, "metadata": {"table": f"{table}_SEP_{context_idx}"}})
                ids_contexts[f"{table}_SEP_{context_idx}"] = context

    corpus_text = [doc["text"] for doc in corpus_json]
    corpus_tokens = bm25s.tokenize(corpus_text, stopwords="en", stemmer=stemmer, show_progress=False)

    retriever = bm25s.BM25(corpus=corpus_json)
    retriever.index(corpus_tokens, show_progress=False)
    return retriever

In [7]:
model = SentenceTransformer('dunzhang/stella_en_1.5B_v5', trust_remote_code=True)
stemmer = Stemmer.Stemmer("english")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
client = chromadb.PersistentClient(f"experiment-{dataset}")
# collection = indexing_vector(client, model, std_contents, contexts)
collection = client.get_collection("benchmark")
retriever = indexing_keyword(stemmer, combined_narration_contents, contexts)

Adding contents: 100%|██████████| 78/78 [00:00<00:00, 247.96it/s]
DEBUG:bm25s:Building index from IDs objects


# 3. Benchmarking

In [11]:
def process_nodes_bm25(results, scores):
    # Normalize relevance scores and return the nodes in dict format
    scores: list[float] = scores[0]
    max_score = max(scores)
    min_score = min(scores)

    processed_nodes: dict[str,float] = {}
    for i, node in enumerate(results[0]):
        if min_score == max_score:
            score = 1
        else:
            score = (scores[i] - min_score) / (max_score - min_score)
        processed_nodes[node["metadata"]["table"]] = score
    return processed_nodes

In [12]:
def process_nodes_vec(items):
    # Normalize relevance scores and return the nodes in dict format
    scores: list[float] = [1 - dist for dist in items["distances"][0]]
    max_score = max(scores)
    min_score = min(scores)

    processed_nodes: dict[str,float] = {}
    for idx, table in enumerate(items['metadatas'][0]):
        if min_score == max_score:
            score = 1
        else:
            score = (scores[idx] - min_score) / (max_score - min_score)
        processed_nodes[table["table"]] = score
    return processed_nodes

In [13]:
def hybrid_retriever(bm25_res, bm25_sc, vec_res, k: int, question: str, use_reranker=False):
    processed_nodes_bm25 = process_nodes_bm25(bm25_res, bm25_sc)
    processed_nodes_vec: dict = process_nodes_vec(vec_res)

    node_ids = set(list(processed_nodes_bm25.keys()) + list(processed_nodes_vec.keys()))
    all_nodes = []
    for node_id in node_ids:
        bm25_score = processed_nodes_bm25.get(node_id, 0.0)
        cosine_score = processed_nodes_vec.get(node_id, 0.0)
        combined_score = 0.5 * bm25_score + 0.5 * cosine_score
        all_nodes.append((node_id, combined_score))

    sorted_nodes = sorted(all_nodes, key=lambda node: (-node[1], node[0]))[:k]
    if use_reranker:
        reranked_nodes = rerank(sorted_nodes, question)
        return reranked_nodes
    return sorted_nodes

In [14]:
import duckdb
con = duckdb.connect()
def is_table_content_relevant(
    path: str,
    table: str,
    max_tokens: int,
    question: str
) -> bool:
    # # Context length of a model but reduced to account for other things (chat template, output token, etc.)
    def find_largest_smaller_or_equal(tokens_list: list[int], max_tokens: int, pengurang: int):
        for idx in range(len(tokens_list) - 1, -1, -1):
            if (tokens_list[idx] - pengurang) <= max_tokens:
                return idx
        return -1

    def get_processed_df(path: str, table: str) -> pd.DataFrame:
        df = con.sql(f"from '{path}/{table}.csv'").to_df().drop_duplicates().reset_index(drop=True)
        for col in df.columns:
            if pd.api.types.is_datetime64_any_dtype(df[col]):
                df[col] = pd.to_datetime(df[col], errors='coerce')
                df[col] = df[col].apply(
                    lambda x: x.strftime('%B ') + str(x.day).lstrip('0') + x.strftime(', %Y %H:%M:%S.%f')[:-3] if pd.notnull(x) else 'NaT'
                )
        return df

    def get_content_relevancy_prompt(table: str, question: str):
        return f"""Given this table:
*/
{table}
*/
and this question:
/*
{question}
*/
Is the table relevant to answer the question? Begin your answer with yes/no."""

    def get_relevant_rows(df: pd.DataFrame):
        client = chromadb.PersistentClient(f"experiment-{dataset}")
        collection = indexing_vector(client, model, contexts, std_contents, "temporary")
        retriever = indexing_keyword(stemmer, tables, contexts, narration_contents)


    df = get_processed_df(path, table.split("_SEP_")[0])
    columns = "col: " + " | ".join(df.columns)
    rows = [""] * len(df)

    required_tokens = [len(pipe.tokenizer.tokenize(columns))] * len(df)

    for row_idx, row in df.iterrows():
        rows[row_idx] = f"row {row_idx+1}: " + " | ".join(row.astype(str))
        required_tokens[row_idx] += len(pipe.tokenizer.tokenize(rows[row_idx]))
        if row_idx > 0:
            required_tokens[row_idx] += required_tokens[row_idx-1]

    last_processed_idx = 0
    conversations: list[list[dict[str, str]]] = []
    print("MASUK WHILE")
    pengurang = 0
    while last_processed_idx < len(required_tokens):
        to_process_idx = find_largest_smaller_or_equal(required_tokens[last_processed_idx:], max_tokens, pengurang)
        if to_process_idx == -1:
            return False

        to_process_idx += last_processed_idx
        prompt = get_content_relevancy_prompt(
            columns + "\n" + "\n".join(rows[last_processed_idx:to_process_idx+1]),
            question
        )
        conversations.append([{"role": "user", "content": prompt}])

        last_processed_idx = to_process_idx + 1
        pengurang += required_tokens[last_processed_idx-1] - len(pipe.tokenizer.tokenize(columns))

    print(f"DEBUG: {len(conversations)}", flush=True)
    for i in range(0, len(conversations), 4):
        outputs = prompt_pipeline(
            pipe, conversations[i:i+4], context_length=8192, max_new_tokens=3, top_p=None, temperature=None
        )
        for output in outputs:
            answer: str = output[-1]["content"]
            if answer.lower().startswith("yes"):
                return True
    return False

In [15]:
def is_table_context_relevant(context: str, question: str):
    def get_context_relevancy_prompt(context: str, question: str):
        return f"""Given this context describing a table:
*/
{context}
*/
and this question:
/*
{question}
*/
Is the table relevant to answer the question? Begin your answer with yes/no."""
    prompt = get_context_relevancy_prompt(
        context,
        question
    )

    answer: str = prompt_pipeline(
        pipe, [[{"role": "user", "content": prompt}]], context_length=8192, max_new_tokens=3, top_p=None, temperature=None
    )[0][-1]["content"]

    if answer.lower().startswith("yes"):
        return True
    return False

In [16]:
from collections import defaultdict
def rerank(nodes: list[tuple[str, str]], question: str):
    max_tokens = 7000  # Context length of a model but reduced to account for other things (chat template, output token, etc.)
    tables_relevancy = defaultdict(bool)

    for node in tqdm(nodes, f"Processing node..."):
        print(f"Re-ranking node {node[0]}", flush=True)
        table_name = node[0]
        if table_name.split("_SEP_")[1].startswith("contents"):
            if is_table_content_relevant(path, table_name, max_tokens, question):
                tables_relevancy[table_name] = True
        else:
            if is_table_context_relevant(ids_contexts[table_name], question):
                tables_relevancy[table_name] = True
    new_nodes = [(table_name, score) for table_name, score in nodes if tables_relevancy[table_name]] + [(table_name, score) for table_name, score in nodes if not tables_relevancy[table_name]]
    return new_nodes

In [17]:
def evaluate_content_benchmark(
    benchmark: list[dict[str,str]],
    benchmark_type: str,
    k: int,
    model,
    collection,
    retriever,
    stemmer,
    use_reranker=False,
    use_rephrased_questions=False
):
    hitrate_sum = 0
    wrong_list = []

    if use_reranker:
        increased_k = k * 2
    else:
        increased_k = k
    
    def get_question_key(benchmark_type: str):
        if benchmark_type == "content":
            if not use_rephrased_questions:
                question_key = "question_from_sql_1"
            else:
                question_key = "question"
        else:
            if not use_rephrased_questions:
                question_key = "question_bx1"
            else:
                question_key = "question_bx2"
        return question_key
    question_key = get_question_key(benchmark_type)

    questions = []
    for data in benchmark:
        questions.append(data[question_key])
    embed_questions = model.encode(questions, batch_size=128, show_progress_bar=True)
    embed_questions = [embed.tolist() for embed in embed_questions]

    for idx, datum in enumerate(tqdm(benchmark)):
        answer_tables = datum["answer_tables"]
        question_embedding = embed_questions[idx]

        query_tokens = bm25s.tokenize(questions[idx], stemmer=stemmer, show_progress=False)
        results, scores = retriever.retrieve(query_tokens, k=increased_k, show_progress=False)

        vec_res = collection.query(
            query_embeddings=[question_embedding],
            n_results=increased_k
        )

        all_nodes = hybrid_retriever(results, scores, vec_res, increased_k, questions[idx], use_reranker)
        before = hitrate_sum
        for table, _ in all_nodes[:k]:
            table = table.split("_SEP_")[0]
            if table in answer_tables:
                hitrate_sum += 1
                break
        if before == hitrate_sum:
            wrong_list.append(idx)
        # Checkpoint
        # if idx % 25 == 0:
        #     print(f"Current Hit Rate Sum: {hitrate_sum}")
    print(f"Final Hit Rate Sum: {hitrate_sum}")
    print(f"Hit Rate: {hitrate_sum/len(benchmark)}")
    print(f"Wrong List: {wrong_list}")
    return hitrate_sum

In [18]:
# BC1
evaluate_content_benchmark(
    content_benchmark, "content", 10, model, collection, retriever, stemmer
)

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

100%|██████████| 1000/1000 [00:03<00:00, 284.93it/s]

Final Hit Rate Sum: 993
Hit Rate: 0.993
Wrong List: [36, 148, 332, 594, 615, 898, 899]





993

In [None]:
# BC2
evaluate_content_benchmark(
    content_benchmark, "content", 1, model, collection, retriever, stemmer, use_rephrased_questions=True
)

In [None]:
# BX1
evaluate_content_benchmark(
    context_benchmark, "context", 1, model, collection, retriever, stemmer
)

In [None]:
# BX2
evaluate_content_benchmark(
    context_benchmark, "context", 1, model, collection, retriever, stemmer, use_rephrased_questions=True
)