# 1. Preparation

In [None]:
import sys
import bm25s
import Stemmer

sys.path.append("../..")

from tqdm import tqdm
from benchmark_generator.context.utils.jsonl import read_jsonl

In [11]:
dataset = "chicago"
if dataset == "chicago":
    contents = read_jsonl("../../pneuma_summarizer/summaries/narrations/chicago_narrations.jsonl")
    dbreader = read_jsonl("../../pneuma_summarizer/summaries/dbreader/chicago_dbreader.jsonl")
    contexts = read_jsonl("../../data_src/benchmarks/context/chicago/contexts_chicago.jsonl")
    content_benchmark = read_jsonl("../../data_src/benchmarks/content/pneuma_chicago_10K_questions_annotated.jsonl")
    context_benchmark = read_jsonl("../../data_src/benchmarks/context/chicago/bx_chicago.jsonl")
    path = "../../data_src/tables/pneuma_chicago_10K"
elif dataset == "public":
    contents = read_jsonl("../../pneuma_summarizer/summaries/narrations/public_narrations.jsonl")
    dbreader = read_jsonl("../../pneuma_summarizer/summaries/dbreader/public_dbreader.jsonl")
    contexts = read_jsonl("../../data_src/benchmarks/context/public/contexts_public.jsonl")
    content_benchmark = read_jsonl("../../data_src/benchmarks/content/pneuma_public_bi_questions_annotated.jsonl")
    context_benchmark = read_jsonl("../../data_src/benchmarks/context/public/bx_public.jsonl")
    path = "../../data_src/tables/pneuma_public_bi"
elif dataset == "chembl":
    contents = read_jsonl("../../pneuma_summarizer/summaries/narrations/chembl_narrations.jsonl")
    dbreader = read_jsonl("../../pneuma_summarizer/summaries/dbreader/chembl_dbreader.jsonl")
    contexts = read_jsonl("../../data_src/benchmarks/context/chembl/contexts_chembl.jsonl")
    content_benchmark = read_jsonl("../../data_src/benchmarks/content/pneuma_chembl_10K_questions_annotated.jsonl")
    context_benchmark = read_jsonl("../../data_src/benchmarks/context/chembl/bx_chembl.jsonl")
    path = "../../data_src/tables/pneuma_chembl_10K"
elif dataset == "adventure":
    contents = read_jsonl("../../pneuma_summarizer/summaries/narrations/adventure_narrations.jsonl")
    dbreader = read_jsonl("../../pneuma_summarizer/summaries/dbreader/adventure_dbreader.jsonl")
    contexts = read_jsonl("../../data_src/benchmarks/context/adventure/contexts_adventure.jsonl")
    content_benchmark = read_jsonl("../../data_src/benchmarks/content/pneuma_adventure_works_questions_annotated.jsonl")
    context_benchmark = read_jsonl("../../data_src/benchmarks/context/adventure/bx_adventure.jsonl")
    path = "../../data_src/tables/pneuma_adventure_works"
elif dataset == "fetaqa":
    contents = read_jsonl("../../pneuma_summarizer/summaries/narrations/adventure_narrations.jsonl")
    dbreader = read_jsonl("../../pneuma_summarizer/summaries/dbreader/fetaqa_dbreader.jsonl")
    contexts = read_jsonl("../../data_src/benchmarks/context/fetaqa/contexts_fetaqa.jsonl")
    content_benchmark = read_jsonl("../../data_src/benchmarks/content/pneuma_fetaqa_questions_annotated.jsonl")
    context_benchmark = read_jsonl("../../data_src/benchmarks/context/fetaqa/bx_fetaqa.jsonl")
    path = "../../data_src/tables/pneuma_fetaqa"

# 2. Indexing

In [12]:
def indexing_keyword(
    stemmer,
    contents: list[dict[str, str]],
    contexts: list[dict[str, str]] = None,
):
    corpus_json = []
    tables = sorted({content["table"] for content in contents})
    for table in tables:
        cols_descriptions = [content["summary"] for content in contents if content["table"] == table]
        for content_idx, content in enumerate(cols_descriptions):
            corpus_json.append({"text": content, "metadata": {"table": f"{table}_SEP_contents_{content_idx}"}})

        if contexts is not None:
            filtered_contexts = [context["context"] for context in contexts if context["table"] == table]
            for context_idx, context in enumerate(filtered_contexts):
                corpus_json.append({"text": context, "metadata": {"table": f"{table}_SEP_{context_idx}"}})

    corpus_text = [doc["text"] for doc in corpus_json]
    corpus_tokens = bm25s.tokenize(corpus_text, stopwords="en", stemmer=stemmer, show_progress=False)

    retriever = bm25s.BM25(corpus=corpus_json)
    retriever.index(corpus_tokens, show_progress=False)
    return retriever

In [13]:
stemmer = Stemmer.Stemmer("english")
retriever = indexing_keyword(stemmer, dbreader, contexts)

# 3. Benchmarking

In [14]:
def evaluate_benchmark(
    benchmark: list[dict[str,str]],
    benchmark_type: str,
    k: int,
    retriever,
    stemmer,
    use_rephrased_questions=False
):
    hitrate_sum = 0
    
    def get_question_key(benchmark_type: str):
        if benchmark_type == "content":
            if not use_rephrased_questions:
                question_key = "question_from_sql_1"
            else:
                question_key = "question"
        else:
            if not use_rephrased_questions:
                question_key = "question_bx1"
            else:
                question_key = "question_bx2"
        return question_key
    question_key = get_question_key(benchmark_type)

    questions = []
    for data in benchmark:
        questions.append(data[question_key])

    for idx, datum in enumerate(tqdm(benchmark)):
        answer_tables = datum["answer_tables"]

        query_tokens = bm25s.tokenize(questions[idx], stemmer=stemmer, show_progress=False)
        results, scores = retriever.retrieve(query_tokens, k=k, show_progress=False)
        for result in results[0]:
            table = result['metadata']['table'].split("_SEP_")[0]
            if table in answer_tables:
                hitrate_sum += 1
                break
    print(f"Hit Rate: {round(hitrate_sum/len(benchmark) * 100, 1)}")

In [15]:
ks = [1]

In [None]:
# BC1
for k in ks:
    print(f"k={k}")
    evaluate_benchmark(
        content_benchmark, "content", k, retriever, stemmer
    )
    print("=" * 50)

In [None]:
# BC2
for k in ks:
    print(f"k={k}")
    evaluate_benchmark(
        content_benchmark, "content", k, retriever, stemmer, True
    )
    print("=" * 50)

In [None]:
# BX1
for k in ks:
    print(f"k={k}")
    evaluate_benchmark(
        context_benchmark, "context", k, retriever, stemmer
    )
    print("=" * 50)

In [None]:
# BX2
for k in ks:
    print(f"k={k}")
    evaluate_benchmark(
        context_benchmark, "context", k, retriever, stemmer, True
    )
    print("=" * 50)