# 1. Preparation

In [1]:
import sys
import bm25s
import Stemmer

sys.path.append("..")

from tqdm import tqdm
from benchmark_generator.context.utils.jsonl import read_jsonl

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = "fetaqa"
if dataset == "chicago":
    std_contents = read_jsonl("../pneuma_summarizer/summaries/standard/chicago_standard.jsonl")
    narration_contents = read_jsonl("../pneuma_summarizer/summaries/narrations/chicago_narrations.jsonl")
    contexts = read_jsonl("../data_src/benchmarks/context/chicago/contexts_chicago.jsonl")
    content_benchmark = read_jsonl("../data_src/benchmarks/content/pneuma_chicago_10K_questions_annotated.jsonl")
    context_benchmark = read_jsonl("../data_src/benchmarks/context/chicago/bx_chicago.jsonl")
    path = "../data_src/tables/pneuma_chicago_10K"
elif dataset == "public":
    std_contents = read_jsonl("../pneuma_summarizer/summaries/standard/public_standard.jsonl")
    narration_contents = read_jsonl("../pneuma_summarizer/summaries/narrations/public_narrations.jsonl")
    contexts = read_jsonl("../data_src/benchmarks/context/public/contexts_public.jsonl")
    content_benchmark = read_jsonl("../data_src/benchmarks/content/pneuma_public_bi_questions_annotated.jsonl")
    context_benchmark = read_jsonl("../data_src/benchmarks/context/public/bx_public.jsonl")
    path = "../data_src/tables/pneuma_public_bi"
elif dataset == "chembl":
    std_contents = read_jsonl("../pneuma_summarizer/summaries/standard/chembl_standard.jsonl")
    narration_contents = read_jsonl("../pneuma_summarizer/summaries/narrations/chembl_narrations.jsonl")
    contexts = read_jsonl("../data_src/benchmarks/context/chembl/contexts_chembl.jsonl")
    content_benchmark = read_jsonl("../data_src/benchmarks/content/pneuma_chembl_10K_questions_annotated.jsonl")
    context_benchmark = read_jsonl("../data_src/benchmarks/context/chembl/bx_chembl.jsonl")
    path = "../data_src/tables/pneuma_chembl_10K"
elif dataset == "adventure":
    std_contents = read_jsonl("../pneuma_summarizer/summaries/standard/adventure_standard.jsonl")
    narration_contents = read_jsonl("../pneuma_summarizer/summaries/narrations/adventure_narrations.jsonl")
    contexts = read_jsonl("../data_src/benchmarks/context/adventure/contexts_adventure.jsonl")
    content_benchmark = read_jsonl("../data_src/benchmarks/content/pneuma_adventure_works_questions_annotated.jsonl")
    context_benchmark = read_jsonl("../data_src/benchmarks/context/adventure/bx_adventure.jsonl")
    path = "../data_src/tables/pneuma_adventure_works"
elif dataset == "fetaqa":
    std_contents = read_jsonl("../pneuma_summarizer/summaries/standard/fetaqa_standard.jsonl")
    narration_contents = read_jsonl("../pneuma_summarizer/summaries/narrations/fetaqa_narrations.jsonl")
    contexts = read_jsonl("../data_src/benchmarks/context/fetaqa/contexts_fetaqa.jsonl")
    content_benchmark = read_jsonl("../data_src/benchmarks/content/pneuma_fetaqa_questions_annotated.jsonl")
    context_benchmark = read_jsonl("../data_src/benchmarks/context/fetaqa/bx_fetaqa.jsonl")
    path = "../data_src/tables/pneuma_fetaqa"

# 2. Indexing

In [3]:
def indexing_keyword(
    stemmer,
    narration_contents: list[dict[str, str]],
    contexts: list[dict[str, str]] = None,
):
    corpus_json = []
    tables = sorted({content["table"] for content in narration_contents})
    for table in tables:
        cols_descriptions = [content["summary"] for content in narration_contents if content["table"] == table]
        for content_idx, content in enumerate(cols_descriptions):
            corpus_json.append({"text": content, "metadata": {"table": f"{table}_SEP_contents_{content_idx}"}})

        if contexts is not None:
            filtered_contexts = [context["context"] for context in contexts if context["table"] == table]
            for context_idx, context in enumerate(filtered_contexts):
                corpus_json.append({"text": context, "metadata": {"table": f"{table}_SEP_{context_idx}"}})

    corpus_text = [doc["text"] for doc in corpus_json]
    corpus_tokens = bm25s.tokenize(corpus_text, stopwords="en", stemmer=stemmer, show_progress=False)

    retriever = bm25s.BM25(corpus=corpus_json)
    retriever.index(corpus_tokens, show_progress=False)
    return retriever

In [4]:
stemmer = Stemmer.Stemmer("english")
retriever = indexing_keyword(stemmer, narration_contents, contexts)

# 3. Benchmarking

In [None]:
def evaluate_benchmark(
    benchmark: list[dict[str,str]],
    benchmark_type: str,
    k: int,
    retriever,
    stemmer,
    use_rephrased_questions=False
):
    hitrate_sum = 0
    
    def get_question_key(benchmark_type: str):
        if benchmark_type == "content":
            if not use_rephrased_questions:
                question_key = "question_from_sql_1"
            else:
                question_key = "question"
        else:
            if not use_rephrased_questions:
                question_key = "question_bx1"
            else:
                question_key = "question_bx2"
        return question_key
    question_key = get_question_key(benchmark_type)

    questions = []
    for data in benchmark:
        questions.append(data[question_key])

    for idx, datum in enumerate(tqdm(benchmark)):
        answer_tables = datum["answer_tables"]

        query_tokens = bm25s.tokenize(questions[idx], stemmer=stemmer, show_progress=False)
        results, scores = retriever.retrieve(query_tokens, k=k, show_progress=False)
        bm25_res = (results, scores)

        for result in results[0]:
            table = result['metadata']['table'].split("_SEP_")[0]
            if table in answer_tables:
                hitrate_sum += 1
                break
    print(f"Hit Rate: {hitrate_sum/len(benchmark)}")

In [None]:
# BC1
evaluate_benchmark(
    content_benchmark, "content", 1, retriever, stemmer
)

100%|██████████| 1000/1000 [00:00<00:00, 5961.57it/s]

Hit Rate: 0.845





In [None]:
# BC2
evaluate_benchmark(
    content_benchmark, "content", 1, retriever, stemmer, use_rephrased_questions=True
)

100%|██████████| 1000/1000 [00:00<00:00, 5946.11it/s]

Hit Rate: 0.765





In [None]:
# BX1
evaluate_benchmark(
    context_benchmark, "context", 1, retriever, stemmer
)

100%|██████████| 1020/1020 [00:00<00:00, 5118.62it/s]

Hit Rate: 0.7450980392156863





In [None]:
# BX2
evaluate_benchmark(
    context_benchmark, "context", 1, retriever, stemmer, use_rephrased_questions=True
)

100%|██████████| 1020/1020 [00:00<00:00, 5235.10it/s]

Hit Rate: 0.6666666666666666



