# 1. Preparation

In [1]:
import sys
import bm25s
import Stemmer

sys.path.append("../..")

from tqdm import tqdm
from benchmark_generator.context.utils.jsonl import read_jsonl

  from .autonotebook import tqdm as notebook_tqdm


In [18]:
dataset = "fetaqa"
if dataset == "chicago":
    std = read_jsonl("../../pneuma_summarizer/summaries/standard/chicago_standard.jsonl")
    narrations = read_jsonl("../../pneuma_summarizer/summaries/narrations/chicago_narrations.jsonl")
    rows = read_jsonl("../../pneuma_summarizer/summaries/rows/chicago.jsonl")
    content_benchmark = read_jsonl("../../data_src/benchmarks/content/pneuma_chicago_10K_questions_annotated.jsonl")
    path = "../../data_src/tables/pneuma_chicago_10K"
elif dataset == "public":
    std = read_jsonl("../../pneuma_summarizer/summaries/standard/public_standard.jsonl")
    narrations = read_jsonl("../../pneuma_summarizer/summaries/narrations/public_narrations.jsonl")
    rows = read_jsonl("../../pneuma_summarizer/summaries/rows/public.jsonl")
    content_benchmark = read_jsonl("../../data_src/benchmarks/content/pneuma_public_bi_questions_annotated.jsonl")
    path = "../../data_src/tables/pneuma_public_bi"
elif dataset == "chembl":
    std = read_jsonl("../../pneuma_summarizer/summaries/standard/chembl_standard.jsonl")
    narrations = read_jsonl("../../pneuma_summarizer/summaries/narrations/chembl_narrations.jsonl")
    rows = read_jsonl("../../pneuma_summarizer/summaries/rows/chembl.jsonl")
    content_benchmark = read_jsonl("../../data_src/benchmarks/content/pneuma_chembl_10K_questions_annotated.jsonl")
    path = "../../data_src/tables/pneuma_chembl_10K"
elif dataset == "adventure":
    std = read_jsonl("../../pneuma_summarizer/summaries/standard/adventure_standard.jsonl")
    narrations = read_jsonl("../../pneuma_summarizer/summaries/narrations/adventure_narrations.jsonl")
    rows = read_jsonl("../../pneuma_summarizer/summaries/rows/adventure.jsonl")
    content_benchmark = read_jsonl("../../data_src/benchmarks/content/pneuma_adventure_works_questions_annotated.jsonl")
    path = "../../data_src/tables/pneuma_adventure_works"
elif dataset == "fetaqa":
    std = read_jsonl("../../pneuma_summarizer/summaries/standard/fetaqa_standard.jsonl")
    narrations = read_jsonl("../../pneuma_summarizer/summaries/narrations/fetaqa_narrations.jsonl")
    rows = read_jsonl("../../pneuma_summarizer/summaries/rows/fetaqa.jsonl")
    content_benchmark = read_jsonl("../../data_src/benchmarks/content/pneuma_fetaqa_questions_annotated.jsonl")
    path = "../../data_src/tables/pneuma_fetaqa"

# 2. Indexing

In [3]:
def indexing_keyword(
    stemmer,
    contents: list[dict[str, str]],
    contexts: list[dict[str, str]] = None,
):
    corpus_json = []
    tables = sorted({content["table"] for content in contents})
    for table in tables:
        cols_descriptions = [content["summary"] for content in contents if content["table"] == table]
        for content_idx, content in enumerate(cols_descriptions):
            corpus_json.append({"text": content, "metadata": {"table": f"{table}_SEP_contents_{content_idx}"}})

        if contexts is not None:
            filtered_contexts = [context["context"] for context in contexts if context["table"] == table]
            for context_idx, context in enumerate(filtered_contexts):
                corpus_json.append({"text": context, "metadata": {"table": f"{table}_SEP_{context_idx}"}})

    corpus_text = [doc["text"] for doc in corpus_json]
    corpus_tokens = bm25s.tokenize(corpus_text, stopwords="en", stemmer=stemmer, show_progress=False)

    retriever = bm25s.BM25(corpus=corpus_json)
    retriever.index(corpus_tokens, show_progress=False)
    return retriever

In [29]:
stemmer = Stemmer.Stemmer("english")
retriever = indexing_keyword(stemmer, rows+narrations, None)

# 3. Benchmarking

In [30]:
def evaluate_benchmark(
    benchmark: list[dict[str,str]],
    benchmark_type: str,
    k: int,
    retriever,
    stemmer,
    use_rephrased_questions=False
):
    hitrate_sum = 0
    
    def get_question_key(benchmark_type: str):
        if benchmark_type == "content":
            if not use_rephrased_questions:
                question_key = "question_from_sql_1"
            else:
                question_key = "question"
        else:
            if not use_rephrased_questions:
                question_key = "question_bx1"
            else:
                question_key = "question_bx2"
        return question_key
    question_key = get_question_key(benchmark_type)

    questions = []
    for data in benchmark:
        questions.append(data[question_key])

    for idx, datum in enumerate(tqdm(benchmark)):
        answer_tables = datum["answer_tables"]

        query_tokens = bm25s.tokenize(questions[idx], stemmer=stemmer, show_progress=False)
        results, scores = retriever.retrieve(query_tokens, k=k, show_progress=False)
        for result in results[0]:
            table = result['metadata']['table'].split("_SEP_")[0]
            if table in answer_tables:
                hitrate_sum += 1
                break
    print(f"Hit Rate: {round(hitrate_sum/len(benchmark) * 100, 2)}")

In [31]:
ks = [1,5,10]

In [32]:
# BC1
for k in ks:
    print(f"k={k}")
    evaluate_benchmark(
        content_benchmark, "content", k, retriever, stemmer
    )
    print("=" * 50)

k=1


100%|██████████| 1001/1001 [00:00<00:00, 3201.05it/s]


Hit Rate: 59.84
k=5


100%|██████████| 1001/1001 [00:00<00:00, 1768.37it/s]


Hit Rate: 74.03
k=10


100%|██████████| 1001/1001 [00:00<00:00, 1760.90it/s]

Hit Rate: 78.22





In [33]:
# BC2
for k in ks:
    print(f"k={k}")
    evaluate_benchmark(
        content_benchmark, "content", k, retriever, stemmer, True
    )
    print("=" * 50)

k=1


100%|██████████| 1001/1001 [00:00<00:00, 3253.71it/s]


Hit Rate: 51.95
k=5


100%|██████████| 1001/1001 [00:00<00:00, 1726.58it/s]


Hit Rate: 66.83
k=10


100%|██████████| 1001/1001 [00:00<00:00, 1683.28it/s]

Hit Rate: 72.43



