In [1]:
import sys
import bm25s
import Stemmer

sys.path.append("..")

from benchmark_generator.context.utils.jsonl import read_jsonl

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
stemmer = Stemmer.Stemmer("english")

In [3]:
def indexing_keyword(
    stemmer,
    narration_contents: list[dict[str, str]],
    contexts: list[dict[str, str]] = None,
):
    corpus_json = []
    tables = sorted({content["table"] for content in narration_contents})
    for table in tables:
        cols_descriptions = [content["summary"] for content in narration_contents if content["table"] == table]
        for content_idx, content in enumerate(cols_descriptions):
            corpus_json.append({"text": content, "metadata": {"table": f"{table}_SEP_contents_{content_idx}"}})

        if contexts is not None:
            filtered_contexts = [context["context"] for context in contexts if context["table"] == table]
            for context_idx, context in enumerate(filtered_contexts):
                corpus_json.append({"text": context, "metadata": {"table": f"{table}_SEP_{context_idx}"}})

    corpus_text = [doc["text"] for doc in corpus_json]
    corpus_tokens = bm25s.tokenize(corpus_text, stopwords="en", stemmer=stemmer, show_progress=False)

    retriever = bm25s.BM25(corpus=corpus_json)
    retriever.index(corpus_tokens, show_progress=False)
    return retriever

# Pneuma-Summarizer-Schema-Only

In [4]:
dataset = "public"
print(f"Indexing dataset: {dataset}")
contents = read_jsonl(
    "../pneuma_summarizer/summaries/narrations/public_narrations.jsonl"
)
contexts = read_jsonl("../data_src/benchmarks/context/public/contexts_public.jsonl")
path = "../data_src/tables/pneuma_public_bi"
retriever = indexing_keyword(stemmer, contents, contexts)
retriever.save(f"index-keyword-{dataset}-pneuma_summarizer_schema_only")

dataset = "chembl"
print(f"Indexing dataset: {dataset}")
contents = read_jsonl(
    "../pneuma_summarizer/summaries/narrations/chembl_narrations.jsonl"
)
contexts = read_jsonl("../data_src/benchmarks/context/chembl/contexts_chembl.jsonl")
path = "../data_src/tables/pneuma_chembl_10K"
retriever = indexing_keyword(stemmer, contents, contexts)
retriever.save(f"index-keyword-{dataset}-pneuma_summarizer_schema_only")

dataset = "adventure"
print(f"Indexing dataset: {dataset}")
contents = read_jsonl(
    "../pneuma_summarizer/summaries/narrations/adventure_narrations.jsonl"
)
contexts = read_jsonl(
    "../data_src/benchmarks/context/adventure/contexts_adventure.jsonl"
)
path = "../data_src/tables/pneuma_adventure_works"
retriever = indexing_keyword(stemmer, contents, contexts)
retriever.save(f"index-keyword-{dataset}-pneuma_summarizer_schema_only")

dataset = "chicago"
print(f"Indexing dataset: {dataset}")
contents = read_jsonl(
    "../pneuma_summarizer/summaries/narrations/chicago_narrations.jsonl"
)
contexts = read_jsonl(
    "../data_src/benchmarks/context/chicago/contexts_chicago.jsonl"
)
path = "../data_src/tables/pneuma_chicago_10K"
retriever = indexing_keyword(stemmer, contents, contexts)
retriever.save(f"index-keyword-{dataset}-pneuma_summarizer_schema_only")

dataset = "fetaqa"
print(f"Indexing dataset: {dataset}")
contents = read_jsonl(
    "../pneuma_summarizer/summaries/narrations/fetaqa_narrations.jsonl"
)
contexts = read_jsonl("../data_src/benchmarks/context/fetaqa/contexts_fetaqa.jsonl")
path = "../data_src/tables/pneuma_fetaqa"
retriever = indexing_keyword(stemmer, contents, contexts)
retriever.save(f"index-keyword-{dataset}-pneuma_summarizer_schema_only")

Indexing dataset: public


Finding newlines for mmindex: 100%|██████████| 8.92M/8.92M [00:00<00:00, 185MB/s]


Indexing dataset: chembl


Finding newlines for mmindex: 100%|██████████| 2.21M/2.21M [00:00<00:00, 128MB/s]


Indexing dataset: adventure


Finding newlines for mmindex: 100%|██████████| 2.52M/2.52M [00:00<00:00, 127MB/s]


Indexing dataset: chicago


Finding newlines for mmindex: 100%|██████████| 25.3M/25.3M [00:00<00:00, 139MB/s]


Indexing dataset: fetaqa


Finding newlines for mmindex: 100%|██████████| 263M/263M [00:02<00:00, 115MB/s] 


# Pneuma-Summarizer

In [5]:
dataset = "public"
print(f"Indexing dataset: {dataset}")
contents = read_jsonl(
    "../pneuma_summarizer/summaries/narrations/public_narrations.jsonl"
) + read_jsonl("../pneuma_summarizer/summaries/rows/public.jsonl")
contexts = read_jsonl("../data_src/benchmarks/context/public/contexts_public.jsonl")
path = "../data_src/tables/pneuma_public_bi"
retriever = indexing_keyword(stemmer, contents, contexts)
retriever.save(f"index-keyword-{dataset}-pneuma_summarizer")

dataset = "chembl"
print(f"Indexing dataset: {dataset}")
contents = read_jsonl(
    "../pneuma_summarizer/summaries/narrations/chembl_narrations.jsonl"
) + read_jsonl("../pneuma_summarizer/summaries/rows/chembl.jsonl")
contexts = read_jsonl("../data_src/benchmarks/context/chembl/contexts_chembl.jsonl")
path = "../data_src/tables/pneuma_chembl_10K"
retriever = indexing_keyword(stemmer, contents, contexts)
retriever.save(f"index-keyword-{dataset}-pneuma_summarizer")

dataset = "adventure"
print(f"Indexing dataset: {dataset}")
contents = read_jsonl(
    "../pneuma_summarizer/summaries/narrations/adventure_narrations.jsonl"
) + read_jsonl("../pneuma_summarizer/summaries/rows/adventure.jsonl")
contexts = read_jsonl(
    "../data_src/benchmarks/context/adventure/contexts_adventure.jsonl"
)
path = "../data_src/tables/pneuma_adventure_works"
retriever = indexing_keyword(stemmer, contents, contexts)
retriever.save(f"index-keyword-{dataset}-pneuma_summarizer")

dataset = "chicago"
print(f"Indexing dataset: {dataset}")
contents = read_jsonl(
    "../pneuma_summarizer/summaries/narrations/chicago_narrations.jsonl"
) + read_jsonl("../pneuma_summarizer/summaries/rows/chicago.jsonl")
contexts = read_jsonl(
    "../data_src/benchmarks/context/chicago/contexts_chicago.jsonl"
)
path = "../data_src/tables/pneuma_chicago_10K"
retriever = indexing_keyword(stemmer, contents, contexts)
retriever.save(f"index-keyword-{dataset}-pneuma_summarizer")

dataset = "fetaqa"
print(f"Indexing dataset: {dataset}")
contents = read_jsonl(
    "../pneuma_summarizer/summaries/narrations/fetaqa_narrations.jsonl"
) + read_jsonl("../pneuma_summarizer/summaries/rows/fetaqa.jsonl")
contexts = read_jsonl("../data_src/benchmarks/context/fetaqa/contexts_fetaqa.jsonl")
path = "../data_src/tables/pneuma_fetaqa"
retriever = indexing_keyword(stemmer, contents, contexts)
retriever.save(f"index-keyword-{dataset}-pneuma_summarizer")

Indexing dataset: public


Finding newlines for mmindex: 100%|██████████| 10.4M/10.4M [00:00<00:00, 196MB/s]


Indexing dataset: chembl


Finding newlines for mmindex: 100%|██████████| 2.33M/2.33M [00:00<00:00, 108MB/s]


Indexing dataset: adventure


Finding newlines for mmindex: 100%|██████████| 3.92M/3.92M [00:00<00:00, 169MB/s]


Indexing dataset: chicago


Finding newlines for mmindex: 100%|██████████| 43.3M/43.3M [00:00<00:00, 201MB/s]


Indexing dataset: fetaqa


Finding newlines for mmindex: 100%|██████████| 273M/273M [00:02<00:00, 107MB/s]  


# DBReader

In [6]:
# dataset = "public"
# print(f"Indexing dataset: {dataset}")
# contents = read_jsonl(
#     "../pneuma_summarizer/summaries/dbreader/public_dbreader.jsonl"
# )
# contexts = read_jsonl("../data_src/benchmarks/context/public/contexts_public.jsonl")
# path = "../data_src/tables/pneuma_public_bi"
# retriever = indexing_keyword(stemmer, contents, contexts)
# retriever.save(f"index-keyword-{dataset}-pneuma_dbreader")

# dataset = "chembl"
# print(f"Indexing dataset: {dataset}")
# contents = read_jsonl(
#     "../pneuma_summarizer/summaries/dbreader/chembl_dbreader.jsonl"
# )
# contexts = read_jsonl("../data_src/benchmarks/context/chembl/contexts_chembl.jsonl")
# path = "../data_src/tables/pneuma_chembl_10K"
# retriever = indexing_keyword(stemmer, contents, contexts)
# retriever.save(f"index-keyword-{dataset}-pneuma_dbreader")

# dataset = "adventure"
# print(f"Indexing dataset: {dataset}")
# contents = read_jsonl(
#     "../pneuma_summarizer/summaries/dbreader/adventure_dbreader.jsonl"
# )
# contexts = read_jsonl(
#     "../data_src/benchmarks/context/adventure/contexts_adventure.jsonl"
# )
# path = "../data_src/tables/pneuma_adventure_works"
# retriever = indexing_keyword(stemmer, contents, contexts)
# retriever.save(f"index-keyword-{dataset}-pneuma_dbreader")

# dataset = "chicago"
# print(f"Indexing dataset: {dataset}")
# contents = read_jsonl(
#     "../pneuma_summarizer/summaries/dbreader/chicago_dbreader.jsonl"
# )
# contexts = read_jsonl(
#     "../data_src/benchmarks/context/chicago/contexts_chicago.jsonl"
# )
# path = "../data_src/tables/pneuma_chicago_10K"
# retriever = indexing_keyword(stemmer, contents, contexts)
# retriever.save(f"index-keyword-{dataset}-pneuma_dbreader")

# dataset = "fetaqa"
# print(f"Indexing dataset: {dataset}")
# contents = read_jsonl(
#     "../pneuma_summarizer/summaries/dbreader/fetaqa_dbreader.jsonl"
# )
# contexts = read_jsonl("../data_src/benchmarks/context/fetaqa/contexts_fetaqa.jsonl")
# path = "../data_src/tables/pneuma_fetaqa"
# retriever = indexing_keyword(stemmer, contents, contexts)
# retriever.save(f"index-keyword-{dataset}-pneuma_dbreader")