In [None]:
import setproctitle

setproctitle.setproctitle("python3.12")
import os
import sys

sys.path.append("..")

from transformers import set_seed
from sentence_transformers import SentenceTransformer
from sentence_transformers.SentenceTransformer import SentenceTransformer
from benchmark_generator.context.utils.jsonl import read_jsonl, write_jsonl
from tqdm import tqdm

os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
os.environ["CUDA_VISIBLE_DEVICES"] = "5"
set_seed(42, deterministic=True)

embedding_model = SentenceTransformer("../models/stella", local_files_only=True)

In [2]:
EMBEDDING_MAX_TOKENS = 512

# Splitter for Schema Summaries

In [22]:
type = "standard"
name = f"adventure_{type}"
contents = read_jsonl(f"summaries/{type}/{name}.jsonl")

In [None]:
processed_contents = []
unique_tables = sorted(set([row["table"] for row in contents]))
tokenizer = embedding_model.tokenizer
for table in tqdm(unique_tables):
    schema_summary = [summary for summary in contents if summary["table"] == table][0]["summary"]
    column_summaries = schema_summary.split(" | ") 

    col_idx = 0
    while col_idx < len(column_summaries):
        processed_summary = column_summaries[col_idx]

        while (col_idx + 1) < len(column_summaries):
            temp = processed_summary + " | " + column_summaries[col_idx + 1]
            if len(tokenizer.encode(temp)) < EMBEDDING_MAX_TOKENS:
                processed_summary = temp
                col_idx += 1
            else:
                break

        col_idx += 1
        processed_contents.append({
            "table": table,
            "summary": processed_summary,
        })

In [None]:
print(f"Num of content summaries (BEFORE): {len(contents)}")
print(f"Num of content summaries (AFTER): {len(processed_contents)}")

In [25]:
write_jsonl(processed_contents, f"summaries/{type}/{name}_splitted.jsonl")

# Merger for Row Summaries

In [20]:
name = f"fetaqa"
rows = read_jsonl(f"summaries/rows/{name}.jsonl")

In [None]:
unique_tables = sorted(set([row["table"] for row in rows]))
processed_rows = []

tokenizer = embedding_model.tokenizer
for table in tqdm(unique_tables):
    table_rows = [row for row in rows if row["table"] == table]

    rows_idx = 0
    while rows_idx < len(table_rows):
        processed_summary = table_rows[rows_idx]["summary"]

        while (rows_idx + 1) < len(table_rows):
            temp = processed_summary + " || " + table_rows[rows_idx + 1]["summary"]
            if len(tokenizer.encode(temp)) < EMBEDDING_MAX_TOKENS:
                # print(len(tokenizer.encode(temp)))
                processed_summary = temp
                rows_idx += 1
            else:
                break

        rows_idx += 1
        processed_rows.append({
            "table": table,
            "summary": processed_summary,
        })

In [None]:
print(f"Num of rows (BEFORE): {len(rows)}")
print(f"Num of rows (AFTER): {len(processed_rows)}")

In [23]:
write_jsonl(processed_rows, f"summaries/rows/{name}_merged.jsonl")

# Merger for Contexts

In [27]:
name = "fetaqa"
contexts = read_jsonl(f"../data_src/benchmarks/context/{name}/contexts_{name}.jsonl")

In [None]:
unique_tables = sorted(set([context["table"] for context in contexts]))
processed_contexts = []

tokenizer = embedding_model.tokenizer
for table in tqdm(unique_tables):
    table_contexts = [context for context in contexts if context["table"] == table]

    context_idx = 0
    while context_idx < len(table_contexts):
        processed_context = table_contexts[context_idx]["context"]

        while (context_idx + 1) < len(table_contexts):
            temp = processed_context + " || " + table_contexts[context_idx + 1]["context"]
            if len(tokenizer.encode(temp)) < EMBEDDING_MAX_TOKENS:
                # print(len(tokenizer.encode(temp)))
                processed_context = temp
                context_idx += 1
            else:
                break

        context_idx += 1
        processed_contexts.append({
            "table": table,
            "context": processed_context,
        })

In [None]:
print(f"Num of contexts (BEFORE): {len(contexts)}")
print(f"Num of contexts (AFTER): {len(processed_contexts)}")

In [30]:
write_jsonl(processed_contexts, f"../data_src/benchmarks/context/{name}/contexts_{name}_merged.jsonl")