# Pneuma-Summarizer

## 0. Load Packages and LLM

In [None]:
# Select GPU (if necessary)
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
import setproctitle
setproctitle.setproctitle("python")

In [None]:
import pandas as pd
import torch
import warnings
import random
import math
import sys
sys.path.append('..')

from benchmark_generator.context.utils.pipeline_initializer import initialize_pipeline
from benchmark_generator.context.utils.prompting_interface import prompt_pipeline

In [None]:
warnings.filterwarnings("ignore")
pipe = initialize_pipeline("meta-llama/Meta-Llama-3-8B-Instruct", torch.bfloat16)

## 1. Define Prompts

In [None]:
def get_row_info_from_df(df: pd.DataFrame, row_idx=0):
    col = "col: " + " | ".join(df.columns)
    row = "row: " + " | ".join(df.iloc[row_idx].astype(str).str.strip())
    return col + "\n" + row

In [None]:
def get_table_summary_prompt(selected_summaries: list[str]):
    return f"""Given these pieces of information regarding some row(s) of a dataset:
/*
{"; ".join(selected_summaries)}
*/
Guess reasonably what this dataset is about. Respond briefly."""

In [None]:
def get_dtype_check_prompt(col_name: str, stats: str):
    """Get prompt to check if an integer column is actually ID/categorical or not"""
    return f"""Do you think a column named {col_name}, which has values such as {stats}, is an identifier or categorical column? Begin your argument with yes/no."""

In [None]:
def get_col_summary_prompt(dataset_info: str, col_name: str, col_stats: str):
    return f"""Given the following description of a dataset and statistics about column {col_name} of the dataset, generate a short paragraph about the column statistics while considering the description:
Dataset description = "{dataset_info}"
Column statistics = "{col_stats}"""

In [None]:
def get_row_summary_prompt(row_info: str):
    return f"""Given this row of a dataset:
/*
{row_info}
*/
Summarize it comprehensively into a single paragraph without adding any external information."""

## 2. Define Helper Functions

In [None]:
def row_similarity(row1: pd.Series, row2: pd.Series):
    """Compute how many columns are the same between two rows"""
    similarity = (row1 == row2).mean()
    return similarity

def remove_similar_rows(df: pd.DataFrame, threshold=0.9):
    """Remove rows that have a similarity greater than or equal to the threshold"""
    to_drop = set()  # Set of indices to drop
    for i in range(len(df)):
        if i in to_drop:
            continue
        for j in range(i + 1, len(df)):
            if j in to_drop:
                continue
            if row_similarity(df.iloc[i], df.iloc[j]) >= threshold:
                to_drop.add(j)
    return df.drop(to_drop).reset_index(drop=True)

In [None]:
def get_categorical_numerical_cols(df: pd.DataFrame, s=5):
    """
    Return two lists: numerical and categorical columns

    Side effect: int columns that are actually categorical will be converted to object data type
    """
    to_be_checked: list[str] = []
    num_cols: list[str] = []
    cat_cols: list[str] = []
    for col in df.columns:
        if (df[col].dtype == "int64"):
            # Check whether an integer column is actually categorical
            to_be_checked.append(col)
        elif (df[col].dtype == "float64"):
            num_cols.append(col)
        else:
            cat_cols.append(col)

    for col in to_be_checked:
        if s <= len(df):
            col_stats = f"{list(df[col])} ({len(df[col].unique())}/{len(df[col])} unique values)"
        else:
            col_stats = f"{list(df[col].sample(s, random_state=42))} ({len(df[col].unique())}/{len(df[col])} unique values)"
        prompt = get_dtype_check_prompt(col, col_stats)
        dtype_ans = prompt_pipeline(
            pipe,
            [{"role": "user", "content": prompt}],
            temperature=None,
            top_p=None,
            max_new_tokens=5
        )[-1]["content"]
        if (dtype_ans.strip().lower().startswith("yes") or dtype_ans.strip().lower().startswith("**yes")):
            cat_cols.append(col)
            df[col] = df[col].astype(object)
    return (num_cols, cat_cols)

## 3. Produce Summaries

In [None]:
def get_row_summaries(sampled_df: pd.DataFrame):
    row_summaries: list[str] = []
    for i in range(len(sampled_df)):
        print(f"Summarizing row {i} of sampled_df")
        row_info = get_row_info_from_df(sampled_df, i)
        prompt = get_row_summary_prompt(row_info)
        row_summary = prompt_pipeline(
            pipe,
            [{"role": "user", "content": prompt}],
            temperature=None,
            top_p=None,
            max_new_tokens=400,
        )[-1]["content"]
        row_summaries.append(row_summary)
    return row_summaries

In [None]:
def get_table_summary(row_summaries: list[str]) -> str:
    """Summarize overall meaning of a table"""
    random.seed(42)
    sample_size = min(3, len(row_summaries))
    selected_summaries = random.sample(row_summaries, sample_size)

    summary_prompt = get_table_summary_prompt(selected_summaries)
    table_summary = prompt_pipeline(
        pipe,
        [{"role": "user", "content": summary_prompt}],
        temperature=None,
        top_p=None,
        max_new_tokens=150,
    )[-1]["content"]
    return table_summary

In [None]:
def get_num_columns_summaries(
    table_summary,
    df: pd.DataFrame,
    num_cols: list[str],
):
    num_cols_summaries: list[str] = []
    for num_col in num_cols:
        print(f"==> Col {num_col}")
        col_stats = "; ".join(
            [f"{index}: {value}" for index, value in df[num_col].describe().items()]
        )
        prompt = get_col_summary_prompt(table_summary, num_col, col_stats)
        num_col_summary = prompt_pipeline(
            pipe,
            [{"role": "user", "content": prompt}],
            temperature=None,
            top_p=None,
            max_new_tokens=200,
        )[-1]["content"]
        num_cols_summaries.append(num_col_summary)
    return num_cols_summaries

In [None]:
def get_cat_columns_summaries(
    table_summary: str,
    df: pd.DataFrame,
    cat_cols: list[str],
    show_unique_cat_threshold=10
):
    cat_cols_summaries: list[str] = []
    for cat_col in cat_cols:
        print(f"==> Col {cat_col}", flush=True)
        col_stats = "; ".join(
            [f"{index}: {value}" for index, value in df[cat_col].describe().items()]
        )

        if len(df[cat_col].unique()) <= show_unique_cat_threshold:
            # Show unique values as well if less than the threshold
            col_stats += f"; categories: {df[cat_col].unique()}"

        prompt = get_col_summary_prompt(table_summary, cat_col, col_stats)
        cat_col_summary = prompt_pipeline(
            pipe,
            [{"role": "user", "content": prompt}],
            temperature=None,
            top_p=None,
            max_new_tokens=200,
        )[-1]["content"]
        cat_cols_summaries.append(cat_col_summary)
    return cat_cols_summaries

In [None]:
def produce_summaries(
    df: pd.DataFrame,
    row_summaries_percentage=0.05,
):
    all_summaries: list[str] = []
    print("Start summarizing table")

    print("Summarizing some rows")
    result_df: pd.DataFrame = remove_similar_rows(df, threshold=0.9)
    sampled_df = result_df.sample(
        math.ceil(row_summaries_percentage * len(result_df)), random_state=42
    ).reset_index(drop=True)
    row_summaries = get_row_summaries(sampled_df)

    print("Summarizing the overall table")
    table_summary = get_table_summary(row_summaries)

    num_cols, cat_cols = get_categorical_numerical_cols(df)

    print("Summarizing the numerical cols")
    num_cols_summaries = get_num_columns_summaries(table_summary, df, num_cols)

    print("Summarizing the categorical cols")
    cat_cols_summaries = get_cat_columns_summaries(table_summary, df, cat_cols)

    all_summaries = row_summaries + [table_summary] + num_cols_summaries + cat_cols_summaries
    return all_summaries