In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib.transforms import Bbox

from fainder.utils import load_input, configure_run
from utils.plotting_defaults import parse_logs_wide, plot_legend, set_style

configure_run("ERROR")
set_style()
Path("plots/runtime_benchmark").mkdir(parents=True, exist_ok=True)

## Number of unique histogram bins

In [2]:
def print_uniques(dataset: str) -> None:
    hists = load_input(f"../data/{dataset}/histograms.zst")

    bin_edges = []
    for _, hist in hists:
        bin_edges += hist[1].tolist()
    print("Total bins:", len(bin_edges))
    print("Unique bins:", len(np.unique(bin_edges)))
    print("Unique rounded bins:", len(np.unique(np.round(bin_edges, 2))))

In [3]:
print_uniques("sportstables")

Total bins: 284592
Unique bins: 104064
Unique rounded bins: 53205


In [4]:
print_uniques("open_data_usa")

Total bins: 923910
Unique bins: 498999
Unique rounded bins: 355816


In [5]:
print_uniques("gittables")

Total bins: 55764293
Unique bins: 18169986
Unique rounded bins: 13619798


## Impact of the number of source histograms

In [6]:
runtime_list = []
construction_list = []
for logfile in Path("../logs/runtime_benchmark/binsort/").iterdir():
    config = logfile.stem.split("-")
    try:
        data = parse_logs_wide(logfile)
    except Exception as e:
        print(f"Failed to parse {logfile}: {e}")
        break
    data["dataset"] = config[0]
    data["approach"] = config[1]
    data["n_bins"] = int(config[2][1:])
    if config[1] == "construction":
        construction_list.append(data)
    else:
        runtime_list.append(data)

runtime = pd.DataFrame(
    runtime_list, columns=["dataset", "approach", "n_bins", "query_collection_time"]
).convert_dtypes()
construction = pd.DataFrame(
    construction_list,
    columns=["dataset", "approach", "n_bins", "histogram_count", "bin_count", "construction_time"],
).convert_dtypes()
runtime = pd.merge(
    runtime, construction[["dataset", "n_bins", "bin_count"]], on=["dataset", "n_bins"]
)

In [7]:
construction

Unnamed: 0,dataset,approach,n_bins,histogram_count,bin_count,construction_time
0,gittables,construction,5000,5017619,174839512,301.162769
1,open_data_usa,construction,100,68313,4053282,21.091483
2,sportstables,construction,500,19862,988037,3.953246
3,gittables,construction,100,5017619,138003232,281.254283
4,open_data_usa,construction,5000,68313,28356830,23.910763
5,gittables,construction,10,5017619,38087256,264.9893
6,open_data_usa,construction,1000,68313,15810134,23.418911
7,sportstables,construction,1000,19862,1030657,3.756182
8,open_data_usa,construction,500,68313,11509189,23.688649
9,open_data_usa,construction,50,68313,2393908,20.6937


In [8]:
runtime.groupby(["dataset", "approach", "n_bins"]).agg(
    {"query_collection_time": "mean", "bin_count": "mean"},
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,query_collection_time,bin_count
dataset,approach,n_bins,Unnamed: 3_level_1,Unnamed: 4_level_1
gittables,binsort,10,6318.515152,38087256.0
gittables,binsort,50,14562.045691,104583830.0
gittables,binsort,100,18601.426289,138003232.0
gittables,binsort,500,21942.250489,166813725.0
gittables,binsort,1000,23141.506456,173037340.0
gittables,binsort,5000,23406.01974,174839512.0
gittables,iterative,10,48052.367034,38087256.0
gittables,iterative,50,48126.377946,104583830.0
gittables,iterative,100,48526.780963,138003232.0
gittables,iterative,500,48175.698742,166813725.0


In [9]:
for i, dataset in enumerate(["sportstables", "open_data_usa", "gittables"]):
    fig, ax = plt.subplots(1, 1, figsize=(1.2, 1.4))
    data = (
        runtime.query(f"dataset == '{dataset}'")
        .groupby(["approach", "n_bins"])
        .agg({"query_collection_time": "mean", "bin_count": "mean"})
        .values
    )

    ax.plot(
        data[6:, 1] / 100000,
        data[6:, 0],
        marker="o",
        ms=1.5,
        label=r"\pscan{}",
        c=sns.color_palette()[0],
    )
    ax.plot(
        data[:6, 1] / 100000,
        data[:6, 0],
        marker="o",
        ms=1.5,
        label=r"\binsort{}",
        c=sns.color_palette()[1],
    )

    ax.set_xlabel(r"\# bins ($\times 10^5$)")
    sns.despine()

    plt.tight_layout(pad=1.02)
    plt.savefig(
        f"plots/runtime_benchmark/binsort_{dataset}.pdf",
        bbox_inches="tight",
        pad_inches=0.01,
    )

    ax.set_ylabel("Time (s)")
    bbox = fig.get_tightbbox()
    label_bbox = Bbox(((bbox.x0, bbox.y0), (0.07, bbox.y1)))
    plt.savefig(f"plots/runtime_benchmark/binsort_ylabel.pdf", bbox_inches=label_bbox)
    plt.close()

handles, labels = ax.get_legend_handles_labels()
plot_legend(
    f"plots/runtime_benchmark/binsort_legend.pdf",
    handles=handles,
    labels=labels,
    ncol=2,
)