# Runtime Benchmark

In [1]:
import copy
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from matplotlib.transforms import Bbox
from utils.plotting_defaults import (
    autolabel_bars,
    delete_tex_cache,
    parse_logs_long,
    parse_logs_wide,
    plot_legend,
    plot_ylabel,
    set_style,
)

set_style()
Path("plots/runtime_benchmark").mkdir(parents=True, exist_ok=True)

In [2]:
delete_tex_cache()

## Data Loading

In [3]:
execution_list = []
for logfile in Path("../logs/runtime_benchmark/execution/").iterdir():
    config = logfile.stem.split("-")
    data = parse_logs_wide(logfile)

    data["dataset"] = config[0]
    data["query_set"] = config[1]
    data["approach"] = config[2]
    data["execution"] = config[3]

    execution_list.append(data)

execution = pd.DataFrame(
    execution_list,
    columns=[
        "dataset",
        "query_set",
        "approach",
        "execution",
        "query_collection_time",
    ],
)
execution = execution[
    (execution["execution"] == "single") | (execution["execution"] == "single_suppressed")
]

In [4]:
low_selectivity_list = []
for logfile in Path("../logs/runtime_benchmark/low_selectivity/").iterdir():
    config = logfile.stem.split("-")
    data = parse_logs_wide(logfile)

    data["dataset"] = config[0]
    data["query_set"] = config[1]
    data["approach"] = config[2]
    data["execution"] = config[3]

    low_selectivity_list.append(data)

low_selectivity = pd.DataFrame(
    low_selectivity_list,
    columns=[
        "dataset",
        "query_set",
        "approach",
        "execution",
        "query_collection_time",
    ],
)
low_selectivity = low_selectivity[
    (low_selectivity["execution"] == "single")
    | (low_selectivity["execution"] == "single_suppressed")
]

In [5]:
index_trace_list = []
for logfile in Path("../logs/runtime_benchmark/index_trace/").iterdir():
    config = logfile.stem.split("-")
    data = parse_logs_long(logfile)

    for entry in data:
        entry["dataset"] = config[0]
        entry["index_type"] = config[1]

    index_trace_list += data

index_trace = pd.DataFrame(
    index_trace_list,
    columns=[
        "dataset",
        "index_type",
        "metric",
        "value",
    ],
)

In [6]:
indexing_list = []
for logfile in Path("../logs/runtime_benchmark/indexing/").iterdir():
    config = logfile.stem.split("-")
    if len(config) != 4:
        continue
    data = parse_logs_wide(logfile)

    data["dataset"] = config[0]
    data["phase"] = config[1]
    data["parameter"] = config[2][0]
    data["parameter_value"] = int(config[2][1:])

    indexing_list.append(data)

indexing = pd.DataFrame(
    indexing_list,
    columns=[
        "dataset",
        "phase",
        "parameter",
        "parameter_value",
        "total_time",
    ],
)

## Analysis

### Approach Comparison

In [7]:
execution.groupby(["query_set", "dataset", "approach", "execution"]).agg(
    {"query_collection_time": ["mean", "std"]}
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,query_collection_time,query_collection_time
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,mean,std
query_set,dataset,approach,execution,Unnamed: 4_level_2,Unnamed: 5_level_2
collection,gittables,binsort,single,7905.69251,26.340778
collection,gittables,conversion,single,366.622109,1.667505
collection,gittables,conversion,single_suppressed,0.286408,0.003362
collection,gittables,iterative,single,48309.523395,1074.502817
collection,gittables,rebinning,single,283.915788,2.610721
collection,gittables,rebinning,single_suppressed,0.28587,0.001682
collection,open_data_usa,binsort,single,108.8449,5.007806
collection,open_data_usa,conversion,single,3.911224,0.570307
collection,open_data_usa,conversion,single_suppressed,0.673464,0.010704
collection,open_data_usa,iterative,single,654.482206,26.154671


In [8]:
height = 0.8
for dataset in ["sportstables", "open_data_usa", "gittables"]:
    fig, ax = plt.subplots(1, 1, figsize=(1.2, height))
    data = (
        execution[(execution["dataset"] == dataset) & (execution["query_set"] == "collection")]
        .groupby(["approach", "execution"])
        .agg({"query_collection_time": ["mean", "std"]})
        .values
    )

    ax.bar(
        x=0,
        height=data[3][0],
        width=0.5,
        # yerr=data[3][1],
        color=sns.color_palette()[0],
        edgecolor="black",
        label=r"\pscan{}",
    )
    ax.bar(
        x=0.75,
        height=data[0][0],
        width=0.5,
        # yerr=data[0][1],
        color=sns.color_palette()[1],
        edgecolor="black",
        label=r"\binsort{}",
    )
    ax.bar(
        x=1.5,
        height=data[4][0],
        width=0.5,
        # yerr=data[4][1],
        color=sns.color_palette()[2],
        edgecolor="black",
        hatch="////",
        label=r"\system{} w/ results",
    )
    ax.bar(
        x=2.05,
        height=data[5][0],
        width=0.5,
        # yerr=data[5][1],
        color=sns.color_palette()[2],
        edgecolor="black",
        hatch="oooo",
        label=r"\system{} w/o results",
    )

    ax.set_xticks([])
    ax.set_yscale("log")
    if dataset == "gittables":
        ax.set_ylim(top=ax.get_ylim()[1] * 4)
    else:
        ax.set_ylim(top=ax.get_ylim()[1] * 2.5)
    autolabel_bars(ax, precision=3, decimal_precision=2)

    plt.tight_layout(pad=1.02)
    plt.savefig(
        f"plots/runtime_benchmark/query_collection_{dataset}.pdf",
        bbox_inches="tight",
        pad_inches=0.01,
    )

    ax.set_ylabel("Time (s)")
    bbox = fig.get_tightbbox()
    label_bbox = Bbox(((bbox.x0, bbox.y0), (0.07, bbox.y1)))
    plt.savefig("plots/runtime_benchmark/query_collection_ylabel.pdf", bbox_inches=label_bbox)
    plt.close()

handles, labels = ax.get_legend_handles_labels()  # type: ignore
handles.append(copy.deepcopy(handles[-1]))
handles[2].patches[0].set_hatch("")  # type: ignore
handles[3].patches[0].set_facecolor("white")  # type: ignore
handles[3].patches[0].set_hatch("////")  # type: ignore
handles[4].patches[0].set_facecolor("white")  # type: ignore
labels = labels[:2] + [r"\system{}", "w/ results", "w/o results"]

plot_legend(
    "plots/runtime_benchmark/query_collection_legend.pdf", handles=handles, labels=labels, ncol=5
)

### Low Selectivity Comparison

In [9]:
height = 0.8
for dataset in ["sportstables", "open_data_usa", "gittables"]:
    fig, ax = plt.subplots(1, 1, figsize=(1.2, height))
    data = (
        low_selectivity[
            (low_selectivity["dataset"] == dataset)
            & (low_selectivity["query_set"] == "collection")
        ]
        .groupby(["approach", "execution"])
        .agg({"query_collection_time": ["mean", "std"]})
        .values
    )

    ax.bar(
        x=0,
        height=data[3][0],
        width=0.5,
        # yerr=data[3][1],
        color=sns.color_palette()[0],
        edgecolor="black",
        label=r"\pscan{}",
    )
    ax.bar(
        x=0.75,
        height=data[0][0],
        width=0.5,
        # yerr=data[0][1],
        color=sns.color_palette()[1],
        edgecolor="black",
        label=r"\binsort{}",
    )
    ax.bar(
        x=1.5,
        height=data[4][0],
        width=0.5,
        # yerr=data[4][1],
        color=sns.color_palette()[2],
        edgecolor="black",
        hatch="////",
        label=r"\system{} w/ results",
    )
    ax.bar(
        x=2.05,
        height=data[5][0],
        width=0.5,
        # yerr=data[5][1],
        color=sns.color_palette()[2],
        edgecolor="black",
        hatch="oooo",
        label=r"\system{} w/o results",
    )

    # ax.set_xticks([0, 0.75, 1.775], [r"\pscan{}", r"\binsort{}", r"\system{}"])
    ax.set_xticks([])
    ax.set_yscale("log")
    if dataset == "gittables":
        ax.set_ylim(top=ax.get_ylim()[1] * 4)
    else:
        ax.set_ylim(top=ax.get_ylim()[1] * 2.5)
    autolabel_bars(ax, precision=3, decimal_precision=2)

    plt.tight_layout(pad=1.02)
    plt.savefig(
        f"plots/runtime_benchmark/low_selectivity_{dataset}.pdf",
        bbox_inches="tight",
        pad_inches=0.01,
    )

    ax.set_ylabel("Time (s)")
    bbox = fig.get_tightbbox()
    label_bbox = Bbox(((bbox.x0, bbox.y0), (0.07, bbox.y1)))
    plt.savefig("plots/runtime_benchmark/low_selectivity_ylabel.pdf", bbox_inches=label_bbox)
    plt.close()

handles, labels = ax.get_legend_handles_labels()  # type: ignore
handles.append(copy.deepcopy(handles[-1]))
handles[2].patches[0].set_hatch("")  # type: ignore
handles[3].patches[0].set_facecolor("white")  # type: ignore
handles[3].patches[0].set_hatch("////")  # type: ignore
handles[4].patches[0].set_facecolor("white")  # type: ignore
labels = labels[:2] + [r"\system{}", "w/ results", "w/o results"]

plot_legend(
    "plots/runtime_benchmark/low_selectivity_legend.pdf", handles=handles, labels=labels, ncol=5
)

### Query Runtime Breakdown

In [10]:
index_trace.groupby(["dataset", "metric"]).agg({"value": ["mean", "std"]})

Unnamed: 0_level_0,Unnamed: 1_level_0,value,value
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std
dataset,metric,Unnamed: 2_level_2,Unnamed: 3_level_2
gittables,execution_time,0.390522,0.005917
gittables,query_bin_search_time,2.1e-05,1.6e-05
gittables,query_boostrap_time,5e-06,1e-06
gittables,query_cluster_skip_time,0.003754,0.010311
gittables,query_hist_search_time,7.8e-05,4e-05
gittables,query_result_update_time,0.002609,0.003235
gittables,query_time,0.390405,0.005918
gittables,total_time,17.58367,3.628835
open_data_usa,execution_time,0.033587,0.005685
open_data_usa,query_bin_search_time,1.8e-05,1e-05


In [11]:
height = 1.2
fig, ax = plt.subplots(1, 1, figsize=(1.79, height))

for i, dataset in enumerate(["sportstables", "open_data_usa", "gittables"]):
    data = (
        index_trace[index_trace.dataset == dataset]
        .groupby(["metric"])
        .agg({"value": ["mean", "std"]})
        .values
    )
    handles = []

    bottom = 0
    for j, idx in enumerate([2, 1, 4, 5, 3]):
        handles += ax.bar(
            x=i * 0.75,
            height=data[idx][0],
            width=0.5,
            bottom=bottom,
            color=sns.color_palette()[j],
            edgecolor="black",
            label=(
                ["Bootstrap", "Bin search", "Histogram search", "Result update", "Cluster skip"][j]
                if i == 0
                else ""
            ),
        )
        bottom += data[idx][0]

    # NOTE: We disregard overhead because it is caused by logging and not the actual computation
    # handles += ax.bar(
    #     x=i * 0.75,
    #     height=data[6][0] - bottom,
    #     width=0.5,
    #     bottom=bottom,
    #     color=sns.color_palette()[5],
    #     edgecolor="black",
    #     label="Overhead" if i == 0 else "",
    # )

    ax.annotate(
        f"{bottom:.4f}",
        xy=(i * 0.75, bottom),  # type: ignore
        xytext=(0, 1),  # 1 point vertical offset
        textcoords="offset points",
        fontsize=6 * 0.8,
        ha="center",
        va="bottom",
    )

ax.set_xticks([0, 0.75, 1.5], ["ST", "OD", "GT"])
# ax.set_ylabel("Time (s)")
ax.set_ylim(
    (index_trace.groupby(["dataset", "metric"]).mean(numeric_only=True).min()).item() / 2,
    ax.get_ylim()[1] * 5,
)
ax.set_yscale("log")
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles[::-1], labels[::-1], loc="upper left", fontsize="x-small")
sns.despine()
bbox = fig.get_tightbbox()

plt.tight_layout(pad=1.02)
plt.savefig("plots/runtime_benchmark/runtime_breakdown.pdf", bbox_inches="tight", pad_inches=0.01)
plt.close()

plot_ylabel(
    "plots/runtime_benchmark/runtime_breakdown_ylabel.pdf", "Time (s)", bbox, height, width=0.17
)

### Index Construction Time

In [12]:
pd.set_option("display.max_rows", 20)
(
    indexing[(indexing["parameter"] == "k")]
    .groupby(["dataset", "phase", "parameter_value"])
    .agg({"total_time": ["mean", "std"]})
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_time,total_time
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,std
dataset,phase,parameter_value,Unnamed: 3_level_2,Unnamed: 4_level_2
gittables,clustering,50,102.437872,2.187090
gittables,clustering,60,101.176613,0.946150
gittables,clustering,70,102.382302,1.965855
gittables,clustering,80,101.825904,2.216701
gittables,clustering,90,103.552862,2.861398
...,...,...,...,...
sportstables,rebinning,160,7.542296,0.224309
sportstables,rebinning,170,7.325124,0.163796
sportstables,rebinning,180,7.526771,0.296109
sportstables,rebinning,190,7.113499,0.212321


In [13]:
height = 1.1
for i, dataset in enumerate(["sportstables", "open_data_usa", "gittables"]):
    for param, xlabel, x in [
        (
            "k",
            "Number of clusters",
            (
                [1, 2, 5, *list(range(10, 201, 10))],
                [1, 2, 5, *list(range(10, 201, 10))],
                list(range(50, 250, 10)) + list(range(250, 1001, 50)),
            )[i],
        ),
        (
            "b",
            "Bin budget",
            (
                [1000, 5000, 10000, 50000, 100000, 500000, 1000000],
                [1000, 5000, 10000, 50000, 100000, 500000, 1000000],
                [10000, 50000, 100000, 500000, 1000000],
            )[i],
        ),
    ]:
        fig, ax = plt.subplots(1, 1, figsize=(1.77, height))
        for i, (phase, label) in enumerate(
            [
                ("clustering", "Clustering"),
                ("rebinning", "Rebinning"),
                ("conversion", "Conversion"),
            ]
        ):
            data = (
                indexing[
                    (indexing["dataset"] == dataset)
                    & (indexing["parameter"] == param)
                    & (indexing["phase"] == phase)
                ]
                .groupby(["parameter_value"])
                .agg({"total_time": ["mean", "std"]})
                .values
            )

            ax.plot(
                x,
                data[: len(x), 0],
                color=sns.color_palette()[i],
                label=label,
            )
            ax.fill_between(
                x,
                data[: len(x), 0] - data[: len(x), 1],
                data[: len(x), 0] + data[: len(x), 1],
                alpha=0.1,
                edgecolor="white",
                color=sns.color_palette()[i],
            )

        if param == "b":
            ax.set_xscale("log")
            ax.set_yscale("log")
        if param == "k":
            ax.set_xticks([50, 500, 1000])
        ax.set_xlabel(xlabel)
        ax.set_xlim(min(x), max(x))
        handles, labels = ax.get_legend_handles_labels()

        sns.despine()
        plt.tight_layout(pad=1.02)
        bbox = fig.get_tightbbox()
        plt.savefig(
            f"plots/runtime_benchmark/construction_{dataset}_{param}.pdf",
            bbox_inches="tight",
            pad_inches=0.01,
        )
        plt.close()

plot_legend(
    "plots/runtime_benchmark/construction_legend.pdf",
    handles=handles,  # type: ignore
    labels=labels,  # type: ignore
    ncol=5,
)
plot_ylabel(
    "plots/runtime_benchmark/construction_ylabel.pdf",
    "Time (s)",
    bbox,
    height,
    width=0.12,
    xlabel="Number of cluster",
)  # type: ignore

**NOTE:** Bin alignment is incremental, clustering is not.