# Microbenchmarks

In [1]:
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from utils.plotting_defaults import delete_tex_cache, parse_logs_wide, set_style

from fainder.utils import configure_run, load_input

configure_run("WARNING")
set_style()
Path("plots/microbenchmarks").mkdir(parents=True, exist_ok=True)

In [2]:
delete_tex_cache()

## Data Loading

In [3]:
runtime_list = []
for logfile in Path("../logs/microbenchmarks/runtime/").iterdir():
    config = logfile.stem.split("-")
    data = parse_logs_wide(logfile)

    data["dataset"] = config[0]
    data["index_type"] = config[1]
    data["parameter"] = config[2][0]
    data["parameter_value"] = int(config[2][1:])
    data["execution"] = config[3]

    runtime_list.append(data)

runtime = pd.DataFrame(
    runtime_list,
    columns=[
        "dataset",
        "index_type",
        "parameter",
        "parameter_value",
        "execution",
        "query_collection_time",
        "avg_result_size",
    ],
)
runtime = (
    runtime.groupby(["dataset", "index_type", "parameter", "parameter_value", "execution"])
    .mean()
    .reset_index()
)

In [4]:
size_list = []
for logfile in Path("../logs/microbenchmarks/indexing/").iterdir():
    config = logfile.stem.split("-")
    if config[1] != "rebinning":
        continue
    data = parse_logs_wide(logfile)

    data["dataset"] = config[0]
    data["phase"] = config[1]
    data["parameter"] = config[2][0]
    data["parameter_value"] = int(config[2][1:])

    size_list.append(data)

index_size = pd.DataFrame(
    size_list,
    columns=[
        "dataset",
        "phase",
        "parameter",
        "parameter_value",
        "index_size",
    ],
)
index_size = (
    index_size.groupby(["dataset", "phase", "parameter", "parameter_value"]).mean().reset_index()
)

In [None]:
metrics = ["precision", "recall", "f1", "pruning_factor"]
accuracy_list = []

for logfile in Path("../logs/microbenchmarks/accuracy/").iterdir():
    logs = load_input(logfile)
    config = logfile.stem.split("-")

    for mode, mode_data in [
        ("recall", logs["recall_mode_metrics"]),
        ("precision", logs["precision_mode_metrics"]),
    ]:
        for i, values in enumerate(mode_data):
            accuracy_list.extend(
                [
                    {
                        "dataset": config[0],
                        "index_type": config[1],
                        "parameter": config[2][0],
                        "parameter_value": int(config[2][1:]),
                        "index_mode": mode,
                        "metric": metrics[i],
                        "value": value,
                    }
                    for value in values
                ]
            )

accuracy = pd.DataFrame(accuracy_list)
accuracy = (
    accuracy.groupby(
        ["dataset", "index_type", "index_mode", "parameter", "parameter_value", "metric"]
    )
    .agg({"value": "mean"})
    .reset_index()
)

## Analysis

In [6]:
def prepare_plot(dataset, param, runtime, index_size, accuracy, size_scale=1.0):
    r = runtime[(runtime["dataset"] == dataset) & (runtime["parameter"] == param)]
    i = index_size[
        (index_size["dataset"] == dataset)
        & (index_size["parameter"] == param)
        & (index_size["phase"] == "rebinning")
    ]
    a = accuracy[
        (accuracy["dataset"] == dataset)
        & (accuracy["parameter"] == param)
        & (accuracy["index_mode"] == "recall")
        & (accuracy["metric"] == "f1")
    ]

    fig, ax1 = plt.subplots(figsize=(3.5, 1.2), layout="constrained")

    ax2 = ax1.twinx()
    ax3 = ax1.twinx()

    # Offset the right spine of ax3 to not collide with ax2
    ax3.spines.right.set_position(("axes", 1.2))

    # Runtime
    ax1.plot(
        r[(r["execution"] == "single")]["parameter_value"],
        r[(r["execution"] == "single")]["query_collection_time"],
        color=sns.color_palette()[0],
        label="w/ results",
    )
    ax1.plot(
        r[(r["execution"] == "single_suppressed")]["parameter_value"],
        r[(r["execution"] == "single_suppressed")]["query_collection_time"],
        color=sns.color_palette()[0],
        label="w/o results",
        linestyle="--",
    )

    # Index size
    ax2.plot(
        i["parameter_value"],
        i["index_size"] * size_scale,
        color=sns.color_palette()[1],
        label="Index size",
    )

    # Accuracy
    ax3.plot(
        a[(a["index_type"] == "rebinning")]["parameter_value"],
        a[(a["index_type"] == "rebinning")]["value"] * 100,
        color=sns.color_palette()[2],
        label=r"Low mem.",
    )
    ax3.plot(
        a[(a["index_type"] == "conversion")]["parameter_value"],
        a[(a["index_type"] == "conversion")]["value"] * 100,
        color=sns.color_palette()[2],
        label=r"Full rec.",
        linestyle="--",
    )

    return fig, (ax1, ax2, ax3)

In [7]:
fig, (ax1, ax2, ax3) = prepare_plot("open_data_usa", "k", runtime, index_size, accuracy)

ax1.set(xlabel="Number of clusters", xlim=(1, 1000), ylabel="Time (s)", yscale="log")
ax2.set(ylabel="Index size (MB)", yscale="log")
ax3.set(ylabel=r"$F_1$ score (\%)", ylim=(0, 100))

ax1.set_xticks([1, 100, 200, 400, 600, 800, 1000])
ax1.set_yticks([0.1, 1], ["0.1", "1"])

ax1.yaxis.label.set_color(sns.color_palette()[0])
ax2.yaxis.label.set_color(sns.color_palette()[1])
ax3.yaxis.label.set_color(sns.color_palette()[2])

fig.legend(loc="upper center", bbox_to_anchor=(0.5, 1.1), ncol=5)

plt.savefig("plots/microbenchmarks/open_data_usa_k.pdf", bbox_inches="tight", pad_inches=0.01)
plt.close()

In [8]:
fig, (ax1, ax2, ax3) = prepare_plot(
    "gittables", "k", runtime, index_size, accuracy, size_scale=0.001
)

ax1.set(xlabel="Number of clusters", xlim=(1, 1000), ylabel="Time (s)", yscale="log")
ax2.set(ylabel="Index size (GB)")
ax3.set(ylabel=r"$F_1$ score (\%)", ylim=(0, 100))

ax1.set_xlim(100, 1000)
ax3.spines.right.set_position(("axes", 1.14))

ax1.yaxis.label.set_color(sns.color_palette()[0])
ax2.yaxis.label.set_color(sns.color_palette()[1])
ax3.yaxis.label.set_color(sns.color_palette()[2])

fig.legend(loc="upper center", bbox_to_anchor=(0.5, 1.1), ncol=5)

plt.savefig("plots/microbenchmarks/gittables_k.pdf", bbox_inches="tight", pad_inches=0.01)
plt.close()

In [9]:
fig, (ax1, ax2, ax3) = prepare_plot("open_data_usa", "b", runtime, index_size, accuracy)

ax1.set(xlabel="Bin Budget", xlim=(100, 1000000), ylabel="Time (s)", xscale="log", yscale="log")
ax2.set(ylabel="Index size (MB)", yscale="log")
ax3.set(ylabel=r"$F_1$ score (\%)", ylim=(0, 100))

ax1.set_yticks([1], ["1"])

ax1.yaxis.label.set_color(sns.color_palette()[0])
ax2.yaxis.label.set_color(sns.color_palette()[1])
ax3.yaxis.label.set_color(sns.color_palette()[2])

fig.legend(loc="upper center", bbox_to_anchor=(0.5, 1.1), ncol=5)

plt.savefig("plots/microbenchmarks/open_data_usa_b.pdf", bbox_inches="tight", pad_inches=0.01)
plt.close()