# Plots for the Talk

**Note:** This notebook only contains copied and slightly adjusted code from the original analyses.

In [1]:
import shutil
from itertools import chain
from pathlib import Path

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import markers
from matplotlib.axes import Axes

from fainder.utils import load_input, configure_run
from utils.plotting_defaults import autolabel_bars, parse_logs_wide, parse_logs_special

configure_run("WARNING")
Path("talk").mkdir(parents=True, exist_ok=True)

In [2]:
try:
    shutil.rmtree(Path.home() / ".cache" / "matplotlib" / "tex.cache")
except FileNotFoundError:
    pass

In [3]:
sns.set_theme(
    context="talk",
    style="ticks",
    palette="colorblind",
    color_codes=True,
    font_scale=1.0,
    rc={
        "axes.linewidth": 1.5,  # 1.875
        "font.family": "Carlito",  # free alternative to Calibri
        # "grid.linewidth": 1.5,  # 1.5
        # "lines.linewidth": 9,  # 9
        # "lines.markersize": 12,  # 12
        # "patch.linewidth": 1.5,  # 1.5
        "savefig.pad_inches": 0.01,
        "savefig.transparent": True,
        "svg.fonttype": "none",
        # "xtick.major.pad": 2.0,  # 3.5
        "xtick.major.size": 10,  # 9
        "xtick.major.width": 2.0,  # 1.875
        # "xtick.minor.pad": 2.0,  # 3.4
        "xtick.minor.size": 6,  # 6dd
        "xtick.minor.width": 1.5,  # 1.5
        # "ytick.major.pad": 2.0,  # 3.5
        "ytick.major.size": 10,  # 9
        "ytick.major.width": 2.0,  # 1.875
        # "ytick.minor.pad": 2.0,  # 3.4
        "ytick.minor.size": 6,  # 6
        "ytick.minor.width": 1.5,  # 1.5
    },
)

## Runtime Comparison

In [4]:
execution_list = []
for logfile in Path("../logs/runtime_benchmark/execution/").iterdir():
    config = logfile.stem.split("-")
    data = parse_logs_wide(logfile)

    data["dataset"] = config[0]
    data["query_set"] = config[1]
    data["approach"] = config[2]
    data["execution"] = config[3]

    execution_list.append(data)

execution = pd.DataFrame(
    execution_list,
    columns=[
        "dataset",
        "query_set",
        "approach",
        "execution",
        "query_collection_time",
    ],
)
execution = execution[
    (execution["execution"] == "single") | (execution["execution"] == "single_suppressed")
]

In [5]:
for i, dataset in enumerate(["sportstables", "open_data_usa", "gittables"]):
    fig, ax = plt.subplots(1, 1, figsize=(2.4, 2.5), layout="constrained")
    data = (
        execution[(execution["dataset"] == dataset) & (execution["query_set"] == "collection")]
        .groupby(["approach", "execution"])
        .agg({"query_collection_time": ["mean", "std"]})
        .values
    )

    ax.bar(
        x=0,
        height=data[3][0],
        width=0.5,
        # yerr=data[3][1],
        color=sns.color_palette()[0],
        edgecolor="black",
        label=r"\pscan{}",
    )
    ax.bar(
        x=0.75,
        height=data[0][0],
        width=0.5,
        # yerr=data[0][1],
        color=sns.color_palette()[1],
        edgecolor="black",
        label=r"\binsort{}",
    )
    ax.bar(
        x=1.5,
        height=data[4][0],
        width=0.5,
        # yerr=data[4][1],
        color=sns.color_palette()[2],
        edgecolor="black",
        hatch="////",
        label=r"\system{} w/ results",
    )
    ax.bar(
        x=2.05,
        height=data[5][0],
        width=0.5,
        # yerr=data[5][1],
        color=sns.color_palette()[2],
        edgecolor="black",
        hatch="oooo",
        label=r"\system{} w/o results",
    )

    ax.set_xticks([])
    ax.set_yscale("log")
    if dataset == "gittables":
        ax.set_ylim(top=ax.get_ylim()[1] * 2.25)
    else:
        ax.set_ylim(top=ax.get_ylim()[1] * 1.75)
    autolabel_bars(ax, precision=3, decimal_precision=2)

    plt.savefig(f"talk/runtime_comparison_{dataset}.svg", bbox_inches="tight")
    plt.close()

## Skyline Analysis

In [6]:
datasets = ["sportstables", "open_data_usa", "gittables"]
queries = ["low_selectivity", "mid_selectivity", "high_selectivity"]

baseline_list = []
for dataset in datasets:
    for query_set in queries:
        for approach, metric_name in [
            ("pscan", "hist"),
            ("ndist", "dist"),
            ("binsort", "binsort"),
        ]:
            acc_logs = load_input(
                f"../logs/accuracy_benchmark/baseline_comp/{dataset}-{approach}-{query_set}.zst"
            )
            perf_logs = parse_logs_wide(
                f"../logs/accuracy_benchmark/baseline_comp/{dataset}-{approach}-{query_set}.log"
            )
            baseline_list.append(
                [
                    dataset,
                    approach,
                    query_set,
                    perf_logs["query_collection_time"],
                    acc_logs[f"{metric_name}_time"],
                    np.mean(acc_logs[f"{metric_name}_metrics"][0]),
                    np.mean(acc_logs[f"{metric_name}_metrics"][1]),
                    np.mean(acc_logs[f"{metric_name}_metrics"][2]),
                    np.mean(acc_logs[f"{metric_name}_metrics"][3]),
                ]
            )

        # Fainder rebinning and conversion
        for i, index_mode in enumerate(["rebinning", "conversion"]):
            acc_logs = load_input(
                f"../logs/accuracy_benchmark/baseline_comp/{dataset}-{index_mode}-{query_set}.zst"
            )
            runtimes = parse_logs_special(
                f"../logs/accuracy_benchmark/baseline_comp/{dataset}-{index_mode}-{query_set}.log"
            )
            baseline_list.append(
                [
                    dataset,
                    f"{index_mode}-precision",
                    query_set,
                    runtimes[0],
                    acc_logs["precision_mode_time"],
                    np.mean(acc_logs["precision_mode_metrics"][0]),
                    np.mean(acc_logs["precision_mode_metrics"][1]),
                    np.mean(acc_logs["precision_mode_metrics"][2]),
                    np.mean(acc_logs["precision_mode_metrics"][3]),
                ]
            )
            baseline_list.append(
                [
                    dataset,
                    f"{index_mode}-recall",
                    query_set,
                    runtimes[1],
                    acc_logs["recall_mode_time"],
                    np.mean(acc_logs["recall_mode_metrics"][0]),
                    np.mean(acc_logs["recall_mode_metrics"][1]),
                    np.mean(acc_logs["recall_mode_metrics"][2]),
                    np.mean(acc_logs["recall_mode_metrics"][3]),
                ]
            )

        # Fainder exact
        acc_logs = load_input(
            f"../logs/accuracy_benchmark/baseline_comp/{dataset}-exact-{query_set}.zst"
        )
        baseline_list.append(
            [
                dataset,
                "exact",
                query_set,
                acc_logs["precision_time"] + acc_logs["recall_time"] + acc_logs["iterative_time"],
                acc_logs["precision_time"] + acc_logs["recall_time"] + acc_logs["iterative_time"],
                1,  # Metrics not logged because approach is exact
                1,
                1,
                None,
            ]
        )

baseline_comp = pd.DataFrame(
    baseline_list,
    columns=[
        "dataset",
        "approach",
        "queries",
        "precise_time",
        "total_time",
        "precision",
        "recall",
        "f1",
        "pruning_factor",
    ],
)

In [7]:
def plot_baseline_scatter(dataset: str, metric: str, queries: str) -> None:
    _, ax = plt.subplots(1, 1, figsize=(2.7, 2.8), layout="constrained")

    if queries == "all":
        data = baseline_comp[
            (baseline_comp["dataset"] == dataset)
            & ~baseline_comp["approach"].isin(["conversion-precision", "rebinning-precision"])
        ]
    else:
        data = baseline_comp[
            (baseline_comp["dataset"] == dataset)
            & (baseline_comp["queries"] == queries)
            & ~baseline_comp["approach"].isin(["conversion-precision", "rebinning-precision"])
        ]

    data = (
        data.groupby(["approach"])
        .agg({"precise_time": "mean", metric: "mean"})
        .reindex(["pscan", "ndist", "binsort", "exact", "rebinning-recall", "conversion-recall"])
    )

    if dataset == "gittables":
        ax.scatter(
            data["precise_time"][:5],
            data[metric][:5] * 100,
            c=sns.color_palette()[:5],
            clip_on=False,
            marker=markers.CARETRIGHT,  # type: ignore
        )
        ax.scatter(
            data["precise_time"].iloc[5],
            data[metric].iloc[5] * 100,
            color=sns.color_palette()[8],
            clip_on=False,
            marker=markers.CARETLEFT,  # type: ignore
        )
    else:
        ax.scatter(
            data["precise_time"],
            data[metric] * 100,
            c=sns.color_palette()[:5] + [sns.color_palette()[8]],
            clip_on=False,
            marker=markers.CARETRIGHT,  # type: ignore
        )

    ax.grid(True, which="major", axis="y", linestyle="--", linewidth=0.5, alpha=0.3, color="gray")
    ax.set_xlabel("Time (s)")
    ax.set_xscale("log")
    ax.set_ylim(0, 100)
    sns.despine()

    plt.savefig(f"talk/scatter_{metric}_{dataset}_{queries}.svg", bbox_inches="tight")
    plt.close()

In [8]:
plot_baseline_scatter("sportstables", "f1", "all")
plot_baseline_scatter("open_data_usa", "f1", "all")
plot_baseline_scatter("gittables", "f1", "all")

## Fainder Exact

Binsort yields the same results as profile-scan so we reuse the runtime measurements from the runtime benchmark for this plot.

In [9]:
baseline_list = []
path = Path("../logs/runtime_benchmark/execution/")
for logfile in chain(
    path.glob("*collection-binsort-single*"), path.glob("*collection-iterative-single*")
):
    config = logfile.stem.split("-")
    data = parse_logs_wide(logfile)

    data["dataset"] = config[0]
    data["query_set"] = config[1]
    data["approach"] = config[2]
    data["execution"] = config[3]
    data["iteration"] = config[4]

    baseline_list.append(data)

baselines = pd.DataFrame(
    baseline_list,
    columns=[
        "dataset",
        # "query_set",
        "approach",
        # "execution",
        "iteration",
        "query_collection_time",
    ],
)
baselines.rename(columns={"query_collection_time": "baseline_time"}, inplace=True)
baselines.replace({"approach": {"iterative": "pscan"}}, inplace=True)

In [10]:
exact_list = []
for logfile in Path("../logs/exact_results/").glob("*.zst"):
    config = logfile.stem.split("-")
    data = load_input(logfile)

    assert len(config) == 3
    data["dataset"] = config[0]
    data["approach"] = config[1]
    data["iteration"] = config[2]
    exact_list.append(data)

exact = pd.DataFrame(
    exact_list,
    columns=[
        "dataset",
        "approach",
        "iteration",
        "precision_time",
        "recall_time",
        "iterative_time",
        "avg_reduction",
    ],
)
exact = exact.merge(baselines, on=["dataset", "approach", "iteration"])
exact["exact_time"] = exact["precision_time"] + exact["recall_time"] + exact["iterative_time"]

In [11]:
analysis = exact.groupby(["dataset", "approach"]).mean(numeric_only=True)
analysis["speedup"] = analysis["baseline_time"] / analysis["exact_time"]

In [12]:
handles = []
for dataset in ["sportstables", "open_data_usa", "gittables"]:
    _, ax = plt.subplots(figsize=(2.7, 2.8), layout="constrained")
    colors = [sns.color_palette()[i] for i in range(4)]

    for i, baseline in enumerate(["pscan", "binsort"]):
        data = analysis.query(f"dataset == '{dataset}' & approach == '{baseline}'")
        handles += ax.bar(
            i * 0.75,
            data["baseline_time"],
            width=0.5,
            color=colors[i],
            edgecolor="black",
        )

        bottom = 0
        for j, time in [
            (3, data["recall_time"]),
            (2, data["precision_time"]),
            (i, data["iterative_time"]),
        ]:
            handles += ax.bar(
                1.5 + i * 0.55,
                time,
                bottom=bottom,
                width=0.5,
                color=colors[j],
                edgecolor="black",
            )
            bottom += time.item()

        for time, x in [
            (data["baseline_time"].item(), i * 0.75),
            (bottom, 1.5 + i * 0.55),
        ]:
            label = f"{time:.0f}" if time > 100 else f"{time:.1f}"
            ax.annotate(
                label,
                xy=(x, time),  # type: ignore
                xytext=(0, 1),
                fontsize=mpl.rcParams["font.size"] * 0.9,
                textcoords="offset points",
                ha="center",
                va="bottom",
            )

    ax.set_xticks([0, 0.75, 1.775])
    ax.set_xticklabels(["Full\nscan", "bin\nsort", "Fainder\nExact"])
    if dataset == "gittables":
        ax.set_ylim(200, 60000)
    else:
        ax.set_ylim(
            analysis.query(f"dataset == '{dataset}' & approach == 'pscan'")["recall_time"].item()
            / 2
            % 10,
            ax.get_ylim()[1] * 1.19,
        )
    ax.set_yscale("log")
    sns.despine()

    plt.savefig(f"talk/exact_{dataset}.svg", bbox_inches="tight")
    plt.close()

## Clustering Microbenchmark

In [13]:
runtime_list = []
for logfile in Path(f"../logs/microbenchmarks/runtime/").iterdir():
    config = logfile.stem.split("-")
    data = parse_logs_wide(logfile)

    data["dataset"] = config[0]
    data["index_type"] = config[1]
    data["parameter"] = config[2][0]
    data["parameter_value"] = int(config[2][1:])
    data["execution"] = config[3]

    runtime_list.append(data)

runtime = pd.DataFrame(
    runtime_list,
    columns=[
        "dataset",
        "index_type",
        "parameter",
        "parameter_value",
        "execution",
        "query_collection_time",
        "avg_result_size",
    ],
)
runtime = (
    runtime.groupby(["dataset", "index_type", "parameter", "parameter_value", "execution"])
    .mean()
    .reset_index()
)

In [14]:
size_list = []
for logfile in Path("../logs/microbenchmarks/indexing/").iterdir():
    config = logfile.stem.split("-")
    if config[1] != "rebinning":
        continue
    data = parse_logs_wide(logfile)

    data["dataset"] = config[0]
    data["phase"] = config[1]
    data["parameter"] = config[2][0]
    data["parameter_value"] = int(config[2][1:])

    size_list.append(data)

index_size = pd.DataFrame(
    size_list,
    columns=[
        "dataset",
        "phase",
        "parameter",
        "parameter_value",
        "index_size",
    ],
)
index_size = (
    index_size.groupby(["dataset", "phase", "parameter", "parameter_value"]).mean().reset_index()
)

In [15]:
metrics = ["precision", "recall", "f1", "pruning_factor"]
accuracy_list = []

for logfile in Path("../logs/microbenchmarks/accuracy/").iterdir():
    logs = load_input(logfile)
    config = logfile.stem.split("-")

    for mode, mode_data in [
        ("recall", logs["recall_mode_metrics"]),
        ("precision", logs["precision_mode_metrics"]),
    ]:
        for i, values in enumerate(mode_data):
            for value in values:
                accuracy_list.append(
                    {
                        "dataset": config[0],
                        "index_type": config[1],
                        "parameter": config[2][0],
                        "parameter_value": int(config[2][1:]),
                        "index_mode": mode,
                        "metric": metrics[i],
                        "value": value,
                    }
                )

accuracy = pd.DataFrame(accuracy_list)
accuracy = (
    accuracy.groupby(
        ["dataset", "index_type", "index_mode", "parameter", "parameter_value", "metric"]
    )
    .agg({"value": "mean"})
    .reset_index()
)

In [16]:
r = runtime[(runtime["dataset"] == "open_data_usa") & (runtime["parameter"] == "k")]
i = index_size[
    (index_size["dataset"] == "open_data_usa")
    & (index_size["parameter"] == "k")
    & (index_size["phase"] == "rebinning")
]
a = accuracy[
    (accuracy["dataset"] == "open_data_usa")
    & (accuracy["parameter"] == "k")
    & (accuracy["index_mode"] == "recall")
    & (accuracy["metric"] == "f1")
]

_, ax1 = plt.subplots(figsize=(7.2, 2.8), layout="constrained")

ax2: Axes = ax1.twinx()  # type: ignore
ax3: Axes = ax1.twinx()  # type: ignore

# Offset the right spine of ax3 to not collide with ax2
ax3.spines.right.set_position(("axes", 1.2))

# Runtime
ax1.plot(
    r[(r["execution"] == "single")]["parameter_value"],
    r[(r["execution"] == "single")]["query_collection_time"],
    color=sns.color_palette()[0],
    label="w/ results",
)
ax1.plot(
    r[(r["execution"] == "single_suppressed")]["parameter_value"],
    r[(r["execution"] == "single_suppressed")]["query_collection_time"],
    color=sns.color_palette()[0],
    label="w/o results",
    linestyle="--",
)

# Index size
ax2.plot(
    i["parameter_value"],
    i["index_size"],
    color=sns.color_palette()[1],
    label="Index size",
)

# Accuracy
ax3.plot(
    a[(a["index_type"] == "rebinning")]["parameter_value"],
    a[(a["index_type"] == "rebinning")]["value"] * 100,
    color=sns.color_palette()[2],
    label="Low mem.",
)
ax3.plot(
    a[(a["index_type"] == "conversion")]["parameter_value"],
    a[(a["index_type"] == "conversion")]["value"] * 100,
    color=sns.color_palette()[2],
    label="Full rec.",
    linestyle="--",
)

ax1.set(xlabel="Number of clusters", xlim=(1, 1000), ylabel="Time (s)", yscale="log")
ax2.set(ylabel="Index size (MB)", yscale="log")
ax3.set(ylabel="$F_1$ score (%)", ylim=(0, 100))

ax1.set_xticks([1, 100, 200, 400, 600, 800, 1000])
ax1.set_yticks([0.1, 1])

ax1.yaxis.label.set_color(sns.color_palette()[0])
ax2.yaxis.label.set_color(sns.color_palette()[1])
ax3.yaxis.label.set_color(sns.color_palette()[2])

plt.savefig("talk/microbenchmarks_open_data_usa_k.svg", bbox_inches="tight")
plt.close()