# Exact Results

In [1]:
import shutil
from itertools import chain
from pathlib import Path

import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from matplotlib.transforms import Bbox

from fainder.utils import load_input, configure_run
from utils.plotting_defaults import set_style, plot_legend, parse_logs_wide

configure_run("WARNING")
set_style()
Path("plots/exact_results").mkdir(parents=True, exist_ok=True)

In [2]:
try:
    shutil.rmtree(Path.home() / ".cache" / "matplotlib" / "tex.cache")
except FileNotFoundError:
    pass

## Data Loading

Binsort yields the same results as profile-scan so we reuse the runtime measurements from the runtime benchmark for this plot.

In [3]:
baseline_list = []
path = Path("../logs/runtime_benchmark/execution/")
for logfile in chain(
    path.glob("*collection-binsort-single*"), path.glob("*collection-iterative-single*")
):
    config = logfile.stem.split("-")
    data = parse_logs_wide(logfile)

    data["dataset"] = config[0]
    data["query_set"] = config[1]
    data["approach"] = config[2]
    data["execution"] = config[3]
    data["iteration"] = config[4]

    baseline_list.append(data)

baselines = pd.DataFrame(
    baseline_list,
    columns=[
        "dataset",
        # "query_set",
        "approach",
        # "execution",
        "iteration",
        "query_collection_time",
    ],
)
baselines.rename(columns={"query_collection_time": "baseline_time"}, inplace=True)
baselines["approach"].replace("iterative", "pscan", inplace=True)

In [4]:
exact_list = []
for logfile in Path("../logs/exact_results/").glob("*.zst"):
    config = logfile.stem.split("-")
    data = load_input(logfile)

    assert len(config) == 3
    data["dataset"] = config[0]
    data["approach"] = config[1]
    data["iteration"] = config[2]
    exact_list.append(data)

exact = pd.DataFrame(
    exact_list,
    columns=[
        "dataset",
        "approach",
        "iteration",
        "precision_time",
        "recall_time",
        "iterative_time",
        "avg_reduction",
    ],
)
exact = exact.merge(baselines, on=["dataset", "approach", "iteration"])
exact["exact_time"] = exact["precision_time"] + exact["recall_time"] + exact["iterative_time"]

In [5]:
analysis = exact.groupby(["dataset", "approach"]).mean(numeric_only=True)
analysis["speedup"] = analysis["baseline_time"] / analysis["exact_time"]
analysis

Unnamed: 0_level_0,Unnamed: 1_level_0,precision_time,recall_time,iterative_time,avg_reduction,baseline_time,exact_time,speedup
dataset,approach,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
gittables,binsort,324.623288,404.754112,379.318834,0.982878,7905.69251,1108.696233,7.130621
gittables,pscan,319.460597,415.601123,1179.027464,0.982878,48309.523395,1914.089183,25.238909
open_data_usa,binsort,3.817181,4.554819,12.214554,0.93012,108.8449,20.586554,5.287184
open_data_usa,pscan,4.459656,4.601708,55.646694,0.93012,654.482206,64.708057,10.114385
sportstables,binsort,2.122183,1.489935,1.893361,0.980537,36.343399,5.505478,6.601315
sportstables,pscan,2.068838,1.518462,4.96417,0.980537,187.799319,8.55147,21.961056


## Plotting

In [6]:
height = 1.1
handles = []
for dataset in ["sportstables", "open_data_usa", "gittables"]:
    fig, ax = plt.subplots(figsize=(1.2, height))
    colors = [sns.color_palette()[i] for i in range(4)]
    hatches = ["xxx", "ooo", "///", "\\\\\\"]  # type: ignore

    for i, baseline in enumerate(["pscan", "binsort"]):
        data = analysis.query(f"dataset == '{dataset}' & approach == '{baseline}'")
        handles += ax.bar(
            i * 0.75,
            data["baseline_time"],
            width=0.5,
            color=colors[i],
            edgecolor="black",
            hatch=hatches[i],
        )

        bottom = 0
        for j, time in [
            (3, data["recall_time"]),
            (2, data["precision_time"]),
            (i, data["iterative_time"]),
        ]:
            handles += ax.bar(
                1.5 + i * 0.55,
                time,
                bottom=bottom,
                width=0.5,
                color=colors[j],
                edgecolor="black",
                hatch=hatches[j],
            )
            bottom += time.item()

        for time, x in [
            (data["baseline_time"].item(), i * 0.75),
            (bottom, 1.5 + i * 0.55),
        ]:
            label = f"{time:.0f}" if time > 100 else f"{time:.1f}"
            ax.annotate(
                label,
                xy=(x, time),  # type: ignore
                xytext=(0, 1),
                fontsize=mpl.rcParams["font.size"] * 0.8,
                textcoords="offset points",
                ha="center",
                va="bottom",
            )

    ax.set_xticks([0, 0.75, 1.775])
    ax.set_xticklabels(
        [
            "Full\nscan",
            r"\texttt{bin}-" "\n" r"\texttt{sort}",
            r"\textsc{Fainder}" "\n" r"\textsc{ Exact}",
        ]
    )
    if dataset == "gittables":
        ax.set_ylim(200, 90000)
    else:
        ax.set_ylim(
            analysis.query(f"dataset == '{dataset}' & approach == 'pscan'")["recall_time"].item()
            / 2
            % 10,
            ax.get_ylim()[1] * 2,
        )
    ax.set_yscale("log")

    sns.despine()
    bbox = fig.get_tightbbox()

    plt.tight_layout(pad=1.02)
    plt.savefig(
        f"plots/exact_results/{dataset}.pdf",
        bbox_inches="tight",
        pad_inches=0.01,
    )

    ax.set_ylabel("Time (s)")
    bbox = fig.get_tightbbox()
    label_bbox = Bbox(((bbox.x0, bbox.y0), (0.07, bbox.y1)))
    plt.savefig(f"plots/exact_results/ylabel.pdf", bbox_inches=label_bbox)
    plt.close()

plot_legend(
    "plots/exact_results/legend.pdf",
    [handles[0], handles[4], handles[2], handles[1]],
    [
        r"\pscan{}",
        r"\binsort{}",
        r"\textsc{F. Approx} full prec.",
        r"\textsc{F. Approx} full rec.",
    ],
    ncol=4,
)