# Scalability Benchmark

In [1]:
import shutil
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from utils.plotting_defaults import (
    set_style,
    parse_logs_wide,
)

set_style()
Path("plots/scalability_benchmark").mkdir(parents=True, exist_ok=True)

In [2]:
try:
    shutil.rmtree(Path.home() / ".cache" / "matplotlib" / "tex.cache")
except FileNotFoundError:
    pass

## Data Loading

In [3]:
execution_list = []
for logfile in Path("../logs/scalability_benchmark/execution/").iterdir():
    config = logfile.stem.split("-")
    data = parse_logs_wide(logfile)

    data["dataset"] = config[0]
    data["query_set"] = config[1]
    data["index_type"] = config[2]
    data["execution"] = config[3]
    data["scaling_factor"] = int(config[4][2:]) / 100

    execution_list.append(data)

execution = pd.DataFrame(
    execution_list,
    columns=[
        "dataset",
        "query_set",
        "index_type",
        "execution",
        "scaling_factor",
        "query_collection_time",
        "query_times",
        "avg_result_size",
    ],
)
execution["query_times_sum"] = execution["query_times"].apply(np.sum)

In [4]:
indexing_list = []
for logfile in Path("../logs/scalability_benchmark/indexing/").iterdir():
    config = logfile.stem.split("-")
    data = parse_logs_wide(logfile)

    data["dataset"] = config[0]
    data["phase"] = config[1]
    data["scaling_factor"] = int(config[2][2:]) / 100

    indexing_list.append(data)

indexing = pd.DataFrame(
    indexing_list,
    columns=[
        "dataset",
        "phase",
        "scaling_factor",
        "total_time",
        "index_size",
    ],
)

## Analysis

### Execution

In [5]:
execution.groupby(["dataset", "query_set", "index_type", "execution", "scaling_factor"]).agg(
    {
        "query_collection_time": ["mean", "std"],
        "query_times_sum": ["mean", "std"],
        "avg_result_size": ["mean"],
    }
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,query_collection_time,query_collection_time,query_times_sum,query_times_sum,avg_result_size
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,mean,std,mean,std,mean
dataset,query_set,index_type,execution,scaling_factor,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
gittables,collection,conversion,single,0.25,63.467779,0.346856,53.490379,0.323898,641780.4
gittables,collection,conversion,single,0.5,145.634594,1.14306,123.329871,1.148826,1293699.0
gittables,collection,conversion,single,1.0,352.906263,1.828558,270.235004,1.320238,2585439.0
gittables,collection,conversion,single,2.0,752.376484,3.422099,602.277972,2.463442,5160141.0
gittables,collection,conversion,single_suppressed,0.25,0.292523,0.003555,0.233128,0.003361,1.0
gittables,collection,conversion,single_suppressed,0.5,0.306344,0.015042,0.238758,0.004957,1.0
gittables,collection,conversion,single_suppressed,1.0,0.324416,0.006346,0.266214,0.006289,1.0
gittables,collection,conversion,single_suppressed,2.0,0.340009,0.00306,0.281049,0.002715,1.0
gittables,collection,rebinning,single,0.25,54.78954,0.300687,52.525611,0.291151,640542.0
gittables,collection,rebinning,single,0.5,126.356023,0.719538,122.286001,0.71383,1291107.0


In [6]:
height = 1.5
for i, index_type in enumerate(["rebinning", "conversion"]):
    fig, ax = plt.subplots(1, 1, figsize=(1.72, height))
    data = (
        execution[
            (execution["index_type"] == index_type) & (execution["query_set"] == "collection")
        ]
        .groupby(["execution", "scaling_factor"])
        .agg({"query_collection_time": ["mean", "std"]})
        .values
    )

    x = [0.25, 0.5, 1, 2]
    ax.plot(
        x,
        data[: len(x), 0],
        color=sns.color_palette()[0],
        label="w/ results",
    )
    ax.fill_between(
        x,
        data[: len(x), 0] - data[: len(x), 1],
        data[: len(x), 0] + data[: len(x), 1],
        alpha=0.1,
        edgecolor="white",
        color=sns.color_palette()[0],
    )
    ax.plot(
        x,
        data[len(x) :, 0],
        color=sns.color_palette()[1],
        label="w/o results",
    )
    ax.fill_between(
        x,
        data[len(x) :, 0] - data[len(x) :, 1],
        data[len(x) :, 0] + data[len(x) :, 1],
        alpha=0.1,
        edgecolor="white",
        color=sns.color_palette()[1],
    )

    ax.set_xticks(x, labels=[".25", ".5", "1", "2"])
    ax.set_ylabel("Time (s)")
    ax.set_yscale("log")
    ax.set_ylim(top=ax.get_ylim()[1] * 3)
    ax.legend(loc="upper left")
    sns.despine()
    bbox = fig.get_tightbbox()
    plt.tight_layout(pad=1.02)
    plt.savefig(
        f"plots/scalability_benchmark/{index_type}.pdf", bbox_inches="tight", pad_inches=0.01
    )
    plt.close()

handles, labels = ax.get_legend_handles_labels()  # type: ignore

### Index Construction

In [7]:
indexing.groupby(["dataset", "phase", "scaling_factor"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_time,index_size
dataset,phase,scaling_factor,Unnamed: 3_level_1,Unnamed: 4_level_1
gittables,clustering,0.25,1587.253769,
gittables,clustering,0.5,3368.694606,
gittables,clustering,1.0,6933.406293,
gittables,clustering,2.0,4893.088123,
gittables,conversion,0.25,286.60451,8556.341976
gittables,conversion,0.5,605.006456,17552.750676
gittables,conversion,1.0,1049.702077,29508.362532
gittables,conversion,2.0,2062.994869,58435.706196
gittables,rebinning,0.25,111.063035,4285.651122
gittables,rebinning,0.5,225.043177,8791.45563


**Finding:** The index construction time and size scales linearly with the dataset size, which is consistent with our conceptual discussion.