In [None]:
import pandas as pd
import json

import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from matplotlib.colors import LogNorm, Normalize

In [None]:
fontsize=18
legend_fontsize=fontsize-2
tick_fontsize=legend_fontsize-3
marker_size=80

single_fig_width = 6
single_fig_height = 3

In [None]:
def to_index_type(index):
    if "lsi" in index.lower():
        return "lsi"
    if "btree" in index.lower():
        return "btree"
    if "art" in index.lower():
        return "art"
    if "robinhash" in index.lower():
        return "robinhash"
    
    
    raise Exception(f"unknown index type: {index}")
    return index

def to_model_name(index):
    if "cht" in index.lower():
        return "cht"
    if "trie" in index.lower():
        return "plex"
    
    
    raise Exception(f"unknown index type: {index}")
    return index

ds_name_table = {
    "0": "seq", 
    "1": "gap_10",
    "2": "uniform",
    "3": "fb",
    "4": "osm",
    "5": "wiki",
    "6": "normal",
    "7": "amzn"
    }

color_i = 0
colors = {}
def get_color(fun):
    global color_i
    global colors
    
    key = fun.lower()
    if key not in colors:
        options = list(mcolors.TABLEAU_COLORS) + list(mcolors.BASE_COLORS)[0:-1]
        colors[key] = options[color_i % len(options)]
        color_i += 1
    return colors[key]

In [None]:
results_file = "results/results.json"
with open (results_file) as file:
    data = json.load(file)
    df = pd.json_normalize(data, "benchmarks")

In [None]:
df["index"] = df["label"].apply(lambda x : x.split(":")[0])
df["model"] = df["index"].apply(lambda x : to_model_name(x.split(",")[0].split("<")[1]) if len(x.split(",")) > 0 and len(x.split(",")[0].split("<")) > 1 else None)
df["index_type"] = df["index"].apply(to_index_type)
df["dataset_size"] = df["name"].apply(lambda x : int(x.split("/")[1]))
df["MiB"] = df.apply(lambda x : float(x["bytes"]) / 1024**2, axis=1)
df["GiB"] = df.apply(lambda x : float(x["bytes"]) / 1024**3, axis=1)
df["model_MiB"] = df.apply(lambda x : float(x["model_bytes"]) / 1024**2, axis=1)
df["perm_MiB"] = df.apply(lambda x : float(x["perm_bytes"]) / 1024**2, axis=1)
df["dataset_name"] = df["name"].apply(lambda x : ds_name_table[x.split("/")[2]])
df["build_time_seconds"] = df.apply(lambda x : (float(x["build_time"]) / 10**9), axis=1)

df_exp1 = df[df["name"].str.lower().str.contains("lowerboundlookup<l")].copy(deep=True).reset_index()
df_exp1["error"] = df_exp1.apply(lambda x : (int("".join([s for s in x["index"].split(",")[0].split("_")[-1] if s.isdigit()]))) if x["index_type"] == "lsi" else 0, axis=1)
df_exp1["bulk_loading"] = df_exp1.apply(lambda x : ("true" in x["name"].split(",")[1].split(">")[0]) if x["index_type"] == "btree" else False, axis=1)
df_exp1 = df_exp1[(df_exp1["index_type"] != "btree") | (df_exp1["bulk_loading"] == True)]

df_exp2 = df[df["name"].str.lower().str.contains("equalityprobe")].copy(deep=True).reset_index()
df_exp2["error"] = df_exp2["index"].apply(lambda x : int("".join([s for s in x.split(",")[0] if s.isdigit()])) if len(x.split(",")) > 1 else 0)
df_exp2["fingerprint_size"] = df_exp2["index"].apply(lambda x : int("".join([s for s in x.split(",")[1] if s.isdigit()])) if len(x.split(",")) > 1 else 0)
df_exp2 = df_exp2[(df_exp2["index_type"] != "lsi") | (df_exp2["error"].isin([1, 4, 8]) & df_exp2["fingerprint_size"].isin([8]))]
df_exp2 = df_exp2.sort_values(["cpu_time"]).groupby(["index_type", "error", "fingerprint_size", "dataset_name"]).head(1)
df_exp2 = df_exp2[df_exp2["dataset_name"] == "amzn"]

df_exp3 = df[df["name"].str.lower().str.contains("equalityprobe") & (df["index_type"] == "lsi")].copy(deep=True).reset_index()
df_exp3["error"] = df_exp3["index"].apply(lambda x : int("".join([s for s in x.split(",")[0] if s.isdigit()])))
df_exp3["fingerprint_size"] = df_exp3["index"].apply(lambda x : int("".join([s for s in x.split(",")[1] if s.isdigit()])))
df_exp3 = df_exp3[df_exp3["error"].isin([4, 16, 64, 256])]
df_exp3 = df_exp3[df_exp3["fingerprint_size"].isin([0, 1, 4, 16])]
df_exp3 = df_exp3.sort_values(by=["fingerprint_size", "error"])
df_exp3 = df_exp3.sort_values(["fingerprint_size", "error", "cpu_time"]).groupby(["index_type", "error", "fingerprint_size"]).head(1)

df_exp4 = df[df["name"].str.lower().str.contains("lowerboundlookup") & (df["index_type"] == "lsi") & (df["dataset_name"] == "amzn")].copy(deep=True).reset_index()
df_exp4["error"] = df_exp4["index"].apply(lambda x : int("".join([s for s in x.split(",")[0].split("_")[-1] if s.isdigit()])))
# df_exp4 = df_exp4[(df_exp4["error"] != 1)]

top_bt = df.copy(deep=True).reset_index()
top_bt["error"] = top_bt.apply(lambda x : (int("".join([s for s in x["index"].split(",")[0].split("_")[-1] if s.isdigit()]))) if x["index_type"] == "lsi" else 0, axis=1)
top_bt = top_bt.iloc[top_bt.groupby(["dataset_name", "index_type", "error"])["build_time_seconds"].idxmin()]
top_bt = top_bt[top_bt["error"].isin([0, 1, 4, 8])]

df_exp5 = df[df["name"].str.lower().str.contains("equalityprobe") & (df["index_type"] == "lsi")].copy(deep=True).reset_index()
# df_exp5["model"] = df_exp5["index"].apply(lambda x : to_model_name(x.split(",")[0].split("<")[1]) if len(x.split(",")) > 0 and len(x.split(",")[0].split("<")) > 1 else None)
df_exp5["error"] = df_exp5["index"].apply(lambda x : int("".join([s for s in x.split(",")[0] if s.isdigit()])))
df_exp5["fingerprint"] = df_exp5["index"].apply(lambda x : int("".join([s for s in x.split(",")[1] if s.isdigit()])))
df_exp5["normalized_base"] = df_exp5.apply(lambda x : x["base_data_accesses"] / x["iterations"], axis=1)
df_exp5["normalized_fp"] = df_exp5.apply(lambda x : x["false_positive_accesses"] / x["iterations"], axis=1)
df_exp5 = df_exp5[df_exp5["error"] <= 128]
# df_exp5["dataset_size"] = df_exp5["name"].apply(lambda x : int(x.split("/")[1]))
# df_exp5["GiB"] = df_exp5.apply(lambda x : float(x["bytes"]) / 1024**3, axis=1)
# df_exp5["model_MiB"] = df_exp5.apply(lambda x : float(x["model_bytes"]) / 1024**2, axis=1)
# df_exp5["perm_MiB"] = df_exp5.apply(lambda x : float(x["perm_bytes"]) / 1024**2, axis=1)
# df_exp5["dataset_name"] = df_exp5["name"].apply(lambda x : ds_name_table[x.split("/")[2]])
# df_exp5["build_time_seconds"] = df_exp5.apply(lambda x : (float(x["build_time"]) / 10**9), axis=1)

In [None]:
df_exp2[["index_type", "fingerprint_size", "error", "cpu_time", "bytes"]]

In [None]:
def plt_exp_1():
    fig, axs = plt.subplots(2, 2, figsize=(8,3.75), sharex=True, sharey=True)
    for i, (ds, g) in enumerate(df_exp1.groupby("dataset_name")):
        ax = axs[i % 2][i // 2] 
        ax.set_title(ds, y=0.975, fontsize=fontsize)
        ax.set_xlim(0, 7)
        ax.set_ylim(0, 1800)
        ax.set_xticks([0, 1, 2, 3, 4, 5, 6, 7])
        ax.set_yticks([0, 500, 1000, 1500])
        ax.grid(linestyle='--')
        ax.tick_params(labelsize=tick_fontsize)
        for name, group in g.groupby("index_type"):
            ax.scatter(data=group, x="GiB", y="cpu_time", label=name, marker='x', s=marker_size, color=get_color(name))
        
            for _, row in group.iterrows():
                if row["index_type"] == "lsi":
                    ax.annotate(f"{row.error}", (row["GiB"]+ (-0.2 if row["error"] == 8 else 0.1), row["cpu_time"] + (100 if row["error"] == 8 else -250)), fontsize=legend_fontsize)

    axs[0][1].legend(fontsize=legend_fontsize, loc="upper right", bbox_to_anchor=(1.025, 1.05), ncol=2, labelspacing=0.2, borderpad=0.3, columnspacing=0.0, handlelength=0.4, handletextpad=0.2)

    fig.supxlabel("Size (GiB)", fontsize=fontsize, y=-0.025)
    fig.supylabel("Latency (ns)", fontsize=fontsize, x=0.00)
    fig.subplots_adjust(hspace=0.3, wspace=0.125)
    
    fig.savefig("results/lowerbounds.pdf", bbox_inches="tight", dpi=300)

plt_exp_1()

In [None]:
def plt_exp_2():
    fig, ax = plt.subplots(figsize=(single_fig_width, single_fig_height))
    ax.set_xlim(0, 8.5)
    ax.set_ylim(0, 760)
    ax.grid(linestyle="--")
    ax.tick_params(labelsize=tick_fontsize)
    
    for name, group in df_exp2.groupby(["index_type"]):
        ax.scatter(data=group, x="GiB", y="cpu_time", label=name, marker='x', s=marker_size, color=get_color(name))
        
        for _, row in group.iterrows():
            if row["index_type"] == "lsi":
                ax.annotate(f"{row.error}", (row["GiB"] + (0.0 if row.error < 8 else -0.5), row["cpu_time"] - 150), fontsize=legend_fontsize)
        
    ax.set_xlabel("Size (GiB)", fontsize=fontsize)
    ax.set_ylabel("Latency (ns)", fontsize=fontsize, x=0.05)
    
    ax.legend(fontsize=legend_fontsize, loc="lower left", labelspacing=0.1, borderpad=0.2, handlelength=1)
    
    fig.savefig("results/equality.pdf", bbox_inches="tight", dpi=300)

plt_exp_2()

In [None]:
def plt_exp_3():
    fig, ax = plt.subplots(figsize=(single_fig_width, single_fig_height))
    for fp_size, group in df_exp3.groupby("fingerprint_size"):
        label = "binary" if fp_size == 0 else f"linear ({fp_size})"
        group.plot.line(x="error", y="cpu_time", label=label, logx=True, ax=ax, linewidth=2, marker='.', markersize=10)
        
    
    ax.set_ylim([0, 2000])
    ax.minorticks_off()
    ax.set_xlabel("Model error", fontsize=fontsize)
    ax.set_xticks(df_exp3.error.unique())
    ax.get_xaxis().set_major_formatter(matplotlib.ticker.ScalarFormatter())
    ax.set_ylabel("Latency (ns)", fontsize=fontsize)
    ax.grid(linestyle="--")
    ax.tick_params(labelsize=tick_fontsize)
    
    ax.legend(fontsize=legend_fontsize-1, loc="lower center", ncol=4, labelspacing=0.1, borderpad=0.3, columnspacing=0.4, handlelength=0.4, handletextpad=0.2)
    
    fig.savefig("results/binary_vs_linear.pdf", bbox_inches="tight", dpi=300)

plt_exp_3()

In [None]:
def plt_exp_4():
    fig, ax = plt.subplots(figsize=(single_fig_width, single_fig_height))

    # Lookup latency
    for model, group in df_exp4.groupby(["model"]):
        ax.plot(group["GiB"], group["cpu_time"], color=get_color(model), linewidth=2)
        ax.scatter(data=group, x="GiB", y="cpu_time", label=model, marker='x', s=marker_size, color=get_color(model))
    
    for _, row in df_exp4.iterrows():
        ax.annotate(f"{row.error}", (row["GiB"] + (0.4 if row.error == 8 and row.model == "cht" else 0.1), row["cpu_time"] + (10 if row.model == "cht" else (-30 if row.error == 1 else 0))), fontsize=fontsize-2)
        
    ax.minorticks_off()
    ax.set_xlabel("Size (GiB)", fontsize=fontsize)
    ax.set_xlim(0, 10.5)
    ax.get_xaxis().set_major_formatter(matplotlib.ticker.ScalarFormatter())
    ax.set_ylabel("Latency (ns)", fontsize=fontsize)
    ax.set_ylim(620, 850)
    ax.set_yticks(range(600, 851, 50))
    ax.grid(linestyle="--")
    ax.tick_params(labelsize=tick_fontsize)
    
    ax.legend(fontsize=legend_fontsize, labelspacing=0.1, borderpad=0.5, handlelength=0.5)
    
    fig.savefig("results/cht_vs_plex.pdf", bbox_inches="tight", dpi=300)

plt_exp_4()

In [None]:
def plt_build_times():
    fig, axs = plt.subplots(2, 2, figsize=(8,3.75), sharex=True, sharey=True)
    fig.subplots_adjust(wspace=0.125, hspace=0.3)
    for i, (ds, g) in enumerate(top_bt.groupby("dataset_name")):
        ax = axs[i % 2][i // 2]
        ax.set_title(ds, y=0.975, fontsize=fontsize)
        #ax.set_xlim(0, 285)
        #ax.set_ylim(0, 1750)
        ax.set_axisbelow(True)
        ax.grid(linestyle="--")
        ax.tick_params(axis="x", which="both", bottom=False, top=False, labelbottom=False)
        ax.tick_params(labelsize=tick_fontsize)
        ax.set_yticks(range(0, 70, 20))
        s = set()
        for j, ((name, err), group) in enumerate(g.groupby(["index_type", "error"])):
            ax.bar(data=group, x=j, height="build_time_seconds", label=name if name not in s else None, color=get_color(name))
            s.add(name)

            for _, row in group.iterrows():
                if err > 0:
                    ax.annotate(f"{row.error}", (j - 0.125, row["build_time_seconds"] + 3), fontsize=fontsize-2)

    handles, labels = ax.get_legend_handles_labels()
    fig.legend(handles, labels, loc='lower center', bbox_to_anchor=(0.51, -0.04), ncol=4, fontsize=legend_fontsize)
    # axs[0][1].legend(fontsize=legend_fontsize, loc="lower center", ncol=4, columnspacing=0.7, borderpad=0.2, handlelength=0.6, handletextpad=0.2)

    # fig.supxlabel("Keys per second", fontsize=fontsize)
    fig.supylabel("Build time (s)", fontsize=fontsize, x=0.025)
    
    fig.savefig("results/build_throughput.pdf", bbox_inches="tight", dpi=300)

plt_build_times()

In [None]:
sim = pd.read_csv('simulation/simulate.csv')
sim = sim[sim['method'].isin(['optimal', 'bitpacking'])]

fig, ax = plt.subplots()
for name, group in sim.groupby('method'):
    group.plot.line(x='num_keys', y='bits_per_key', logx=True, logy=False, fontsize=fontsize, figsize=(single_fig_width, single_fig_height), legend=True,
                    label=name, ax=ax)
ax.minorticks_off()
ax.set_xlabel("Number of keys", fontsize=fontsize)
ax.set_xticks(10**np.arange(start=2, stop=10, step=2))
ax.set_ylabel("Bits per key", fontsize=fontsize)
ax.set_yticks(np.arange(stop=32 + 1, step=8))
ax.tick_params(labelsize=tick_fontsize)
ax.grid(linestyle="--")

ax.legend(fontsize=legend_fontsize, loc="upper left", labelspacing=0.1, borderpad=0.2, handlelength=1)

plt.savefig("results/bit-compression.pdf", bbox_inches="tight", dpi=300)

In [None]:
def plt_exp_5():
    errors = sorted(set(df_exp5["error"]))
    fp_fingerprints = list(reversed(sorted(set(df_exp5[df_exp5["fingerprint"] != 0]["fingerprint"]))))
    
    def make_heat_data(color_col= "cpu_time"):
        return [
            np.array([
                [df_exp5[(df_exp5['error'] == err) & (df_exp5['fingerprint'] == fp)][color_col].values[0] / (2.0 * err if color_col == "normalized_fp" else 1)
                    for err in errors]
                        for fp in fp_fingerprints]
            ),
            np.array([
                [df_exp5[(df_exp5['error'] == err) & (df_exp5['fingerprint'] == 0)][color_col].values[0]
                    for err in errors]
            ]
            ),
        ]
    
    # could use LogNorm (?)
    for i, color_col in enumerate(["cpu_time", "normalized_fp", "normalized_base"]):
        if color_col == "normalized_fp":
            fig, ax_fp = plt.subplots(figsize=(7.5, 3.2))
        else:    
            fig, [ax_fp, ax_bs] = plt.subplots(2, 1, figsize=(8, 4), gridspec_kw={'height_ratios': [5, 1]}, sharex=True)
        
        [fp_dat, bs_dat] = make_heat_data(color_col)
        
        # heat map
        vmin = 0.0 if color_col == "normalized_fp" else min(np.amin(fp_dat), np.amin(bs_dat))
        vmax = 1.0 if color_col == "normalized_fp" else max(np.amax(fp_dat), np.amax(bs_dat))
        cmap = "Blues"
        im_fp = ax_fp.imshow(fp_dat, vmin=vmin, vmax=vmax, cmap=cmap, aspect='auto')
        if color_col != "normalized_fp":
            im_bs = ax_bs.imshow(bs_dat, vmin=vmin, vmax=vmax, cmap=cmap, aspect='auto')
        
        # ticks
        ax_fp.tick_params(labelsize=tick_fontsize)
        ax_fp.set_yticks(np.arange(len(fp_fingerprints)), labels=fp_fingerprints, fontsize=tick_fontsize)

        if color_col != "normalized_fp":
            ax_bs.tick_params(labelsize=tick_fontsize)
            ax_bs.set_xticks(np.arange(len(errors)), labels=errors, fontsize=tick_fontsize)
            ax_bs.set_yticks([0], labels=["binary"], fontsize=tick_fontsize)
            ax_fp.get_xaxis().set_visible(False)
        else:
            ax_fp.set_xticks(np.arange(len(errors)), labels=errors, fontsize=tick_fontsize)
        
        # axis labels
        # ax1.set_title(color_col)
        ax_fp.set_ylabel("Fingerprint Bits", fontsize=legend_fontsize)
        (ax_bs if color_col != "normalized_fp" else ax_fp).set_xlabel("Model Error", fontsize=legend_fontsize)

        # numbers in heat map
        for j in range(len(errors)):
            for i in range(len(fp_fingerprints)):
                text = int(round(fp_dat[i, j])) if color_col == "cpu_time" else f"{fp_dat[i, j]:.1f}"
                color = "black" if fp_dat[i, j] / np.amax(fp_dat) < 0.8 else "white"
                ax_fp.text(j, i, text, ha="center", va="center", color=color, fontsize=tick_fontsize)
            
            if color_col == "normalized_fp":
                continue
            for i in [0]:
                text = int(round(bs_dat[i, j])) if color_col == "cpu_time" else f"{bs_dat[i, j]:.1f}"
                color = "black" if bs_dat[i, j] / np.amax(fp_dat) < 0.8 else "white"
                ax_bs.text(j, i, text, ha="center", va="center", color=color, fontsize=tick_fontsize)
                 
    
        fig.tight_layout()
    
        # color bar
        cbar = fig.figure.colorbar(im_fp, ax=[ax_fp, ax_bs] if color_col != "normalized_fp" else ax_fp, cmap=cmap, location="right")
        cbar.ax.tick_params(labelsize=tick_fontsize)
        color_col_to_hr = {"cpu_time": "Latency (ns)", "normalized_fp": "False Positives", "normalized_base": "Base Data Accesses"}
        cbar.ax.set_ylabel(color_col_to_hr[color_col], rotation=-90, va="bottom", fontsize=legend_fontsize)
    
        fig.savefig(f"results/error_fingerprint_study_{color_col}.pdf", bbox_inches="tight", dpi=300)

plt_exp_5()