In [None]:
import pandas as pd
import seaborn as sns
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt

In [None]:
BENCHMARK_RUN_FOLDER = Path(
    "../output_saved/output_22-03-03_23:22:25_1ef313d_vm-midea03_max-level-3"
)
LIBRARY_NAMES = [
    "pcalg",
    "bnlearn",
    "cupc",
    "gpucsl",
    "gpucsl_multi_4gpu",
    "gpucsl_incl_compilation",
    "gpucsl_incl_compilation_multi_4gpu",
]
sns.set(rc={"figure.figsize": (20, 10)})

# https://stackoverflow.com/questions/43214978/seaborn-barplot-displaying-values
def show_values_on_bars(axs):
    def _show_on_single_plot(ax):
        for p in ax.patches:
            _x = p.get_x() + p.get_width() / 2
            _y = p.get_y() + p.get_height()
            value = "{:.2f}".format(p.get_height())
            ax.text(_x, _y, value, ha="center")

    if isinstance(axs, np.ndarray):
        for idx, ax in np.ndenumerate(axs):
            _show_on_single_plot(ax)
    else:
        _show_on_single_plot(axs)

In [None]:
dfs = []
for library in LIBRARY_NAMES:
    try:
        library_df = pd.read_csv(BENCHMARK_RUN_FOLDER / f"{library}.csv")
        library_df["name"] = library
        library_df["benchmark_dataset"] = library_df["dataset"]
        library_df = library_df.set_index("dataset")
        dfs.append(library_df)
    except Exception as e:
        print(e)

all_benchmarks = pd.concat(dfs)

all_benchmarks.loc[
    all_benchmarks["name"] == "pcalg", "edge_orientation_time"
] = all_benchmarks[all_benchmarks["name"] == "cupc"]["edge_orientation_time"]
all_benchmarks.loc[all_benchmarks["name"] == "pcalg", "discover_skeleton_time"] = (
    all_benchmarks[all_benchmarks["name"] == "pcalg"]["discover_skeleton_time"]
    - all_benchmarks[all_benchmarks["name"] == "pcalg"]["edge_orientation_time"]
)


all_benchmarks["display_name_dataset"] = (
    all_benchmarks["benchmark_dataset"]
    .str.replace("Saureus", "S.aureus")
    .replace("Scerevisiae", "S.cerevisiae")
)

all_benchmarks["display_name_library"] = (
    all_benchmarks["name"]
    .str.replace(
        "gpucsl_incl_compilation_multi_4gpu",
        "GPUCSL (4 GPUs) (incl. JIT compilation time)",
    )
    .replace("gpucsl_incl_compilation", "GPUCSL (1 GPU) (incl. JIT compilation time)")
    .replace("gpucsl_multi_4gpu", "GPUCSL (4 GPUs)")
    .replace("gpucsl", "GPUCSL (1 GPU)")
    .replace("bnlearn", "bnlearn (8 cores)")
    .replace("pcalg", "pcalg stable.fast (8 cores)")
    .replace("cupc", "cuPC-S (1 GPU)")
)
all_benchmarks

In [None]:
d = all_benchmarks
d = d[d["distribution"] == "gaussian"]
d = d[~d["name"].str.match(".*multi.*")]
ax = sns.barplot(
    data=d,
    x="display_name_dataset",
    y="full_runtime",
    hue="display_name_library",
)
ax.set_yscale("log")
ax.set_xlabel("Dataset (multivariate normal distr.)")
ax.set_ylabel(
    "Runtime (wall clock time, without correlation matrix computation, in seconds)"
)
ax.set_title(
    "PC library runtime comparision for multivariate normal distributed datasets (maximum CI test level 3)"
)
show_values_on_bars(ax)
plt.legend(title="PC Library")

In [None]:
d = all_benchmarks[all_benchmarks["distribution"] == "gaussian"]
d = d[~d["name"].str.match("gpucsl_incl_compilation|.*multi.*")]
ax = sns.barplot(
    data=d,
    x="display_name_dataset",
    y="discover_skeleton_time",
    hue="display_name_library",
)
ax.set_yscale("log")
ax.set_xlabel("Dataset (multivariate normal distr.)")
ax.set_ylabel("Skeleton Computation Runtime (wall clock time, in seconds)")
ax.set_title(
    "Skeleton function runtime comparision for multivariate normal distributed datasets (maximum CI test level 3)"
)
show_values_on_bars(ax)
plt.legend(title="PC Library")

In [None]:
d = all_benchmarks[all_benchmarks["distribution"] == "gaussian"]
d = d[~d["name"].str.match("gpucsl_incl_compilation|.*multi.*|pcalg")]
ax = sns.barplot(
    data=d,
    x="display_name_dataset",
    y="kernel_time",
    hue="display_name_library",
)
ax.set_yscale("log")
ax.set_xlabel("Dataset (multivariate normal distr.)")
ax.set_ylabel("Skeleton Kernel Computation Runtime (wall clock time, in seconds)")
ax.set_title(
    "Skeleton kernel function runtime (loop through all levels) comparision for multivariate normal distributed datasets (maximum CI test level 3)"
)
show_values_on_bars(ax)
plt.legend(title="PC Library")

In [None]:
d = all_benchmarks[all_benchmarks["distribution"] == "gaussian"]
d = d[~d["name"].str.match("gpucsl_incl_compilation|.*multi.*")]
ax = sns.barplot(
    data=d,
    x="display_name_dataset",
    y="edge_orientation_time",
    hue="display_name_library",
)
ax.set_yscale("log")
ax.set_xlabel("Dataset (multivariate normal distr.)")
ax.set_ylabel("Edge Orientation Runtime (wall clock time, in seconds)")
ax.set_title(
    "Edge Orientation runtime comparision for multivariate normal distributed datasets (maximum CI test level 3)"
)
show_values_on_bars(ax)
plt.legend(title="PC Library")

In [None]:
d = all_benchmarks
d = d[d["distribution"] == "discrete"]
d = d[~d["name"].str.match(".*multi.*")]

ax = sns.barplot(
    data=d,
    x="display_name_dataset",
    y="full_runtime",
    hue="display_name_library",
)
ax.set_yscale("log")
ax.set_xlabel("Dataset (discrete)")
ax.set_ylabel("Runtime (wall clock time, in seconds)")
ax.set_title(
    "PC library runtime comparision for discrete datasets (maximum CI test level 3)"
)
show_values_on_bars(ax)
plt.legend(title="PC Library")

In [None]:
# todo: the numbers look a bit odd, re-run on delos
# todo: peter improvements of sepset merging?
d = all_benchmarks
d = d[d["distribution"] == "gaussian"]
d = d[d["name"].str.match("gpucsl")]

ax = sns.barplot(
    data=d,
    x="display_name_dataset",
    y="full_runtime",
    hue="display_name_library",
)
ax.set_yscale("log")
ax.set_xlabel("Dataset")
ax.set_ylabel("Runtime (wall clock time, in seconds)")
ax.set_title(
    "GPUCSL runtime comparision between single and multi GPU (maximum CI test level 3)"
)
show_values_on_bars(ax)
plt.legend(title="PC Library")

In [None]:
def plot_clustered_stacked(
    dfall, labels=None, title="multiple stacked bar plot", H="/", **kwargs
):
    """Given a list of dataframes, with identical columns and index, create a clustered stacked bar plot.
    labels is a list of the names of the dataframe, used for the legend
    title is a string for the title of the plot
    H is the hatch used for identification of the different dataframe"""

    n_df = len(dfall)
    n_col = len(dfall[0].columns)
    n_ind = len(dfall[0].index)
    axe = plt.subplot(111)
    axe.set_yscale("log")

    for df in dfall:  # for each data frame
        axe = df.plot(
            kind="bar",
            linewidth=0,
            stacked=True,
            ax=axe,
            legend=False,
            grid=False,
            **kwargs
        )  # make bar plots

    h, l = axe.get_legend_handles_labels()  # get the handles we want to modify
    for i in range(0, n_df * n_col, n_col):  # len(h) = n_col * n_df
        for j, pa in enumerate(h[i : i + n_col]):
            for rect in pa.patches:  # for each index
                rect.set_x(rect.get_x() + 1 / float(n_df + 1) * i / float(n_col))
                rect.set_hatch(H * int(i / n_col))  # edited part
                rect.set_width(1 / float(n_df + 1))

    axe.set_xticks((np.arange(0, 2 * n_ind, 2) + 1 / float(n_df + 1)) / 2.0)
    axe.set_xticklabels(df.index, rotation=0)
    axe.set_title(title)
    axe.set_xlabel("Dataset")
    axe.set_ylabel("Cumulated Runtime (seconds, without compilation time)")

    # Add invisible data to add another legend
    n = []
    for i in range(n_df):
        n.append(axe.bar(0, 0, color="gray", hatch=H * i))

    l1 = axe.legend(h[:n_col], l[:n_col], title="Runtime", loc=[1.01, 0.5])
    if labels is not None:
        l2 = plt.legend(n, labels, title="Library", loc=[1.01, 0.1])
    axe.add_artist(l1)
    return axe


# https://stackoverflow.com/questions/22787209/how-to-have-clusters-of-stacked-bars-with-python-pandas
plot_clustered_stacked(
    [df[["discover_skeleton_time", "edge_orientation_time"]] for df in dfs],
    [df["name"][0] for df in dfs],
    title="GPUCSL benchmark runtimes",
)