In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from scip_workflows.common import *


In [None]:
try:
    hpo_scip_path = snakemake.input.hpo_scip
    hpo_ideas_path = snakemake.input.hpo_ideas
    output_metrics = snakemake.output.metrics
    output_table = snakemake.output.table
except NameError:
    data_dir = Path("/home/maximl/scratch/data/vsc/datasets/wbc/scip/20220713131400/")
    hpo_scip_path = data_dir / "hpo" / "WBC_rsh_scip_cyto_li_xgboost.pickle"
    hpo_ideas_path = (
        data_dir.parent.parent
        / "ideas"
        / "hpo"
        / "WBC_rsh_ideas_cyto_li_xgboost.pickle"
    )
    output_metrics = data_dir / "figures" / "classification_comparison.png"
    output_table = data_dir / "tables" / "WBC_classification_comparison.tex"


In [None]:
with open(hpo_scip_path, "rb") as fh:
    hpo_scip = pickle.load(fh)
with open(hpo_ideas_path, "rb") as fh:
    hpo_ideas = pickle.load(fh)


In [None]:
scip_df = (
    pandas.DataFrame(hpo_scip)
    .drop(
        columns=[
            "score_time",
            "fit_time",
            "estimator",
            "train_recall_macro",
            "test_recall_macro",
        ]
    )
    .melt()
)
scip_df["software"] = "SCIP"
ideas_df = (
    pandas.DataFrame(hpo_ideas)
    .drop(
        columns=[
            "score_time",
            "fit_time",
            "estimator",
            "train_recall_macro",
            "test_recall_macro",
        ]
    )
    .melt()
)
ideas_df["software"] = "IDEAS"

df = pandas.concat([scip_df, ideas_df])
df["phase"] = df["variable"].apply(lambda r: r.split("_")[0])
df["metric"] = df["variable"].apply(lambda r: " ".join(r.split("_")[1:]))


In [None]:
g = seaborn.catplot(
    data=df,
    x="value",
    y="software",
    col="metric",
    dodge=True,
    hue="phase",
    kind="strip",
)
for ax in g.axes.ravel():
    ax.set_box_aspect(0.4)

plt.savefig(output_metrics, bbox_inches="tight")


In [None]:
scores_df = df.groupby(["metric", "phase", "software"])["value"].agg(
    ["mean", scipy.stats.sem]
)

scores_df.index.names = [c.capitalize() for c in scores_df.index.names]
scores_df["Mean (std. error)"] = scores_df.apply(
    lambda r: "%.3f (%.3f)" % (r["mean"], r["sem"]), axis=1
)


In [None]:
scores = scores_df.reset_index().pivot(
    index=["Metric", "Software"], values="Mean (std. error)", columns="Phase"
)


In [None]:
scores.columns.name = ""
scores.columns = [c.capitalize() for c in scores.columns]


In [None]:
scores.T


In [None]:
print(scores.T.style.to_latex(hrules=True, multicol_align="l"))


In [None]:
with open(output_table, "w") as fh:neutrophilsneutrophilsneutrophilsneutrophils
    fh.write(scores.T.style.to_latex(hrules=True, multicol_align="l"))