### Evaluation of the ci test comparison

In [None]:
### imports
import pandas as pd
import os
import seaborn as sns
import warnings
import math
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics

In [None]:
### parameters
alpha = 0.05
files = [f for f in os.listdir('./') if '.csv' in f]

def calc_auc(y,pred):
    fpr, tpr, thresholds = metrics.roc_curve(y, pred)
    return metrics.auc(fpr, tpr)

def calc_typeIrate(y,pred):
    tn, fp, fn, tp = metrics.confusion_matrix(y, pred, labels=[0, 1]).ravel()
    return (fp / (fp + tn))

def calc_typeIIrate(y,pred):
    tn, fp, fn, tp = metrics.confusion_matrix(y, pred, labels=[0, 1]).ravel()
    return (fn / (tp + fn))

In [None]:
### preparation
dfs = []
for file in files:
    dfs.append(pd.read_csv('./'+file))

df = pd.concat(dfs,ignore_index=True)

### optional penalize all entries where a method has a -1.0 in pvalue, i.e., it failed computing, setting
### pvalue as it did not match the appropriate decision e.g., see below
# CI test decisions, i.e., 
# reject H_0 for pvalue <= alpha, i.e., we assume H_1 is true -> edge (1) 
# cannot reject for pvalue > alpha i.e., we assume H_0 is true -> no edge (0)
df['pvalue'] = np.where(((df["pvalue"] == -1.0) & (df['hasedge'] == False)), 0.0, df['pvalue'])
df['pvalue'] = np.where(((df["pvalue"] == -1.0) & (df['hasedge'] == True)), 1.0, df['pvalue'])


### transform data
df.loc[df["hasedge"] == False, "H_0"] = 0    #H_0 true is 0 (independent, no edge)
df.loc[df["hasedge"] == True, "H_0"] = 1    #H_0 false is 1 (dependent, edge)
# CI test decisions, i.e., 
# reject H_0 for pvalue <= alpha, i.e., we assume H_1 is true -> edge (1) 
# cannot reject for pvalue > alpha i.e., we assume H_0 is true -> no edge (0)
df["CItest"] = df["pvalue"].le(alpha).astype(int)

## rename
df.loc[df["method"] == "discretized-x2", "method"] = "$disc\chi^2$"
df.loc[df["method"] == "KCIT", "method"] = "$KCIT$"
df.loc[df["method"] == "MICD", "method"] = "$CG$"
df.loc[df["method"] == "cmipchisq95", "method"] = "$aHist\chi^2$"
df.loc[df["method"] == "mCMIkNN", "method"] = "$mCMIkNN$"
# select
df = df[df.sepsetsize.isin([1,3,5,7])]
hue_order = ["$mCMIkNN$","$CG$","$KCIT$","$disc\chi^2$","$aHist\chi^2$"]

### Figure 2 in paper

In [None]:
plt.rcParams.update({'font.size': 12})
subset = df[["cgmid","samples", "sepsetsize", "method", "discretenoderatio", "pvalue", "H_0", "CItest"]]
fig, axs = plt.subplots(ncols=3, sharey=True, figsize=(16, 3.6))


grouped = subset.groupby(["cgmid","samples","method"]).apply(lambda x: calc_auc(x[["H_0"]], x[["CItest"]])).reset_index(name='AUC')
f1 = sns.boxplot(x="samples", y="AUC", hue="method", palette="colorblind", data=grouped,hue_order=hue_order, ax=axs[0])
f1.set_ylabel("ROC AUC score")
f1.set_xlabel("sample sizes $n$")
f1.spines["top"].set_visible(False)
f1.spines["right"].set_visible(False)
f1.legend([],[], frameon=False)


grouped = subset.groupby(["cgmid","sepsetsize","method"]).apply(lambda x: calc_auc(x[["H_0"]], x[["CItest"]])).reset_index(name='AUC')
f2 = sns.boxplot(x="sepsetsize", y="AUC", hue="method", palette="colorblind", data=grouped,hue_order=hue_order, ax=axs[1])
f2.set_ylabel("")
f2.set_xlabel("$d_Z$")
f2.spines["top"].set_visible(False)
f2.spines["right"].set_visible(False)
f2.legend(loc='upper center', bbox_to_anchor=(0.5, 1.25),frameon=False, ncol=6)


grouped = subset.groupby(["cgmid","discretenoderatio","method"]).apply(lambda x: calc_auc(x[["H_0"]], x[["CItest"]])).reset_index(name='AUC')
f3 = sns.boxplot(x="discretenoderatio", y="AUC", hue="method", palette="colorblind", data=grouped,hue_order=hue_order, ax=axs[2])
f3.set_ylabel("")
f3.set_xlabel("ratio of discrete variables")
f3.legend([],[], frameon=False)
f3.spines["top"].set_visible(False)
f3.spines["right"].set_visible(False)


fig.savefig('./case2ROC.pdf',bbox_inches="tight")



### Figure A.2 in appendix

In [None]:
plt.rcParams.update({'font.size': 12})
subset = df[["cgmid","samples", "sepsetsize", "method", "discretenoderatio", "pvalue", "H_0", "CItest"]]
fig, axs = plt.subplots(ncols=3, sharey=True, figsize=(16, 3.6))


grouped = subset.groupby(["cgmid","samples","method"]).apply(lambda x: calc_typeIrate(x[["H_0"]], x[["CItest"]])).reset_index(name='AUC')
f1 = sns.boxplot(x="samples", y="AUC", hue="method", palette="colorblind", data=grouped,hue_order=hue_order, ax=axs[0])
f1.set_ylabel("type I error rate")
f1.set_xlabel("sample sizes $n$")
f1.spines["top"].set_visible(False)
f1.spines["right"].set_visible(False)
f1.legend([],[], frameon=False)


grouped = subset.groupby(["cgmid","sepsetsize","method"]).apply(lambda x: calc_typeIrate(x[["H_0"]], x[["CItest"]])).reset_index(name='AUC')
f2 = sns.boxplot(x="sepsetsize", y="AUC", hue="method", palette="colorblind", data=grouped,hue_order=hue_order, ax=axs[1])
f2.set_ylabel("")
f2.set_xlabel("$d_Z$")
f2.spines["top"].set_visible(False)
f2.spines["right"].set_visible(False)
f2.legend(loc='upper center', bbox_to_anchor=(0.5, 1.25),frameon=False, ncol=6)


grouped = subset.groupby(["cgmid","discretenoderatio","method"]).apply(lambda x: calc_typeIrate(x[["H_0"]], x[["CItest"]])).reset_index(name='AUC')
f3 = sns.boxplot(x="discretenoderatio", y="AUC", hue="method", palette="colorblind",hue_order=hue_order, data=grouped, ax=axs[2])
f3.set_ylabel("")
f3.set_xlabel("ratio of discrete variables")
f3.legend([],[], frameon=False)
f3.spines["top"].set_visible(False)
f3.spines["right"].set_visible(False)


fig.savefig('./case2typeI.pdf',bbox_inches="tight")



### Figure A.3 in appendix

In [None]:
plt.rcParams.update({'font.size': 12})
subset = df[["cgmid","samples", "sepsetsize", "method", "discretenoderatio", "pvalue", "H_0", "CItest"]]
fig, axs = plt.subplots(ncols=3, sharey=True, figsize=(16, 3.6))


grouped = subset.groupby(["cgmid","samples","method"]).apply(lambda x: calc_typeIIrate(x[["H_0"]], x[["CItest"]])).reset_index(name='AUC')
f1 = sns.boxplot(x="samples", y="AUC", hue="method", palette="colorblind", data=grouped,hue_order=hue_order, ax=axs[0])
f1.set_ylabel("type II error rate")
f1.set_xlabel("sample sizes $n$")
f1.spines["top"].set_visible(False)
f1.spines["right"].set_visible(False)
f1.legend([],[], frameon=False)


grouped = subset.groupby(["cgmid","sepsetsize","method"]).apply(lambda x: calc_typeIIrate(x[["H_0"]], x[["CItest"]])).reset_index(name='AUC')
f2 = sns.boxplot(x="sepsetsize", y="AUC", hue="method", palette="colorblind", data=grouped,hue_order=hue_order, ax=axs[1])
f2.set_ylabel("")
f2.set_xlabel("$d_Z$")
f2.spines["top"].set_visible(False)
f2.spines["right"].set_visible(False)
f2.legend(loc='upper center', bbox_to_anchor=(0.5, 1.25),frameon=False, ncol=6)


grouped = subset.groupby(["cgmid","discretenoderatio","method"]).apply(lambda x: calc_typeIIrate(x[["H_0"]], x[["CItest"]])).reset_index(name='AUC')
f3 = sns.boxplot(x="discretenoderatio", y="AUC", hue="method", palette="colorblind", data=grouped,hue_order=hue_order, ax=axs[2])
f3.set_ylabel("")
f3.set_xlabel("ratio of discrete variables")
f3.legend([],[], frameon=False)
f3.spines["top"].set_visible(False)
f3.spines["right"].set_visible(False)


fig.savefig('./case2TypeII.pdf',bbox_inches="tight")
