### Evaluation of the comparison of the application of the ci tests within csl

In [None]:
### imports
import csv
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import networkx as nx
import igraph as ig

In [None]:
### parameters
methodLocDict = {'$KCIT$':'./results_KCIT/',
                '$mCMIkNN$':'./results_mCMIkNN/',
                '$disc\chi^2$':'./results_discretized/',
                '$CG$':'./results_MICD/'
                }

groundTruthLoc = '../data_generation/csl_graph/'

In [None]:
### compute metrics and store results
entry = []
for graphFile in os.listdir(groundTruthLoc):
    splitted = graphFile.split('_')
    cgmid = splitted[0]
    nvars = splitted[1]
    ed = splitted[2].replace('c','.')
    dnr = splitted[3][:-4].replace('c','.')
    gt_graph = nx.read_gml(groundTruthLoc+graphFile, destringizer=int)
    gt_graph_skel = gt_graph.to_undirected().to_directed()
    gt_edges = gt_graph_skel.edges()
    gt_non_edges = set(nx.non_edges(gt_graph_skel))
    for n in [50,100,250,500,1000]:
        for method in methodLocDict.keys():
            try:
                learned_graph = nx.read_gml(methodLocDict[method]+ graphFile[:-4]+'_'+str(n)+'.gml', destringizer=int) if method == '$mCMIkNN$' else ig.Graph.Read_GML(methodLocDict[method]+ graphFile[:-4]+'_'+str(n)+'.gml').to_networkx()
                learned_graph_skel = learned_graph.to_undirected().to_directed()
                learned_edges = learned_graph_skel.edges()
                learned_non_edges = set(nx.non_edges(learned_graph_skel))
                
                true_positives = list(gt_edges & learned_edges)
                tp = len(true_positives) // 2

                true_negatives = list(gt_non_edges & learned_non_edges)
                tn = len(true_negatives) // 2

                false_positives = list(gt_non_edges & learned_edges)
                fp = len(false_positives) // 2

                false_negatives = list(gt_edges & learned_non_edges)
                fn = len(false_negatives) // 2
                
                fpr = fp /(fp + tn)
                tpr = tp /(tp + fn)
                precision = tp /(fp + tp)
                f1 = 2 * tp /(fp + fn + 2 * tp)
                entry.append([method, cgmid, nvars, ed, dnr, n, tp, tn, fp, fn, fpr, tpr, precision, f1])
            except:
                print("Error in", n, method, graphFile)

columns = ['method','cgmid','nvars','edgeDensity','discreteNodeRatio','samples','truePositives','trueNegatives','falsePositives','falseNegatives','fpr','tpr','Precision','F1']

df = pd.DataFrame(entry, columns = columns)

df.to_csv('./results.csv')


### generate plots

In [None]:
### read from csv 
df = pd.read_csv('./results.csv')

### Plot for Figure D.6 in Appendix

In [None]:
plt.rcParams.update({'font.size': 12})
unique = df["method"].unique()
palette = 'colorblind' 
hue_order = ["$mCMIkNN$","$CG$","$KCIT$","$disc\chi^2$"]


fig, axs = plt.subplots(ncols=4, sharey=True, figsize=(16, 3.0))
df['samples'] = df['samples'].apply(lambda x: int(x))
df['nvars'] = df['nvars'].apply(lambda x: int(x))
df['discreteNodeRatio'] = df['discreteNodeRatio'].apply(lambda x: float(x))
df['edgeDensity'] = df['edgeDensity'].apply(lambda x: float(x))


#Variante 1 N over samples (3)
# F1 - Boxplot 

subset = df[["samples", "method", "F1", "nvars"]]

subset['number of variables $N$'] = subset["nvars"]

g = sns.FacetGrid(subset, height=4, aspect=1, col= 'number of variables $N$')
g.map_dataframe(sns.boxplot, x="samples", y="F1", hue="method", palette=palette, hue_order=hue_order)
g.add_legend()
for i in range(0,3):
    g.axes[0,i].set_xlabel('sample sizes $n$')
for i in range(0,1):
    g.axes[i,0].set_ylabel('F1 score')
g.savefig('./case_3_f1_nodes_over_samples.pdf')

In [None]:
### limit the remaining figures to nvar == 30
df=df[df.nvars == 30]

### Plot for Figure 3 in paper

In [None]:
f1 = sns.boxplot(x="samples", y="F1", hue="method", palette=palette, hue_order=hue_order, data=df, ax=axs[0])
f1.set_ylabel("F1 score")
f1.set_xlabel("sample sizes $n$")
f1.spines["top"].set_visible(False)
f1.spines["right"].set_visible(False)
f1.legend([],[], frameon=False)


f2 = sns.boxplot(x="discreteNodeRatio", y="F1", hue="method", palette=palette, hue_order=hue_order, data=df, ax=axs[1])
f2.set_ylabel("")
f2.set_xlabel('ratio of discrete variables')
f2.spines["top"].set_visible(False)
f2.spines["right"].set_visible(False)
f2.legend([],[], frameon=False)


f3 = sns.boxplot(x="edgeDensity", y="F1", hue="method", palette=palette, hue_order=hue_order, data=df, ax=axs[2])
f3.set_ylabel("")
f3.set_xlabel('density of the CGM')
f3.spines["top"].set_visible(False)
f3.spines["right"].set_visible(False)
f3.legend(loc='upper center', bbox_to_anchor=(-0.1, 1.15),frameon=False, ncol=4)


fig.savefig('./case_3_combined.pdf',bbox_inches="tight")


### Plot for Figure D4 in Appendix

In [None]:
df['samples'] = df['samples'].apply(lambda x: int(x))
subset = df[["samples", "method", "F1", "edgeDensity"]]
subset['density of the CGM'] = subset["edgeDensity"]

g = sns.FacetGrid(subset, height=4, aspect=1, col='density of the CGM')
g.map_dataframe(sns.boxplot, x="samples", y="F1", hue="method", palette=palette, hue_order=hue_order)
g.add_legend()
for i in range(0,4):
    g.axes[0,i].set_xlabel('sample sizes $n$')
for i in range(0,1):
    g.axes[i,0].set_ylabel('F1 score')
g.savefig('./case_3_f1_density_over_samples.pdf')

### Plot for Figure D4 in Appendix

In [None]:
df['samples'] = df['samples'].apply(lambda x: int(x))
subset = df[["samples", "method", "F1", "discreteNodeRatio"]]

subset['ratio of discrete variables'] = subset["discreteNodeRatio"]

g = sns.FacetGrid(subset, height=4, aspect=1, col= 'ratio of discrete variables')
g.map_dataframe(sns.boxplot, x="samples", y="F1", hue="method", palette=palette, hue_order=hue_order)
g.add_legend()
for i in range(0,3):
    g.axes[0,i].set_xlabel('sample sizes $n$')
for i in range(0,1):
    g.axes[i,0].set_ylabel('F1 score')
g.savefig('./case_3_f1_dnr_over_samples.pdf')