In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn import tree
from scipy import interp
import os

In [13]:
datasetPath = "../datasets/"
dataPath = "./data3/"
resultsPath = "./results3/"


#If data fodler does not exists
if not os.path.exists(resultsPath):
    try:
        os.makedirs(resultsPath)
    except OSError as exc: # Guard against race condition
        if exc.errno != errno.EEXIST:
            raise

In [8]:
resultsList = pd.read_csv(dataPath+"results-list.csv")
serie = []
for i in resultsList['results']:
    serie.append(i.split("-")[2].split(".")[0])
#print(serie)

serie = pd.Series(serie)

resultsList['type'] = serie
accuracy = []
for i in resultsList['results']:
    dfTemp = pd.read_csv(i)
    dfMean = dfTemp.groupby(['sr']).mean().drop(['t'],axis=1)
    dfMean['accuracy']= (dfMean['TP']+dfMean['TN']) / (dfMean['TP']+dfMean['TN']+dfMean['FN']+dfMean['FP'])
    accuracy.append(dfMean['accuracy'].mean())
resultsList['accuracy'] = pd.Series(accuracy)

In [9]:
configSet = []
resultsList2 = resultsList[idx][(resultsList['type'] == "classification") | (resultsList['type'] == "regression")].where(pd.notnull(resultsList), None)
for f in resultsList2.file.unique():
    for t in resultsList2.type.unique():
        for index, r in resultsList2.loc[(resultsList["file"]==f) & (resultsList2["type"]==t)].sort_values("accuracy", ascending=False)[:10].iterrows():
            configuration = {}
            for i in r.keys():
                if not i in ["results","type","accuracy","file"]:
                    if not (r["type"] == "regression" and i=="class_weight"):
                        configuration[i] = r[i]
            configSet.append({"file":r["file"],"type":r["type"],"configuration":configuration})

  


In [24]:
dfConfig = pd.DataFrame(configSet)
for filename in dfConfig.file.unique():
    
    d = pd.read_csv(datasetPath+filename+".csv")
    d = d.sort_values(by="perf")

    for threshold in [d["perf"].iloc[i * d.shape[0]//4] for i in range(1, 4)]:

        d["label"] = 0
        d.loc[d["perf"] > threshold, "label"] = 1

        X = d.drop(["perf"],axis=1,errors="ignore")
        y_reg = d["perf"]
        y_classif = d["label"]
        
        n = X.shape[1] - 1

        for sr in [n,2*n,3*n]:
            NSUBS=10
            shuffle_split = StratifiedShuffleSplit(train_size=sr, n_splits=NSUBS)

            classifier = tree.DecisionTreeClassifier(**dfConfig[(dfConfig['file'] == filename) & (dfConfig['type'] == "classification")].configuration.values[0])
            tprs = []
            aucs = []
            mean_fpr = np.linspace(0, 1, 100)

            i = 0
            for train, test in shuffle_split.split(X, y_classif):
                probas_ = classifier.fit(X.drop(["label"],axis=1).iloc[train], y_classif.iloc[train]).predict_proba(X.drop(["label"],axis=1).iloc[test])
                # Compute ROC curve and area the curve
                fpr, tpr, thresholds = roc_curve(y_classif.iloc[test], probas_[:, 1])
                #print(thresholds)
                tprs.append(interp(mean_fpr, fpr, tpr))
                tprs[-1][0] = 0.0
                roc_auc = auc(fpr, tpr)
                aucs.append(roc_auc)
                #plt.plot(fpr, tpr, lw=1, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))

                i += 1

            mean_tpr = np.mean(tprs, axis=0)
            mean_tpr[-1] = 1.0
            mean_auc = auc(mean_fpr, mean_tpr)
            std_auc = np.std(aucs)
            plt.plot(mean_fpr, mean_tpr, color='b',
                     label=r'Mean ROC Classification (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
                     lw=2, alpha=.8)

            plt.rcParams["figure.figsize"] = [15,9]

            std_tpr = np.std(tprs, axis=0)
            tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
            tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
            #plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,label=r'$\pm$ 1 std. dev.')

            plt.plot(mean_fpr, tprs_lower, '--', color='b',
                     label=r'Mean ROC Classification STD',
                     lw=1, alpha=.8)

            plt.plot(mean_fpr, tprs_upper, '--', color='b',
                     #label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
                     lw=1, alpha=.8)

            classifier = tree.DecisionTreeRegressor(**dfConfig[(dfConfig['file'] == filename) & (dfConfig['type'] == "regression")].configuration.values[0])
            tprs = []
            aucs = []
            mean_fpr = np.linspace(0, 1, 100)

            i = 0
            for train, test in shuffle_split.split(X, y_classif):
                probas_ = classifier.fit(X.iloc[train], y_reg.iloc[train]).predict(X.iloc[test]) > threshold
                # Compute ROC curve and area the curve
                fpr, tpr, thresholds = roc_curve(y_reg.iloc[test] > threshold, probas_)
                #print(thresholds)
                tprs.append(interp(mean_fpr, fpr, tpr))
                tprs[-1][0] = 0.0
                roc_auc = auc(fpr, tpr)
                aucs.append(roc_auc)
                #plt.plot(fpr, tpr, lw=1, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))

                i += 1

            mean_tpr = np.mean(tprs, axis=0)
            mean_tpr[-1] = 1.0
            mean_auc = auc(mean_fpr, mean_tpr)
            std_auc = np.std(aucs)
            plt.plot(mean_fpr, mean_tpr, color='r',
                     label=r'Mean ROC Regression (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
                     lw=2, alpha=.8)

            plt.rcParams["figure.figsize"] = [15,9]

            std_tpr = np.std(tprs, axis=0)
            tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
            tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
            #plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,label=r'$\pm$ 1 std. dev.')

            plt.plot(mean_fpr, tprs_lower, '--', color='r',
                     label=r'Mean ROC Regression STD',
                     lw=1, alpha=.8)

            plt.plot(mean_fpr, tprs_upper, '--', color='r',
                     #label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
                     lw=1, alpha=.8)

            plt.xlim([-0.05, 1.05])
            plt.ylim([-0.05, 1.05])
            plt.xlabel('False Positive Rate')
            plt.ylabel('True Positive Rate')
            plt.title('ROC Threshold='+str(threshold)+"/Sample="+str(sr))
            plt.legend(loc="lower right")
            plt.savefig(resultsPath+'ROC_'+filename+'_'+str(threshold)+'_'+str(sr)+'.png')
            plt.clf()



<Figure size 432x288 with 0 Axes>