In [None]:
import subprocess

#Params
datasetPath = "../datasets/"
dataPath = "./data/"
resultPath = "./results/"
#filenames = ["BerkeleyC","BerkeleyJ","Dune","HIPAcc","HSMGP","JavaGC"]
filenames = ["Apache"]
perf="perf"


#Params for sensistivity
NBINS = 40 # Number of vertical bins for threshold
NSUBS = 10 # Number of training sets to average on
srm = 1 # Minimum sampling size
srM=[]
srs = []
for k,filename in enumerate(filenames):
    srM.append(int(subprocess.check_output("echo $(wc -l < "+datasetPath+filename+".csv)", shell=True))) # Maximum sampling size
    srs.append(srM[k]//100) # Sampling step between two iterations
    
#Params for Decision Tree
#Default params
classParamsDefault = {
    "criterion":"gini",
    "splitter":"best",
    "max_features":None,
    "max_depth":None,
    "min_samples_split":2,
    "min_samples_leaf":1,
    "min_weight_fraction_leaf":0.,
    "max_leaf_nodes":None,
    "class_weight":None,
    "random_state":None,
    "min_impurity_split":1e-7,
    "presort":False
}
regParamsDefault = {
    "criterion":"mse",
    "splitter":"best",
    "max_depth":None,
    "min_samples_split":2,
    "min_samples_leaf":1,
    "min_weight_fraction_leaf":0.,
    "max_features":None,
    "random_state":None,
    "max_leaf_nodes":None,
    "min_impurity_split":1e-7,
    "presort":False
}

In [None]:
import sys, os
import pandas as pd
from sklearn import tree
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score

from os import listdir
from os.path import isfile, join

def newVersionFilename(path, filename):
    # Get all the files in the {path} directory starting with {filename}
    files = [f for f in listdir(path) if isfile(join(path, f)) and f.startswith(filename+"-")]
    files.sort(reverse=True)
    # If no file yet
    if len(files)==0:
        return path+filename+"-"+str(1).zfill(4)
    # Split the last one
    splitted = files[0].split("-")
    # Get the last version
    num = int(splitted[len(splitted)-1].split(".")[0])
    # Return the full name with new version
    return path+filename+"-"+str(num+1).zfill(4)

In [None]:
def sensitivityClassification(datasetPath, dataPath, filename, perf, NBINS, NSUBS, srm, srM, srs, params, thresholds=False):

    d = pd.read_csv(datasetPath+filename+".csv") # Open dataset
    d = d.sort_values(by=perf) # Sort it by perf to get threshold values
    thresholds = [d[perf].iloc[i * d.shape[0]//NBINS] for i in range(1, NBINS)]

    res = {"sr":[],"t":[],"TN":[],"TP":[],"FN":[],"FP":[]}

    #Put limit so that the size of training set is not too small and not too big
    for sr in range(srm+1,int(0.9*srM),srs):
        for t in thresholds:
            print("Computing for sr=%d and t=%.3f..." % (sr, t))
            #Set label to 1 if perf above the threshold
            d["label"] = 0
            d.loc[d[perf] > t, "label"] = 1

            TN = TP = FN = FP = 0 # Counters for classification results

            clean = d.drop(["perf"],axis=1,errors="ignore")
            
            #Prepare NSUBS stratified training sets and test sets
            shuffle_split = StratifiedShuffleSplit(train_size=sr, n_splits=NSUBS)
            
            #Prepare the decision tree
            c = tree.DecisionTreeClassifier(**params)

            try:

                for train_index, test_index in shuffle_split.split(clean,clean.label):
                    
                    #training the tree
                    c.fit(clean.drop(["label"],axis=1).iloc[train_index], clean.label.iloc[train_index])
                    
                    #extracting prediction on the test set
                    pred = c.predict(clean.drop(["label"],axis=1).iloc[test_index])
                    dfTest = pd.DataFrame()
                    dfTest["label"] = clean.label.iloc[test_index]
                    dfTest["pred"] = pred
                    
                    #calculatingtrue and false negatives and positives
                    TN += dfTest[(dfTest.label == 0) & (dfTest.pred == 0)].shape[0]
                    TP += dfTest[(dfTest.label == 1) & (dfTest.pred == 1)].shape[0]
                    FN += dfTest[(dfTest.label == 1) & (dfTest.pred == 0)].shape[0]
                    FP += dfTest[(dfTest.label == 0) & (dfTest.pred == 1)].shape[0]


            except Exception as e:
                print(e)
                break
                break

            res["sr"].append(sr)
            res["t"].append(t)
            res["TN"].append(TN/NSUBS)
            res["TP"].append(TP/NSUBS)
            res["FN"].append(FN/NSUBS)
            res["FP"].append(FP/NSUBS)
    
    #Getting a new filename
    newFilename = newVersionFilename(dataPath,filename)
    #Saving the data
    pd.DataFrame(res).to_csv(newFilename+".csv", index=False)
    
    #Saving the used params
    paramsUsed = dict(params)
    paramsUsed['file']=filename
    paramsUsed['results']=newFilename+".csv"
    
    dfParamsUsed = pd.DataFrame.from_dict([paramsUsed])
    
    # If params list does not exists, create it
    if not os.path.exists(dataPath+"results-list.csv"):
        dfParamsUsed.to_csv(dataPath+"results-list.csv", index=False)
    # If the list already exists, add the params used
    else:
        paramList = pd.read_csv(dataPath+"results-list.csv")
        frames = [paramList, dfParamsUsed]
        paramList = pd.concat(frames)
        pd.DataFrame(paramList).to_csv(dataPath+"results-list.csv", index=False)
        
def sensitivityRegression(datasetPath, dataPath, filename, perf, NBINS, NSUBS, srm, srM, srs, params, thresholds=False):

    d = pd.read_csv(datasetPath+filename+".csv") # Open dataset
    d = d.sort_values(by=perf) # Sort it by perf to get threshold values
    thresholds = [d[perf].iloc[i * d.shape[0]//NBINS] for i in range(1, NBINS)]

    res = {"sr":[],"t":[],"TN":[],"TP":[],"FN":[],"FP":[]}

    #Put limit so that the size of training set is not too small and not too big
    for sr in range(srm+1,int(0.9*srM),srs):
        for t in thresholds:
            print("Computing for sr=%d and t=%.3f..." % (sr, t))
            d["label"] = 0
            d.loc[d[perf] > t, "label"] = 1

            TN = TP = FN = FP = 0 # Counters for classification results

            clean = d.drop(["perf"],axis=1,errors="ignore")

            #Prepare NSUBS stratified training sets and test sets
            shuffle_split = StratifiedShuffleSplit(train_size=sr, n_splits=NSUBS)
            
            #Prepare the decision tree
            c = tree.DecisionTreeRegressor(**params)

            try:

                for train_index, test_index in shuffle_split.split(clean,clean.label):
                    
                    #training the tree
                    c.fit(clean.drop(["label"],axis=1).iloc[train_index], clean.label.iloc[train_index])
                    
                    #extracting prediction on the test set
                    pred = c.predict(clean.drop(["label"],axis=1).iloc[test_index])
                    dfTest = pd.DataFrame()
                    dfTest["label"] = clean.label.iloc[test_index]
                    dfTest["pred"] = pred
                    
                    #calculatingtrue and false negatives and positives
                    TN += dfTest[(dfTest.label == 0) & (dfTest.pred == 0)].shape[0]
                    TP += dfTest[(dfTest.label == 1) & (dfTest.pred == 1)].shape[0]
                    FN += dfTest[(dfTest.label == 1) & (dfTest.pred == 0)].shape[0]
                    FP += dfTest[(dfTest.label == 0) & (dfTest.pred == 1)].shape[0]


            except Exception as e:
                print(e)
                break
                break

            res["sr"].append(sr)
            res["t"].append(t)
            res["TN"].append(TN/NSUBS)
            res["TP"].append(TP/NSUBS)
            res["FN"].append(FN/NSUBS)
            res["FP"].append(FP/NSUBS)
    
    
    #Getting a new filename
    newFilename = newVersionFilename(dataPath,filename)
    #Saving the data
    pd.DataFrame(res).to_csv(newFilename+".csv", index=False)
    
    #Saving the used params
    paramsUsed = dict(params)
    paramsUsed['file']=filename
    paramsUsed['results']=newFilename+".csv"
    
    dfParamsUsed = pd.DataFrame.from_dict([paramsUsed])
    
    # If params list does not exists, create it
    if not os.path.exists(dataPath+"results-list.csv"):
        dfParamsUsed.to_csv(dataPath+"results-list.csv", index=False)
    # If the list already exists, add the params used
    else:
        paramList = pd.read_csv(dataPath+"results-list.csv")
        frames = [paramList, dfParamsUsed]
        paramList = pd.concat(frames)
        pd.DataFrame(paramList).to_csv(dataPath+"results-list.csv", index=False)     

In [None]:
#clone the default params
params = dict(classParamsDefault)

#set some params
params['criterion']="entropy"
params['min_samples_leaf']=2

#set mulitple params configurations
for max_leaf_nodes in [20,30,40,50]:
    params['max_leaf_nodes']=2
    
    for k,filename in enumerate(filenames):
        sensitivityClassification(datasetPath = datasetPath, dataPath = dataPath, filename = filename, perf = perf,
                    NBINS = NBINS, NSUBS = NSUBS, srm = srm, srM = srM[k], srs = srs[k], params=params)