In [50]:
import subprocess

#Params
datasetPath = "../datasets/"
dataPath = "./dataTest/"
resultPath = "./results3/"
#filenames = ["BerkeleyC","BerkeleyJ","Dune","HIPAcc","HSMGP","JavaGC"]
filenames = ["Apache","BerkeleyC","BerkeleyJ","Dune","HIPAcc"]
perf="perf"


#Params for sensistivity
NBINS = 10 # Number of vertical bins for threshold
NSUBS = 10 # Number of training sets to average on
srm = 1 # Minimum sampling size
srM={}
srs = {}
for filename in filenames:
    srM[filename] = int(subprocess.check_output("echo $(wc -l < "+datasetPath+filename+".csv)", shell=True)) # Maximum sampling size
    srs[filename] = srM[filename]//100 # Sampling step between two iterations
    
    
#Params for Decision Tree
#Default params
classParamsDefault = {
    "criterion":"gini",
    "splitter":"best",
    "max_features":None,
    "max_depth":None,
    "min_samples_split":2,
    "min_samples_leaf":1,
    "min_weight_fraction_leaf":0.,
    "max_leaf_nodes":None,
    "class_weight":None,
    "random_state":None,
    "min_impurity_decrease":1e-7,
    "presort":False
}
regParamsDefault = {
    "criterion":"mse",
    "splitter":"best",
    "max_depth":None,
    "min_samples_split":2,
    "min_samples_leaf":1,
    "min_weight_fraction_leaf":0.,
    "max_features":None,
    "random_state":None,
    "max_leaf_nodes":None,
    "min_impurity_decrease":1e-7,
    "presort":False
}

In [9]:
import sys, os
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error as mse

from os import listdir
from os.path import isfile, join

def newVersionFilename(path, filename):
    # Get all the files in the {path} directory starting with {filename}
    files = [f for f in listdir(path) if isfile(join(path, f)) and f.startswith(filename+"-")]
    files.sort(reverse=True)
    # If no file yet
    if len(files)==0:
        return path+filename+"-"+str(1).zfill(4)
    # Split the last one
    splitted = files[0].split("-")
    # Get the last version
    num = int(splitted[len(splitted)-2].split(".")[0])
    # Return the full name with new version
    return path+filename+"-"+str(num+1).zfill(4)


def sensitivityClassification(datasetPath, dataPath, filename, perf, NBINS, NSUBS, srm, srM, srs, classifParams, thresholds=False):

    #If data fodler does not exists
    if not os.path.exists(dataPath):
        try:
            os.makedirs(dataPath)
        except OSError as exc: # Guard against race condition
            if exc.errno != errno.EEXIST:
                raise
    
    d = pd.read_csv(datasetPath+filename+".csv") # Open dataset
    d = d.sort_values(by=perf) # Sort it by perf to get threshold values
    thresholds = [d[perf].iloc[i * d.shape[0]//NBINS] for i in range(1, NBINS)]

    resClassification = {"sr":[],"t":[],"TN":[],"TP":[],"FN":[],"FP":[]}
    #for sr in range(1,99):
    for sr in range(srm+1,int(0.9*srM),srs):
        for t in thresholds:
            #print("Computing for sr=%d and t=%.3f..." % (sr, t))
            
            shuffle_split = StratifiedShuffleSplit(train_size=sr, n_splits=NSUBS)
            
            d["label"] = 0
            d.loc[d[perf] > t, "label"] = 1
            
            TN = TP = FN = FP = 0 # Counters for classification results

            clean = d.drop(["perf"],axis=1,errors="ignore")

            c = tree.DecisionTreeClassifier(**classifParams)

            try:

                for train_index, test_index in shuffle_split.split(clean,clean.label):
                    c.fit(clean.drop(["label"],axis=1).iloc[train_index], clean.label.iloc[train_index])
                    pred = c.predict(clean.drop(["label"],axis=1).iloc[test_index])
                    
                    dfTest = pd.DataFrame()
                    dfTest["label"] = clean.label.iloc[test_index]
                    dfTest["pred"] = pred

                    TN += dfTest[(dfTest.label == 0) & (dfTest.pred == 0)].shape[0]
                    TP += dfTest[(dfTest.label == 1) & (dfTest.pred == 1)].shape[0]
                    FN += dfTest[(dfTest.label == 1) & (dfTest.pred == 0)].shape[0]
                    FP += dfTest[(dfTest.label == 0) & (dfTest.pred == 1)].shape[0]


            except Exception as e:
                print(e)
                break
                break

            resClassification["sr"].append(sr)
            resClassification["t"].append(t)
            resClassification["TN"].append(TN/NSUBS)
            resClassification["TP"].append(TP/NSUBS)
            resClassification["FN"].append(FN/NSUBS)
            resClassification["FP"].append(FP/NSUBS)
            
            #break
        #break
        
    newFilename = newVersionFilename(dataPath,filename)
    
    pd.DataFrame(resClassification).to_csv(newFilename+"-classification.csv", index=False)
    
    classifParamsUsed = dict(classifParams)
    classifParamsUsed['file']=filename
    classifParamsUsed['results']=newFilename+"-classification.csv"
    
    dfParamsUsed = pd.DataFrame.from_dict([classifParamsUsed])
    
    # If params list does not exists, create it
    if not os.path.exists(dataPath+"results-list.csv"):
        dfParamsUsed.to_csv(dataPath+"results-list.csv", index=False)
    # If the list already exists, add the params used
    else:
        paramList = pd.read_csv(dataPath+"results-list.csv")
        frames = [paramList, dfParamsUsed]
        paramList = pd.concat(frames)
        pd.DataFrame(paramList).to_csv(dataPath+"results-list.csv", index=False)     

In [10]:
def sensitivityRegression(datasetPath, dataPath, filename, perf, NBINS, NSUBS, srm, srM, srs, regParams, thresholds=False):

    #If data fodler does not exists
    if not os.path.exists(dataPath):
        try:
            os.makedirs(dataPath)
        except OSError as exc: # Guard against race condition
            if exc.errno != errno.EEXIST:
                raise
    
    d = pd.read_csv(datasetPath+filename+".csv") # Open dataset
    d = d.sort_values(by=perf) # Sort it by perf to get threshold values
    thresholds = [d[perf].iloc[i * d.shape[0]//NBINS] for i in range(1, NBINS)]

    resRegression = {"sr":[],"t":[],"TN":[],"TP":[],"FN":[],"FP":[],"MSE":[]}

    #for sr in range(1,99):
    for sr in range(srm+1,int(0.9*srM),srs):
        for t in thresholds:
            #print("Computing for sr=%d and t=%.3f..." % (sr, t))
            
            shuffle_split = StratifiedShuffleSplit(train_size=sr, n_splits=NSUBS)
            
            d["label"] = 0
            d.loc[d[perf] > t, "label"] = 1
            
            
            TN = TP = FN = FP = MSE = 0 # Counters for regression results

            c = tree.DecisionTreeRegressor(**regParams)

            for train_index, test_index in shuffle_split.split(d,d.label):
                c.fit(d.drop([perf,"label"],axis=1).iloc[train_index], d[perf].iloc[train_index])
                pred = c.predict(d.drop([perf,"label"],axis=1).iloc[test_index])
                #print(list(pred))
                #print(list(clean.label.iloc[test_index]))
                #print()
                dfTest = pd.DataFrame()
                dfTest[perf] = d[perf].iloc[test_index]
                dfTest["pred"] = pred
                dfTest["label"] = d.label.iloc[test_index]
                dfTest["label_pred"] = 0
                dfTest.loc[dfTest["pred"] > t, "label_pred"] = 1
                                 
                MSE = mse(dfTest[perf],dfTest["pred"])

                TN += dfTest[(dfTest.label == 0) & (dfTest.label_pred == 0)].shape[0]
                TP += dfTest[(dfTest.label == 1) & (dfTest.label_pred == 1)].shape[0]
                FN += dfTest[(dfTest.label == 1) & (dfTest.label_pred == 0)].shape[0]
                FP += dfTest[(dfTest.label == 0) & (dfTest.label_pred == 1)].shape[0]

            resRegression["sr"].append(sr)
            resRegression["t"].append(t)
            resRegression["MSE"].append(MSE/NSUBS)
            resRegression["TN"].append(TN/NSUBS)
            resRegression["TP"].append(TP/NSUBS)
            resRegression["FN"].append(FN/NSUBS)
            resRegression["FP"].append(FP/NSUBS)
            
            #break
        #break
        
    newFilename = newVersionFilename(dataPath,filename)
    pd.DataFrame(resRegression).to_csv(newFilename+"-regression.csv", index=False)
    
    regParamsUsed = dict(regParams)
    regParamsUsed['file']=filename
    regParamsUsed['results']=newFilename+"-regression.csv"
    
    dfParamsUsed = pd.DataFrame.from_dict([regParamsUsed])
    
    # If params list does not exists, create it
    if not os.path.exists(dataPath+"results-list.csv"):
        dfParamsUsed.to_csv(dataPath+"results-list.csv", index=False)
    # If the list already exists, add the params used
    else:
        paramList = pd.read_csv(dataPath+"results-list.csv")
        frames = [paramList, dfParamsUsed]
        paramList = pd.concat(frames)
        pd.DataFrame(paramList).to_csv(dataPath+"results-list.csv", index=False)     

In [11]:

def sensitivityClassificationStrat6(datasetPath, dataPath, filename, perf, NBINS, NSUBS, srm, srM, srs, classifParams, thresholds=False):

    #If data fodler does not exists
    if not os.path.exists(dataPath):
        try:
            os.makedirs(dataPath)
        except OSError as exc: # Guard against race condition
            if exc.errno != errno.EEXIST:
                raise
    
    d = pd.read_csv(datasetPath+filename+".csv") # Open dataset
    d = d.sort_values(by=perf) # Sort it by perf to get threshold values
    thresholds = [d[perf].iloc[i * d.shape[0]//NBINS] for i in range(1, NBINS)]

    resClassificationStrat = {"sr":[],"t":[],"TN":[],"TP":[],"FN":[],"FP":[]}

    #for sr in range(1,99):
    for sr in range(srm+1,int(0.9*srM),srs):
        for t in thresholds:
            #print("Computing for sr=%d and t=%.3f..." % (sr, t))
            if sr < 6:
                continue
            
            shuffle_split = StratifiedShuffleSplit(train_size=sr, n_splits=NSUBS)
            
            d["label"] = 0
            d.loc[d[perf] > t, "label"] = 1
            
            #For 6 classes
            d["label_strat"] = 0
            d.loc[d[perf] > t * 0.5, "label_strat"] = 1
            d.loc[d[perf] > t * 0.75, "label_strat"] = 2
            d.loc[d[perf] > t * 1, "label_strat"] = 3
            d.loc[d[perf] > t * 1.25, "label_strat"] = 4
            d.loc[d[perf] > t * 1.5, "label_strat"] = 5
            TN = TP = FN = FP = 0 # Counters for classification results

            cleanStrat = d.drop(["perf"],axis=1,errors="ignore")

            c = tree.DecisionTreeClassifier(**classifParams)

            try:

                for train_index, test_index in shuffle_split.split(cleanStrat,cleanStrat.label_strat):
                    c.fit(cleanStrat.drop(["label","label_strat"],axis=1).iloc[train_index], cleanStrat.label_strat.iloc[train_index])
                    pred = c.predict(cleanStrat.drop(["label","label_strat"],axis=1).iloc[test_index])
                    
                    dfTest = pd.DataFrame()
                    dfTest["label_strat"] = cleanStrat.label_strat.iloc[test_index]
                    dfTest["label"] = cleanStrat.label.iloc[test_index]
                    dfTest["pred"] = pred

                    TN += dfTest[(dfTest.label == 0) & (dfTest.pred <= 2)].shape[0]
                    TP += dfTest[(dfTest.label == 1) & (dfTest.pred > 2)].shape[0]
                    FN += dfTest[(dfTest.label == 1) & (dfTest.pred <= 2)].shape[0]
                    FP += dfTest[(dfTest.label == 0) & (dfTest.pred > 2)].shape[0]


                resClassificationStrat["sr"].append(sr)
                resClassificationStrat["t"].append(t)
                resClassificationStrat["TN"].append(TN/NSUBS)
                resClassificationStrat["TP"].append(TP/NSUBS)
                resClassificationStrat["FN"].append(FN/NSUBS)
                resClassificationStrat["FP"].append(FP/NSUBS)

            except Exception as e:
                print(e)
                break
                break
            
            #break
        #break
        
    newFilename = newVersionFilename(dataPath,filename)
    pd.DataFrame(resClassificationStrat).to_csv(newFilename+"-classification_strat_6.csv", index=False)
    
    classifStratParamsUsed = dict(classifParams)
    classifStratParamsUsed['file']=filename
    classifStratParamsUsed['results']=newFilename+"-classification_strat_6.csv"
    
    dfParamsUsed = pd.DataFrame.from_dict([classifStratParamsUsed])
    
    # If params list does not exists, create it
    if not os.path.exists(dataPath+"results-list.csv"):
        dfParamsUsed.to_csv(dataPath+"results-list.csv", index=False)
    # If the list already exists, add the params used
    else:
        paramList = pd.read_csv(dataPath+"results-list.csv")
        frames = [paramList, dfParamsUsed]
        paramList = pd.concat(frames)
        pd.DataFrame(paramList).to_csv(dataPath+"results-list.csv", index=False)     

In [12]:

def sensitivityClassificationStrat8(datasetPath, dataPath, filename, perf, NBINS, NSUBS, srm, srM, srs, classifParams, thresholds=False):

    #If data fodler does not exists
    if not os.path.exists(dataPath):
        try:
            os.makedirs(dataPath)
        except OSError as exc: # Guard against race condition
            if exc.errno != errno.EEXIST:
                raise
    
    d = pd.read_csv(datasetPath+filename+".csv") # Open dataset
    d = d.sort_values(by=perf) # Sort it by perf to get threshold values
    thresholds = [d[perf].iloc[i * d.shape[0]//NBINS] for i in range(1, NBINS)]

    resClassificationStrat = {"sr":[],"t":[],"TN":[],"TP":[],"FN":[],"FP":[]}

    #for sr in range(1,99):
    for sr in range(srm+1,int(0.9*srM),srs):
        for t in thresholds:
            #print("Computing for sr=%d and t=%.3f..." % (sr, t))
            if sr < 8:
                continue
            
            shuffle_split = StratifiedShuffleSplit(train_size=sr, n_splits=NSUBS)
            
            d["label"] = 0
            d.loc[d[perf] > t, "label"] = 1
            
            #For 8 classes
            d["label_strat"] = 0
            d.loc[d[perf] > t * 0.25, "label_strat"] = 1
            d.loc[d[perf] > t * 0.5, "label_strat"] = 2
            d.loc[d[perf] > t * 0.75, "label_strat"] = 3
            d.loc[d[perf] > t * 1, "label_strat"] = 4
            d.loc[d[perf] > t * 1.25, "label_strat"] = 5
            d.loc[d[perf] > t * 1.5, "label_strat"] = 6
            d.loc[d[perf] > t * 1.75, "label_strat"] = 7
            
            TN = TP = FN = FP = 0 # Counters for classification results

            cleanStrat = d.drop(["perf"],axis=1,errors="ignore")

            c = tree.DecisionTreeClassifier(**classifParams)

            try:

                for train_index, test_index in shuffle_split.split(cleanStrat,cleanStrat.label_strat):
                    c.fit(cleanStrat.drop(["label","label_strat"],axis=1).iloc[train_index], cleanStrat.label_strat.iloc[train_index])
                    pred = c.predict(cleanStrat.drop(["label","label_strat"],axis=1).iloc[test_index])
                    
                    dfTest = pd.DataFrame()
                    dfTest["label_strat"] = cleanStrat.label_strat.iloc[test_index]
                    dfTest["label"] = cleanStrat.label.iloc[test_index]
                    dfTest["pred"] = pred

                    TN += dfTest[(dfTest.label == 0) & (dfTest.pred <= 2)].shape[0]
                    TP += dfTest[(dfTest.label == 1) & (dfTest.pred > 2)].shape[0]
                    FN += dfTest[(dfTest.label == 1) & (dfTest.pred <= 2)].shape[0]
                    FP += dfTest[(dfTest.label == 0) & (dfTest.pred > 2)].shape[0]


                resClassificationStrat["sr"].append(sr)
                resClassificationStrat["t"].append(t)
                resClassificationStrat["TN"].append(TN/NSUBS)
                resClassificationStrat["TP"].append(TP/NSUBS)
                resClassificationStrat["FN"].append(FN/NSUBS)
                resClassificationStrat["FP"].append(FP/NSUBS)

            except Exception as e:
                print(e)
                break
                break
            
            #break
        #break
        
    newFilename = newVersionFilename(dataPath,filename)
    pd.DataFrame(resClassificationStrat).to_csv(newFilename+"-classification_strat_8.csv", index=False)
    
    classifStratParamsUsed = dict(classifParams)
    classifStratParamsUsed['file']=filename
    classifStratParamsUsed['results']=newFilename+"-classification_strat_8.csv"
    
    dfParamsUsed = pd.DataFrame.from_dict([classifStratParamsUsed])
    
    # If params list does not exists, create it
    if not os.path.exists(dataPath+"results-list.csv"):
        dfParamsUsed.to_csv(dataPath+"results-list.csv", index=False)
    # If the list already exists, add the params used
    else:
        paramList = pd.read_csv(dataPath+"results-list.csv")
        frames = [paramList, dfParamsUsed]
        paramList = pd.concat(frames)
        pd.DataFrame(paramList).to_csv(dataPath+"results-list.csv", index=False)     

In [13]:
def sensitivity(datasetPath, dataPath, filename, perf, NBINS,NSUBS, srm, srM, srs, classifParams, regParams):
    sensitivityClassification(datasetPath = datasetPath, dataPath = dataPath, filename = filename, perf = perf, NBINS = NBINS,
                    NSUBS = NSUBS, srm = srm, srM = srM, srs = srs, classifParams=classifParams)
    sensitivityRegression(datasetPath = datasetPath, dataPath = dataPath, filename = filename, perf = perf, NBINS = NBINS,
                    NSUBS = NSUBS, srm = srm, srM = srM, srs = srs, regParams=regParams)

Use grid search to create different configurations

In [14]:
classifParams = dict(classParamsDefault)
regParams = dict(regParamsDefault)

for k,filename in enumerate(filenames):
    #for max_depth in [None,10,20,30,40,50,60,70,80,90]:
    for max_depth in [None,10,20]:
        #for max_leaf_nodes in [10,20,30,40,50,60,70,80,90]:
        for max_leaf_nodes in [10,20]:
            #for min_samples_leaf in [1,2,4,6,8,10]:
            for min_samples_leaf in [1,2]:

                classifParams = dict(classParamsDefault)
                regParams = dict(regParamsDefault)

                classifParams['criterion']="entropy"
                classifParams['max_leaf_nodes'] = max_leaf_nodes
                classifParams['min_samples_leaf']=min_samples_leaf
                classifParams['max_depth']=max_depth

                regParams['min_samples_leaf']=min_samples_leaf
                regParams['max_leaf_nodes'] = max_leaf_nodes
                regParams['max_depth'] = max_depth
        
                sensitivity(datasetPath = datasetPath, dataPath = dataPath, filename = filename, perf = perf, NBINS = NBINS,
                    NSUBS = NSUBS, srm = srm, srM = srM[filename], srs = srs[filename], classifParams=classifParams, regParams=regParams)



Use previously created data (here in ./data2) to get the 3 best configurations in order to have a more precise information, but taking longer to compute

In [42]:
resultsList = pd.read_csv("./dataTest/results-list.csv")

serie = []
for i in resultsList['results']:
    serie.append(i.split("-")[2].split(".")[0])
serie = pd.Series(serie)
resultsList['type'] = serie

accuracy = []
for i in resultsList['results']:
    dfTemp = pd.read_csv(i)
    dfMean = dfTemp.groupby(['sr']).mean().drop(['t'],axis=1)
    dfMean['accuracy']= (dfMean['TP']+dfMean['TN']) / (dfMean['TP']+dfMean['TN']+dfMean['FN']+dfMean['FP'])
    accuracy.append(dfMean['accuracy'].mean())
resultsList['accuracy'] = pd.Series(accuracy)

numberOfBestConfig = 3
metric = "accuracy"

configSet = []
resultsList2 = resultsList.where(pd.notnull(resultsList), None)
for f in resultsList2.file.unique():
    for t in resultsList2.type.unique():
        for index, r in resultsList2.loc[(resultsList["file"]==f) & (resultsList2["type"]==t)].sort_values(metric, ascending=False)[:numberOfBestConfig].iterrows():
            configuration = {}
            for i in r.keys():
                if not i in ["results","type","accuracy","file"]:
                    configuration[i] = r[i]
            configSet.append({"file":r["file"],"type":r["type"],"configuration":configuration})

dfConfig = pd.DataFrame(configSet)

In [None]:
dataPath = "./dataTest2/"
for index, row in dfConfig.iterrows():
    if row['type']=="classification":
        sensitivityClassification(datasetPath = datasetPath, dataPath = dataPath, filename = row['file'], perf = perf, NBINS = NBINS,
                    NSUBS = NSUBS, srm = srm, srM = srM[row['file']], srs = srs[row['file']], classifParams=row['configuration'])
    elif row['type']=="classification_strat_6":
        sensitivityClassificationStrat6(datasetPath = datasetPath, dataPath = dataPath, filename = row['file'], perf = perf, NBINS = NBINS,
                    NSUBS = NSUBS, srm = srm, srM = srM[row['file']], srs = srs[row['file']], classifParams=row['configuration'])
    elif row['type']=="classification_strat_8":
        sensitivityClassificationStrat8(datasetPath = datasetPath, dataPath = dataPath, filename = row['file'], perf = perf, NBINS = NBINS,
                    NSUBS = NSUBS, srm = srm, srM = srM[row['file']], srs = srs[row['file']], classifParams=row['configuration'])
    elif row['type']=="regression":
        del(row['configuration']['class_weight'])
        sensitivityRegression(datasetPath = datasetPath, dataPath = dataPath, filename = row['file'], perf = perf, NBINS = NBINS,
                    NSUBS = NSUBS, srm = srm, srM = srM[row['file']], srs = srs[row['file']], regParams=row['configuration'])
    #break

configuration    {'class_weight': None, 'criterion': 'entropy',...
file                                                        Apache
type                                                classification
Name: 0, dtype: object


