In [18]:
#Params for Decision Tree
#Default params
classParamsDefault = {
    "criterion":"gini",
    "splitter":"best",
    "max_features":None,
    "max_depth":None,
    "min_samples_split":2,
    "min_samples_leaf":1,
    "min_weight_fraction_leaf":0.,
    "max_leaf_nodes":None,
    "class_weight":None,
    "random_state":None,
    "min_impurity_decrease":1e-7,
    "presort":False
}
regParamsDefault = {
    "criterion":"mse",
    "splitter":"best",
    "max_depth":None,
    "min_samples_split":2,
    "min_samples_leaf":1,
    "min_weight_fraction_leaf":0.,
    "max_features":None,
    "random_state":None,
    "max_leaf_nodes":None,
    "min_impurity_decrease":1e-7,
    "presort":False
}

In [33]:
import sys, os
import pandas as pd
from sklearn import tree
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error as mse

from os import listdir
from os.path import isfile, join

def newVersionFilename(path, filename):
    # Get all the files in the {path} directory starting with {filename}
    files = [f for f in listdir(path) if isfile(join(path, f)) and f.startswith(filename+"-")]
    files.sort(reverse=True)
    # If no file yet
    if len(files)==0:
        return path+filename+"-"+str(1).zfill(4)
    # Split the last one
    splitted = files[0].split("-")
    # Get the last version
    num = int(splitted[len(splitted)-2].split(".")[0])
    # Return the full name with new version
    return path+filename+"-"+str(num+1).zfill(4)


def sensitivity(dataPath, classifParams, regParams):

    #If data fodler does not exists
    if not os.path.exists(dataPath):
        try:
            os.makedirs(dataPath)
        except OSError as exc: # Guard against race condition
            if exc.errno != errno.EEXIST:
                raise
    
    d = pd.read_csv("dataset.csv") # Open dataset

    resClassification = {"sr":[],"TN":[],"TP":[],"FN":[],"FP":[]}
    resRegression = {"sr":[],"TN":[],"TP":[],"FN":[],"FP":[],"MSE":[]}
    
    perf="value that led to label"
    label="label (0 if usable and 1 if not)"
    
    srm = 100
    srM = d.shape[0]
    srs = 100
    NSUBS = 10
    
    t=0.5
    
    #for sr in range(1,99):
    for sr in range(srm+1,int(0.9*srM),srs):

        shuffle_split = StratifiedShuffleSplit(train_size=sr, n_splits=NSUBS)

        TN = TP = FN = FP = MSE = 0 # Counters for regression results

        c = tree.DecisionTreeRegressor(**regParams)

        for train_index, test_index in shuffle_split.split(d,d[label]):
            c.fit(d.drop([perf,label],axis=1).iloc[train_index], d[perf].iloc[train_index])
            pred = c.predict(d.drop([perf,label],axis=1).iloc[test_index])
            #print(list(pred))
            #print(list(clean.label.iloc[test_index]))
            #print()
            dfTest = pd.DataFrame()
            dfTest["perf"] = d[perf].iloc[test_index]
            dfTest["pred"] = pred
            dfTest["label"] = d[label].iloc[test_index]
            dfTest["label_pred"] = 0
            dfTest.loc[dfTest["pred"] >= t, "label_pred"] = 1

            MSE = mse(dfTest["perf"],dfTest["pred"])

            TN += dfTest[(dfTest.label == 0) & (dfTest.label_pred == 0)].shape[0]
            TP += dfTest[(dfTest.label == 1) & (dfTest.label_pred == 1)].shape[0]
            FN += dfTest[(dfTest.label == 1) & (dfTest.label_pred == 0)].shape[0]
            FP += dfTest[(dfTest.label == 0) & (dfTest.label_pred == 1)].shape[0]

        resRegression["sr"].append(sr)
        resRegression["MSE"].append(MSE/NSUBS)
        resRegression["TN"].append(TN/NSUBS)
        resRegression["TP"].append(TP/NSUBS)
        resRegression["FN"].append(FN/NSUBS)
        resRegression["FP"].append(FP/NSUBS)


        TN = TP = FN = FP = 0 # Counters for classification results

        clean = d.drop([perf],axis=1,errors="ignore")

        c = tree.DecisionTreeClassifier(**classifParams)

        try:

            for train_index, test_index in shuffle_split.split(clean,clean[label]):
                c.fit(clean.drop([label],axis=1).iloc[train_index], clean[label].iloc[train_index])
                pred = c.predict(clean.drop([label],axis=1).iloc[test_index])
                #print(list(pred))
                #print(list(clean.label.iloc[test_index]))
                #print()
                dfTest = pd.DataFrame()
                dfTest["label"] = clean[label].iloc[test_index]
                dfTest["pred"] = pred

                TN += dfTest[(dfTest.label == 0) & (dfTest.pred == 0)].shape[0]
                TP += dfTest[(dfTest.label == 1) & (dfTest.pred == 1)].shape[0]
                FN += dfTest[(dfTest.label == 1) & (dfTest.pred == 0)].shape[0]
                FP += dfTest[(dfTest.label == 0) & (dfTest.pred == 1)].shape[0]


        except Exception as e:
            print(e)
            break
            break

        resClassification["sr"].append(sr)
        resClassification["TN"].append(TN/NSUBS)
        resClassification["TP"].append(TP/NSUBS)
        resClassification["FN"].append(FN/NSUBS)
        resClassification["FP"].append(FP/NSUBS)

        
    newFilename = newVersionFilename(dataPath,"video")
    pd.DataFrame(resClassification).to_csv(newFilename+"-classification.csv", index=False)
    pd.DataFrame(resRegression).to_csv(newFilename+"-regression.csv", index=False)
    
    classifParamsUsed = dict(classifParams)
    classifParamsUsed['file']="dataset"
    classifParamsUsed['results']=newFilename+"-classification.csv"
    
    regParamsUsed = dict(regParams)
    regParamsUsed['file']="dataset"
    regParamsUsed['results']=newFilename+"-regression.csv"

    dfParamsUsed = pd.DataFrame.from_dict([classifParamsUsed,regParamsUsed])
    
    # If params list does not exists, create it
    if not os.path.exists(dataPath+"results-list.csv"):
        dfParamsUsed.to_csv(dataPath+"results-list.csv", index=False)
    # If the list already exists, add the params used
    else:
        paramList = pd.read_csv(dataPath+"results-list.csv")
        frames = [paramList, dfParamsUsed]
        paramList = pd.concat(frames)
        pd.DataFrame(paramList).to_csv(dataPath+"results-list.csv", index=False)     

In [27]:
sensitivity("./data/",classParamsDefault,regParamsDefault)



In [34]:
classifParams = dict(classParamsDefault)
regParams = dict(regParamsDefault)

for max_depth in [None,10,20,30,40,50,60,70,80,90]:
#for max_depth in [None,20,40,60]:
    #for max_leaf_nodes in [None,20,40,60]:
    #for max_leaf_nodes in [10,30,50,70]:
    for max_leaf_nodes in [10,20,30,40,50,60,70,80,90]:
        for min_samples_leaf in [1,2,4,6,8,10]:

            classifParams = dict(classParamsDefault)
            regParams = dict(regParamsDefault)

            classifParams['criterion']="entropy"
            classifParams['max_leaf_nodes'] = max_leaf_nodes
            classifParams['min_samples_leaf']=min_samples_leaf
            classifParams['max_depth']=max_depth

            regParams['min_samples_leaf']=min_samples_leaf
            regParams['max_leaf_nodes'] = max_leaf_nodes
            regParams['max_depth'] = max_depth

            sensitivity("./data/",classifParams,regParams)
            '''sensitivity(datasetPath = datasetPath, dataPath = dataPath, filename = filename, perf = perf, NBINS = NBINS,
                NSUBS = NSUBS, srm = srm, srM = srM[k], srs = srs[k], classifParams=classifParams, regParams=regParams)'''



In [40]:
import sys, os
import pandas as pd
from sklearn import tree
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error as mse

from os import listdir
from os.path import isfile, join

def newVersionFilename(path, filename):
    # Get all the files in the {path} directory starting with {filename}
    files = [f for f in listdir(path) if isfile(join(path, f)) and f.startswith(filename+"-")]
    files.sort(reverse=True)
    # If no file yet
    if len(files)==0:
        return path+filename+"-"+str(1).zfill(4)
    # Split the last one
    splitted = files[0].split("-")
    # Get the last version
    num = int(splitted[len(splitted)-2].split(".")[0])
    # Return the full name with new version
    return path+filename+"-"+str(num+1).zfill(4)


def sensitivity2(dataPath, classifParams, regParams):

    #If data fodler does not exists
    if not os.path.exists(dataPath):
        try:
            os.makedirs(dataPath)
        except OSError as exc: # Guard against race condition
            if exc.errno != errno.EEXIST:
                raise
    
    d = pd.read_csv("dataset.csv") # Open dataset

    resClassification = {"sr":[],"TN":[],"TP":[],"FN":[],"FP":[]}
    resRegression = {"sr":[],"TN":[],"TP":[],"FN":[],"FP":[],"MSE":[]}
    
    perf="value that led to label"
    label="label (0 if usable and 1 if not)"
    
    srm = 100
    srM = d.shape[0]
    srs = 100
    NSUBS = 10
    
    n = d.shape[1]-2
    
    t=0.5
    
    #for sr in range(1,99):
    #for sr in range(srm+1,int(0.9*srM),srs):
    for sr in [n,n*2,n*3,n*4,n*5]:

        shuffle_split = StratifiedShuffleSplit(train_size=sr, n_splits=NSUBS)

        TN = TP = FN = FP = MSE = 0 # Counters for regression results

        c = tree.DecisionTreeRegressor(**regParams)

        for train_index, test_index in shuffle_split.split(d,d[label]):
            c.fit(d.drop([perf,label],axis=1).iloc[train_index], d[perf].iloc[train_index])
            pred = c.predict(d.drop([perf,label],axis=1).iloc[test_index])
            
            #print(list(pred))
            #print(list(clean.label.iloc[test_index]))
            #print()
            dfTest = pd.DataFrame()
            dfTest["perf"] = d[perf].iloc[test_index]
            dfTest["pred"] = pred
            dfTest["label"] = d[label].iloc[test_index]
            dfTest["label_pred"] = 0
            dfTest.loc[dfTest["pred"] >= t, "label_pred"] = 1

            MSE = mse(dfTest["perf"],dfTest["pred"])

            TN += dfTest[(dfTest.label == 0) & (dfTest.label_pred == 0)].shape[0]
            TP += dfTest[(dfTest.label == 1) & (dfTest.label_pred == 1)].shape[0]
            FN += dfTest[(dfTest.label == 1) & (dfTest.label_pred == 0)].shape[0]
            FP += dfTest[(dfTest.label == 0) & (dfTest.label_pred == 1)].shape[0]
            print(pred)
            break

        resRegression["sr"].append(sr)
        resRegression["MSE"].append(MSE/NSUBS)
        resRegression["TN"].append(TN/NSUBS)
        resRegression["TP"].append(TP/NSUBS)
        resRegression["FN"].append(FN/NSUBS)
        resRegression["FP"].append(FP/NSUBS)


        TN = TP = FN = FP = 0 # Counters for classification results

        clean = d.drop([perf],axis=1,errors="ignore")

        c = tree.DecisionTreeClassifier(**classifParams)

        try:

            for train_index, test_index in shuffle_split.split(clean,clean[label]):
                c.fit(clean.drop([label],axis=1).iloc[train_index], clean[label].iloc[train_index])
                pred = c.predict(clean.drop([label],axis=1).iloc[test_index])
                #print(list(pred))
                #print(list(clean.label.iloc[test_index]))
                #print()
                dfTest = pd.DataFrame()
                dfTest["label"] = clean[label].iloc[test_index]
                dfTest["pred"] = pred

                TN += dfTest[(dfTest.label == 0) & (dfTest.pred == 0)].shape[0]
                TP += dfTest[(dfTest.label == 1) & (dfTest.pred == 1)].shape[0]
                FN += dfTest[(dfTest.label == 1) & (dfTest.pred == 0)].shape[0]
                FP += dfTest[(dfTest.label == 0) & (dfTest.pred == 1)].shape[0]


        except Exception as e:
            print(e)
            break
            break

        resClassification["sr"].append(sr)
        resClassification["TN"].append(TN/NSUBS)
        resClassification["TP"].append(TP/NSUBS)
        resClassification["FN"].append(FN/NSUBS)
        resClassification["FP"].append(FP/NSUBS)

        
    newFilename = newVersionFilename(dataPath,"video")
    pd.DataFrame(resClassification).to_csv(newFilename+"-classification.csv", index=False)
    pd.DataFrame(resRegression).to_csv(newFilename+"-regression.csv", index=False)
    
    classifParamsUsed = dict(classifParams)
    classifParamsUsed['file']="dataset"
    classifParamsUsed['results']=newFilename+"-classification.csv"
    
    regParamsUsed = dict(regParams)
    regParamsUsed['file']="dataset"
    regParamsUsed['results']=newFilename+"-regression.csv"

    dfParamsUsed = pd.DataFrame.from_dict([classifParamsUsed,regParamsUsed])
    
    # If params list does not exists, create it
    if not os.path.exists(dataPath+"results-list.csv"):
        dfParamsUsed.to_csv(dataPath+"results-list.csv", index=False)
    # If the list already exists, add the params used
    else:
        paramList = pd.read_csv(dataPath+"results-list.csv")
        frames = [paramList, dfParamsUsed]
        paramList = pd.concat(frames)
        pd.DataFrame(paramList).to_csv(dataPath+"results-list.csv", index=False)     

In [36]:
classifParams = dict(classParamsDefault)
regParams = dict(regParamsDefault)

#for max_depth in [None,10,20,30,40,50,60,70,80,90]:
for max_depth in [None,10,20,30,40,50]:
    #for max_leaf_nodes in [None,20,40,60]:
    #for max_leaf_nodes in [10,30,50,70]:
    for max_leaf_nodes in [10,20,30,40,50,60,70,80,90]:
        for min_samples_leaf in [1,2,4,6,8,10]:

            classifParams = dict(classParamsDefault)
            regParams = dict(regParamsDefault)

            classifParams['criterion']="entropy"
            classifParams['max_leaf_nodes'] = max_leaf_nodes
            classifParams['min_samples_leaf']=min_samples_leaf
            classifParams['max_depth']=max_depth

            regParams['min_samples_leaf']=min_samples_leaf
            regParams['max_leaf_nodes'] = max_leaf_nodes
            regParams['max_depth'] = max_depth

            sensitivity2("./data2/",classifParams,regParams)
            '''sensitivity(datasetPath = datasetPath, dataPath = dataPath, filename = filename, perf = perf, NBINS = NBINS,
                NSUBS = NSUBS, srm = srm, srM = srM[k], srs = srs[k], classifParams=classifParams, regParams=regParams)'''



In [41]:
sensitivity2("./data3/",classifParams,regParams)



[0.35      0.        0.1       0.35      0.        0.        0.
 0.1       0.        0.1421875 0.1       0.        0.1       0.
 0.        0.        0.1421875 0.        0.1421875 0.        0.
 0.1       0.        0.        0.1421875 0.35      0.        0.
 0.        0.        0.1       0.        0.        0.        0.
 0.        0.        0.        0.1421875 0.        0.        0.35
 0.35      0.35      0.        0.        0.        0.        0.1421875
 0.        0.35      0.35      0.        0.35      0.        0.
 0.35      0.        0.        0.1       0.        0.1421875 0.
 0.1       0.1421875 0.        0.        0.        0.        0.
 0.        0.1421875 0.1       0.1421875 0.1421875 0.1421875 0.
 0.        0.1       0.        0.        0.        0.1       0.
 0.1421875 0.        0.        0.1421875 0.        0.1       0.35
 0.35      0.1       0.        0.        0.35      0.1421875 0.
 0.1       0.1421875 0.        0.        0.1       0.        0.1421875
 0.35      0.        0

[0.         0.         0.         0.05       0.         0.
 0.01       0.76315789 0.1953125  0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.76315789 0.         0.         0.         0.1953125  0.01
 0.01       0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.76315789 0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.05       0.
 0.         0.         0.01       0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.57272727 0.78       0.78       0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.76315789 0.         0.78       0.         0.
 0.         0.         0.         0.         0.         0.
 0.1953125  0.         0.         0.         0.       