In [None]:
import subprocess

#Params
datasetPath = "../datasets/"
dataPath = "./data_clustered/"
resultPath = "./results/"
filenames = ["Apache"]
perf="perf"

n_lines=[]
for k,filename in enumerate(filenames):
    n_lines.append(int(subprocess.check_output("echo $(wc -l < "+datasetPath+filename+".csv)", shell=True)))

#Params for sensitivity
NBINS = 40 # Number of vertical bins for threshold
NSUBS = 10 # Number of training sets to average on
srm = 1 # Minimum sampling size
srM=[]
srs = []
for k,filename in enumerate(filenames):
    srM.append(int(subprocess.check_output("echo $(wc -l < "+datasetPath+filename+".csv)", shell=True))) # Maximum sampling size
    srs.append(srM[k]//100) # Sampling step between two iterations

    
oracle = [0.2,0.5,0.8]

#Params for Decision Tree
#Default params
treeParamsDefault = {
    "criterion":"gini",
    "splitter":"best",
    "max_features":None,
    "max_depth":None,
    "min_samples_split":2,
    "min_samples_leaf":1,
    "min_weight_fraction_leaf":0.,
    "max_leaf_nodes":None,
    "class_weight":None,
    "random_state":None,
    "min_impurity_split":1e-7,
    "presort":False
}
treeParams = treeParamsDefault
# Modify params there
treeParams['criterion']="entropy"
treeParams['min_samples_leaf']=2

In [None]:
import sys, os
import pandas as pd
from sklearn import tree
from os import listdir
from os.path import isfile, join

def newVersionFilename(path, filename):
    # Get all the files in the {path} directory starting with {filename}
    files = [f for f in listdir(path) if isfile(join(path, f)) and f.startswith(filename)]
    files.sort(reverse=True)
    # If no file yet
    if len(files)==0:
        return path+filename+"-"+str(1).zfill(4)
    # Split the last one
    splitted = files[0].split("-")
    # Get the last version
    num = int(splitted[len(splitted)-1].split(".")[0])
    # Return the full name with new version
    return path+filename+"-"+str(num+1).zfill(4)

def sensitivity(datasetPath, dataPath, filename, perf, NBINS, NSUBS, srm, srM, srs, treeParams, oracle=False):
    
    # Create a list of params used for the function
    varParams = locals()
    del varParams['treeParams']
    del varParams['oracle']
    varParams.update(treeParams)
    
    #If data fodler does not exists
    if not os.path.exists(dataPath):
        try:
            os.makedirs(dataPath)
        except OSError as exc: # Guard against race condition
            if exc.errno != errno.EEXIST:
                raise
    
    
    perf_x264 = ['Watt', 'Energy', 'SSIM', 'PSNR', 'Speed', 'Size', 'Time']
    perf_sac = ['compile-exit', 'compile-real', 'compile-user', 'compile-ioin', 'compile-ioout',
                'compile-maxmem', 'compile-cpu', 'compile-size', 'run-exit',
                'run-real', 'run-user', 'run-maxmem', 'run-cpu']

    d = pd.read_csv(datasetPath+filename+".csv") # Open dataset
    d = d.sort_values(by=perf) # Sort it by perf to get threshold values
    if not oracle:
        thresholds = [d[perf].iloc[i * d.shape[0]//NBINS] for i in range(1, NBINS)]
    else:
        for i in oracle:
            if not os.path.exists(dataPath+str(i)):
                try:
                    os.makedirs(dataPath+str(i))
                except OSError as exc: # Guard against race condition
                    if exc.errno != errno.EEXIST:
                        raise
        thresholds = [d[perf].iloc[int(i * d.shape[0])] for i in oracle]
    
    res = {"sr":[],"t":[],"TN":[],"TP":[],"FN":[],"FP":[]}
    
    # Computation
    for k,t in enumerate(thresholds):
        if oracle:
            res = {"sr":[],"t":[],"TN":[],"TP":[],"FN":[],"FP":[]}
        for sr in range(srm,srM,srs):
                print("Computing for sr=%d and t=%.3f..." % (sr, t))
                d["label"] = 0
                d.loc[d[perf] > t, "label"] = 1 # Label with the (current) oracle
                clean = d.drop(perf_sac+perf_x264+["perf"],axis=1,errors="ignore")
                subs = [clean.sample(sr) for i in range(NSUBS)] # Subsample trainsets
                TN = TP = FN = FP = 0 # Counters for classification results
                d["pred"] = 0
                for s in subs: # We cumulate results for each experiment and average later
                    # MACHINE LEARNING PART
                    # Settings are chosen to be the closest to J48 algorithm
                    ##c = tree.DecisionTreeClassifier(criterion="entropy", min_samples_leaf=2)
                    c = tree.DecisionTreeClassifier(**treeParams)
                    c.fit(s.drop(["label"],axis=1), s.label)
                    # END OF LEARNING
                    d["pred"] = c.predict(clean.drop(["label"], axis=1)) # Get model's prediction
                    TN += d[(d.label == 0) & (d.pred == 0)].shape[0]
                    TP += d[(d.label == 1) & (d.pred == 1)].shape[0]
                    FN += d[(d.label == 1) & (d.pred == 0)].shape[0]
                    FP += d[(d.label == 0) & (d.pred == 1)].shape[0]
                del d["pred"] # Reset
                # Push the results
                res["sr"].append(sr)
                res["t"].append(t)
                res["TN"].append(TN/NSUBS)
                res["TP"].append(TP/NSUBS)
                res["FN"].append(FN/NSUBS)
                res["FP"].append(FP/NSUBS)
        # Save the result as csv
        if oracle:
            newFilename = newVersionFilename(dataPath+"/"+str(oracle[k])+"/",filename)+".csv"
            pd.DataFrame(res).to_csv(newFilename, index=False)
            # Save the params used
            paramsUsed = varParams
            paramsUsed['oracle']=oracle[k]
            paramsUsed['file']=filename
            paramsUsed['results']=newFilename
            
            dfParamsUsed = pd.DataFrame.from_dict([paramsUsed])
            
            # If params list does not exists, create it
            if not os.path.exists(dataPath+"/"+str(oracle[k])+"-results-list.csv"):
                dfParamsUsed.to_csv(dataPath+"/"+str(oracle[k])+"-results-list.csv", index=False)
            # If the list already exists, add the params used
            else:
                paramList = pd.read_csv(dataPath+"/"+str(oracle[k])+"-results-list.csv")
                frames = [paramList, dfParamsUsed]
                paramList = pd.concat(frames)
                pd.DataFrame(paramList).to_csv(dataPath+"/"+str(oracle[k])+"-results-list.csv", index=False)
       
    if not oracle:
        newFilename = newVersionFilename(dataPath+"/",filename)+".csv"
        pd.DataFrame(res).to_csv(newFilename, index=False)
        # Save the params used
        paramsUsed = varParams
        paramsUsed['file']=filename
        paramsUsed['results']=newFilename

        dfParamsUsed = pd.DataFrame.from_dict([paramsUsed])

        # If params list does not exists, create it
        if not os.path.exists(dataPath+"/results-list.csv"):
            dfParamsUsed.to_csv(dataPath+"/results-list.csv", index=False)
        # If the list already exists, add the params used
        else:
            paramList = pd.read_csv(dataPath+"/results-list.csv")
            frames = [paramList, dfParamsUsed]
            paramList = pd.concat(frames)
            pd.DataFrame(paramList).to_csv(dataPath+"/results-list.csv", index=False)     

In [None]:
#Example of running sensitivity on many configurations
treeParamsArray = []

for max_features in [0.8,0.9]:
    for min_samples_leaf in [1,2]:
        treeParams = dict(treeParamsDefault)
        treeParams['criterion']="entropy"
        treeParams['min_samples_leaf']=2

        treeParams['max_features']=max_features
        treeParams['min_samples_leaf']=min_samples_leaf


        treeParamsArray.append(treeParams)
            
for treeParams in treeParamsArray:
    for k,filename in enumerate(filenames):
        #Machine learning part, using data from {filename} file in {datasetPath} folder and writing results in {dataPath} folder
        sensitivity(datasetPath = datasetPath, dataPath = dataPath, filename = filename, perf = perf, NBINS = NBINS,
                    NSUBS = NSUBS, srm = srm, srM = srM[k], srs = srs[k], treeParams=treeParams, oracle=oracle)