In [14]:
import subprocess

#Params
datasetPath = "../datasets/"
dataPath = "./finalData/"
resultPath = "./finalResults/"
#filenames = ["Apache","BerkeleyC","BerkeleyJ","LLVM","SQLite","Dune","HIPAcc","HSMGP","JavaGC"]
filenames = ["JavaGC"]
perf="perf"


#Params for sensistivity
NBINS = 50 # Number of vertical bins for threshold
NSUBS = 10 # Number of training sets to average on
srm = 1 # Minimum sampling size
srM={}
srs = {}
for filename in filenames:
    srM[filename] = int(subprocess.check_output("echo $(wc -l < "+datasetPath+filename+".csv)", shell=True)) # Maximum sampling size
    srs[filename] = srM[filename]//100 # Sampling step between two iterations
    

In [10]:
import sys, os
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error as mse

from os import listdir
from os.path import isfile, join

def newVersionFilename(path, filename):
    # Get all the files in the {path} directory starting with {filename}
    files = [f for f in listdir(path) if isfile(join(path, f)) and f.startswith(filename+"-")]
    files.sort(reverse=True)
    # If no file yet
    if len(files)==0:
        return path+filename+"-"+str(1).zfill(4)
    # Split the last one
    splitted = files[0].split("-")
    # Get the last version
    num = int(splitted[len(splitted)-2].split(".")[0])
    # Return the full name with new version
    return path+filename+"-"+str(num+1).zfill(4)


def sensitivityClassification(datasetPath, dataPath, filename, perf, NBINS, NSUBS, srm, srM, srs):

    #If data fodler does not exists
    if not os.path.exists(dataPath):
        try:
            os.makedirs(dataPath)
        except OSError as exc: # Guard against race condition
            if exc.errno != errno.EEXIST:
                raise
    
    d = pd.read_csv(datasetPath+filename+".csv") # Open dataset
    d = d.sort_values(by=perf) # Sort it by perf to get threshold values
    thresholds = [d[perf].iloc[i * d.shape[0]//NBINS] for i in range(1, NBINS)]

    resClassification = {"sr":[],"t":[],"TN":[],"TP":[],"FN":[],"FP":[]}
    #for sr in range(1,99):
    for sr in range(srm+1,int(0.9*srM),srs):
        for t in thresholds:
            print("Computing for sr=%d and t=%.3f..." % (sr, t))
            
            shuffle_split = StratifiedShuffleSplit(train_size=sr, n_splits=NSUBS)
            
            d["label"] = 0
            d.loc[d[perf] > t, "label"] = 1
            
            TN = TP = FN = FP = 0 # Counters for classification results

            clean = d.drop(["perf"],axis=1,errors="ignore")

            c = tree.DecisionTreeClassifier()

            try:

                for train_index, test_index in shuffle_split.split(clean,clean.label):
                    c.fit(clean.drop(["label"],axis=1).iloc[train_index], clean.label.iloc[train_index])
                    pred = c.predict(clean.drop(["label"],axis=1).iloc[test_index])
                    
                    dfTest = pd.DataFrame()
                    dfTest["label"] = clean.label.iloc[test_index]
                    dfTest["pred"] = pred

                    TN += dfTest[(dfTest.label == 0) & (dfTest.pred == 0)].shape[0]
                    TP += dfTest[(dfTest.label == 1) & (dfTest.pred == 1)].shape[0]
                    FN += dfTest[(dfTest.label == 1) & (dfTest.pred == 0)].shape[0]
                    FP += dfTest[(dfTest.label == 0) & (dfTest.pred == 1)].shape[0]


            except Exception as e:
                print(e)
                break
                break

            resClassification["sr"].append(sr)
            resClassification["t"].append(t)
            resClassification["TN"].append(TN/NSUBS)
            resClassification["TP"].append(TP/NSUBS)
            resClassification["FN"].append(FN/NSUBS)
            resClassification["FP"].append(FP/NSUBS)
            
            #break
        #break
        
    newFilename = newVersionFilename(dataPath,filename)
    
    pd.DataFrame(resClassification).to_csv(newFilename+"-classification.csv", index=False) 

In [11]:
def sensitivityRegression(datasetPath, dataPath, filename, perf, NBINS, NSUBS, srm, srM, srs):

    #If data fodler does not exists
    if not os.path.exists(dataPath):
        try:
            os.makedirs(dataPath)
        except OSError as exc: # Guard against race condition
            if exc.errno != errno.EEXIST:
                raise
    
    d = pd.read_csv(datasetPath+filename+".csv") # Open dataset
    d = d.sort_values(by=perf) # Sort it by perf to get threshold values
    thresholds = [d[perf].iloc[i * d.shape[0]//NBINS] for i in range(1, NBINS)]

    resRegression = {"sr":[],"t":[],"TN":[],"TP":[],"FN":[],"FP":[],"MSE":[]}

    #for sr in range(1,99):
    for sr in range(srm+1,int(0.9*srM),srs):
        for t in thresholds:
            #print("Computing for sr=%d and t=%.3f..." % (sr, t))
            
            shuffle_split = StratifiedShuffleSplit(train_size=sr, n_splits=NSUBS)
            
            d["label"] = 0
            d.loc[d[perf] > t, "label"] = 1
            
            
            TN = TP = FN = FP = MSE = 0 # Counters for regression results

            c = tree.DecisionTreeRegressor()

            for train_index, test_index in shuffle_split.split(d,d.label):
                c.fit(d.drop([perf,"label"],axis=1).iloc[train_index], d[perf].iloc[train_index])
                pred = c.predict(d.drop([perf,"label"],axis=1).iloc[test_index])
                #print(list(pred))
                #print(list(clean.label.iloc[test_index]))
                #print()
                dfTest = pd.DataFrame()
                dfTest[perf] = d[perf].iloc[test_index]
                dfTest["pred"] = pred
                dfTest["label"] = d.label.iloc[test_index]
                dfTest["label_pred"] = 0
                dfTest.loc[dfTest["pred"] > t, "label_pred"] = 1
                                 
                MSE = mse(dfTest[perf],dfTest["pred"])

                TN += dfTest[(dfTest.label == 0) & (dfTest.label_pred == 0)].shape[0]
                TP += dfTest[(dfTest.label == 1) & (dfTest.label_pred == 1)].shape[0]
                FN += dfTest[(dfTest.label == 1) & (dfTest.label_pred == 0)].shape[0]
                FP += dfTest[(dfTest.label == 0) & (dfTest.label_pred == 1)].shape[0]

            resRegression["sr"].append(sr)
            resRegression["t"].append(t)
            resRegression["MSE"].append(MSE/NSUBS)
            resRegression["TN"].append(TN/NSUBS)
            resRegression["TP"].append(TP/NSUBS)
            resRegression["FN"].append(FN/NSUBS)
            resRegression["FP"].append(FP/NSUBS)
            
            #break
        #break
        
    newFilename = newVersionFilename(dataPath,filename)
    pd.DataFrame(resRegression).to_csv(newFilename+"-regression.csv", index=False)


In [12]:
def sensitivity(datasetPath, dataPath, filename, perf, NBINS,NSUBS, srm, srM, srs):
    sensitivityClassification(datasetPath = datasetPath, dataPath = dataPath, filename = filename, perf = perf, NBINS = NBINS,
                    NSUBS = NSUBS, srm = srm, srM = srM, srs = srs)
    sensitivityRegression(datasetPath = datasetPath, dataPath = dataPath, filename = filename, perf = perf, NBINS = NBINS,
                    NSUBS = NSUBS, srm = srm, srM = srM, srs = srs)

In [13]:
for k,filename in enumerate(filenames):
    sensitivity(datasetPath = datasetPath, dataPath = dataPath, filename = filename, perf = perf, NBINS = NBINS,
                        NSUBS = NSUBS, srm = srm, srM = srM[filename], srs = srs[filename])



KeyboardInterrupt: 