In [None]:
import subprocess

#Params
datasetPath = "../datasets/"
dataPath = "./data/"
resultPath = "./results/"
filenames = ["Apache","BerkeleyC","BerkeleyJ","Dune","HIPAcc","HSMGP","JavaGC","LLVM","SQLite"]
filenames = ["Apache"]
perf="perf"


#Params for sensistivity
NBINS = 40 # Number of vertical bins for threshold
NSUBS = 10 # Number of training sets to average on
srm = 1 # Minimum sampling size
srM=[]
srs = []
for k,filename in enumerate(filenames):
    srM.append(int(subprocess.check_output("echo $(wc -l < "+datasetPath+filename+".csv)", shell=True))) # Maximum sampling size
    srs.append(srM[k]//100) # Sampling step between two iterations

In [None]:
import sys, os
import pandas as pd
from sklearn import tree
import numpy as np

def sensitivityRegression(datasetPath, dataPath, filename, perf, NBINS, NSUBS, srm, srM, srs, thresholds=False):
    
    #If data fodler does not exists
    if not os.path.exists(dataPath):
        try:
            os.makedirs(dataPath)
        except OSError as exc: # Guard against race condition
            if exc.errno != errno.EEXIST:
                raise
            
    perf_x264 = ['Watt', 'Energy', 'SSIM', 'PSNR', 'Speed', 'Size', 'Time']
    perf_sac = ['compile-exit', 'compile-real', 'compile-user', 'compile-ioin', 'compile-ioout',
                'compile-maxmem', 'compile-cpu', 'compile-size', 'run-exit',
                'run-real', 'run-user', 'run-maxmem', 'run-cpu']

    d = pd.read_csv(datasetPath+filename+".csv") # Open dataset
    d = d.sort_values(by=perf) # Sort it by perf to get threshold values
    if not thresholds:
        thresholds = [d[perf].iloc[i * d.shape[0]//NBINS] for i in range(1, NBINS)]
    else:
        thresholds = [d[perf].iloc[int(thresholds * d.shape[0])]]

    res = {"sr":[],"t":[],"TN":[],"TP":[],"FN":[],"FP":[]}

    # Computation
    for sr in range(srm,srM,srs):
        for t in thresholds:
            print("Computing for sr=%d and t=%.3f..." % (sr, t))
            d["label"] = 0
            d.loc[d[perf] > t, "label"] = 1 # Label with the (current) oracle
            clean = d.drop(perf_sac+perf_x264,axis=1,errors="ignore")
            subs = [clean.sample(sr) for i in range(NSUBS)] # Subsample trainsets
            TN = TP = FN = FP = 0 # Counters for classification results
            d["pred"] = 0
            d["labelpred"] = 0
            for s in subs: # We cumulate results for each experiment and average later
                # MACHINE LEARNING PART
                # Settings are chosen to be the closest to J48 algorithm
                c = tree.DecisionTreeRegressor(min_samples_leaf=2)
                c.fit(s.drop(["perf","label"],axis=1), s.perf)
                # END OF LEARNING
                d["pred"] = c.predict(clean.drop(["perf","label"], axis=1)) # Get model's prediction
                d.loc[d["pred"] > t, "labelpred"] = 1 # Label with the (current) oracle
                TN += d[(d.label == 0) & (d.labelpred == 0)].shape[0]
                TP += d[(d.label == 1) & (d.labelpred == 1)].shape[0]
                FN += d[(d.label == 1) & (d.labelpred == 0)].shape[0]
                FP += d[(d.label == 0) & (d.labelpred == 1)].shape[0]
            del d["pred"] # Reset
            del d["labelpred"] # Reset
            # Push the results
            res["sr"].append(sr)
            res["t"].append(t)
            res["TN"].append(TN/NSUBS)
            res["TP"].append(TP/NSUBS)
            res["FN"].append(FN/NSUBS)
            res["FP"].append(FP/NSUBS)
    # Save the result as csv
    pd.DataFrame(res).to_csv(dataPath+filename+"-regression.csv", index=False)


def sensitivityClassification(datasetPath, dataPath, filename, perf, NBINS, NSUBS, srm, srM, srs, thresholds=False):
    
    #If data fodler does not exists
    if not os.path.exists(dataPath):
        try:
            os.makedirs(dataPath)
        except OSError as exc: # Guard against race condition
            if exc.errno != errno.EEXIST:
                raise
            
    perf_x264 = ['Watt', 'Energy', 'SSIM', 'PSNR', 'Speed', 'Size', 'Time']
    perf_sac = ['compile-exit', 'compile-real', 'compile-user', 'compile-ioin', 'compile-ioout',
                'compile-maxmem', 'compile-cpu', 'compile-size', 'run-exit',
                'run-real', 'run-user', 'run-maxmem', 'run-cpu']

    d = pd.read_csv(datasetPath+filename+".csv") # Open dataset
    d = d.sort_values(by=perf) # Sort it by perf to get threshold values
    if not thresholds:
        thresholds = [d[perf].iloc[i * d.shape[0]//NBINS] for i in range(1, NBINS)]
    else:
        thresholds = [d[perf].iloc[int(thresholds * d.shape[0])]]

    res = {"sr":[],"t":[],"TN":[],"TP":[],"FN":[],"FP":[]}

    # Computation
    for sr in range(srm,srM,srs):
        for t in thresholds:
            print("Computing for sr=%d and t=%.3f..." % (sr, t))
            d["label"] = 0
            d.loc[d[perf] > t, "label"] = 1 # Label with the (current) oracle
            clean = d.drop(perf_sac+perf_x264+["perf"],axis=1,errors="ignore")
            subs = [clean.sample(sr) for i in range(NSUBS)] # Subsample trainsets
            TN = TP = FN = FP = 0 # Counters for classification results
            d["pred"] = 0
            for s in subs: # We cumulate results for each experiment and average later
                # MACHINE LEARNING PART
                # Settings are chosen to be the closest to J48 algorithm
                c = tree.DecisionTreeClassifier(criterion="entropy", min_samples_leaf=2)
                c.fit(s.drop(["label"],axis=1), s.label)
                # END OF LEARNING
                d["pred"] = c.predict(clean.drop(["label"], axis=1)) # Get model's prediction
                TN += d[(d.label == 0) & (d.pred == 0)].shape[0]
                TP += d[(d.label == 1) & (d.pred == 1)].shape[0]
                FN += d[(d.label == 1) & (d.pred == 0)].shape[0]
                FP += d[(d.label == 0) & (d.pred == 1)].shape[0]
            del d["pred"] # Reset
            # Push the results
            res["sr"].append(sr)
            res["t"].append(t)
            res["TN"].append(TN/NSUBS)
            res["TP"].append(TP/NSUBS)
            res["FN"].append(FN/NSUBS)
            res["FP"].append(FP/NSUBS)
    # Save the result as csv
    pd.DataFrame(res).to_csv(dataPath+filename+"-classification.csv", index=False)
    
def sensitivity(datasetPath, dataPath, filename, perf, NBINS, NSUBS, srm, srM, srs, thresholds=False):
    sensitivityClassification(datasetPath = datasetPath, dataPath = dataPath, filename = filename,
                perf = perf, NBINS = NBINS, NSUBS = NSUBS, srm = srm, srM = srM, srs = srs)
    sensitivityRegression(datasetPath = datasetPath, dataPath = dataPath, filename = filename,
                perf = perf, NBINS = NBINS, NSUBS = NSUBS, srm = srm, srM = srM, srs = srs)

In [None]:
for k,filename in enumerate(filenames):
    #Machine learning part, using data from {filename} file in {datasetPath} folder and writing results in {dataPath} folder
    sensitivity(datasetPath = datasetPath, dataPath = dataPath, filename = filename,
                perf = perf, NBINS = NBINS, NSUBS = NSUBS, srm = srm, srM = srM[k], srs = srs[k])

In [None]:
#If results folder does not exist
if not os.path.exists(resultPath):
    try:
        os.makedirs(resultPath)
    except OSError as exc: # Guard against race condition
        if exc.errno != errno.EEXIST:
            raise

#For each file, get the min and max values to have a global colorscale and then use the R script to create the heatmaps
for filename in filenames:
    metrics = ["accuracy","specificity","precision","recall","npv","negNPV","negAccuracy"]
    minArray={}
    maxArray={}
    for m in metrics:
        minArray[m] = 1
        maxArray[m] = -1


    for file in [filename+"-classification.csv",filename+"-regression.csv"]:
        df = pd.read_csv(dataPath+file)

        df['accuracy']= (df['TP']+df['TN']) / (df['TP']+df['TN']+df['FN']+df['FP'])
        df['specificity']=df['TN']/(df['TN']+df['FP'])
        df['precision']=df['TP']/(df['TP'] + df['FP'])
        df['recall']= df['TP'] / (df['TP']+ df['FN']) # Recall of class 1
        df['npv']=df['TN']/(df['TN']+df['FN'])

        df['index']=df.groupby('sr').cumcount()+1
        df['frequency']=max(df['index'])
        df['index']= df['index'] * 100 / df['frequency']

        df['negNPV']=df['npv']-(df['index']/100)
        df['negAccuracy']=df['accuracy']-(df['index']/100)

        for m in metrics:
            if max(df[m])>maxArray[m] and not np.isnan(max(df[m])):
                maxArray[m] = max(df[m])
            elif np.isnan(max(df[m])):
                if np.nanmax(df[m].values)>maxArray[m]:
                    maxArray[m] = np.nanmax(df[m].values)
            if min(df[m])<minArray[m] and not np.isnan(min(df[m])):
                minArray[m] = min(df[m])
            elif np.isnan(min(df[m])):
                if np.nanmin(df[m].values)<minArray[m]:
                    minArray[m] = np.nanmin(df[m].values)


    minmax = pd.DataFrame([minArray,maxArray])
    minmax.to_csv(dataPath+"minmax-"+filename+".csv", index=False)
    
    !Rscript ./helpers/2.calculateMetrics.R {dataPath}{filename} {resultPath} {dataPath}"minmax-"{filename}".csv"