# The Interplay of Compile-time Options and Run-time Options for Performance Prediction

#### Import libraries

In [1]:
import os

import pandas as pd

import numpy as np

from scipy import stats
import scipy.stats as sc
from scipy.cluster.hierarchy import linkage, leaves_list

import statsmodels.api as sm

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

import seaborn as sns

import matplotlib.pyplot as plt

##### Variables to keep

In [2]:
data_dir = "../../data/"

##### Import  and clean data for x264

#### Variables to change

In [18]:
name_system = "x264"
shape_csv_system = (201,33)
nb_inputs = 8 

In [19]:
def time_to_sec(el):
    tab = el.split("m")
    return float(tab[0])*60+float(tab[1][:-1])

list_dir = os.listdir(data_dir+name_system)
list_dir.remove('ctime_options.csv')
list_dir

#for i in range(len(list_dir)):
#    print(list_dir[i])
l = 'default'
list_inputs = os.listdir(data_dir+name_system+'/'+l)
assert len(list_inputs) == nb_inputs, l
for j in range(len(list_inputs)):
    print(list_inputs[j])
    loc = data_dir+name_system+'/'+l+'/'+list_inputs[j]
    df = pd.read_csv(loc)
    assert df.shape == shape_csv_system, l+ list_inputs[j] 
    df = df.drop(['usertime', 'fps', 'kbs'], axis=1)
    df.columns = ['configurationID', 'cabac', 'ref', 'deblock', 'analyse', 'me', 'subme','mixed_ref', 
                  'me_range', 'trellis', '8x8dct', 'fast_pskip','chroma_qp_offset', 'bframes', 
                  'b_pyramid', 'b_adapt', 'direct','weightb', 'open_gop', 'weightp', 'scenecut', 
                  'rc_lookahead', 'mbtree','qpmax', 'aq-mode', 'size', 'usertime','frames', 'fps', 'kbs']
    df['time'] = [*map(time_to_sec, df['usertime'])]
    df = df.drop(['usertime'], axis=1)
    df.to_csv(loc, index = False)

original_videos_LiveMusic_360P_LiveMusic_360P-1d94.csv
original_videos_CoverSong_360P_CoverSong_360P-5d20.csv
original_videos_MusicVideo_360P_MusicVideo_360P-5699.csv
original_videos_LyricVideo_360P_LyricVideo_360P-5e87.csv
original_videos_Sports_360P_Sports_360P-4545.csv
original_videos_Gaming_360P_Gaming_360P-56fe.csv
original_videos_Animation_480P_Animation_480P-087e.csv
original_videos_Lecture_360P_Lecture_360P-114f.csv


### GCC

In [3]:
pd.read_csv("../../data/gcc/2mm.csv")

Unnamed: 0,configurationID,optim,-floop-interchange,-fprefetch-loop-arrays,-ffloat-store,-fno-asm,size,usertime,systemtime,elapsedtime,cpu,exec
0,0,-O2,1,1,0,0,17552,,0m0.148s,20.408121,,
1,1,-Og,1,1,0,1,17720,,0m0.083s,18.690651,,
2,10,-O1,0,1,1,0,17584,,0m0.107s,6.444763,,
3,11,-Ofast,1,1,1,1,19112,,0m0.124s,7.714287,,
4,12,-O2,1,0,0,1,21648,,0m0.184s,14.688036,,
...,...,...,...,...,...,...,...,...,...,...,...,...
75,77,-Og,1,1,1,0,17720,,0m0.090s,7.286327,,
76,78,-O1,0,0,0,0,21680,,0m0.143s,14.342549,,
77,79,-O1,1,0,1,1,17584,,0m0.119s,6.956464,,
78,8,-Ofast,1,0,0,1,23208,,0m0.174s,14.492661,,


In [4]:
name_system = "gcc"
shape_csv_system = (80,11)
nb_inputs = 30

In [14]:
def time_to_sec(el):
    tab = el.split("m")
    return float(tab[0])*60+float(tab[1][:-1])

list_inputs = os.listdir(data_dir+name_system)
list_inputs.remove('others')

#for i in range(len(list_dir)):
#    print(list_dir[i])
#list_inputs = os.listdir(data_dir+name_system+'/')

#assert len(list_inputs) == nb_inputs, 
for j in range(len(list_inputs)):
    loc = data_dir+name_system+'/'+list_inputs[j]
    df = pd.read_csv(loc)
    #if np.isnan(df["size"][0]):
    #    print(list_inputs[j])
    #assert df.shape == shape_csv_system, list_inputs[j] 
    #df = df.drop(['usertime', 'cpu', 'exec'], axis=1)
    #df['ctime'] = [*map(time_to_sec, df['systemtime'])]
    #df['exec'] = df['elapsedtime']
    #df = df.drop(['systemtime', 'elapsedtime'], axis=1)
    #df.to_csv(loc, index = False)
    if np.any(np.isnan(df["size"])):
        print(list_inputs[j])

##### Import  and clean data for poppler

In [26]:
name_system = "poppler"
shape_csv_system = (16,8)
nb_inputs = 1480

In [28]:
def time_to_sec(el):
    tab = el.split("m")
    return float(tab[0])*60+float(tab[1][:-1])

#for i in range(len(list_dir)):
#    print(list_dir[i])
list_inputs = os.listdir(data_dir+name_system+'/')
list_inputs.remove('others')
#assert len(list_inputs) == nb_inputs, "not the right number"
for j in range(len(list_inputs)):
    #print(list_inputs[j])
    loc = data_dir+name_system+'/'+list_inputs[j]
    df = pd.read_csv(loc)
    #print(df.shape)
    assert df.shape == shape_csv_system, str(df.shape) +' '+ list_inputs[j] 
    df['time'] = [*map(time_to_sec, df['realtime'])]
    df = df.drop(['realtime'], axis=1)
    df.to_csv(loc, index = False)

##### Import  and clean data for poppler

In [14]:
name_system = "xz"
shape_csv_system = (30,7)
nb_inputs = 48

In [25]:
def time_to_sec(el):
    if type(el)!=float:
        tab = el.split("m")
        res = float(tab[0])*60+float(tab[1][:-1])
    else:
        res = el
    return res

list_inputs= os.listdir(data_dir+name_system)
list_inputs.remove('others')
list_inputs

#for i in range(len(list_dir)):
#    print(list_dir[i])
assert len(list_inputs) == nb_inputs
for j in range(len(list_inputs)):
    print(list_inputs[j])
    loc = data_dir+name_system+'/'+list_inputs[j]
    df = pd.read_csv(loc)
    assert df.shape == shape_csv_system, list_inputs[j] 
    df['time'] = [*map(time_to_sec, df['realtime'])]
    df = df.drop(['realtime'], axis=1)
    df.to_csv(loc, index = False)

alice29.csv
progl.csv
paper6.csv
dickens.csv
bib.csv
bible.csv
random.csv
progc.csv
mozilla.csv
fieldsc.csv
kennedy.csv
cp.csv
trans.csv
ptt5.csv
paper4.csv
Ecoli.csv
webster.csv
a.csv
asyoulik.csv
x-ray.csv
progp.csv
reymont.csv
sao.csv
world192.csv
news.csv
book1.csv
ooffice.csv
osdb.csv
geo.csv
obj1.csv
paper5.csv
mr.csv
samba.csv
nci.csv
alphabet.csv
plrabn12.csv
pic.csv
obj2.csv
book2.csv
paper2.csv
xargs.csv
xml.csv
grammar.csv
aaa.csv
sum.csv
lcet10.csv
paper3.csv
paper1.csv


#### Import data for nodejs

In [37]:
name_system = "nodejs"
shape_csv_system = (50,7)
nb_inputs = 1932

list_inputs = os.listdir(data_dir+name_system+'/')
list_inputs.remove('others')

print(len(list_inputs))

assert len(list_inputs) == nb_inputs, "no"
for j in range(len(list_inputs)):
    #print(list_inputs[j])
    loc = data_dir+name_system+'/'+list_inputs[j]
    df = pd.read_csv(loc)
    assert df.shape == shape_csv_system, list_inputs[j]
    df.columns = ['jitless', 'experimental-wasm-modules',
   'experimental-vm-modules', 'preserve-symlinks-main',
   'no-warnings', 'node-memory-debug', 'ops']
    if np.isnan(df['ops'][0]):
        print(list_inputs[j])
    #df.to_csv(loc, index = False)

1932


In [30]:
df

Unnamed: 0,--jitless,--experimental-wasm-modules,--experimental-vm-modules,--preserve-symlinks-main,--no-warnings,--node-memory-debug
0,0,0,1,1,0,0
1,1,0,1,1,1,0
2,0,0,0,0,0,0
3,1,0,1,0,1,0
4,0,0,0,1,0,1
5,1,0,0,1,0,1
6,1,1,0,0,1,0
7,0,1,1,1,1,1
8,0,0,1,1,1,0
9,0,1,1,1,1,0
