# Compiler Optimization Dataset

In [1]:
import pandas as pd
import numpy as np

In [2]:
#Get Flags and Execution Times
flags_csv = pd.read_csv('../raw_data/cBench_onPandaboard_24app_5ds.csv')

y = flags_csv.iloc[:,8:-1]
y.insert(0, 'APP_NAME', list(flags_csv['APP_NAME']), True,)
print('y:', y.shape)
flags = flags_csv.iloc[:,:8]
flags.insert(flags.shape[1], 'code_size', flags_csv['code_size'] ,True)
print('flags:', flags.shape)

y: (3072, 6)
flags: (3072, 9)


In [3]:
flags = flags.replace(to_replace='X', value=0)
flags = flags.replace(to_replace='-', value=1, regex=True)
flags.head(n=5)

Unnamed: 0,APP_NAME,funsafe_math_optimizations,fno_guess_branch_probability,fno_ivopts,fno_tree_loop_optimize,fno_inline_functions,funroll_all_loops,o2,code_size
0,automotive_bitcount,0,0,0,0,0,0,0,457478
1,automotive_bitcount,0,0,0,0,0,0,1,457478
2,automotive_bitcount,0,0,0,0,0,1,0,461574
3,automotive_bitcount,0,0,0,0,0,1,1,461574
4,automotive_bitcount,0,0,0,0,1,0,0,457478


In [4]:
#Get Static
static_csv = pd.read_csv('../raw_data/ft_Milepost_cbench.csv')
print('static:', static_csv.shape)

static = static_csv.drop(['DATASET'], axis=1)
static.drop_duplicates(keep='first', inplace=True)
static = static.reset_index()
static.head()

static: (120, 55)


Unnamed: 0,index,APP_NAME,noBasicBlock,nobasicBlockSingleSocc,nobasicBlock2Socc,noBasicBlockSoccMore,noBasicBlockPred,noBasicBlock2Pred,noBasicBlockPredMore,ft8,...,ft46,ft47,ft48,ft49,ft50,ft51,ft52,ft53,ft54,ft55
0,0,automotive_bitcount,129,37,31,0,61,26,2,26,...,9,49,2,8.90872,21.6932,26,5,58,3,28
1,5,automotive_qsort1,85,34,37,0,47,26,4,27,...,2,39,2,4.99028,8.42857,22,12,43,2,32
2,10,automotive_susan_c,628,218,333,2,452,114,20,182,...,6,406,6,18.8668,52.1316,100,55,431,0,155
3,15,automotive_susan_e,628,218,333,2,452,114,20,182,...,6,406,6,18.8668,52.1316,100,55,431,0,155
4,20,automotive_susan_s,628,218,333,2,452,114,20,182,...,6,406,6,18.8668,52.1316,100,55,431,0,155


In [5]:
#Get Dynamic
dynamic_csv = pd.read_csv('../raw_data/ft_MICA_cbench.csv')
print('dynamic:', dynamic_csv.shape)

dynamic = dynamic_csv.rename(columns={'APPLICATION_NAME': 'APP_NAME'})
dynamic_list = [pd.DataFrame(dynamic[dynamic['DATASET'] == val].iloc[:24]).drop(['DATASET'], axis=1).rename(columns=lambda x: x + '_' + val[-1:]).reset_index() for val in list(dynamic['DATASET'].unique())]
dynamic = pd.concat(dynamic_list, axis=1, sort=False).drop('index', axis=1)
dynamic = dynamic.drop(['APP_NAME_' + str(i) for i in range(2,6)], axis=1).rename(columns={'APP_NAME_1': 'APP_NAME'})
dynamic.head()

dynamic: (120, 47)


Unnamed: 0,APP_NAME,ILP32_1,arithmetic_1,InstrFootprint64_1,DataFootprint64_1,mem_access_1,memReuseDist0-2_1,memReuseDist2-4_1,memReuseDist4-8_1,memReuseDist8-16_1,...,mem_read_global_stride_0_5,mem_read_global_stride_8_5,mem_write_cnt_5,mem_write_local_stride_0_5,mem_write_local_stride_8_5,mem_write_global_stride_0_5,mem_write_global_stride_8_5,mem_write_global_stride_64_5,mem_write_global_stride_512_5,mem_write_global_stride_4096_5
0,automotive_bitcount,81759232,336803788,2446,1084,56205320,2121,48231601,2552206,5409095,...,5169035,124553755,71763110,84,71759394,2,3630,71760056,71761599,71761792
1,automotive_qsort1,15327129,85898554,16248,1116,27570282,15881,11920302,4022153,3366741,...,206142,4350007,9859011,125299,9288625,2,1378231,4030051,6021071,6580534
2,automotive_susan_c,5690408,35704170,25303,1236,13898555,23984,8514467,2137884,1001556,...,2777151,6474197,11958143,28,11765537,2,1487168,8530450,11947805,11949687
3,automotive_susan_e,13743232,72987520,28082,1291,28030322,27472,21211596,2106466,1313504,...,721758,4107067,3485964,190019,3164643,2,1784033,2744417,3276891,3338188
4,automotive_susan_s,128865750,767907553,11627,1243,191000695,11250,3572567,167162424,11536642,...,904,769967,15160450,28,15097246,28,10782584,13708715,13712373,13719994


In [6]:
apps = list(static['APP_NAME'].unique())

flags_temp = flags[flags['APP_NAME'] == apps[0]]

static_temp = pd.DataFrame(static[static['APP_NAME'] == apps[0]])
static_temp = pd.concat([static_temp]*128, ignore_index=True).drop('APP_NAME', axis=1)

dynamic_temp = pd.DataFrame(dynamic[dynamic['APP_NAME'] == apps[0]])
dynamic_temp = pd.concat([dynamic_temp]*128, ignore_index=True).drop('APP_NAME', axis=1)

y_temp = y[y['APP_NAME'] == apps[0]].reset_index(drop=True).drop('APP_NAME', axis=1)

data = pd.concat([flags_temp, static_temp, dynamic_temp, y_temp], axis=1, sort=False)

for app in apps[1:]:
    flags_temp = pd.DataFrame(flags[flags['APP_NAME'] == app]).reset_index(drop=True)
    
    static_temp = pd.DataFrame(static[static['APP_NAME'] == app])
    static_temp = pd.concat([static_temp]*128, ignore_index=True).reset_index(drop=True).drop('APP_NAME', axis=1)
    
    dynamic_temp = pd.DataFrame(dynamic[dynamic['APP_NAME'] == app])
    dynamic_temp = pd.concat([dynamic_temp]*128, ignore_index=True).reset_index(drop=True).drop('APP_NAME', axis=1)
    
    y_temp = y[y['APP_NAME'] == app].reset_index(drop=True).drop('APP_NAME', axis=1)
    
    data_app = pd.concat([flags_temp, static_temp, dynamic_temp, y_temp], axis=1, sort=False)
    
    data = pd.concat([data, data_app], axis=0, sort=False)

data = data.reset_index(drop=True).drop('index', axis=1)

In [7]:
data.head()

Unnamed: 0,APP_NAME,funsafe_math_optimizations,fno_guess_branch_probability,fno_ivopts,fno_tree_loop_optimize,fno_inline_functions,funroll_all_loops,o2,code_size,noBasicBlock,...,mem_write_global_stride_0_5,mem_write_global_stride_8_5,mem_write_global_stride_64_5,mem_write_global_stride_512_5,mem_write_global_stride_4096_5,execution_time_1,execution_time_2,execution_time_3,execution_time_4,execution_time_5
0,automotive_bitcount,0,0,0,0,0,0,0,457478,129,...,2,3630,71760056,71761599,71761792,2010.64,3302.71,4770.88,6314.6,7889.87
1,automotive_bitcount,0,0,0,0,0,0,1,457478,129,...,2,3630,71760056,71761599,71761792,1777.07,3251.86,4731.59,6372.31,8459.09
2,automotive_bitcount,0,0,0,0,0,1,0,461574,129,...,2,3630,71760056,71761599,71761792,2504.95,3295.54,4645.23,6286.89,7816.15
3,automotive_bitcount,0,0,0,0,0,1,1,461574,129,...,2,3630,71760056,71761599,71761792,1715.03,3143.13,4557.01,6103.12,7736.7
4,automotive_bitcount,0,0,0,0,1,0,0,457478,129,...,2,3630,71760056,71761599,71761792,1695.72,3151.09,4782.62,6558.69,8059.82


In [8]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0,1))
scaler.fit(data.iloc[:,1:].values)
scaled_data = pd.concat([data['APP_NAME'], pd.DataFrame(scaler.transform(data.iloc[:,1:].values))], axis=1, ignore_index=True)
scaled_data.columns = data.columns


In [9]:
def data_to_csv(dataset, csv_name):
    if str(csv_name[-4:]) != '.csv':
        csv_name = str(csv_name) + '.csv'
    with open('../data/' + csv_name, 'w') as dataset_csv:
        dataset_csv.write(dataset.to_csv(index=False))

In [10]:
data_to_csv(data, 'data')

In [11]:
scaled_data.head()

Unnamed: 0,APP_NAME,funsafe_math_optimizations,fno_guess_branch_probability,fno_ivopts,fno_tree_loop_optimize,fno_inline_functions,funroll_all_loops,o2,code_size,noBasicBlock,...,mem_write_global_stride_0_5,mem_write_global_stride_8_5,mem_write_global_stride_64_5,mem_write_global_stride_512_5,mem_write_global_stride_4096_5,execution_time_1,execution_time_2,execution_time_3,execution_time_4,execution_time_5
0,automotive_bitcount,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002404,0.01336,...,0.0,0.0,1.0,1.0,1.0,0.301362,0.461115,0.261839,0.776975,0.506382
1,automotive_bitcount,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.002404,0.01336,...,0.0,0.0,1.0,1.0,1.0,0.266198,0.453989,0.259676,0.784096,0.543015
2,automotive_bitcount,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.012652,0.01336,...,0.0,0.0,1.0,1.0,1.0,0.375778,0.460111,0.254922,0.773556,0.501638
3,automotive_bitcount,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.012652,0.01336,...,0.0,0.0,1.0,1.0,1.0,0.256859,0.43875,0.250065,0.750881,0.496525
4,automotive_bitcount,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.002404,0.01336,...,0.0,0.0,1.0,1.0,1.0,0.253951,0.439866,0.262485,0.807094,0.51732
