#### This notebook is about dimensionality selection from a given dataset using Genetic Algorithm.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
import matplotlib.cm as cm
import os
import sys
import time

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Commonly Used Variables

In [None]:
common_path = '/content/drive/My Drive/Colab Notebooks/spec2006-perf-simulations/'
MAX_VAL = sys.maxsize
MIN_VAL = -sys.maxsize - 1

# Commonly Used Functions

In [0]:
def get_dataframe_from_directory(dir_path) :
    '''
        make one csv file from multiple csv files in the directory and
        returns a dataframe consisting of csv files of given directory and list of file names in the directory
        and a list of number of rows in each file
    '''
    dirFiles = os.listdir(dir_path)
    dirFiles.sort()
    dfList = []
    if not dir_path.endswith('/') :
        dir_path = dir_path + '/'
    fileSizes = []
    for entry in dirFiles : 
        file_path = dir_path + entry 
        if file_path.endswith('.csv') :
            dfList.append(pd.read_csv(file_path))
            for k in list(dfList[-1]) :
                dfList[-1][k] = pd.to_numeric(dfList[-1][k], errors='coerce')       # convert all strings to NaN
            dfList[-1].fillna(dfList[-1].median(), inplace=True)          # convert all NaN to median of the corresponding column
            fileSizes.append(dfList[-1].shape[0])
    if dfList :
        df = pd.concat(dfList)
    else :
        df = None
    return df, dirFiles, fileSizes

def get_dataframe_from_directories(listOfDir=[]) :
    '''
        returns a dataframe consisting of csv files of all directories and list of all the file names 
    '''
    if not listOfDir :
        return None, []
    dfList = []
    listOfFiles = []
    fileSizeList = []
    dirSizeList = []
    for directory in listOfDir :
        dirDf, files, fileSizes = get_dataframe_from_directory(directory)
        dfList.append(dirDf)
        listOfFiles += files
        fileSizeList += fileSizes
        dirSizeList.append(sum(fileSizes))
    
    df = pd.concat(dfList)
    return df, listOfFiles, dirSizeList, fileSizeList


def euclidean_dist(first, second) :
    '''
    returns euclidean distance between two data points
    '''
    if not (type(first) is np.ndarray) :
        first = np.array(first)
    if not (type(second) is np.ndarray) :
        second = np.array(second)

    return np.sqrt(np.sum((np.square(first-second))))

def divide_by_instruction(x):
    '''
        divide the pandas series 'x' by number of instructions
    '''
    return x/x['inst_retired.any']

def preprocessData(df, per_instruction_factor=1000) :
    df = df.apply(divide_by_instruction, axis=1)
    df = df.iloc[:, 1:]*1000
    return df

def getAllDirectories() :
    dirFiles = os.listdir(common_path)
    dirFiles.sort()
    allDirectories = []
    for entry in dirFiles : 
        if entry != 'listOfFiles.list' :
            file_path = common_path + entry + '/yinputs'
            allDirectories.append(file_path)
    return allDirectories

###### GENETIC ALGORITHM UTILITIES ######
def getInitialPopulation(n_features, POPULATION_SIZE=5) :
    population = []
    binList = ['0', '1']
    for i in range(POPULATION_SIZE) :
        randomList = list(np.random.choice(binList, n_features))
        population.append(''.join(randomList))
    return population

In [19]:
listOfDir = [ common_path + '403.gcc/yinputs' ]
# listOfDir = getAllDirectories()
df, listOfFiles, dirSizeList, fileSizeList = get_dataframe_from_directories(listOfDir)
df_model = df.iloc[:, 1:]
df_model = preprocessData(df_model)
display("Dataframe for the list of directories has been created...")
display(df_model)
display(listOfFiles)

'Dataframe for the list of directories has been created...'

Unnamed: 0,cycles,cpu_clk_unhalted.thread,cpu_clk_unhalted.ref_tsc,msr/tsc/,fp_arith_inst_retired.scalar_single,fp_arith_inst_retired.scalar_double,fp_arith_inst_retired.128b_packed_double,fp_arith_inst_retired.128b_packed_single,fp_arith_inst_retired.256b_packed_double,fp_arith_inst_retired.256b_packed_single,cpu_clk_thread_unhalted.one_thread_active,cpu_clk_thread_unhalted.ref_xclk_any,cpu_clk_unhalted.ref_tsc:u,uops_issued.any,uops_retired.retire_slots,icache_64b.iftag_hit,icache_64b.iftag_miss,idq.dsb_uops,lsd.uops,idq.mite_uops,idq.ms_uops,l1d_pend_miss.pending,l1d_pend_miss.pending_cycles_any,l1d_pend_miss.pending_cycles,mem_load_retired.l1_miss,mem_load_retired.fb_hit,uops_executed.thread,uops_executed.core_cycles_ge_1
0,357.485971,282.515162,312.280398,313.756999,0.0,0.002296,0.0,0.0,0.0,0.0,2.663777,2.763129,280.108699,653.883511,587.555720,132.568295,7.231947,364.738497,0.0,380.907875,21.766146,79.338075,55.684441,55.685199,2.811646,1.618649,682.975378,238.184004
1,370.702966,371.691158,326.486384,326.756751,0.0,0.000000,0.0,0.0,0.0,0.0,2.870947,2.886037,308.933802,813.647344,645.710984,115.660700,0.010646,362.662649,0.0,205.828947,222.756784,53.852411,16.400923,16.400880,2.147492,1.571957,774.591064,258.194428
2,353.989887,352.451906,310.677422,311.166990,0.0,0.000000,0.0,0.0,0.0,0.0,2.737047,2.746203,268.666196,784.490770,622.011981,132.101222,0.447268,72.304784,0.0,0.732267,0.136737,142.108051,210.821362,210.821223,1.912185,2.955333,661.922970,273.088811
3,836.250612,833.660967,727.953267,733.866627,0.0,0.000000,0.0,0.0,0.0,0.0,6.470074,6.484594,718.149800,458.968649,428.782749,84.368153,0.107101,408.708818,0.0,19.894388,0.939609,89.095293,70.326908,52.953340,2.268360,6.854987,605.389868,223.369117
4,785.605789,785.237800,844.823325,846.496206,0.0,0.000000,0.0,0.0,0.0,0.0,7.446459,7.480184,706.810197,1602.015770,399.171794,676.499942,0.061992,950.282251,0.0,28.717056,28.893286,1044.572347,586.671002,586.670195,10.417044,10.602538,653.435063,271.291254
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
124,293.242065,293.650594,255.657861,258.901794,0.0,0.003968,0.0,0.0,0.0,0.0,1.999137,2.279728,255.794234,588.501414,540.762946,133.163891,3.361023,461.423440,0.0,110.333856,4.650081,49.156429,76.414669,69.220124,2.449943,3.038946,627.561838,226.357605
125,297.864261,298.067622,262.142958,262.528988,0.0,0.002529,0.0,0.0,0.0,0.0,2.290254,2.318907,261.179427,691.009976,498.796753,153.234339,3.621732,358.747679,0.0,241.700898,70.073761,69.999651,96.615701,96.519425,2.935964,3.714781,606.175868,210.778556
126,595.938625,595.588892,522.660774,523.571981,0.0,0.000218,0.0,0.0,0.0,0.0,4.570278,4.612072,518.979503,934.899697,716.711156,179.528666,2.136171,338.178283,0.0,190.990209,203.124006,307.850947,297.267083,297.266426,3.607320,4.298839,750.074225,330.203847
127,306.419525,305.946900,268.660094,268.965311,0.0,0.001218,0.0,0.0,0.0,0.0,2.295699,2.373740,266.094455,442.801925,503.511407,109.770232,2.306566,440.739196,0.0,144.604587,23.660256,117.228547,110.274310,110.274194,4.218064,5.854630,569.871574,207.567924


['gcc_input1.csv',
 'gcc_input2.csv',
 'gcc_input3.csv',
 'gcc_input4.csv',
 'gcc_input5.csv',
 'gcc_input6.csv',
 'gcc_input7.csv',
 'gcc_input8.csv',
 'gcc_input9.csv']

# Genetic Algorithm

In [0]:
n_features = len(df_model.columns)
POPULATION_SIZE = 5
# population = getInitialPopulation(n_features, POPULATION_SIZE)
population = [ '1010111100111010101100000100',
                        '0000011111000010001101000100',
                        '0011000100111111000100010010',
                        '1000011110000111111111100110',
                        '0111111110001010100001010010' ]

