In [1]:
import pandas as pd
import numpy as np
import json

import math
from scipy.spatial.distance import euclidean

# Lectura de datos

In [2]:
with open('ISCX.json', 'r') as file:
    ISCX_data = json.load(file)
df = pd.DataFrame(data=ISCX_data['dataroot']['TestbedSunJun13Flows'])

**Transformamos en valores numéricos las dos posibles categorías: Normal -> 0 y Attack -> 1.**

In [3]:
df.loc[:, 'Tag'].replace(['Normal'], 0, inplace=True)
df.loc[:, 'Tag'].replace(['Attack'], 1, inplace=True)

In [4]:
tags = df.loc[:,'Tag'].copy()
tags.to_csv('ISCX_datasets_preprocesados/ISCX_tags.csv')

# Ingeniería de características: Extracción de variables

**Bit String-based Extraction**

In [5]:
def to_binary_cols(src, dst):
    array = []  
    for item in src.split('.') + dst.split('.'):
        item = format(int(item),'#08b')
        item = [ int(bit) for bit in item.split('b')[1]]
        while len(item) < 8:
            item.insert(0,0)
        array.extend(item)
        
    return array

**Distancia Euclidiana**

In [6]:
potencias2 = [pow(2,k) for k in range(33)]   # 2^32 para tener calculado también el valor de la "normalizacion"
print(potencias2)

def distancia_euclidiana_bits(src, dst):
    dist = 0
    for k in range(0,32):
        dist += potencias2[31-k] * (src[k]-dst[k])
    
    return abs(dist)

[1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152, 4194304, 8388608, 16777216, 33554432, 67108864, 134217728, 268435456, 536870912, 1073741824, 2147483648, 4294967296]


**XOR**

In [7]:
def distancia_xor(src,dst):
    k = 0
    while k < 32:
        if src[k] != dst[k]:
            return potencias2[31-k]
        k += 1
    
    return 0

**XOR+**

In [8]:
def distancia_xorplus(eucl,xor):
    return eucl + xor

**Distancia Aritmética**

In [9]:
def dif_aritmetica(src, dst):
    src = int(src.replace('.',''))
    dst = int(dst.replace('.',''))
    dif = abs(src-dst)
    if dif == 0:
        return dif
    return math.log(dif)

**Distancia Euclidiana y de Hamming sobre los bytes en formato decimal. También "normalización".**

In [10]:
def hamming(src, dst):
    return math.sqrt(1000 * math.pow(src[0]-dst[0],2) +
                     100 * math.pow(src[1]-dst[1],2) +
                     10 * math.pow(src[2]-dst[2],2) +
                     1 * math.pow(src[3]-dst[3],2)
    )


def normalizacion(ip):
    suma = 0
    for i in range(4):
        suma += ip[i] * potencias2[24 - 8 * i]
    
    return suma / potencias2[32]
    
    
def dist_euclidiana_hamming_norm(src, dst):
    src = [int(byte) for byte in src.split('.')]
    dst = [int(byte) for byte in dst.split('.')]
    
    return euclidean(src, dst), hamming(src, dst), normalizacion(src), normalizacion(dst)

Desarrollo características para evitar recorrer todas las filas múltiples veces.

In [11]:
def desarrolloCaracteristicas(src,dst):
    bits = to_binary_cols(src,dst)
    eucl = distancia_euclidiana_bits(bits[:32],bits[32:])
    xor = distancia_xor(bits[:32],bits[32:])
  
    return bits + [eucl, xor, distancia_xorplus(eucl,xor), dif_aritmetica(src, dst), *dist_euclidiana_hamming_norm(src, dst)] 

In [12]:
df = df[['source', 'destination', 'Tag']]
df.head()

Unnamed: 0,source,destination,Tag
0,192.168.5.122,224.0.0.251,0
1,192.168.5.122,224.0.0.251,0
2,192.168.2.113,192.168.5.122,0
3,192.168.2.113,192.168.5.122,0
4,192.168.2.113,207.241.148.80,0


In [13]:
bits_index = ['s' + str(i) for i in np.arange(31,-1,-1)] + ['d' + str(i) for i in np.arange(31,-1,-1)]

df[np.append(bits_index,
                       ['eucl', 'xor', 'xor+',
                        'arit', 'eucl_dec', 'hamm',
                        'norm_src', 'norm_dst'])] = df.apply(
                                                    lambda x: desarrolloCaracteristicas(x['source'],x['destination']), 
                                                    axis=1, result_type='expand')

df = df[np.append(bits_index, ['eucl', 'xor', 'xor+', 'arit', 'eucl_dec', 'hamm', 'norm_src', 'norm_dst'])]

df.to_csv('ISCX_datasets_preprocesados/no.csv')
df.head()

Unnamed: 0,s31,s30,s29,s28,s27,s26,s25,s24,s23,s22,...,d1,d0,eucl,xor,xor+,arit,eucl_dec,hamm,norm_src,norm_dst
0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,1.0,525859713.0,536870912.0,1062731000.0,21.364743,214.275524,1965.525629,0.752564,0.875
1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,1.0,525859713.0,536870912.0,1062731000.0,21.364743,214.275524,1965.525629,0.752564,0.875
2,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,777.0,1024.0,1801.0,8.009363,9.486833,13.076697,0.752564,0.752564
3,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,777.0,1024.0,1801.0,8.009363,9.486833,13.076697,0.752564,0.752564
4,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,256479711.0,134217728.0,390697400.0,23.657252,167.209449,985.976166,0.752564,0.81228


# Ingeniería de características: Transformación de los datos

In [28]:
from sklearn.preprocessing import StandardScaler
df_standardScaler = df.copy()
df_standardScaler[np.append(bits_index, ['eucl', 'xor', 'xor+', 'arit', 'eucl_dec', 'hamm', 
                                            'norm_src', 'norm_dst'])] = StandardScaler().fit_transform(df)

df_standardScaler.to_csv('ISCX_datasets_preprocesados/standard.csv')
df_standardScaler.head()

Unnamed: 0,s31,s30,s29,s28,s27,s26,s25,s24,s23,s22,...,d1,d0,eucl,xor,xor+,arit,eucl_dec,hamm,norm_src,norm_dst
0,0.041249,0.073837,-0.013739,-0.014758,-0.0434,-0.043148,-0.06238,-0.048024,0.038415,-0.047872,...,0.657051,1.152252,-0.407338,-0.494026,-0.459682,0.416685,0.658888,0.048462,0.062827,1.314719
1,0.041249,0.073837,-0.013739,-0.014758,-0.0434,-0.043148,-0.06238,-0.048024,0.038415,-0.047872,...,0.657051,1.152252,-0.407338,-0.494026,-0.459682,0.416685,0.658888,0.048462,0.062827,1.314719
2,0.041249,0.073837,-0.013739,-0.014758,-0.0434,-0.043148,-0.06238,-0.048024,0.038415,-0.047872,...,0.657051,-0.867866,-1.001321,-1.025664,-1.02805,-1.598216,-1.624509,-1.125236,0.062822,0.783218
3,0.041249,0.073837,-0.013739,-0.014758,-0.0434,-0.043148,-0.06238,-0.048024,0.038415,-0.047872,...,0.657051,-0.867866,-1.001321,-1.025664,-1.02805,-1.598216,-1.624509,-1.125236,0.062822,0.783218
4,0.041249,0.073837,-0.013739,-0.014758,-0.0434,-0.043148,-0.06238,-0.048024,0.038415,-0.047872,...,-1.521952,-0.867866,-0.711616,-0.892755,-0.819098,0.762551,0.134101,-0.540385,0.062822,1.042449


In [16]:
from sklearn.preprocessing import MinMaxScaler
df_minMaxScaler = df.copy()
df_minMaxScaler[np.append(bits_index, ['eucl', 'xor', 'xor+', 'arit', 'eucl_dec', 'hamm', 
                                            'norm_src', 'norm_dst'])] = MinMaxScaler().fit_transform(df)

df_minMaxScaler.to_csv('ISCX_datasets_preprocesados/minMax.csv')
df_minMaxScaler.head()

Unnamed: 0,s31,s30,s29,s28,s27,s26,s25,s24,s23,s22,...,d1,d0,eucl,xor,xor+,arit,eucl_dec,hamm,norm_src,norm_dst
0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,1.0,0.1224363,0.25,0.1649575,0.813414,0.420148,0.23125,0.89007,0.875
1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,1.0,0.1224363,0.25,0.1649575,0.813414,0.420148,0.23125,0.89007,0.875
2,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,1.809094e-07,4.768372e-07,2.79552e-07,0.304938,0.018602,0.001539,0.890069,0.752564
3,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,1.809094e-07,4.768372e-07,2.79552e-07,0.304938,0.018602,0.001539,0.890069,0.752564
4,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.05971634,0.0625,0.06064422,0.900696,0.327862,0.116003,0.890069,0.81228


In [17]:
from sklearn.preprocessing import RobustScaler
df_robustScaler = df.copy()
df_robustScaler[np.append(bits_index, ['eucl', 'xor', 'xor+', 'arit', 'eucl_dec', 'hamm', 
                                            'norm_src', 'norm_dst'])] = RobustScaler().fit_transform(df)

df_robustScaler.to_csv('ISCX_datasets_preprocesados/robust.csv')
df_robustScaler.head()

Unnamed: 0,s31,s30,s29,s28,s27,s26,s25,s24,s23,s22,...,d1,d0,eucl,xor,xor+,arit,eucl_dec,hamm,norm_src,norm_dst
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.111404,0.129032,0.114648,0.042934,0.435779,0.230957,1.48855,0.2493939
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.111404,0.129032,0.114648,0.042934,0.435779,0.230957,1.48855,0.2493939
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-0.167265,-0.129032,-0.153216,-5.861853,-0.998006,-0.305728,0.005725,4.941773e-07
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-0.167265,-0.129032,-0.153216,-5.861853,-0.998006,-0.305728,0.005725,4.941773e-07
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.0,0.0,-0.031349,-0.064516,-0.05474,1.056517,0.106255,-0.0383,0.005725,0.1216378


In [19]:
from sklearn.preprocessing import Normalizer
df_norml1 = df.copy()
df_norml1[np.append(bits_index, ['eucl', 'xor', 'xor+', 'arit', 'eucl_dec', 'hamm', 
                                            'norm_src', 'norm_dst'])] = Normalizer(norm='l1').fit_transform(df)

df_norml1.to_csv('ISCX_datasets_preprocesados/norml1.csv')
df_norml1.head()

Unnamed: 0,s31,s30,s29,s28,s27,s26,s25,s24,s23,s22,...,d1,d0,eucl,xor,xor+,arit,eucl_dec,hamm,norm_src,norm_dst
0,4.704856e-10,4.704856e-10,0.0,0.0,0.0,0.0,0.0,0.0,4.704856e-10,0.0,...,4.704856e-10,4.704856e-10,0.247409,0.25259,0.499999,1.00518e-08,1.008136e-07,9.247515e-07,3.540704e-10,4.116749e-10
1,4.704856e-10,4.704856e-10,0.0,0.0,0.0,0.0,0.0,0.0,4.704856e-10,0.0,...,4.704856e-10,4.704856e-10,0.247409,0.25259,0.499999,1.00518e-08,1.008136e-07,9.247515e-07,3.540704e-10,4.116749e-10
2,0.0002735171,0.0002735171,0.0,0.0,0.0,0.0,0.0,0.0,0.0002735171,0.0,...,0.0002735171,0.0,0.212523,0.280082,0.492604,0.002190698,0.002594811,0.003576701,0.000205839,0.0002058391
3,0.0002735171,0.0002735171,0.0,0.0,0.0,0.0,0.0,0.0,0.0002735171,0.0,...,0.0002735171,0.0,0.212523,0.280082,0.492604,0.002190698,0.002594811,0.003576701,0.000205839,0.0002058391
4,1.279761e-09,1.279761e-09,0.0,0.0,0.0,0.0,0.0,0.0,1.279761e-09,0.0,...,0.0,0.0,0.328233,0.171767,0.499999,3.027562e-08,2.139881e-07,1.261814e-06,9.631014e-10,1.039524e-09


In [23]:
df_norml2 = None
df_norml2 = pd.DataFrame(Normalizer(norm='l2').fit_transform(df),
                        columns = np.append(bits_index, ['eucl', 'xor', 'xor+', 'arit', 'eucl_dec', 'hamm', 
                                            'norm_src', 'norm_dst']))

df_norml2.to_csv('ISCX_datasets_preprocesados/norml2.csv')
df_norml2.head()

Unnamed: 0,s31,s30,s29,s28,s27,s26,s25,s24,s23,s22,...,d1,d0,eucl,xor,xor+,arit,eucl_dec,hamm,norm_src,norm_dst
0,7.682869e-10,7.682869e-10,0.0,0.0,0.0,0.0,0.0,0.0,7.682869e-10,0.0,...,7.682869e-10,7.682869e-10,0.404011,0.412471,0.816482,1.641425e-08,1.646251e-07,2e-06,5.781849e-10,6.72251e-10
1,7.682869e-10,7.682869e-10,0.0,0.0,0.0,0.0,0.0,0.0,7.682869e-10,0.0,...,7.682869e-10,7.682869e-10,0.404011,0.412471,0.816482,1.641425e-08,1.646251e-07,2e-06,5.781849e-10,6.72251e-10
2,0.0004519267,0.0004519267,0.0,0.0,0.0,0.0,0.0,0.0,0.0004519267,0.0,...,0.0004519267,0.0,0.351147,0.462773,0.81392,0.003619645,0.004287353,0.00591,0.0003401036,0.0003401037
3,0.0004519267,0.0004519267,0.0,0.0,0.0,0.0,0.0,0.0,0.0004519267,0.0,...,0.0004519267,0.0,0.351147,0.462773,0.81392,0.003619645,0.004287353,0.00591,0.0003401036,0.0003401037
4,2.056548e-09,2.056548e-09,0.0,0.0,0.0,0.0,0.0,0.0,2.056548e-09,0.0,...,0.0,0.0,0.527463,0.276025,0.803488,4.865227e-08,3.438743e-07,2e-06,1.547683e-09,1.670493e-09


# Feature selection

In [24]:
multiIndex = [['kbest_chi2', 'kbest_fclass', 'extraTrees', 'randomForest'], ['no', 'standard', 'minMax', 'robust', 'norm_l1', 'norm_l2']]

multiIndex = pd.MultiIndex.from_product(multiIndex, names=['featureSelection', 'preprocesamiento'])
df_features = pd.DataFrame(None, index=multiIndex, columns=np.arange(10))
df_features

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3,4,5,6,7,8,9
featureSelection,preprocesamiento,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
kbest_chi2,no,,,,,,,,,,
kbest_chi2,standard,,,,,,,,,,
kbest_chi2,minMax,,,,,,,,,,
kbest_chi2,robust,,,,,,,,,,
kbest_chi2,norm_l1,,,,,,,,,,
kbest_chi2,norm_l2,,,,,,,,,,
kbest_fclass,no,,,,,,,,,,
kbest_fclass,standard,,,,,,,,,,
kbest_fclass,minMax,,,,,,,,,,
kbest_fclass,robust,,,,,,,,,,


In [36]:
# Transformamos los valores negativos de cada columna en positivos (>=0) sumando el valor absoluto el minimo valor si este es negativo.
def sin_negativos(columna):
    minVal = columna.min()
    if minVal < 0:
        return columna - minVal
    return columna

df_standardScaler_positivos = df_standardScaler.copy()
df_robustScaler_positivos = df_robustScaler.copy() 

for col in df.columns:
    df_standardScaler_positivos[col] = sin_negativos(df_standardScaler_positivos[col])    
    df_robustScaler_positivos[col] = sin_negativos(df_robustScaler_positivos[col])

In [37]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

df_kbest = SelectKBest(chi2).fit(df, tags)
indices = np.argsort(df_kbest.scores_)[::-1]
df_features.loc[('kbest_chi2','no')] = [df.columns[col] for col in indices[:10]]

df_kbest = SelectKBest(chi2).fit(df_standardScaler_positivos, tags)
indices = np.argsort(df_kbest.scores_)[::-1]
df_features.loc[('kbest_chi2','standard')] = [df.columns[col] for col in indices[:10]]

df_kbest = SelectKBest(chi2).fit(df_minMaxScaler, tags)
indices = np.argsort(df_kbest.scores_)[::-1]
df_features.loc[('kbest_chi2','minMax')] = [df.columns[col] for col in indices[:10]]

df_kbest = SelectKBest(chi2).fit(df_robustScaler_positivos, tags)
indices = np.argsort(df_kbest.scores_)[::-1]
df_features.loc[('kbest_chi2','robust')] = [df.columns[col] for col in indices[:10]]

df_kbest = SelectKBest(chi2).fit(df_norml1, tags)
indices = np.argsort(df_kbest.scores_)[::-1]
df_features.loc[('kbest_chi2','norm_l1')] = [df.columns[col] for col in indices[:10]]

df_kbest = SelectKBest(chi2).fit(df_norml2, tags)
indices = np.argsort(df_kbest.scores_)[::-1]
df_features.loc[('kbest_chi2','norm_l2')] = [df.columns[col] for col in indices[:10]]

df_features.loc['kbest_chi2']

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9
preprocesamiento,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
no,xor+,xor,eucl,hamm,eucl_dec,arit,s0,s1,d18,d12
standard,s11,s18,s26,s27,s16,s25,s12,eucl_dec,s17,s0
minMax,s0,s1,d18,d12,d25,xor,d24,d11,d22,d15
robust,norm_src,arit,eucl_dec,s0,s1,d18,d12,d25,hamm,xor
norm_l1,s3,s8,s0,d8,eucl,arit,s5,s6,d19,d21
norm_l2,s3,s8,d8,s0,s5,s6,d19,d21,d23,d31


In [38]:
from sklearn.feature_selection import f_classif

df_kbest = SelectKBest(f_classif).fit(df, tags)
indices = np.argsort(df_kbest.scores_)[::-1]
df_features.loc[('kbest_fclass','no')] = [df.columns[col] for col in indices[:10]]

df_kbest = SelectKBest(f_classif).fit(df_standardScaler, tags)
indices = np.argsort(df_kbest.scores_)[::-1]
df_features.loc[('kbest_fclass','standard')] = [df.columns[col] for col in indices[:10]]

df_kbest = SelectKBest(f_classif).fit(df_minMaxScaler, tags)
indices = np.argsort(df_kbest.scores_)[::-1]
df_features.loc[('kbest_fclass','minMax')] = [df.columns[col] for col in indices[:10]]

df_kbest = SelectKBest(f_classif).fit(df_robustScaler, tags)
indices = np.argsort(df_kbest.scores_)[::-1]
df_features.loc[('kbest_fclass','robust')] = [df.columns[col] for col in indices[:10]]

df_kbest = SelectKBest(f_classif).fit(df_norml1, tags)
indices = np.argsort(df_kbest.scores_)[::-1]
df_features.loc[('kbest_fclass','norm_l1')] = [df.columns[col] for col in indices[:10]]

df_kbest = SelectKBest(f_classif).fit(df_norml2, tags)
indices = np.argsort(df_kbest.scores_)[::-1]
df_features.loc[('kbest_fclass','norm_l2')] = [df.columns[col] for col in indices[:10]]

df_features.loc['kbest_fclass']

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9
preprocesamiento,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
no,arit,eucl_dec,s1,hamm,d12,d18,xor+,s0,xor,d25
standard,arit,eucl_dec,s1,hamm,d12,d18,xor+,s0,xor,d25
minMax,arit,eucl_dec,s1,hamm,d12,d18,xor+,s0,xor,d25
robust,arit,eucl_dec,s1,hamm,d12,d18,xor+,s0,xor,d25
norm_l1,arit,eucl,s3,s8,s0,d8,s5,s6,d19,d21
norm_l2,eucl,s3,arit,s8,d8,xor,s0,s5,s6,d19


In [39]:
from sklearn.ensemble import ExtraTreesClassifier

extraTrees = ExtraTreesClassifier(n_estimators=50, random_state=1).fit(df, tags)
indices = np.argsort(extraTrees.feature_importances_)[::-1]
df_features.loc[('extraTrees','no')] = [df.columns[col] for col in indices[:10]]

extraTrees = ExtraTreesClassifier(n_estimators=50, random_state=1).fit(df_standardScaler, tags)
indices = np.argsort(extraTrees.feature_importances_)[::-1]
df_features.loc[('extraTrees','standard')] = [df.columns[col] for col in indices[:10]]

extraTrees = ExtraTreesClassifier(n_estimators=50, random_state=1).fit(df_minMaxScaler, tags)
indices = np.argsort(extraTrees.feature_importances_)[::-1]
df_features.loc[('extraTrees','minMax')] = [df.columns[col] for col in indices[:10]]

extraTrees = ExtraTreesClassifier(n_estimators=50, random_state=1).fit(df_robustScaler, tags)
indices = np.argsort(extraTrees.feature_importances_)[::-1]
df_features.loc[('extraTrees','robust')] = [df.columns[col] for col in indices[:10]]

extraTrees = ExtraTreesClassifier(n_estimators=50, random_state=1).fit(df_norml1, tags)
indices = np.argsort(extraTrees.feature_importances_)[::-1]
df_features.loc[('extraTrees','norm_l1')] = [df.columns[col] for col in indices[:10]]

extraTrees = ExtraTreesClassifier(n_estimators=50, random_state=1).fit(df_norml2, tags)
indices = np.argsort(extraTrees.feature_importances_)[::-1]
df_features.loc[('extraTrees','norm_l2')] = [df.columns[col] for col in indices[:10]]
                                          
df_features.loc['extraTrees']

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9
preprocesamiento,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
no,eucl_dec,d10,s1,s3,d4,s2,arit,s8,d8,s0
standard,eucl_dec,s1,d4,d10,s2,arit,s3,s8,s0,s4
minMax,s1,eucl_dec,d10,arit,s2,d4,s3,hamm,d8,s8
robust,d10,s1,eucl_dec,d4,arit,s8,s2,s3,d1,s0
norm_l1,s8,s31,d6,xor,d30,d21,eucl,s23,s3,d5
norm_l2,d9,d19,s8,s19,d0,s5,xor,d21,d6,s30


In [40]:
from sklearn.ensemble import RandomForestClassifier

randomForest = RandomForestClassifier(n_estimators=50, random_state=1).fit(df, tags)
indices = np.argsort(randomForest.feature_importances_)[::-1]
df_features.loc[('randomForest','no')] = [df.columns[col] for col in indices[:10]]

randomForest = RandomForestClassifier(n_estimators=50, random_state=1).fit(df_standardScaler, tags)
indices = np.argsort(randomForest.feature_importances_)[::-1]
df_features.loc[('randomForest','standard')] = [df.columns[col] for col in indices[:10]]

randomForest = RandomForestClassifier(n_estimators=50, random_state=1).fit(df_minMaxScaler, tags)
indices = np.argsort(randomForest.feature_importances_)[::-1]
df_features.loc[('randomForest','minMax')] = [df.columns[col] for col in indices[:10]]

randomForest = RandomForestClassifier(n_estimators=50, random_state=1).fit(df_robustScaler, tags)
indices = np.argsort(randomForest.feature_importances_)[::-1]
df_features.loc[('randomForest','robust')] = [df.columns[col] for col in indices[:10]]

randomForest = RandomForestClassifier(n_estimators=50, random_state=1).fit(df_norml1, tags)
indices = np.argsort(randomForest.feature_importances_)[::-1]
df_features.loc[('randomForest','norm_l1')] = [df.columns[col] for col in indices[:10]]

randomForest = RandomForestClassifier(n_estimators=50, random_state=1).fit(df_norml2, tags)
indices = np.argsort(randomForest.feature_importances_)[::-1]
df_features.loc[('randomForest','norm_l2')] = [df.columns[col] for col in indices[:10]]

df_features.loc['randomForest']

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9
preprocesamiento,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
no,xor+,arit,eucl_dec,xor,hamm,s4,eucl,s8,s3,s2
standard,arit,eucl_dec,xor,norm_src,hamm,xor+,norm_dst,eucl,s3,s2
minMax,eucl_dec,arit,hamm,xor,s4,s8,xor+,s3,s2,d4
robust,arit,eucl_dec,norm_src,hamm,xor,xor+,s4,s8,eucl,s2
norm_l1,s3,s8,eucl,d4,d31,d23,d19,d30,norm_src,s4
norm_l2,s3,s8,xor,eucl,s5,d21,s31,d4,s4,d30


In [41]:
df_features

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3,4,5,6,7,8,9
featureSelection,preprocesamiento,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
kbest_chi2,no,xor+,xor,eucl,hamm,eucl_dec,arit,s0,s1,d18,d12
kbest_chi2,standard,s11,s18,s26,s27,s16,s25,s12,eucl_dec,s17,s0
kbest_chi2,minMax,s0,s1,d18,d12,d25,xor,d24,d11,d22,d15
kbest_chi2,robust,norm_src,arit,eucl_dec,s0,s1,d18,d12,d25,hamm,xor
kbest_chi2,norm_l1,s3,s8,s0,d8,eucl,arit,s5,s6,d19,d21
kbest_chi2,norm_l2,s3,s8,d8,s0,s5,s6,d19,d21,d23,d31
kbest_fclass,no,arit,eucl_dec,s1,hamm,d12,d18,xor+,s0,xor,d25
kbest_fclass,standard,arit,eucl_dec,s1,hamm,d12,d18,xor+,s0,xor,d25
kbest_fclass,minMax,arit,eucl_dec,s1,hamm,d12,d18,xor+,s0,xor,d25
kbest_fclass,robust,arit,eucl_dec,s1,hamm,d12,d18,xor+,s0,xor,d25


In [42]:
df_features.to_csv('ISCX_datasets_preprocesados/ISCX_features.csv')