In [1]:
import pandas as pd
import numpy as np
import json

import math
from scipy.spatial.distance import euclidean

# Lectura de datos

In [2]:
df = pd.read_csv('UNSW.csv', names=np.arange(1,50), skiprows=1)

In [3]:
df.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,40,41,42,43,44,45,46,47,48,49
0,59.166.0.9,7045,149.171.126.7,25,tcp,FIN,0.201886,37552,3380,31,...,,2,2,7,4,1,1,3,,0
1,59.166.0.9,9685,149.171.126.2,80,tcp,FIN,5.864748,19410,1087890,31,...,,3,1,4,4,1,1,1,,0
2,59.166.0.2,1421,149.171.126.4,53,udp,CON,0.001391,146,178,31,...,,3,5,2,7,1,1,4,,0
3,59.166.0.2,21553,149.171.126.2,25,tcp,FIN,0.053948,37812,3380,31,...,,1,1,4,7,1,1,3,,0
4,59.166.0.8,45212,149.171.126.4,53,udp,CON,0.000953,146,178,31,...,,2,5,2,1,1,1,2,,0


In [4]:
df = df[[1,2,3,4,49]]
df.columns = ['source', 'Src Pt', 'destination', 'Dst Pt', 'Tag']
df.columns

Index(['source', 'Src Pt', 'destination', 'Dst Pt', 'Tag'], dtype='object')

In [5]:
tags = df.loc[:,'Tag']
tags.to_csv('UNSW_datasets_preprocesados/UNSW_tags.csv')

# Ingeniería de características: Extracción de variables

**Bit String-based Extraction**

In [6]:
def to_binary_cols(src, dst):
    array = []  
    for item in src.split('.') + dst.split('.'):
        item = format(int(item),'#08b')
        item = [ int(bit) for bit in item.split('b')[1]]
        while len(item) < 8:
            item.insert(0,0)
        array.extend(item)
        
    return array

**Distancia Euclidiana**

In [7]:
potencias2 = [pow(2,k) for k in range(33)]   # 2^32 para tener calculado también el valor de la "normalizacion"
print(potencias2)

def distancia_euclidiana_bits(src, dst):
    dist = 0
    for k in range(0,32):
        dist += potencias2[31-k] * (src[k]-dst[k])
    
    return abs(dist)

[1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152, 4194304, 8388608, 16777216, 33554432, 67108864, 134217728, 268435456, 536870912, 1073741824, 2147483648, 4294967296]


**XOR**

In [8]:
def distancia_xor(src,dst):
    k = 0
    while k < 32:
        if src[k] != dst[k]:
            return potencias2[31-k]
        k += 1
    
    return 0

**XOR+**

In [9]:
def distancia_xorplus(eucl,xor):
    return eucl + xor

**Distancia Aritmética**

In [10]:
def dif_aritmetica(src, dst):
    src = int(src.replace('.',''))
    dst = int(dst.replace('.',''))
    dif = abs(src-dst)
    if dif == 0:
        return dif
    return math.log(dif)

**Distancia Euclidiana y de Hamming sobre los bytes en formato decimal. También "normalización".**

In [11]:
def hamming(src, dst):
    return math.sqrt(1000 * math.pow(src[0]-dst[0],2) +
                     100 * math.pow(src[1]-dst[1],2) +
                     10 * math.pow(src[2]-dst[2],2) +
                     1 * math.pow(src[3]-dst[3],2)
    )


def normalizacion(ip):
    suma = 0
    for i in range(4):
        suma += ip[i] * potencias2[24 - 8 * i]
    
    return suma / potencias2[32]
    
    
def dist_euclidiana_hamming_norm(src, dst):
    src = [int(byte) for byte in src.split('.')]
    dst = [int(byte) for byte in dst.split('.')]
    
    return euclidean(src, dst), hamming(src, dst), normalizacion(src), normalizacion(dst)

Desarrollo características para evitar recorrer todas las filas múltiples veces.

In [12]:
def desarrolloCaracteristicas(src,dst):
    bits = to_binary_cols(src,dst)
    eucl = distancia_euclidiana_bits(bits[:32],bits[32:])
    xor = distancia_xor(bits[:32],bits[32:])
  
    return bits + [eucl, xor, distancia_xorplus(eucl,xor), dif_aritmetica(src, dst), *dist_euclidiana_hamming_norm(src, dst)] 

In [13]:
df = df[['source', 'destination', 'Tag']]
df.head()

Unnamed: 0,source,destination,Tag
0,59.166.0.9,149.171.126.7,0
1,59.166.0.9,149.171.126.2,0
2,59.166.0.2,149.171.126.4,0
3,59.166.0.2,149.171.126.2,0
4,59.166.0.8,149.171.126.4,0


In [14]:
bits_index = ['s' + str(i) for i in np.arange(31,-1,-1)] + ['d' + str(i) for i in np.arange(31,-1,-1)]

df[np.append(bits_index,
                       ['eucl', 'xor', 'xor+',
                        'arit', 'eucl_dec', 'hamm',
                        'norm_src', 'norm_dst'])] = df.apply(
                                                    lambda x: desarrolloCaracteristicas(x['source'],x['destination']), 
                                                    axis=1, result_type='expand')

df = df[np.append(bits_index, ['eucl', 'xor', 'xor+', 'arit', 'eucl_dec', 'hamm', 'norm_src', 'norm_dst'])]

df.to_csv('UNSW_datasets_preprocesados/no.csv')
df.head()

Unnamed: 0,s31,s30,s29,s28,s27,s26,s25,s24,s23,s22,...,d1,d0,eucl,xor,xor+,arit,eucl_dec,hamm,norm_src,norm_dst
0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,...,1.0,1.0,1510309000.0,2147484000.0,3657793000.0,21.119216,154.93547,2874.241465,0.233002,0.584648
1,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,...,1.0,0.0,1510309000.0,2147484000.0,3657793000.0,21.119216,155.080624,2874.249293,0.233002,0.584648
2,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,...,0.0,0.0,1510309000.0,2147484000.0,3657793000.0,21.119216,154.93547,2874.241465,0.233002,0.584648
3,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,...,1.0,0.0,1510309000.0,2147484000.0,3657793000.0,21.119216,154.922561,2874.240769,0.233002,0.584648
4,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,...,0.0,0.0,1510309000.0,2147484000.0,3657793000.0,21.119216,154.974191,2874.243553,0.233002,0.584648


# Ingeniería de características: Transformación de los datos

In [15]:
from sklearn.preprocessing import StandardScaler
df_standardScaler = pd.DataFrame(StandardScaler().fit_transform(df),
                                 columns = np.append(bits_index, ['eucl', 'xor', 'xor+', 'arit', 'eucl_dec', 'hamm', 
                                            'norm_src', 'norm_dst']))

df_standardScaler.to_csv('UNSW_datasets_preprocesados/standard.csv')
df_standardScaler.head()

Unnamed: 0,s31,s30,s29,s28,s27,s26,s25,s24,s23,s22,...,d1,d0,eucl,xor,xor+,arit,eucl_dec,hamm,norm_src,norm_dst
0,-0.795265,0.0,0.45038,0.547903,0.440627,-0.795265,0.440627,0.078014,0.547903,0.0,...,1.162439,1.083925,0.787671,0.802387,0.798077,-0.597636,0.519031,0.769685,-0.774626,-0.255662
1,-0.795265,0.0,0.45038,0.547903,0.440627,-0.795265,0.440627,0.078014,0.547903,0.0,...,1.162439,-0.922573,0.787671,0.802387,0.798077,-0.597636,0.530913,0.769696,-0.774626,-0.255662
2,-0.795265,0.0,0.45038,0.547903,0.440627,-0.795265,0.440627,0.078014,0.547903,0.0,...,-0.86026,-0.922573,0.787671,0.802387,0.798077,-0.597636,0.519031,0.769685,-0.774626,-0.255662
3,-0.795265,0.0,0.45038,0.547903,0.440627,-0.795265,0.440627,0.078014,0.547903,0.0,...,1.162439,-0.922573,0.787671,0.802387,0.798077,-0.597636,0.517974,0.769684,-0.774626,-0.255662
4,-0.795265,0.0,0.45038,0.547903,0.440627,-0.795265,0.440627,0.078014,0.547903,0.0,...,-0.86026,-0.922573,0.787671,0.802387,0.798077,-0.597636,0.5222,0.769688,-0.774626,-0.255662


In [16]:
from sklearn.preprocessing import MinMaxScaler
df_minMaxScaler = pd.DataFrame(MinMaxScaler().fit_transform(df),
                                 columns = np.append(bits_index, ['eucl', 'xor', 'xor+', 'arit', 'eucl_dec', 'hamm', 
                                            'norm_src', 'norm_dst']))

df_minMaxScaler.to_csv('UNSW_datasets_preprocesados/minMax.csv')
df_minMaxScaler.head()

Unnamed: 0,s31,s30,s29,s28,s27,s26,s25,s24,s23,s22,...,d1,d0,eucl,xor,xor+,arit,eucl_dec,hamm,norm_src,norm_dst
0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,...,1.0,1.0,0.420971,1.0,0.637783,0.891915,0.545954,0.422467,0.299907,0.652407
1,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,...,1.0,0.0,0.420971,1.0,0.637783,0.891915,0.546465,0.422468,0.299907,0.652407
2,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.420971,1.0,0.637783,0.891915,0.545954,0.422467,0.299907,0.652407
3,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,...,1.0,0.0,0.420971,1.0,0.637783,0.891915,0.545908,0.422467,0.299907,0.652407
4,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.420971,1.0,0.637783,0.891915,0.54609,0.422467,0.299907,0.652407


In [17]:
from sklearn.preprocessing import RobustScaler
df_robustScaler = pd.DataFrame(RobustScaler().fit_transform(df),
                                 columns = np.append(bits_index, ['eucl', 'xor', 'xor+', 'arit', 'eucl_dec', 'hamm', 
                                            'norm_src', 'norm_dst']))

df_robustScaler.to_csv('UNSW_datasets_preprocesados/robust.csv')
df_robustScaler.head()

Unnamed: 0,s31,s30,s29,s28,s27,s26,s25,s24,s23,s22,...,d1,d0,eucl,xor,xor+,arit,eucl_dec,hamm,norm_src,norm_dst
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.847837e-09,0.0,7.426774e-10,-1.759791e-09,0.000593,3.833452e-07,6.62116e-10,-0.090909
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,-2.771756e-09,0.0,-1.114016e-09,-3.226285e-09,0.009479,6.133515e-06,6.62116e-10,-0.545455
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5.543511e-09,0.0,2.228032e-09,-5.865971e-10,0.000593,3.833452e-07,-3.972696e-09,-0.363636
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,3.695674e-09,0.0,1.485355e-09,-1.173194e-09,-0.000198,-1.277818e-07,-3.972696e-09,-0.545455
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-2.346388e-09,0.002963,1.916725e-06,0.0,-0.363636


In [18]:
from sklearn.preprocessing import Normalizer
df_norml1 = pd.DataFrame(Normalizer(norm='l1').fit_transform(df),
                                 columns = np.append(bits_index, ['eucl', 'xor', 'xor+', 'arit', 'eucl_dec', 'hamm', 
                                            'norm_src', 'norm_dst']))

df_norml1.to_csv('UNSW_datasets_preprocesados/norml1.csv')
df_norml1.head()

Unnamed: 0,s31,s30,s29,s28,s27,s26,s25,s24,s23,s22,...,d1,d0,eucl,xor,xor+,arit,eucl_dec,hamm,norm_src,norm_dst
0,0.0,0.0,1.366944e-10,1.366944e-10,1.366944e-10,0.0,1.366944e-10,1.366944e-10,1.366944e-10,0.0,...,1.366944e-10,1.366944e-10,0.206451,0.293549,0.5,2.886878e-09,2.117881e-08,3.928927e-07,3.185003e-11,7.99181e-11
1,0.0,0.0,1.366944e-10,1.366944e-10,1.366944e-10,0.0,1.366944e-10,1.366944e-10,1.366944e-10,0.0,...,1.366944e-10,0.0,0.206451,0.293549,0.5,2.886878e-09,2.119865e-08,3.928938e-07,3.185003e-11,7.99181e-11
2,0.0,0.0,1.366944e-10,1.366944e-10,1.366944e-10,0.0,1.366944e-10,1.366944e-10,1.366944e-10,0.0,...,0.0,0.0,0.206451,0.293549,0.5,2.886878e-09,2.117881e-08,3.928927e-07,3.185003e-11,7.99181e-11
3,0.0,0.0,1.366944e-10,1.366944e-10,1.366944e-10,0.0,1.366944e-10,1.366944e-10,1.366944e-10,0.0,...,1.366944e-10,0.0,0.206451,0.293549,0.5,2.886878e-09,2.117705e-08,3.928926e-07,3.185003e-11,7.99181e-11
4,0.0,0.0,1.366944e-10,1.366944e-10,1.366944e-10,0.0,1.366944e-10,1.366944e-10,1.366944e-10,0.0,...,0.0,0.0,0.206451,0.293549,0.5,2.886878e-09,2.11841e-08,3.92893e-07,3.185003e-11,7.99181e-11


In [19]:
df_norml2 = pd.DataFrame(Normalizer(norm='l2').fit_transform(df),
                        columns = np.append(bits_index, ['eucl', 'xor', 'xor+', 'arit', 'eucl_dec', 'hamm', 
                                            'norm_src', 'norm_dst']))

df_norml2.to_csv('UNSW_datasets_preprocesados/norml2.csv')
df_norml2.head()

Unnamed: 0,s31,s30,s29,s28,s27,s26,s25,s24,s23,s22,...,d1,d0,eucl,xor,xor+,arit,eucl_dec,hamm,norm_src,norm_dst
0,0.0,0.0,2.221007e-10,2.221007e-10,2.221007e-10,0.0,2.221007e-10,2.221007e-10,2.221007e-10,0.0,...,2.221007e-10,2.221007e-10,0.335441,0.476958,0.812398,4.690592e-09,3.441127e-08,6.38371e-07,5.174984e-11,1.298507e-10
1,0.0,0.0,2.221007e-10,2.221007e-10,2.221007e-10,0.0,2.221007e-10,2.221007e-10,2.221007e-10,0.0,...,2.221007e-10,0.0,0.335441,0.476958,0.812398,4.690592e-09,3.444351e-08,6.383727e-07,5.174984e-11,1.298507e-10
2,0.0,0.0,2.221007e-10,2.221007e-10,2.221007e-10,0.0,2.221007e-10,2.221007e-10,2.221007e-10,0.0,...,0.0,0.0,0.335441,0.476958,0.812398,4.690592e-09,3.441127e-08,6.38371e-07,5.174984e-11,1.298507e-10
3,0.0,0.0,2.221007e-10,2.221007e-10,2.221007e-10,0.0,2.221007e-10,2.221007e-10,2.221007e-10,0.0,...,2.221007e-10,0.0,0.335441,0.476958,0.812398,4.690592e-09,3.440841e-08,6.383708e-07,5.174984e-11,1.298507e-10
4,0.0,0.0,2.221007e-10,2.221007e-10,2.221007e-10,0.0,2.221007e-10,2.221007e-10,2.221007e-10,0.0,...,0.0,0.0,0.335441,0.476958,0.812398,4.690592e-09,3.441987e-08,6.383714e-07,5.174984e-11,1.298507e-10


# Feature selection

In [20]:
multiIndex = [['kbest_chi2', 'kbest_fclass', 'extraTrees', 'randomForest'], ['no', 'standard', 'minMax', 'robust', 'norm_l1', 'norm_l2']]

multiIndex = pd.MultiIndex.from_product(multiIndex, names=['featureSelection', 'preprocesamiento'])
df_features = pd.DataFrame(None, index=multiIndex, columns=np.arange(10))
df_features

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3,4,5,6,7,8,9
featureSelection,preprocesamiento,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
kbest_chi2,no,,,,,,,,,,
kbest_chi2,standard,,,,,,,,,,
kbest_chi2,minMax,,,,,,,,,,
kbest_chi2,robust,,,,,,,,,,
kbest_chi2,norm_l1,,,,,,,,,,
kbest_chi2,norm_l2,,,,,,,,,,
kbest_fclass,no,,,,,,,,,,
kbest_fclass,standard,,,,,,,,,,
kbest_fclass,minMax,,,,,,,,,,
kbest_fclass,robust,,,,,,,,,,


In [24]:
# Transformamos los valores negativos de cada columna en positivos (>=0) sumando el valor absoluto el minimo valor si este es negativo.
def sin_negativos(columna):
    minVal = columna.min()
    if minVal < 0:
        return columna - minVal
    return columna

df_standardScaler_positivos = df_standardScaler.copy()
df_robustScaler_positivos = df_robustScaler.copy() 

for col in df.columns:
    df_standardScaler_positivos[col] = sin_negativos(df_standardScaler_positivos[col])    
    df_robustScaler_positivos[col] = sin_negativos(df_robustScaler_positivos[col])

In [25]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

df_kbest = SelectKBest(chi2).fit(df, tags)
indices = np.argsort(df_kbest.scores_)[::-1]
df_features.loc[('kbest_chi2','no')] = [df.columns[col] for col in indices[:10]]

df_kbest = SelectKBest(chi2).fit(df_standardScaler_positivos, tags)
indices = np.argsort(df_kbest.scores_)[::-1]
df_features.loc[('kbest_chi2','standard')] = [df.columns[col] for col in indices[:10]]

df_kbest = SelectKBest(chi2).fit(df_minMaxScaler, tags)
indices = np.argsort(df_kbest.scores_)[::-1]
df_features.loc[('kbest_chi2','minMax')] = [df.columns[col] for col in indices[:10]]

df_kbest = SelectKBest(chi2).fit(df_robustScaler_positivos, tags)
indices = np.argsort(df_kbest.scores_)[::-1]
df_features.loc[('kbest_chi2','robust')] = [df.columns[col] for col in indices[:10]]

df_kbest = SelectKBest(chi2).fit(df_norml1, tags)
indices = np.argsort(df_kbest.scores_)[::-1]
df_features.loc[('kbest_chi2','norm_l1')] = [df.columns[col] for col in indices[:10]]

df_kbest = SelectKBest(chi2).fit(df_norml2, tags)
indices = np.argsort(df_kbest.scores_)[::-1]
df_features.loc[('kbest_chi2','norm_l2')] = [df.columns[col] for col in indices[:10]]

df_features.loc['kbest_chi2']

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9
preprocesamiento,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
no,d22,s20,s5,s6,s22,s7,s30,xor+,xor,eucl
standard,s21,s5,s30,s6,s7,d22,s22,s20,s15,s26
minMax,s22,s5,s30,s6,s7,d22,s20,s21,s15,s31
robust,d22,s30,s22,s21,s20,s7,s6,s5,norm_dst,s15
norm_l1,d22,s30,s22,s7,s6,s5,s20,eucl_dec,hamm,eucl
norm_l2,d22,s30,s22,s7,s6,s5,s20,eucl_dec,hamm,eucl


In [26]:
from sklearn.feature_selection import f_classif

df_kbest = SelectKBest(f_classif).fit(df, tags)
indices = np.argsort(df_kbest.scores_)[::-1]
df_features.loc[('kbest_fclass','no')] = [df.columns[col] for col in indices[:10]]

df_kbest = SelectKBest(f_classif).fit(df_standardScaler, tags)
indices = np.argsort(df_kbest.scores_)[::-1]
df_features.loc[('kbest_fclass','standard')] = [df.columns[col] for col in indices[:10]]

df_kbest = SelectKBest(f_classif).fit(df_minMaxScaler, tags)
indices = np.argsort(df_kbest.scores_)[::-1]
df_features.loc[('kbest_fclass','minMax')] = [df.columns[col] for col in indices[:10]]

df_kbest = SelectKBest(f_classif).fit(df_robustScaler, tags)
indices = np.argsort(df_kbest.scores_)[::-1]
df_features.loc[('kbest_fclass','robust')] = [df.columns[col] for col in indices[:10]]

df_kbest = SelectKBest(f_classif).fit(df_norml1, tags)
indices = np.argsort(df_kbest.scores_)[::-1]
df_features.loc[('kbest_fclass','norm_l1')] = [df.columns[col] for col in indices[:10]]

df_kbest = SelectKBest(f_classif).fit(df_norml2, tags)
indices = np.argsort(df_kbest.scores_)[::-1]
df_features.loc[('kbest_fclass','norm_l2')] = [df.columns[col] for col in indices[:10]]

df_features.loc['kbest_fclass']

  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9
preprocesamiento,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
no,s22,d22,s5,s6,s7,s20,s30,s21,s15,s17
standard,s22,d22,s5,s6,s7,s20,s30,s21,s15,s17
minMax,s22,d22,s5,s6,s7,s20,s30,s21,s15,s17
robust,s22,d22,s5,s6,s7,s20,s30,s21,s15,s17
norm_l1,s7,s30,s22,d22,s20,s5,s6,s29,s18,eucl
norm_l2,s7,s30,s22,d22,s20,s5,s6,s29,s18,eucl


In [27]:
from sklearn.ensemble import ExtraTreesClassifier

extraTrees = ExtraTreesClassifier(n_estimators=50, random_state=1).fit(df, tags)
indices = np.argsort(extraTrees.feature_importances_)[::-1]
df_features.loc[('extraTrees','no')] = [df.columns[col] for col in indices[:10]]

extraTrees = ExtraTreesClassifier(n_estimators=50, random_state=1).fit(df_standardScaler, tags)
indices = np.argsort(extraTrees.feature_importances_)[::-1]
df_features.loc[('extraTrees','standard')] = [df.columns[col] for col in indices[:10]]

extraTrees = ExtraTreesClassifier(n_estimators=50, random_state=1).fit(df_minMaxScaler, tags)
indices = np.argsort(extraTrees.feature_importances_)[::-1]
df_features.loc[('extraTrees','minMax')] = [df.columns[col] for col in indices[:10]]

extraTrees = ExtraTreesClassifier(n_estimators=50, random_state=1).fit(df_robustScaler, tags)
indices = np.argsort(extraTrees.feature_importances_)[::-1]
df_features.loc[('extraTrees','robust')] = [df.columns[col] for col in indices[:10]]

extraTrees = ExtraTreesClassifier(n_estimators=50, random_state=1).fit(df_norml1, tags)
indices = np.argsort(extraTrees.feature_importances_)[::-1]
df_features.loc[('extraTrees','norm_l1')] = [df.columns[col] for col in indices[:10]]

extraTrees = ExtraTreesClassifier(n_estimators=50, random_state=1).fit(df_norml2, tags)
indices = np.argsort(extraTrees.feature_importances_)[::-1]
df_features.loc[('extraTrees','norm_l2')] = [df.columns[col] for col in indices[:10]]
                                          
df_features.loc['extraTrees']

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9
preprocesamiento,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
no,s28,s15,s17,s23,s26,d11,s9,norm_src,s31,s12
standard,s28,s15,s17,s23,s26,norm_src,d11,s9,d28,s31
minMax,s28,s15,s17,s23,s26,norm_src,d11,s9,d28,s31
robust,s28,s15,s17,s23,s26,d11,s9,norm_src,s31,d28
norm_l1,xor+,hamm,eucl,xor,s25,eucl_dec,d7,d25,s1,d5
norm_l2,hamm,xor+,eucl,xor,eucl_dec,s25,d7,d5,d19,d13


In [28]:
from sklearn.ensemble import RandomForestClassifier

randomForest = RandomForestClassifier(n_estimators=50, random_state=1).fit(df, tags)
indices = np.argsort(randomForest.feature_importances_)[::-1]
df_features.loc[('randomForest','no')] = [df.columns[col] for col in indices[:10]]

randomForest = RandomForestClassifier(n_estimators=50, random_state=1).fit(df_standardScaler, tags)
indices = np.argsort(randomForest.feature_importances_)[::-1]
df_features.loc[('randomForest','standard')] = [df.columns[col] for col in indices[:10]]

randomForest = RandomForestClassifier(n_estimators=50, random_state=1).fit(df_minMaxScaler, tags)
indices = np.argsort(randomForest.feature_importances_)[::-1]
df_features.loc[('randomForest','minMax')] = [df.columns[col] for col in indices[:10]]

randomForest = RandomForestClassifier(n_estimators=50, random_state=1).fit(df_robustScaler, tags)
indices = np.argsort(randomForest.feature_importances_)[::-1]
df_features.loc[('randomForest','robust')] = [df.columns[col] for col in indices[:10]]

randomForest = RandomForestClassifier(n_estimators=50, random_state=1).fit(df_norml1, tags)
indices = np.argsort(randomForest.feature_importances_)[::-1]
df_features.loc[('randomForest','norm_l1')] = [df.columns[col] for col in indices[:10]]

randomForest = RandomForestClassifier(n_estimators=50, random_state=1).fit(df_norml2, tags)
indices = np.argsort(randomForest.feature_importances_)[::-1]
df_features.loc[('randomForest','norm_l2')] = [df.columns[col] for col in indices[:10]]

df_features.loc['randomForest']

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9
preprocesamiento,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
no,norm_src,s23,s15,s31,s17,s28,arit,eucl_dec,s13,d11
standard,norm_src,s23,s15,s31,s17,s28,eucl_dec,arit,s13,d11
minMax,norm_src,s23,s15,s31,s17,s28,eucl_dec,arit,s13,d11
robust,norm_src,s23,s15,s17,s31,s28,arit,eucl_dec,d11,s10
norm_l1,eucl,hamm,xor,xor+,d19,norm_dst,norm_src,s25,d25,d12
norm_l2,hamm,eucl,xor+,xor,d19,norm_dst,norm_src,s25,d25,d21


In [29]:
df_features

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3,4,5,6,7,8,9
featureSelection,preprocesamiento,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
kbest_chi2,no,d22,s20,s5,s6,s22,s7,s30,xor+,xor,eucl
kbest_chi2,standard,s21,s5,s30,s6,s7,d22,s22,s20,s15,s26
kbest_chi2,minMax,s22,s5,s30,s6,s7,d22,s20,s21,s15,s31
kbest_chi2,robust,d22,s30,s22,s21,s20,s7,s6,s5,norm_dst,s15
kbest_chi2,norm_l1,d22,s30,s22,s7,s6,s5,s20,eucl_dec,hamm,eucl
kbest_chi2,norm_l2,d22,s30,s22,s7,s6,s5,s20,eucl_dec,hamm,eucl
kbest_fclass,no,s22,d22,s5,s6,s7,s20,s30,s21,s15,s17
kbest_fclass,standard,s22,d22,s5,s6,s7,s20,s30,s21,s15,s17
kbest_fclass,minMax,s22,d22,s5,s6,s7,s20,s30,s21,s15,s17
kbest_fclass,robust,s22,d22,s5,s6,s7,s20,s30,s21,s15,s17


In [30]:
df_features.to_csv('UNSW_datasets_preprocesados/UNSW_features.csv')