In [1]:
import pandas as pd
import numpy as np
import json
from sklearn.cluster import KMeans, MiniBatchKMeans, Birch, DBSCAN
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score

df = pd.read_csv('ISCX_datasets_preprocesados/minMax.csv')
df_tags = pd.read_csv('ISCX_datasets_preprocesados/ISCX_tags.csv', index_col=0, header=None, names=['Tag'])

df_features = pd.read_csv('ISCX_datasets_preprocesados/ISCX_features.csv', index_col=['featureSelection', 'preprocesamiento'])
df_features.loc[pd.IndexSlice[:, 'minMax'],:]

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3,4,5,6,7,8,9
featureSelection,preprocesamiento,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
kbest_chi2,minMax,s0,s1,d18,d12,d25,xor,d24,d11,d22,d15
kbest_fclass,minMax,arit,eucl_dec,s1,hamm,d12,d18,xor+,s0,xor,d25
extraTrees,minMax,s1,eucl_dec,d10,arit,s2,d4,s3,hamm,d8,s8
randomForest,minMax,eucl_dec,arit,hamm,xor,s4,s8,xor+,s3,s2,d4


In [2]:
def save_results(real, prediccion, indexs):
    df_results.loc[indexs] =  [round(accuracy_score(real, prediccion), 4),
                              round(precision_score(real, prediccion), 4),
                              round(recall_score(real, prediccion), 4),
                              round(f1_score(real, prediccion), 4),
                              round(matthews_corrcoef(real, prediccion), 4),
                              round(roc_auc_score(real, prediccion), 4)]

Se cargan los resultados ya logrados con anterioridad para no tener que ejecutar de nuevo todos los procesos, sólo los deseados.

In [3]:
from os import path

if path.exists('ISCX_results/ISCX_minMaxScaler_results.csv'):
    df_results = pd.read_csv('ISCX_results/ISCX_minMaxScaler_results.csv', index_col=['method', 'featureSelection'])
else:
    multiIndex = [['kmeans', 'minibatch', 'birch', 'mincovdet', 'isolation'],['kbest_chi2', 'kbest_fclass', 'extraTrees', 'randomForest']]

    multiIndex = pd.MultiIndex.from_product(multiIndex, names=['method', 'featureSelection'])
    df_results = pd.DataFrame(None, index=multiIndex, columns=['accuracy', 'precision', 'recall', 'f1_score', 'mathews_corr', 'AUC'])
df_results

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,precision,recall,f1_score,mathews_corr,AUC
method,featureSelection,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
kmeans,kbest_chi2,,,,,,
kmeans,kbest_fclass,,,,,,
kmeans,extraTrees,,,,,,
kmeans,randomForest,,,,,,
minibatch,kbest_chi2,,,,,,
minibatch,kbest_fclass,,,,,,
minibatch,extraTrees,,,,,,
minibatch,randomForest,,,,,,
birch,kbest_chi2,,,,,,
birch,kbest_fclass,,,,,,


# KMeans

In [4]:
kmeans = KMeans(n_clusters=2, n_init=10, random_state=0, algorithm='full').fit(df[df_features.loc[('kbest_chi2', 'minMax')]])

save_results(df_tags, kmeans.labels_, ('kmeans', 'kbest_chi2'))
df_results.loc[('kmeans', 'kbest_chi2')]

accuracy        0.7432
precision       0.2218
recall          0.9868
f1_score        0.3622
mathews_corr    0.3957
AUC             0.8553
Name: (kmeans, kbest_chi2), dtype: object

In [5]:
kmeans = KMeans(n_clusters=2, n_init=10, random_state=0, algorithm='full').fit(df[df_features.loc[('kbest_fclass', 'minMax')]])

save_results(df_tags, (kmeans.labels_ +1) % 2, ('kmeans', 'kbest_fclass'))
df_results.loc[('kmeans', 'kbest_fclass')]

accuracy        0.7107
precision       0.2019
recall          0.9871
f1_score        0.3352
mathews_corr     0.368
AUC             0.8379
Name: (kmeans, kbest_fclass), dtype: object

In [6]:
kmeans = KMeans(n_clusters=2, n_init=10, random_state=0, algorithm='full').fit(df[df_features.loc[('extraTrees', 'minMax')]])

save_results(df_tags, kmeans.labels_, ('kmeans', 'extraTrees'))
df_results.loc[('kmeans', 'extraTrees')]

accuracy        0.4885
precision       0.1238
recall          0.9749
f1_score        0.2198
mathews_corr    0.2252
AUC             0.7123
Name: (kmeans, extraTrees), dtype: object

In [7]:
kmeans = KMeans(n_clusters=2, n_init=10, random_state=0, algorithm='full').fit(df[df_features.loc[('randomForest', 'minMax')]])

save_results(df_tags, kmeans.labels_, ('kmeans', 'randomForest'))
df_results.loc[('kmeans', 'randomForest')]

accuracy        0.5341
precision       0.1334
recall          0.9654
f1_score        0.2344
mathews_corr    0.2439
AUC             0.7325
Name: (kmeans, randomForest), dtype: object

# MiniBatch KMeans

In [9]:
minibatch = MiniBatchKMeans(n_clusters=2,n_init=10, random_state=0).fit(df[df_features.loc[('kbest_chi2', 'minMax')]])

save_results(df_tags, minibatch.labels_, ('minibatch', 'kbest_chi2'))    # Se invierte los clusters tags 0 y 1
df_results.loc[('minibatch', 'kbest_chi2')]

accuracy        0.4379
precision       0.1152
recall          0.9889
f1_score        0.2063
mathews_corr    0.2079
AUC             0.6914
Name: (minibatch, kbest_chi2), dtype: object

In [10]:
minibatch = MiniBatchKMeans(n_clusters=2,n_init=10, random_state=0).fit(df[df_features.loc[('kbest_fclass', 'minMax')]])

save_results(df_tags, minibatch.labels_, ('minibatch', 'kbest_fclass'))    # Se invierte los clusters tags 0 y 1
df_results.loc[('minibatch', 'kbest_fclass')]

accuracy        0.6921
precision        0.192
recall          0.9872
f1_score        0.3215
mathews_corr    0.3534
AUC             0.8279
Name: (minibatch, kbest_fclass), dtype: object

In [11]:
minibatch = MiniBatchKMeans(n_clusters=2,n_init=10, random_state=0).fit(df[df_features.loc[('extraTrees', 'minMax')]])

save_results(df_tags, minibatch.labels_, ('minibatch', 'extraTrees'))    # Se invierte los clusters tags 0 y 1
df_results.loc[('minibatch', 'extraTrees')]

accuracy        0.5132
precision       0.1117
recall          0.8036
f1_score        0.1961
mathews_corr    0.1539
AUC             0.6468
Name: (minibatch, extraTrees), dtype: object

In [12]:
minibatch = MiniBatchKMeans(n_clusters=2,n_init=10, random_state=0).fit(df[df_features.loc[('randomForest', 'minMax')]])

save_results(df_tags, minibatch.labels_, ('minibatch', 'randomForest'))    # Se invierte los clusters tags 0 y 1
df_results.loc[('minibatch', 'randomForest')]

accuracy        0.5344
precision       0.1335
recall          0.9654
f1_score        0.2345
mathews_corr    0.2441
AUC             0.7327
Name: (minibatch, randomForest), dtype: object

# BIRCH

In [25]:
birch = Birch(n_clusters=2, threshold=0.1, branching_factor=10).fit_predict(df[df_features.loc[('kbest_chi2', 'minMax')]])

save_results(df_tags, (birch + 1) % 2, ('birch', 'kbest_chi2'))
df_results.loc[('birch', 'kbest_chi2')]

accuracy        0.3844
precision       0.1064
recall          0.9911
f1_score        0.1922
mathews_corr    0.1847
AUC             0.6635
Name: (birch, kbest_chi2), dtype: object

In [26]:
birch = Birch(n_clusters=2, threshold=0.1, branching_factor=10).fit_predict(df[df_features.loc[('kbest_fclass', 'minMax')]])

save_results(df_tags, (birch + 1) % 2, ('birch', 'kbest_fclass'))
df_results.loc[('birch', 'kbest_fclass')]

accuracy        0.4885
precision       0.1251
recall          0.9878
f1_score         0.222
mathews_corr    0.2316
AUC             0.7183
Name: (birch, kbest_fclass), dtype: object

In [27]:
birch = Birch(n_clusters=2, threshold=0.1, branching_factor=10).fit_predict(df[df_features.loc[('extraTrees', 'minMax')]])

save_results(df_tags, (birch + 1) % 2, ('birch', 'extraTrees'))
df_results.loc[('birch', 'extraTrees')]

accuracy        0.3919
precision       0.1075
recall          0.9896
f1_score        0.1939
mathews_corr    0.1872
AUC             0.6669
Name: (birch, extraTrees), dtype: object

In [28]:
birch = Birch(n_clusters=2, threshold=0.1, branching_factor=10).fit_predict(df[df_features.loc[('randomForest', 'minMax')]])

save_results(df_tags, (birch + 1) % 2, ('birch', 'randomForest'))
df_results.loc[('birch', 'randomForest')]

accuracy         0.383
precision        0.105
recall          0.9772
f1_score        0.1896
mathews_corr    0.1765
AUC             0.6564
Name: (birch, randomForest), dtype: object

# MinCovDet

In [13]:
minCovDet = EllipticEnvelope(assume_centered=False, contamination=0.2, support_fraction=1,
                             random_state=4).fit_predict(df[df_features.loc[('kbest_chi2', 'minMax')]])

minCovDet[minCovDet == 1] = 0
minCovDet[minCovDet == -1] = 1
save_results(df_tags, minCovDet, ('mincovdet', 'kbest_chi2'))
df_results.loc[('mincovdet', 'kbest_chi2')]

accuracy        0.7275
precision       0.0035
recall          0.0094
f1_score        0.0051
mathews_corr   -0.1346
AUC             0.3971
Name: (mincovdet, kbest_chi2), dtype: object

In [14]:
minCovDet = EllipticEnvelope(assume_centered=False, contamination=0.2, support_fraction=1,
                             random_state=4).fit_predict(df[df_features.loc[('kbest_fclass', 'minMax')]])

minCovDet[minCovDet == 1] = 0
minCovDet[minCovDet == -1] = 1
save_results(df_tags, minCovDet, ('mincovdet', 'kbest_fclass'))
df_results.loc[('mincovdet', 'kbest_fclass')]

accuracy        0.7231
precision       0.0034
recall          0.0094
f1_score         0.005
mathews_corr   -0.1366
AUC             0.3947
Name: (mincovdet, kbest_fclass), dtype: object

In [15]:
minCovDet = EllipticEnvelope(assume_centered=False, contamination=0.2, support_fraction=1,
                             random_state=4).fit_predict(df[df_features.loc[('extraTrees', 'minMax')]])

minCovDet[minCovDet == 1] = 0
minCovDet[minCovDet == -1] = 1
save_results(df_tags, minCovDet, ('mincovdet', 'extraTrees'))
df_results.loc[('mincovdet', 'extraTrees')]

accuracy        0.7515
precision       0.0636
recall           0.172
f1_score        0.0928
mathews_corr   -0.0198
AUC             0.4849
Name: (mincovdet, extraTrees), dtype: object

In [16]:
minCovDet = EllipticEnvelope(assume_centered=False, contamination=0.2, support_fraction=1,
                             random_state=4).fit_predict(df[df_features.loc[('randomForest', 'minMax')]])

minCovDet[minCovDet == 1] = 0
minCovDet[minCovDet == -1] = 1
save_results(df_tags, minCovDet, ('mincovdet', 'randomForest'))
df_results.loc[('mincovdet', 'randomForest')]

accuracy        0.7398
precision       0.0342
recall          0.0925
f1_score        0.0499
mathews_corr   -0.0759
AUC              0.442
Name: (mincovdet, randomForest), dtype: object

# Isolation Forest

In [18]:
isolation = IsolationForest(n_estimators=100, contamination=0.2, random_state=0, 
                            behaviour='new').fit_predict(df[df_features.loc[('kbest_chi2', 'minMax')]])

isolation[isolation == 1] = 0
isolation[isolation == -1] = 1
save_results(df_tags, isolation, ('isolation', 'kbest_chi2'))
df_results.loc[('isolation', 'kbest_chi2')]

accuracy        0.7301
precision       0.0042
recall          0.0113
f1_score        0.0061
mathews_corr   -0.1322
AUC             0.3993
Name: (isolation, kbest_chi2), dtype: object

In [19]:
isolation = IsolationForest(n_estimators=100, contamination=0.2, random_state=0, 
                            behaviour='new').fit_predict(df[df_features.loc[('kbest_fclass', 'minMax')]])

isolation[isolation == 1] = 0
isolation[isolation == -1] = 1
save_results(df_tags, isolation, ('isolation', 'kbest_fclass'))
df_results.loc[('isolation', 'kbest_fclass')]

accuracy        0.7308
precision       0.0118
recall          0.0319
f1_score        0.0172
mathews_corr   -0.1187
AUC             0.4093
Name: (isolation, kbest_fclass), dtype: object

In [20]:
isolation = IsolationForest(n_estimators=100, contamination=0.2, random_state=0, 
                            behaviour='new').fit_predict(df[df_features.loc[('extraTrees', 'minMax')]])

isolation[isolation == 1] = 0
isolation[isolation == -1] = 1
save_results(df_tags, isolation, ('isolation', 'extraTrees'))
df_results.loc[('isolation', 'extraTrees')]

accuracy        0.7423
precision       0.0405
recall          0.1095
f1_score        0.0591
mathews_corr   -0.0639
AUC             0.4512
Name: (isolation, extraTrees), dtype: object

In [21]:
isolation = IsolationForest(n_estimators=100, contamination=0.2, random_state=0, 
                            behaviour='new').fit_predict(df[df_features.loc[('randomForest', 'minMax')]])

isolation[isolation == 1] = 0
isolation[isolation == -1] = 1
save_results(df_tags, isolation, ('isolation', 'randomForest'))
df_results.loc[('isolation', 'randomForest')]

accuracy        0.7309
precision       0.0116
recall          0.0314
f1_score         0.017
mathews_corr   -0.1189
AUC             0.4091
Name: (isolation, randomForest), dtype: object

# Resumen resultados aplicando la transformación MinMaxScaler a los datos

In [29]:
df_results.to_csv('ISCX_results/ISCX_minMaxScaler_results.csv')
df_results

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,precision,recall,f1_score,mathews_corr,AUC
method,featureSelection,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
kmeans,kbest_chi2,0.7432,0.2218,0.9868,0.3622,0.3957,0.8553
kmeans,kbest_fclass,0.7107,0.2019,0.9871,0.3352,0.368,0.8379
kmeans,extraTrees,0.4885,0.1238,0.9749,0.2198,0.2252,0.7123
kmeans,randomForest,0.5341,0.1334,0.9654,0.2344,0.2439,0.7325
minibatch,kbest_chi2,0.4379,0.1152,0.9889,0.2063,0.2079,0.6914
minibatch,kbest_fclass,0.6921,0.192,0.9872,0.3215,0.3534,0.8279
minibatch,extraTrees,0.5132,0.1117,0.8036,0.1961,0.1539,0.6468
minibatch,randomForest,0.5344,0.1335,0.9654,0.2345,0.2441,0.7327
birch,kbest_chi2,0.3844,0.1064,0.9911,0.1922,0.1847,0.6635
birch,kbest_fclass,0.4885,0.1251,0.9878,0.222,0.2316,0.7183
