In [1]:
import pandas as pd
import numpy as np
import json
from sklearn.cluster import KMeans, MiniBatchKMeans, Birch, DBSCAN
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score

df = pd.read_csv('ISCX_datasets_preprocesados/robust.csv')
df_tags = pd.read_csv('ISCX_datasets_preprocesados/ISCX_tags.csv', index_col=0, header=None, names=['Tag'])

df_features = pd.read_csv('ISCX_datasets_preprocesados/ISCX_features.csv', index_col=['featureSelection', 'preprocesamiento'])
df_features.loc[pd.IndexSlice[:, 'robust'],:]

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3,4,5,6,7,8,9
featureSelection,preprocesamiento,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
kbest_chi2,robust,norm_src,arit,eucl_dec,s0,s1,d18,d12,d25,hamm,xor
kbest_fclass,robust,arit,eucl_dec,s1,hamm,d12,d18,xor+,s0,xor,d25
extraTrees,robust,d10,s1,eucl_dec,d4,arit,s8,s2,s3,d1,s0
randomForest,robust,arit,eucl_dec,norm_src,hamm,xor,xor+,s4,s8,eucl,s2


In [2]:
def save_results(real, prediccion, indexs):
    df_results.loc[indexs] =  [round(accuracy_score(real, prediccion), 4),
                              round(precision_score(real, prediccion), 4),
                              round(recall_score(real, prediccion), 4),
                              round(f1_score(real, prediccion), 4),
                              round(matthews_corrcoef(real, prediccion), 4),
                              round(roc_auc_score(real, prediccion), 4)]

In [3]:
from os import path

if path.exists('ISCX_results/ISCX_robustScaler_results.csv'):
    df_results = pd.read_csv('ISCX_results/ISCX_robustScaler_results.csv', index_col=['method', 'featureSelection'])
else:
    multiIndex = [['kmeans', 'minibatch', 'birch', 'mincovdet', 'isolation'],['kbest_chi2', 'kbest_fclass', 'extraTrees', 'randomForest']]

    multiIndex = pd.MultiIndex.from_product(multiIndex, names=['method', 'featureSelection'])
    df_results = pd.DataFrame(None, index=multiIndex, columns=['accuracy', 'precision', 'recall', 'f1_score', 'mathews_corr', 'AUC'])
df_results

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,precision,recall,f1_score,mathews_corr,AUC
method,featureSelection,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
kmeans,kbest_chi2,0.9244,0.0043,0.0001,0.0002,-0.011,0.4991
kmeans,kbest_fclass,0.8504,0.3172,0.8886,0.4675,0.4752,0.868
kmeans,extraTrees,0.8504,0.3172,0.8886,0.4675,0.4752,0.868
kmeans,randomForest,0.9244,0.0043,0.0001,0.0002,-0.011,0.4991
minibatch,kbest_chi2,0.9239,0.3009,0.0225,0.0419,0.0647,0.5092
minibatch,kbest_fclass,0.8504,0.3172,0.8886,0.4675,0.4752,0.868
minibatch,extraTrees,0.8504,0.3172,0.8886,0.4675,0.4752,0.868
minibatch,randomForest,0.9239,0.3009,0.0225,0.0419,0.0647,0.5092
birch,kbest_chi2,0.9244,0.0043,0.0001,0.0002,-0.011,0.4991
birch,kbest_fclass,0.8506,0.3174,0.8886,0.4677,0.4754,0.8681


# KMeans

In [4]:
kmeans = KMeans(n_clusters=2, n_init=10, random_state=0, algorithm='full').fit(df[df_features.loc[('kbest_chi2', 'robust')]])

save_results(df_tags, kmeans.labels_, ('kmeans', 'kbest_chi2'))
df_results.loc[('kmeans', 'kbest_chi2')]

accuracy        0.9244
precision       0.0043
recall          0.0001
f1_score        0.0002
mathews_corr   -0.0110
AUC             0.4991
Name: (kmeans, kbest_chi2), dtype: float64

In [5]:
kmeans = KMeans(n_clusters=2, n_init=10, random_state=0, algorithm='full').fit(df[df_features.loc[('kbest_fclass', 'robust')]])

save_results(df_tags, kmeans.labels_, ('kmeans', 'kbest_fclass'))
df_results.loc[('kmeans', 'kbest_fclass')]

accuracy        0.8504
precision       0.3172
recall          0.8886
f1_score        0.4675
mathews_corr    0.4752
AUC             0.8680
Name: (kmeans, kbest_fclass), dtype: float64

In [6]:
kmeans = KMeans(n_clusters=2, n_init=10, random_state=0, algorithm='full').fit(df[df_features.loc[('extraTrees', 'robust')]])

save_results(df_tags, kmeans.labels_, ('kmeans', 'extraTrees'))
df_results.loc[('kmeans', 'extraTrees')]

accuracy        0.8504
precision       0.3172
recall          0.8886
f1_score        0.4675
mathews_corr    0.4752
AUC             0.8680
Name: (kmeans, extraTrees), dtype: float64

In [7]:
kmeans = KMeans(n_clusters=2, n_init=10, random_state=0, algorithm='full').fit(df[df_features.loc[('randomForest', 'robust')]])

save_results(df_tags, kmeans.labels_, ('kmeans', 'randomForest'))
df_results.loc[('kmeans', 'randomForest')]

accuracy        0.9244
precision       0.0043
recall          0.0001
f1_score        0.0002
mathews_corr   -0.0110
AUC             0.4991
Name: (kmeans, randomForest), dtype: float64

# MiniBatch KMeans

In [8]:
minibatch = MiniBatchKMeans(n_clusters=2,n_init=10, random_state=0).fit(df[df_features.loc[('kbest_chi2', 'robust')]])

save_results(df_tags, minibatch.labels_, ('minibatch', 'kbest_chi2'))    # Se invierte los clusters tags 0 y 1
df_results.loc[('minibatch', 'kbest_chi2')]

accuracy        0.9239
precision       0.3009
recall          0.0225
f1_score        0.0419
mathews_corr    0.0647
AUC             0.5092
Name: (minibatch, kbest_chi2), dtype: float64

In [9]:
minibatch = MiniBatchKMeans(n_clusters=2,n_init=10, random_state=0).fit(df[df_features.loc[('kbest_fclass', 'robust')]])

save_results(df_tags, minibatch.labels_, ('minibatch', 'kbest_fclass'))    # Se invierte los clusters tags 0 y 1
df_results.loc[('minibatch', 'kbest_fclass')]

accuracy        0.8504
precision       0.3172
recall          0.8886
f1_score        0.4675
mathews_corr    0.4752
AUC             0.8680
Name: (minibatch, kbest_fclass), dtype: float64

In [10]:
minibatch = MiniBatchKMeans(n_clusters=2,n_init=10, random_state=0).fit(df[df_features.loc[('extraTrees', 'robust')]])

save_results(df_tags, minibatch.labels_, ('minibatch', 'extraTrees'))    # Se invierte los clusters tags 0 y 1
df_results.loc[('minibatch', 'extraTrees')]

accuracy        0.8504
precision       0.3172
recall          0.8886
f1_score        0.4675
mathews_corr    0.4752
AUC             0.8680
Name: (minibatch, extraTrees), dtype: float64

In [11]:
minibatch = MiniBatchKMeans(n_clusters=2,n_init=10, random_state=0).fit(df[df_features.loc[('randomForest', 'robust')]])

save_results(df_tags, minibatch.labels_, ('minibatch', 'randomForest'))    # Se invierte los clusters tags 0 y 1
df_results.loc[('minibatch', 'randomForest')]

accuracy        0.9239
precision       0.3009
recall          0.0225
f1_score        0.0419
mathews_corr    0.0647
AUC             0.5092
Name: (minibatch, randomForest), dtype: float64

# BIRCH

In [12]:
birch = Birch(n_clusters=2, threshold=1, branching_factor=10).fit_predict(df[df_features.loc[('kbest_chi2', 'robust')]])

save_results(df_tags, birch, ('birch', 'kbest_chi2'))
df_results.loc[('birch', 'kbest_chi2')]

accuracy        0.9244
precision       0.0043
recall          0.0001
f1_score        0.0002
mathews_corr   -0.0110
AUC             0.4991
Name: (birch, kbest_chi2), dtype: float64

In [13]:
birch = Birch(n_clusters=2, threshold=0.1, branching_factor=10).fit_predict(df[df_features.loc[('kbest_fclass', 'robust')]])

save_results(df_tags, birch, ('birch', 'kbest_fclass'))
df_results.loc[('birch', 'kbest_fclass')]

accuracy        0.8506
precision       0.3174
recall          0.8886
f1_score        0.4677
mathews_corr    0.4754
AUC             0.8681
Name: (birch, kbest_fclass), dtype: float64

In [14]:
birch = Birch(n_clusters=2, threshold=1, branching_factor=10).fit_predict(df[df_features.loc[('extraTrees', 'robust')]])

save_results(df_tags, birch, ('birch', 'extraTrees'))
df_results.loc[('birch', 'extraTrees')]

accuracy        0.8504
precision       0.3172
recall          0.8886
f1_score        0.4675
mathews_corr    0.4752
AUC             0.8680
Name: (birch, extraTrees), dtype: float64

In [15]:
birch = Birch(n_clusters=2, threshold=1, branching_factor=10).fit_predict(df[df_features.loc[('randomForest', 'robust')]])

save_results(df_tags, birch, ('birch', 'randomForest'))
df_results.loc[('birch', 'randomForest')]

accuracy        0.9244
precision       0.0043
recall          0.0001
f1_score        0.0002
mathews_corr   -0.0110
AUC             0.4991
Name: (birch, randomForest), dtype: float64

# MinCovDet

In [16]:
minCovDet = EllipticEnvelope(assume_centered=False, contamination=0.2, support_fraction=1,
                             random_state=4).fit_predict(df[df_features.loc[('kbest_chi2', 'robust')]])

minCovDet[minCovDet == 1] = 0
minCovDet[minCovDet == -1] = 1
save_results(df_tags, minCovDet, ('mincovdet', 'kbest_chi2'))
df_results.loc[('mincovdet', 'kbest_chi2')]

accuracy        0.7308
precision       0.0118
recall          0.0319
f1_score        0.0172
mathews_corr   -0.1187
AUC             0.4093
Name: (mincovdet, kbest_chi2), dtype: float64

In [17]:
minCovDet = EllipticEnvelope(assume_centered=False, contamination=0.2, support_fraction=1,
                             random_state=4).fit_predict(df[df_features.loc[('kbest_fclass', 'robust')]])

minCovDet[minCovDet == 1] = 0
minCovDet[minCovDet == -1] = 1
save_results(df_tags, minCovDet, ('mincovdet', 'kbest_fclass'))
df_results.loc[('mincovdet', 'kbest_fclass')]

accuracy        0.7296
precision       0.0035
recall          0.0094
f1_score        0.0051
mathews_corr   -0.1336
AUC             0.3982
Name: (mincovdet, kbest_fclass), dtype: float64

In [18]:
minCovDet = EllipticEnvelope(assume_centered=False, contamination=0.2, support_fraction=1,
                             random_state=4).fit_predict(df[df_features.loc[('extraTrees', 'robust')]])

minCovDet[minCovDet == 1] = 0
minCovDet[minCovDet == -1] = 1
save_results(df_tags, minCovDet, ('mincovdet', 'extraTrees'))
df_results.loc[('mincovdet', 'extraTrees')]

accuracy        0.7589
precision       0.0746
recall          0.1983
f1_score        0.1084
mathews_corr    0.0013
AUC             0.5010
Name: (mincovdet, extraTrees), dtype: float64

In [19]:
minCovDet = EllipticEnvelope(assume_centered=False, contamination=0.2, support_fraction=1,
                             random_state=4).fit_predict(df[df_features.loc[('randomForest', 'robust')]])

minCovDet[minCovDet == 1] = 0
minCovDet[minCovDet == -1] = 1
save_results(df_tags, minCovDet, ('mincovdet', 'randomForest'))
df_results.loc[('mincovdet', 'randomForest')]

accuracy        0.7523
precision       0.0685
recall          0.1868
f1_score        0.1003
mathews_corr   -0.0103
AUC             0.4921
Name: (mincovdet, randomForest), dtype: float64

# Isolation Forest

In [20]:
isolation = IsolationForest(n_estimators=100, contamination=0.2, random_state=0, 
                            behaviour='new').fit_predict(df[df_features.loc[('kbest_chi2', 'robust')]])

isolation[isolation == 1] = 0
isolation[isolation == -1] = 1
save_results(df_tags, isolation, ('isolation', 'kbest_chi2'))
df_results.loc[('isolation', 'kbest_chi2')]

accuracy        0.7310
precision       0.0121
recall          0.0326
f1_score        0.0176
mathews_corr   -0.1182
AUC             0.4097
Name: (isolation, kbest_chi2), dtype: float64

In [21]:
isolation = IsolationForest(n_estimators=100, contamination=0.2, random_state=0, 
                            behaviour='new').fit_predict(df[df_features.loc[('kbest_fclass', 'robust')]])

isolation[isolation == 1] = 0
isolation[isolation == -1] = 1
save_results(df_tags, isolation, ('isolation', 'kbest_fclass'))
df_results.loc[('isolation', 'kbest_fclass')]

accuracy        0.7309
precision       0.0119
recall          0.0321
f1_score        0.0173
mathews_corr   -0.1185
AUC             0.4094
Name: (isolation, kbest_fclass), dtype: float64

In [23]:
isolation = IsolationForest(n_estimators=100, contamination=0.2, random_state=0, 
                            behaviour='new').fit_predict(df[df_features.loc[('extraTrees', 'robust')]])

isolation[isolation == 1] = 0
isolation[isolation == -1] = 1
save_results(df_tags, isolation, ('isolation', 'extraTrees'))
df_results.loc[('isolation', 'extraTrees')]

accuracy        0.7498
precision       0.0471
recall          0.1242
f1_score        0.0683
mathews_corr   -0.0503
AUC             0.4619
Name: (isolation, extraTrees), dtype: float64

In [24]:
isolation = IsolationForest(n_estimators=100, contamination=0.2, random_state=0, 
                            behaviour='new').fit_predict(df[df_features.loc[('randomForest', 'robust')]])

isolation[isolation == 1] = 0
isolation[isolation == -1] = 1
save_results(df_tags, isolation, ('isolation', 'randomForest'))
df_results.loc[('isolation', 'randomForest')]

accuracy        0.7318
precision       0.0115
recall          0.0308
f1_score        0.0167
mathews_corr   -0.1189
AUC             0.4093
Name: (isolation, randomForest), dtype: float64

# Resumen resultados sin preprocesamiento de los datos

In [25]:
df_results.to_csv('ISCX_results/ISCX_robustScaler_results.csv')
df_results

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,precision,recall,f1_score,mathews_corr,AUC
method,featureSelection,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
kmeans,kbest_chi2,0.9244,0.0043,0.0001,0.0002,-0.011,0.4991
kmeans,kbest_fclass,0.8504,0.3172,0.8886,0.4675,0.4752,0.868
kmeans,extraTrees,0.8504,0.3172,0.8886,0.4675,0.4752,0.868
kmeans,randomForest,0.9244,0.0043,0.0001,0.0002,-0.011,0.4991
minibatch,kbest_chi2,0.9239,0.3009,0.0225,0.0419,0.0647,0.5092
minibatch,kbest_fclass,0.8504,0.3172,0.8886,0.4675,0.4752,0.868
minibatch,extraTrees,0.8504,0.3172,0.8886,0.4675,0.4752,0.868
minibatch,randomForest,0.9239,0.3009,0.0225,0.0419,0.0647,0.5092
birch,kbest_chi2,0.9244,0.0043,0.0001,0.0002,-0.011,0.4991
birch,kbest_fclass,0.8506,0.3174,0.8886,0.4677,0.4754,0.8681
