In [1]:
import pandas as pd
import numpy as np
import json
from sklearn.cluster import KMeans, MiniBatchKMeans, Birch, DBSCAN
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score

df = pd.read_csv('ISCX_datasets_preprocesados/standard.csv')
df_tags = pd.read_csv('ISCX_datasets_preprocesados/ISCX_tags.csv', index_col=0, header=None, names=['Tag'])

df_features = pd.read_csv('ISCX_datasets_preprocesados/ISCX_features.csv', index_col=['featureSelection', 'preprocesamiento'])
df_features.loc[pd.IndexSlice[:, 'standard'],:]

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3,4,5,6,7,8,9
featureSelection,preprocesamiento,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
kbest_chi2,standard,s11,s18,s26,s27,s16,s25,s12,eucl_dec,s17,s0
kbest_fclass,standard,arit,eucl_dec,s1,hamm,d12,d18,xor+,s0,xor,d25
extraTrees,standard,eucl_dec,s1,d4,d10,s2,arit,s3,s8,s0,s4
randomForest,standard,arit,eucl_dec,xor,norm_src,hamm,xor+,norm_dst,eucl,s3,s2


In [2]:
def save_results(real, prediccion, indexs):
    df_results.loc[indexs] =  [round(accuracy_score(real, prediccion), 4),
                              round(precision_score(real, prediccion), 4),
                              round(recall_score(real, prediccion), 4),
                              round(f1_score(real, prediccion), 4),
                              round(matthews_corrcoef(real, prediccion), 4),
                              round(roc_auc_score(real, prediccion), 4)]

In [3]:
from os import path

if path.exists('ISCX_results/ISCX_standardScaler_results.csv'):
    df_results = pd.read_csv('ISCX_results/ISCX_standardScaler_results.csv', index_col=['method', 'featureSelection'])
else:
    multiIndex = [['kmeans', 'minibatch', 'birch', 'mincovdet', 'isolation'],['kbest_chi2', 'kbest_fclass', 'extraTrees', 'randomForest']]

    multiIndex = pd.MultiIndex.from_product(multiIndex, names=['method', 'featureSelection'])
    df_results = pd.DataFrame(None, index=multiIndex, columns=['accuracy', 'precision', 'recall', 'f1_score', 'mathews_corr', 'AUC'])
df_results

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,precision,recall,f1_score,mathews_corr,AUC
method,featureSelection,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
kmeans,kbest_chi2,0.9274,0.8401,0.0222,0.0433,0.1296,0.5109
kmeans,kbest_fclass,0.5205,0.1323,0.9873,0.2333,0.2475,0.7353
kmeans,extraTrees,0.5849,0.1293,0.8054,0.2228,0.1956,0.6863
kmeans,randomForest,0.5171,0.1315,0.9875,0.2321,0.2459,0.7335
minibatch,kbest_chi2,0.6987,0.1727,0.8124,0.2849,0.2757,0.751
minibatch,kbest_fclass,0.5198,0.1321,0.9873,0.233,0.2471,0.7349
minibatch,extraTrees,0.58,0.128,0.8056,0.2208,0.1928,0.6838
minibatch,randomForest,0.5171,0.1315,0.9875,0.2321,0.2459,0.7335
birch,kbest_chi2,,,,,,
birch,kbest_fclass,,,,,,


# KMeans

In [4]:
kmeans = KMeans(n_clusters=2, n_init=10, random_state=0, algorithm='full').fit(df[df_features.loc[('kbest_chi2', 'standard')]])

save_results(df_tags, kmeans.labels_, ('kmeans', 'kbest_chi2'))
df_results.loc[('kmeans', 'kbest_chi2')]

accuracy        0.9274
precision       0.8401
recall          0.0222
f1_score        0.0433
mathews_corr    0.1296
AUC             0.5109
Name: (kmeans, kbest_chi2), dtype: float64

In [5]:
kmeans = KMeans(n_clusters=2, n_init=10, random_state=0, algorithm='full').fit(df[df_features.loc[('kbest_fclass', 'standard')]])

save_results(df_tags, kmeans.labels_, ('kmeans', 'kbest_fclass'))
df_results.loc[('kmeans', 'kbest_fclass')]

accuracy        0.5205
precision       0.1323
recall          0.9873
f1_score        0.2333
mathews_corr    0.2475
AUC             0.7353
Name: (kmeans, kbest_fclass), dtype: float64

In [6]:
kmeans = KMeans(n_clusters=2, n_init=10, random_state=0, algorithm='full').fit(df[df_features.loc[('extraTrees', 'standard')]])

save_results(df_tags, (kmeans.labels_ + 1) % 2, ('kmeans', 'extraTrees'))
df_results.loc[('kmeans', 'extraTrees')]

accuracy        0.5603
precision       0.0049
recall          0.0245
f1_score        0.0082
mathews_corr   -0.2019
AUC             0.3137
Name: (kmeans, extraTrees), dtype: float64

In [7]:
kmeans = KMeans(n_clusters=2, n_init=10, random_state=0, algorithm='full').fit(df[df_features.loc[('randomForest', 'standard')]])

save_results(df_tags, kmeans.labels_, ('kmeans', 'randomForest'))
df_results.loc[('kmeans', 'randomForest')]

accuracy        0.5171
precision       0.1315
recall          0.9875
f1_score        0.2321
mathews_corr    0.2459
AUC             0.7335
Name: (kmeans, randomForest), dtype: float64

# MiniBatch KMeans

In [8]:
minibatch = MiniBatchKMeans(n_clusters=2,n_init=10, random_state=0).fit(df[df_features.loc[('kbest_chi2', 'standard')]])

save_results(df_tags, minibatch.labels_, ('minibatch', 'kbest_chi2'))    # Se invierte los clusters tags 0 y 1
df_results.loc[('minibatch', 'kbest_chi2')]

accuracy        0.6987
precision       0.1727
recall          0.8124
f1_score        0.2849
mathews_corr    0.2757
AUC             0.7510
Name: (minibatch, kbest_chi2), dtype: float64

In [9]:
minibatch = MiniBatchKMeans(n_clusters=2,n_init=10, random_state=0).fit(df[df_features.loc[('kbest_fclass', 'standard')]])

save_results(df_tags, minibatch.labels_, ('minibatch', 'kbest_fclass'))    # Se invierte los clusters tags 0 y 1
df_results.loc[('minibatch', 'kbest_fclass')]

accuracy        0.5198
precision       0.1321
recall          0.9873
f1_score        0.2330
mathews_corr    0.2471
AUC             0.7349
Name: (minibatch, kbest_fclass), dtype: float64

In [10]:
minibatch = MiniBatchKMeans(n_clusters=2,n_init=10, random_state=0).fit(df[df_features.loc[('extraTrees', 'standard')]])

save_results(df_tags, minibatch.labels_, ('minibatch', 'extraTrees'))    # Se invierte los clusters tags 0 y 1
df_results.loc[('minibatch', 'extraTrees')]

accuracy        0.7422
precision       0.2145
recall          0.9353
f1_score        0.3490
mathews_corr    0.3706
AUC             0.8310
Name: (minibatch, extraTrees), dtype: float64

In [11]:
minibatch = MiniBatchKMeans(n_clusters=2,n_init=10, random_state=0).fit(df[df_features.loc[('randomForest', 'standard')]])

save_results(df_tags, (minibatch.labels_ + 1) % 2, ('minibatch', 'randomForest'))    # Se invierte los clusters tags 0 y 1
df_results.loc[('minibatch', 'randomForest')]

accuracy        0.5171
precision       0.1315
recall          0.9875
f1_score        0.2321
mathews_corr    0.2459
AUC             0.7335
Name: (minibatch, randomForest), dtype: float64

# BIRCH

In [22]:
birch = Birch(n_clusters=2, threshold=1, branching_factor=10).fit_predict(df[df_features.loc[('kbest_chi2', 'standard')]])

save_results(df_tags, (birch + 1) % 2, ('birch', 'kbest_chi2'))
df_results.loc[('birch', 'kbest_chi2')]

accuracy        0.9274
precision       0.8370
recall          0.0222
f1_score        0.0433
mathews_corr    0.1293
AUC             0.5109
Name: (birch, kbest_chi2), dtype: float64

In [25]:
birch = Birch(n_clusters=2, threshold=1, branching_factor=10).fit_predict(df[df_features.loc[('kbest_fclass', 'standard')]])

save_results(df_tags, (birch + 1) % 2, ('birch', 'kbest_fclass'))
df_results.loc[('birch', 'kbest_fclass')]

accuracy        0.5139
precision       0.1308
recall          0.9880
f1_score        0.2310
mathews_corr    0.2445
AUC             0.7321
Name: (birch, kbest_fclass), dtype: float64

In [27]:
birch = Birch(n_clusters=2, threshold=1, branching_factor=10).fit_predict(df[df_features.loc[('extraTrees', 'standard')]])

save_results(df_tags, birch, ('birch', 'extraTrees'))
df_results.loc[('birch', 'extraTrees')]

accuracy        0.8496
precision       0.3159
recall          0.8886
f1_score        0.4662
mathews_corr    0.4739
AUC             0.8676
Name: (birch, extraTrees), dtype: float64

In [29]:
birch = Birch(n_clusters=2, threshold=1, branching_factor=10).fit_predict(df[df_features.loc[('randomForest', 'standard')]])

save_results(df_tags, birch, ('birch', 'randomForest'))
df_results.loc[('birch', 'randomForest')]

accuracy        0.9244
precision       0.0043
recall          0.0001
f1_score        0.0002
mathews_corr   -0.0110
AUC             0.4991
Name: (birch, randomForest), dtype: float64

# MinCovDet

In [12]:
minCovDet = EllipticEnvelope(assume_centered=False, contamination=0.2, support_fraction=1,
                             random_state=4).fit_predict(df[df_features.loc[('kbest_chi2', 'standard')]])

minCovDet[minCovDet == 1] = 0
minCovDet[minCovDet == -1] = 1
save_results(df_tags, minCovDet, ('mincovdet', 'kbest_chi2'))
df_results.loc[('mincovdet', 'kbest_chi2')]

accuracy        0.8586
precision       0.3310
recall          0.8951
f1_score        0.4833
mathews_corr    0.4911
AUC             0.8754
Name: (mincovdet, kbest_chi2), dtype: float64

In [13]:
minCovDet = EllipticEnvelope(assume_centered=False, contamination=0.2, support_fraction=1,
                             random_state=4).fit_predict(df[df_features.loc[('kbest_fclass', 'standard')]])

minCovDet[minCovDet == 1] = 0
minCovDet[minCovDet == -1] = 1
save_results(df_tags, minCovDet, ('mincovdet', 'kbest_fclass'))
df_results.loc[('mincovdet', 'kbest_fclass')]

accuracy        0.7231
precision       0.0034
recall          0.0094
f1_score        0.0050
mathews_corr   -0.1366
AUC             0.3947
Name: (mincovdet, kbest_fclass), dtype: float64

In [14]:
minCovDet = EllipticEnvelope(assume_centered=False, contamination=0.2, support_fraction=1,
                             random_state=4).fit_predict(df[df_features.loc[('extraTrees', 'standard')]])

minCovDet[minCovDet == 1] = 0
minCovDet[minCovDet == -1] = 1
save_results(df_tags, minCovDet, ('mincovdet', 'extraTrees'))
df_results.loc[('mincovdet', 'extraTrees')]

accuracy        0.7301
precision       0.0112
recall          0.0303
f1_score        0.0163
mathews_corr   -0.1201
AUC             0.4081
Name: (mincovdet, extraTrees), dtype: float64

In [15]:
minCovDet = EllipticEnvelope(assume_centered=False, contamination=0.2, support_fraction=1,
                             random_state=4).fit_predict(df[df_features.loc[('randomForest', 'standard')]])

minCovDet[minCovDet == 1] = 0
minCovDet[minCovDet == -1] = 1
save_results(df_tags, minCovDet, ('mincovdet', 'randomForest'))
df_results.loc[('mincovdet', 'randomForest')]

accuracy        0.7410
precision       0.0355
recall          0.0956
f1_score        0.0517
mathews_corr   -0.0733
AUC             0.4440
Name: (mincovdet, randomForest), dtype: float64

# Isolation Forest

In [16]:
isolation = IsolationForest(n_estimators=100, contamination=0.2, random_state=0, 
                            behaviour='new').fit_predict(df[df_features.loc[('kbest_chi2', 'standard')]])

isolation[isolation == 1] = 0
isolation[isolation == -1] = 1
save_results(df_tags, isolation, ('isolation', 'kbest_chi2'))
df_results.loc[('isolation', 'kbest_chi2')]

accuracy        0.7356
precision       0.0238
recall          0.0644
f1_score        0.0348
mathews_corr   -0.0957
AUC             0.4268
Name: (isolation, kbest_chi2), dtype: float64

In [17]:
isolation = IsolationForest(n_estimators=100, contamination=0.2, random_state=0, 
                            behaviour='new').fit_predict(df[df_features.loc[('kbest_fclass', 'standard')]])

isolation[isolation == 1] = 0
isolation[isolation == -1] = 1
save_results(df_tags, isolation, ('isolation', 'kbest_fclass'))
df_results.loc[('isolation', 'kbest_fclass')]

accuracy        0.7320
precision       0.0119
recall          0.0321
f1_score        0.0174
mathews_corr   -0.1180
AUC             0.4100
Name: (isolation, kbest_fclass), dtype: float64

In [19]:
isolation = IsolationForest(n_estimators=100, contamination=0.2, random_state=0, 
                            behaviour='new').fit_predict(df[df_features.loc[('extraTrees', 'standard')]])

isolation[isolation == 1] = 0
isolation[isolation == -1] = 1
save_results(df_tags, isolation, ('isolation', 'extraTrees'))
df_results.loc[('isolation', 'extraTrees')]

accuracy        0.7281
precision       0.0039
recall          0.0104
f1_score        0.0056
mathews_corr   -0.1337
AUC             0.3979
Name: (isolation, extraTrees), dtype: float64

In [20]:
isolation = IsolationForest(n_estimators=100, contamination=0.2, random_state=0, 
                            behaviour='new').fit_predict(df[df_features.loc[('randomForest', 'standard')]])

isolation[isolation == 1] = 0
isolation[isolation == -1] = 1
save_results(df_tags, isolation, ('isolation', 'randomForest'))
df_results.loc[('isolation', 'randomForest')]

accuracy        0.7307
precision       0.0115
recall          0.0311
f1_score        0.0168
mathews_corr   -0.1192
AUC             0.4088
Name: (isolation, randomForest), dtype: float64

# Resumen resultados sin preprocesamiento de los datos

In [30]:
df_results.to_csv('ISCX_results/ISCX_standardScaler_results.csv')
df_results

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,precision,recall,f1_score,mathews_corr,AUC
method,featureSelection,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
kmeans,kbest_chi2,0.9274,0.8401,0.0222,0.0433,0.1296,0.5109
kmeans,kbest_fclass,0.5205,0.1323,0.9873,0.2333,0.2475,0.7353
kmeans,extraTrees,0.5603,0.0049,0.0245,0.0082,-0.2019,0.3137
kmeans,randomForest,0.5171,0.1315,0.9875,0.2321,0.2459,0.7335
minibatch,kbest_chi2,0.6987,0.1727,0.8124,0.2849,0.2757,0.751
minibatch,kbest_fclass,0.5198,0.1321,0.9873,0.233,0.2471,0.7349
minibatch,extraTrees,0.7422,0.2145,0.9353,0.349,0.3706,0.831
minibatch,randomForest,0.5171,0.1315,0.9875,0.2321,0.2459,0.7335
birch,kbest_chi2,0.9274,0.837,0.0222,0.0433,0.1293,0.5109
birch,kbest_fclass,0.5139,0.1308,0.988,0.231,0.2445,0.7321
