In [1]:
import pandas as pd
import numpy as np
import json
from sklearn.cluster import KMeans, MiniBatchKMeans, Birch, DBSCAN
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score

df = pd.read_csv('ISCX_datasets_preprocesados/norml2.csv')
df_tags = pd.read_csv('ISCX_datasets_preprocesados/ISCX_tags.csv', index_col=0, header=None, names=['Tag'])

df_features = pd.read_csv('ISCX_datasets_preprocesados/ISCX_features.csv', index_col=['featureSelection', 'preprocesamiento'])
df_features.loc[pd.IndexSlice[:, 'norm_l2'],:]

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3,4,5,6,7,8,9
featureSelection,preprocesamiento,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
kbest_chi2,norm_l2,s3,s8,d8,s0,s5,s6,d19,d21,d23,d31
kbest_fclass,norm_l2,eucl,s3,arit,s8,d8,xor,s0,s5,s6,d19
extraTrees,norm_l2,d9,d19,s8,s19,d0,s5,xor,d21,d6,s30
randomForest,norm_l2,s3,s8,xor,eucl,s5,d21,s31,d4,s4,d30


In [2]:
def save_results(real, prediccion, indexs):
    df_results.loc[indexs] =  [round(accuracy_score(real, prediccion), 4),
                              round(precision_score(real, prediccion), 4),
                              round(recall_score(real, prediccion), 4),
                              round(f1_score(real, prediccion), 4),
                              round(matthews_corrcoef(real, prediccion), 4),
                              round(roc_auc_score(real, prediccion), 4)]

In [3]:
from os import path

if path.exists('ISCX_results/ISCX_norml2_results.csv'):
    df_results = pd.read_csv('ISCX_results/ISCX_norml2_results.csv', index_col=['method', 'featureSelection'])
else:
    multiIndex = [['kmeans', 'minibatch', 'birch', 'mincovdet', 'isolation'],['kbest_chi2', 'kbest_fclass', 'extraTrees', 'randomForest']]

    multiIndex = pd.MultiIndex.from_product(multiIndex, names=['method', 'featureSelection'])
    df_results = pd.DataFrame(None, index=multiIndex, columns=['accuracy', 'precision', 'recall', 'f1_score', 'mathews_corr', 'AUC'])
df_results

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,precision,recall,f1_score,mathews_corr,AUC
method,featureSelection,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
kmeans,kbest_chi2,,,,,,
kmeans,kbest_fclass,,,,,,
kmeans,extraTrees,,,,,,
kmeans,randomForest,,,,,,
minibatch,kbest_chi2,,,,,,
minibatch,kbest_fclass,,,,,,
minibatch,extraTrees,,,,,,
minibatch,randomForest,,,,,,
birch,kbest_chi2,,,,,,
birch,kbest_fclass,,,,,,


# KMeans

In [4]:
kmeans = KMeans(n_clusters=2, n_init=10, random_state=0, algorithm='full').fit(df[df_features.loc[('kbest_chi2', 'norm_l2')]])

save_results(df_tags, kmeans.labels_, ('kmeans', 'kbest_chi2'))
df_results.loc[('kmeans', 'kbest_chi2')]

accuracy        0.9292
precision       0.5644
recall          0.1837
f1_score        0.2772
mathews_corr    0.2944
AUC             0.5862
Name: (kmeans, kbest_chi2), dtype: object

In [5]:
kmeans = KMeans(n_clusters=2, n_init=10, random_state=0, algorithm='full').fit(df[df_features.loc[('kbest_fclass', 'norm_l2')]])

save_results(df_tags, kmeans.labels_, ('kmeans', 'kbest_fclass'))
df_results.loc[('kmeans', 'kbest_fclass')]

accuracy        0.7723
precision       0.2329
recall           0.907
f1_score        0.3706
mathews_corr    0.3863
AUC             0.8343
Name: (kmeans, kbest_fclass), dtype: object

In [6]:
kmeans = KMeans(n_clusters=2, n_init=10, random_state=0, algorithm='full').fit(df[df_features.loc[('extraTrees', 'norm_l2')]])

save_results(df_tags, kmeans.labels_, ('kmeans', 'extraTrees'))
df_results.loc[('kmeans', 'extraTrees')]

accuracy        0.7704
precision       0.2313
recall           0.907
f1_score        0.3686
mathews_corr    0.3843
AUC             0.8332
Name: (kmeans, extraTrees), dtype: object

In [7]:
kmeans = KMeans(n_clusters=2, n_init=10, random_state=0, algorithm='full').fit(df[df_features.loc[('randomForest', 'norm_l2')]])

save_results(df_tags, (kmeans.labels_ + 1) % 2, ('kmeans', 'randomForest'))
df_results.loc[('kmeans', 'randomForest')]

accuracy        0.7723
precision       0.2329
recall           0.907
f1_score        0.3706
mathews_corr    0.3863
AUC             0.8343
Name: (kmeans, randomForest), dtype: object

# MiniBatch KMeans

In [8]:
minibatch = MiniBatchKMeans(n_clusters=2,n_init=10, random_state=0).fit(df[df_features.loc[('kbest_chi2', 'norm_l2')]])

save_results(df_tags, minibatch.labels_, ('minibatch', 'kbest_chi2'))    # Se invierte los clusters tags 0 y 1
df_results.loc[('minibatch', 'kbest_chi2')]

accuracy        0.9213
precision       0.3233
recall          0.0591
f1_score           0.1
mathews_corr    0.1116
AUC             0.5246
Name: (minibatch, kbest_chi2), dtype: object

In [9]:
minibatch = MiniBatchKMeans(n_clusters=2,n_init=10, random_state=0).fit(df[df_features.loc[('kbest_fclass', 'norm_l2')]])

save_results(df_tags, minibatch.labels_, ('minibatch', 'kbest_fclass'))    # Se invierte los clusters tags 0 y 1
df_results.loc[('minibatch', 'kbest_fclass')]

accuracy        0.7724
precision       0.2329
recall           0.907
f1_score        0.3706
mathews_corr    0.3863
AUC             0.8343
Name: (minibatch, kbest_fclass), dtype: object

In [10]:
minibatch = MiniBatchKMeans(n_clusters=2,n_init=10, random_state=0).fit(df[df_features.loc[('extraTrees', 'norm_l2')]])

save_results(df_tags, (minibatch.labels_ + 1) % 2, ('minibatch', 'extraTrees'))    # Se invierte los clusters tags 0 y 1
df_results.loc[('minibatch', 'extraTrees')]

accuracy        0.7704
precision       0.2313
recall           0.907
f1_score        0.3685
mathews_corr    0.3843
AUC             0.8332
Name: (minibatch, extraTrees), dtype: object

In [11]:
minibatch = MiniBatchKMeans(n_clusters=2,n_init=10, random_state=0).fit(df[df_features.loc[('randomForest', 'norm_l2')]])

save_results(df_tags, minibatch.labels_, ('minibatch', 'randomForest'))    # Se invierte los clusters tags 0 y 1
df_results.loc[('minibatch', 'randomForest')]

accuracy        0.7722
precision       0.2328
recall           0.907
f1_score        0.3705
mathews_corr    0.3862
AUC             0.8342
Name: (minibatch, randomForest), dtype: object

# BIRCH

In [22]:
birch = Birch(n_clusters=2, threshold=0.1, branching_factor=10).fit_predict(df[df_features.loc[('kbest_chi2', 'norm_l2')]])

save_results(df_tags, (birch + 1) % 2, ('birch', 'kbest_chi2'))
df_results.loc[('birch', 'kbest_chi2')]

accuracy        0.9227
precision        0.359
recall          0.0591
f1_score        0.1016
mathews_corr     0.121
AUC             0.5254
Name: (birch, kbest_chi2), dtype: object

In [24]:
birch = Birch(n_clusters=2, threshold=0.1, branching_factor=10).fit_predict(df[df_features.loc[('kbest_fclass', 'norm_l2')]])

save_results(df_tags, (birch + 1) % 2, ('birch', 'kbest_fclass'))
df_results.loc[('birch', 'kbest_fclass')]

accuracy        0.9199
precision       0.2933
recall          0.0592
f1_score        0.0986
mathews_corr    0.1032
AUC             0.5239
Name: (birch, kbest_fclass), dtype: object

In [25]:
birch = Birch(n_clusters=2, threshold=0.1, branching_factor=10).fit_predict(df[df_features.loc[('extraTrees', 'norm_l2')]])

save_results(df_tags, birch, ('birch', 'extraTrees'))
df_results.loc[('birch', 'extraTrees')]

accuracy        0.9227
precision        0.359
recall          0.0591
f1_score        0.1016
mathews_corr     0.121
AUC             0.5254
Name: (birch, extraTrees), dtype: object

In [26]:
birch = Birch(n_clusters=2, threshold=0.1, branching_factor=10).fit_predict(df[df_features.loc[('randomForest', 'norm_l2')]])

save_results(df_tags, (birch + 1) % 2, ('birch', 'randomForest'))
df_results.loc[('birch', 'randomForest')]

accuracy        0.9199
precision       0.2933
recall          0.0592
f1_score        0.0986
mathews_corr    0.1032
AUC             0.5239
Name: (birch, randomForest), dtype: object

# MinCovDet

In [12]:
minCovDet = EllipticEnvelope(assume_centered=False, contamination=0.2, support_fraction=1,
                             random_state=4).fit_predict(df[df_features.loc[('kbest_chi2', 'norm_l2')]])

minCovDet[minCovDet == 1] = 0
minCovDet[minCovDet == -1] = 1
save_results(df_tags, minCovDet, ('mincovdet', 'kbest_chi2'))
df_results.loc[('mincovdet', 'kbest_chi2')]



accuracy        0.8657
precision        0.351
recall          0.9633
f1_score        0.5146
mathews_corr    0.5343
AUC             0.9106
Name: (mincovdet, kbest_chi2), dtype: object

In [13]:
minCovDet = EllipticEnvelope(assume_centered=False, contamination=0.2, support_fraction=1,
                             random_state=4).fit_predict(df[df_features.loc[('kbest_fclass', 'norm_l2')]])

minCovDet[minCovDet == 1] = 0
minCovDet[minCovDet == -1] = 1
save_results(df_tags, minCovDet, ('mincovdet', 'kbest_fclass'))
df_results.loc[('mincovdet', 'kbest_fclass')]



accuracy        0.8696
precision       0.3578
recall           0.963
f1_score        0.5217
mathews_corr    0.5407
AUC             0.9125
Name: (mincovdet, kbest_fclass), dtype: object

In [14]:
minCovDet = EllipticEnvelope(assume_centered=False, contamination=0.2, support_fraction=1,
                             random_state=4).fit_predict(df[df_features.loc[('extraTrees', 'norm_l2')]])

minCovDet[minCovDet == 1] = 0
minCovDet[minCovDet == -1] = 1
save_results(df_tags, minCovDet, ('mincovdet', 'extraTrees'))
df_results.loc[('mincovdet', 'extraTrees')]



accuracy        0.8567
precision        0.324
recall          0.8648
f1_score        0.4714
mathews_corr     0.474
AUC             0.8604
Name: (mincovdet, extraTrees), dtype: object

In [23]:
minCovDet = EllipticEnvelope(assume_centered=False, contamination=0.2, support_fraction=1,
                             random_state=4).fit_predict(df[df_features.loc[('randomForest', 'norm_l2')]])

minCovDet[minCovDet == 1] = 0
minCovDet[minCovDet == -1] = 1
save_results(df_tags, minCovDet, ('mincovdet', 'randomForest'))
df_results.loc[('mincovdet', 'randomForest')]



accuracy        0.9004
precision       0.4123
recall          0.8186
f1_score        0.5484
mathews_corr    0.5364
AUC             0.8628
Name: (mincovdet, randomForest), dtype: object

# Isolation Forest

In [16]:
isolation = IsolationForest(n_estimators=100, contamination=0.2, random_state=0, 
                            behaviour='new').fit_predict(df[df_features.loc[('kbest_chi2', 'norm_l2')]])

isolation[isolation == 1] = 0
isolation[isolation == -1] = 1
save_results(df_tags, isolation, ('isolation', 'kbest_chi2'))
df_results.loc[('isolation', 'kbest_chi2')]

accuracy        0.8572
precision       0.3368
recall          0.9633
f1_score        0.4991
mathews_corr    0.5203
AUC              0.906
Name: (isolation, kbest_chi2), dtype: object

In [17]:
isolation = IsolationForest(n_estimators=100, contamination=0.2, random_state=0, 
                            behaviour='new').fit_predict(df[df_features.loc[('kbest_fclass', 'norm_l2')]])

isolation[isolation == 1] = 0
isolation[isolation == -1] = 1
save_results(df_tags, isolation, ('isolation', 'kbest_fclass'))
df_results.loc[('isolation', 'kbest_fclass')]

accuracy        0.8497
precision       0.3082
recall          0.8309
f1_score        0.4496
mathews_corr    0.4467
AUC             0.8411
Name: (isolation, kbest_fclass), dtype: object

In [18]:
isolation = IsolationForest(n_estimators=100, contamination=0.2, random_state=0, 
                            behaviour='new').fit_predict(df[df_features.loc[('extraTrees', 'norm_l2')]])

isolation[isolation == 1] = 0
isolation[isolation == -1] = 1
save_results(df_tags, isolation, ('isolation', 'extraTrees'))
df_results.loc[('isolation', 'extraTrees')]

accuracy        0.8576
precision        0.327
recall          0.8765
f1_score        0.4763
mathews_corr    0.4809
AUC             0.8663
Name: (isolation, extraTrees), dtype: object

In [19]:
isolation = IsolationForest(n_estimators=100, contamination=0.2, random_state=0, 
                            behaviour='new').fit_predict(df[df_features.loc[('randomForest', 'norm_l2')]])

isolation[isolation == 1] = 0
isolation[isolation == -1] = 1
save_results(df_tags, isolation, ('isolation', 'randomForest'))
df_results.loc[('isolation', 'randomForest')]

accuracy        0.8553
precision       0.3228
recall           0.873
f1_score        0.4714
mathews_corr    0.4755
AUC             0.8634
Name: (isolation, randomForest), dtype: object

# Resumen resultados sin preprocesamiento de los datos

In [27]:
df_results.to_csv('ISCX_results/ISCX_norml2_results.csv')
df_results

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,precision,recall,f1_score,mathews_corr,AUC
method,featureSelection,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
kmeans,kbest_chi2,0.9292,0.5644,0.1837,0.2772,0.2944,0.5862
kmeans,kbest_fclass,0.7723,0.2329,0.907,0.3706,0.3863,0.8343
kmeans,extraTrees,0.7704,0.2313,0.907,0.3686,0.3843,0.8332
kmeans,randomForest,0.7723,0.2329,0.907,0.3706,0.3863,0.8343
minibatch,kbest_chi2,0.9213,0.3233,0.0591,0.1,0.1116,0.5246
minibatch,kbest_fclass,0.7724,0.2329,0.907,0.3706,0.3863,0.8343
minibatch,extraTrees,0.7704,0.2313,0.907,0.3685,0.3843,0.8332
minibatch,randomForest,0.7722,0.2328,0.907,0.3705,0.3862,0.8342
birch,kbest_chi2,0.9227,0.359,0.0591,0.1016,0.121,0.5254
birch,kbest_fclass,0.9199,0.2933,0.0592,0.0986,0.1032,0.5239
