In [1]:
import pandas as pd
import numpy as np
import json
from sklearn.cluster import KMeans, MiniBatchKMeans, Birch, DBSCAN
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score

df = pd.read_csv('ISCX_datasets_preprocesados/norml1.csv')
df_tags = pd.read_csv('ISCX_datasets_preprocesados/ISCX_tags.csv', index_col=0, header=None, names=['Tag'])

df_features = pd.read_csv('ISCX_datasets_preprocesados/ISCX_features.csv', index_col=['featureSelection', 'preprocesamiento'])
df_features.loc[pd.IndexSlice[:, 'norm_l1'],:]

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3,4,5,6,7,8,9
featureSelection,preprocesamiento,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
kbest_chi2,norm_l1,s3,s8,s0,d8,eucl,arit,s5,s6,d19,d21
kbest_fclass,norm_l1,arit,eucl,s3,s8,s0,d8,s5,s6,d19,d21
extraTrees,norm_l1,s8,s31,d6,xor,d30,d21,eucl,s23,s3,d5
randomForest,norm_l1,s3,s8,eucl,d4,d31,d23,d19,d30,norm_src,s4


In [2]:
def save_results(real, prediccion, indexs):
    df_results.loc[indexs] =  [round(accuracy_score(real, prediccion), 4),
                              round(precision_score(real, prediccion), 4),
                              round(recall_score(real, prediccion), 4),
                              round(f1_score(real, prediccion), 4),
                              round(matthews_corrcoef(real, prediccion), 4),
                              round(roc_auc_score(real, prediccion), 4)]

Se cargan los resultados ya logrados con anterioridad para no tener que ejecutar de nuevo todos los procesos, sólo los deseados.

In [3]:
from os import path

if path.exists('ISCX_results/ISCX_norml1_results.csv'):
    df_results = pd.read_csv('ISCX_norml1_results.csv', index_col=['method', 'featureSelection'])
else:
    multiIndex = [['kmeans', 'minibatch', 'birch', 'mincovdet', 'isolation'],['kbest_chi2', 'kbest_fclass', 'extraTrees', 'randomForest']]

    multiIndex = pd.MultiIndex.from_product(multiIndex, names=['method', 'featureSelection'])
    df_results = pd.DataFrame(None, index=multiIndex, columns=['accuracy', 'precision', 'recall', 'f1_score', 'mathews_corr', 'AUC'])
df_results

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,precision,recall,f1_score,mathews_corr,AUC
method,featureSelection,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
kmeans,kbest_chi2,,,,,,
kmeans,kbest_fclass,,,,,,
kmeans,extraTrees,,,,,,
kmeans,randomForest,,,,,,
minibatch,kbest_chi2,,,,,,
minibatch,kbest_fclass,,,,,,
minibatch,extraTrees,,,,,,
minibatch,randomForest,,,,,,
birch,kbest_chi2,,,,,,
birch,kbest_fclass,,,,,,


# KMeans

In [4]:
kmeans = KMeans(n_clusters=2, n_init=10, random_state=0, algorithm='full').fit(df[df_features.loc[('kbest_chi2', 'norm_l1')]])

save_results(df_tags, kmeans.labels_, ('kmeans', 'kbest_chi2'))
df_results.loc[('kmeans', 'kbest_chi2')]

accuracy        0.7982
precision        0.245
recall          0.8316
f1_score        0.3784
mathews_corr    0.3784
AUC             0.8136
Name: (kmeans, kbest_chi2), dtype: object

In [5]:
kmeans = KMeans(n_clusters=2, n_init=10, random_state=0, algorithm='full').fit(df[df_features.loc[('kbest_fclass', 'norm_l1')]])

save_results(df_tags, kmeans.labels_, ('kmeans', 'kbest_fclass'))
df_results.loc[('kmeans', 'kbest_fclass')]

accuracy        0.7982
precision        0.245
recall          0.8316
f1_score        0.3784
mathews_corr    0.3784
AUC             0.8136
Name: (kmeans, kbest_fclass), dtype: object

In [6]:
kmeans = KMeans(n_clusters=2, n_init=10, random_state=0, algorithm='full').fit(df[df_features.loc[('extraTrees', 'norm_l1')]])

save_results(df_tags, (kmeans.labels_ + 1) % 2, ('kmeans', 'extraTrees'))
df_results.loc[('kmeans', 'extraTrees')]

accuracy        0.7736
precision       0.2338
recall           0.907
f1_score        0.3718
mathews_corr    0.3875
AUC             0.8349
Name: (kmeans, extraTrees), dtype: object

In [7]:
kmeans = KMeans(n_clusters=2, n_init=10, random_state=0, algorithm='full').fit(df[df_features.loc[('randomForest', 'norm_l1')]])

save_results(df_tags, kmeans.labels_, ('kmeans', 'randomForest'))
df_results.loc[('kmeans', 'randomForest')]

accuracy        0.7982
precision        0.245
recall          0.8316
f1_score        0.3784
mathews_corr    0.3784
AUC             0.8136
Name: (kmeans, randomForest), dtype: object

# MiniBatch KMeans

In [8]:
minibatch = MiniBatchKMeans(n_clusters=2,n_init=10, random_state=0).fit(df[df_features.loc[('kbest_chi2', 'norm_l1')]])

save_results(df_tags, (minibatch.labels_ + 1) % 2, ('minibatch', 'kbest_chi2'))    # Se invierte los clusters tags 0 y 1
df_results.loc[('minibatch', 'kbest_chi2')]

accuracy        0.7981
precision       0.2449
recall          0.8316
f1_score        0.3784
mathews_corr    0.3784
AUC             0.8135
Name: (minibatch, kbest_chi2), dtype: object

In [9]:
minibatch = MiniBatchKMeans(n_clusters=2,n_init=10, random_state=0).fit(df[df_features.loc[('kbest_fclass', 'norm_l1')]])

save_results(df_tags, (minibatch.labels_ + 1) % 2, ('minibatch', 'kbest_fclass'))    # Se invierte los clusters tags 0 y 1
df_results.loc[('minibatch', 'kbest_fclass')]

accuracy        0.7981
precision       0.2449
recall          0.8316
f1_score        0.3784
mathews_corr    0.3784
AUC             0.8135
Name: (minibatch, kbest_fclass), dtype: object

In [10]:
minibatch = MiniBatchKMeans(n_clusters=2,n_init=10, random_state=0).fit(df[df_features.loc[('extraTrees', 'norm_l1')]])

save_results(df_tags, minibatch.labels_, ('minibatch', 'extraTrees'))    # Se invierte los clusters tags 0 y 1
df_results.loc[('minibatch', 'extraTrees')]

accuracy        0.7736
precision       0.2338
recall           0.907
f1_score        0.3718
mathews_corr    0.3875
AUC             0.8349
Name: (minibatch, extraTrees), dtype: object

In [11]:
minibatch = MiniBatchKMeans(n_clusters=2,n_init=10, random_state=0).fit(df[df_features.loc[('randomForest', 'norm_l1')]])

save_results(df_tags, (minibatch.labels_ + 1) % 2, ('minibatch', 'randomForest'))    # Se invierte los clusters tags 0 y 1
df_results.loc[('minibatch', 'randomForest')]

accuracy        0.7981
precision       0.2449
recall          0.8316
f1_score        0.3784
mathews_corr    0.3784
AUC             0.8135
Name: (minibatch, randomForest), dtype: object

# BIRCH

In [22]:
birch = Birch(n_clusters=2, threshold=0.1, branching_factor=10).fit_predict(df[df_features.loc[('kbest_chi2', 'norm_l1')]])

save_results(df_tags, birch, ('birch', 'kbest_chi2'))
df_results.loc[('birch', 'kbest_chi2')]

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


accuracy        0.9261
precision            0
recall               0
f1_score             0
mathews_corr         0
AUC                0.5
Name: (birch, kbest_chi2), dtype: object

In [24]:
birch = Birch(n_clusters=2, threshold=0.1, branching_factor=10).fit_predict(df[df_features.loc[('kbest_fclass', 'norm_l1')]])

save_results(df_tags, birch, ('birch', 'kbest_fclass'))
df_results.loc[('birch', 'kbest_fclass')]

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


accuracy        0.9261
precision            0
recall               0
f1_score             0
mathews_corr         0
AUC                0.5
Name: (birch, kbest_fclass), dtype: object

In [27]:
birch = Birch(n_clusters=2, threshold=0.1, branching_factor=10).fit_predict(df[df_features.loc[('extraTrees', 'norm_l1')]])

save_results(df_tags, (birch + 1) % 2, ('birch', 'extraTrees'))
df_results.loc[('birch', 'extraTrees')]

accuracy        0.8905
precision         0.23
recall           0.205
f1_score        0.2168
mathews_corr    0.1585
AUC             0.5751
Name: (birch, extraTrees), dtype: object

In [26]:
birch = Birch(n_clusters=2, threshold=0.1, branching_factor=10).fit_predict(df[df_features.loc[('randomForest', 'norm_l1')]])

save_results(df_tags, birch, ('birch', 'randomForest'))
df_results.loc[('birch', 'randomForest')]

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


accuracy        0.9261
precision            0
recall               0
f1_score             0
mathews_corr         0
AUC                0.5
Name: (birch, randomForest), dtype: object

# MinCovDet

In [12]:
minCovDet = EllipticEnvelope(assume_centered=False, contamination=0.2, support_fraction=1,
                             random_state=4).fit_predict(df[df_features.loc[('kbest_chi2', 'norm_l1')]])

minCovDet[minCovDet == 1] = 0
minCovDet[minCovDet == -1] = 1
save_results(df_tags, minCovDet, ('mincovdet', 'kbest_chi2'))
df_results.loc[('mincovdet', 'kbest_chi2')]



accuracy        0.8643
precision       0.3486
recall          0.9634
f1_score         0.512
mathews_corr     0.532
AUC             0.9099
Name: (mincovdet, kbest_chi2), dtype: object

In [13]:
minCovDet = EllipticEnvelope(assume_centered=False, contamination=0.2, support_fraction=1,
                             random_state=4).fit_predict(df[df_features.loc[('kbest_fclass', 'norm_l1')]])

minCovDet[minCovDet == 1] = 0
minCovDet[minCovDet == -1] = 1
save_results(df_tags, minCovDet, ('mincovdet', 'kbest_fclass'))
df_results.loc[('mincovdet', 'kbest_fclass')]



accuracy        0.8643
precision       0.3486
recall          0.9634
f1_score         0.512
mathews_corr     0.532
AUC             0.9099
Name: (mincovdet, kbest_fclass), dtype: object

In [14]:
minCovDet = EllipticEnvelope(assume_centered=False, contamination=0.2, support_fraction=1,
                             random_state=4).fit_predict(df[df_features.loc[('extraTrees', 'norm_l1')]])

minCovDet[minCovDet == 1] = 0
minCovDet[minCovDet == -1] = 1
save_results(df_tags, minCovDet, ('mincovdet', 'extraTrees'))
df_results.loc[('mincovdet', 'extraTrees')]



accuracy        0.7917
precision       0.2387
recall          0.8306
f1_score        0.3708
mathews_corr    0.3706
AUC             0.8096
Name: (mincovdet, extraTrees), dtype: object

In [15]:
minCovDet = EllipticEnvelope(assume_centered=False, contamination=0.2, support_fraction=1,
                             random_state=4).fit_predict(df[df_features.loc[('randomForest', 'norm_l1')]])

minCovDet[minCovDet == 1] = 0
minCovDet[minCovDet == -1] = 1
save_results(df_tags, minCovDet, ('mincovdet', 'randomForest'))
df_results.loc[('mincovdet', 'randomForest')]



accuracy        0.8442
precision       0.2982
recall          0.8191
f1_score        0.4373
mathews_corr    0.4328
AUC             0.8327
Name: (mincovdet, randomForest), dtype: object

# Isolation Forest

In [16]:
isolation = IsolationForest(n_estimators=100, contamination=0.2, random_state=0, 
                            behaviour='new').fit_predict(df[df_features.loc[('kbest_chi2', 'norm_l1')]])

isolation[isolation == 1] = 0
isolation[isolation == -1] = 1
save_results(df_tags, isolation, ('isolation', 'kbest_chi2'))
df_results.loc[('isolation', 'kbest_chi2')]

accuracy        0.8553
precision        0.317
recall          0.8305
f1_score        0.4589
mathews_corr    0.4554
AUC             0.8439
Name: (isolation, kbest_chi2), dtype: object

In [17]:
isolation = IsolationForest(n_estimators=100, contamination=0.2, random_state=0, 
                            behaviour='new').fit_predict(df[df_features.loc[('kbest_fclass', 'norm_l1')]])

isolation[isolation == 1] = 0
isolation[isolation == -1] = 1
save_results(df_tags, isolation, ('isolation', 'kbest_fclass'))
df_results.loc[('isolation', 'kbest_fclass')]

accuracy         0.862
precision       0.3395
recall          0.9185
f1_score        0.4958
mathews_corr    0.5075
AUC              0.888
Name: (isolation, kbest_fclass), dtype: object

In [18]:
isolation = IsolationForest(n_estimators=100, contamination=0.2, random_state=0, 
                            behaviour='new').fit_predict(df[df_features.loc[('extraTrees', 'norm_l1')]])

isolation[isolation == 1] = 0
isolation[isolation == -1] = 1
save_results(df_tags, isolation, ('isolation', 'extraTrees'))
df_results.loc[('isolation', 'extraTrees')]

accuracy         0.843
precision       0.2954
recall          0.8119
f1_score        0.4332
mathews_corr    0.4275
AUC             0.8287
Name: (isolation, extraTrees), dtype: object

In [19]:
isolation = IsolationForest(n_estimators=100, contamination=0.2, random_state=0, 
                            behaviour='new').fit_predict(df[df_features.loc[('randomForest', 'norm_l1')]])

isolation[isolation == 1] = 0
isolation[isolation == -1] = 1
save_results(df_tags, isolation, ('isolation', 'randomForest'))
df_results.loc[('isolation', 'randomForest')]

accuracy        0.8641
precision       0.3483
recall          0.9633
f1_score        0.5116
mathews_corr    0.5316
AUC             0.9097
Name: (isolation, randomForest), dtype: object

# Resumen resultados aplicando la transformación Normalización L1 a los datos

In [28]:
df_results.to_csv('ISCX_results/ISCX_norml1_results.csv')
df_results

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,precision,recall,f1_score,mathews_corr,AUC
method,featureSelection,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
kmeans,kbest_chi2,0.7982,0.245,0.8316,0.3784,0.3784,0.8136
kmeans,kbest_fclass,0.7982,0.245,0.8316,0.3784,0.3784,0.8136
kmeans,extraTrees,0.7736,0.2338,0.907,0.3718,0.3875,0.8349
kmeans,randomForest,0.7982,0.245,0.8316,0.3784,0.3784,0.8136
minibatch,kbest_chi2,0.7981,0.2449,0.8316,0.3784,0.3784,0.8135
minibatch,kbest_fclass,0.7981,0.2449,0.8316,0.3784,0.3784,0.8135
minibatch,extraTrees,0.7736,0.2338,0.907,0.3718,0.3875,0.8349
minibatch,randomForest,0.7981,0.2449,0.8316,0.3784,0.3784,0.8135
birch,kbest_chi2,0.9261,0.0,0.0,0.0,0.0,0.5
birch,kbest_fclass,0.9261,0.0,0.0,0.0,0.0,0.5
