In [1]:
import pandas as pd
import numpy as np
import json
from sklearn.cluster import KMeans, MiniBatchKMeans, Birch, DBSCAN
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score

df = pd.read_csv('ISCX_datasets_preprocesados/no.csv')
df_tags = pd.read_csv('ISCX_datasets_preprocesados/ISCX_tags.csv', index_col=0, header=None, names=['Tag'])

df_features = pd.read_csv('ISCX_datasets_preprocesados/ISCX_features.csv', index_col=['featureSelection', 'preprocesamiento'])
df_features.loc[pd.IndexSlice[:, 'no'],:]

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3,4,5,6,7,8,9
featureSelection,preprocesamiento,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
kbest_chi2,no,xor+,xor,eucl,hamm,eucl_dec,arit,s0,s1,d18,d12
kbest_fclass,no,arit,eucl_dec,s1,hamm,d12,d18,xor+,s0,xor,d25
extraTrees,no,eucl_dec,d10,s1,s3,d4,s2,arit,s8,d8,s0
randomForest,no,xor+,arit,eucl_dec,xor,hamm,s4,eucl,s8,s3,s2


In [2]:
def save_results(real, prediccion, indexs):
    df_results.loc[indexs] =  [round(accuracy_score(real, prediccion), 4),
                              round(precision_score(real, prediccion), 4),
                              round(recall_score(real, prediccion), 4),
                              round(f1_score(real, prediccion), 4),
                              round(matthews_corrcoef(real, prediccion), 4),
                              round(roc_auc_score(real, prediccion), 4)]

In [3]:
from os import path

if path.exists('ISCX_results/ISCX_results.csv'):
    df_results = pd.read_csv('ISCX_results/ISCX_results.csv', index_col=['method', 'featureSelection'])
else:
    multiIndex = [['kmeans', 'minibatch', 'birch', 'mincovdet', 'isolation'],['kbest_chi2', 'kbest_fclass', 'extraTrees', 'randomForest']]

    multiIndex = pd.MultiIndex.from_product(multiIndex, names=['method', 'featureSelection'])
    df_results = pd.DataFrame(None, index=multiIndex, columns=['accuracy', 'precision', 'recall', 'f1_score', 'mathews_corr', 'AUC'])
df_results

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,precision,recall,f1_score,mathews_corr,AUC
method,featureSelection,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
kmeans,kbest_chi2,0.5168,0.1315,0.988,0.2321,0.246,0.7336
kmeans,kbest_fclass,0.5139,0.1308,0.988,0.231,0.2445,0.7321
kmeans,extraTrees,0.5631,0.1433,0.9872,0.2503,0.2703,0.7583
kmeans,randomForest,0.5168,0.1315,0.988,0.2321,0.246,0.7336
minibatch,kbest_chi2,0.5168,0.1315,0.988,0.2321,0.246,0.7336
minibatch,kbest_fclass,0.5139,0.1308,0.988,0.231,0.2445,0.7321
minibatch,extraTrees,0.561,0.1428,0.9872,0.2494,0.2691,0.7571
minibatch,randomForest,0.5168,0.1315,0.988,0.2321,0.246,0.7336
birch,kbest_chi2,,,,,,
birch,kbest_fclass,,,,,,


# KMeans

In [5]:
kmeans = KMeans(n_clusters=2, n_init=10, random_state=0, algorithm='full').fit(df[df_features.loc[('kbest_chi2', 'no')]])

save_results(df_tags, kmeans.labels_, ('kmeans', 'kbest_chi2'))
df_results.loc[('kmeans', 'kbest_chi2')]

accuracy        0.5168
precision       0.1315
recall           0.988
f1_score        0.2321
mathews_corr     0.246
AUC             0.7336
Name: (kmeans, kbest_chi2), dtype: object

In [6]:
kmeans = KMeans(n_clusters=2, n_init=10, random_state=0, algorithm='full').fit(df[df_features.loc[('kbest_fclass', 'no')]])

save_results(df_tags, kmeans.labels_, ('kmeans', 'kbest_fclass'))
df_results.loc[('kmeans', 'kbest_fclass')]

accuracy        0.5139
precision       0.1308
recall           0.988
f1_score         0.231
mathews_corr    0.2445
AUC             0.7321
Name: (kmeans, kbest_fclass), dtype: object

In [7]:
kmeans = KMeans(n_clusters=2, n_init=10, random_state=0, algorithm='full').fit(df[df_features.loc[('extraTrees', 'no')]])

save_results(df_tags, kmeans.labels_, ('kmeans', 'extraTrees'))
df_results.loc[('kmeans', 'extraTrees')]

accuracy        0.5631
precision       0.1433
recall          0.9872
f1_score        0.2503
mathews_corr    0.2703
AUC             0.7583
Name: (kmeans, extraTrees), dtype: object

In [8]:
kmeans = KMeans(n_clusters=2, n_init=10, random_state=0, algorithm='full').fit(df[df_features.loc[('randomForest', 'no')]])

save_results(df_tags, kmeans.labels_, ('kmeans', 'randomForest'))
df_results.loc[('kmeans', 'randomForest')]

accuracy        0.5168
precision       0.1315
recall           0.988
f1_score        0.2321
mathews_corr     0.246
AUC             0.7336
Name: (kmeans, randomForest), dtype: object

# MiniBatch KMeans

In [9]:
minibatch = MiniBatchKMeans(n_clusters=2,n_init=10, random_state=0).fit(df[df_features.loc[('kbest_chi2', 'no')]])

save_results(df_tags, (minibatch.labels_ + 1) % 2, ('minibatch', 'kbest_chi2'))    # Se invierte los clusters tags 0 y 1
df_results.loc[('minibatch', 'kbest_chi2')]

accuracy        0.5168
precision       0.1315
recall           0.988
f1_score        0.2321
mathews_corr     0.246
AUC             0.7336
Name: (minibatch, kbest_chi2), dtype: object

In [10]:
minibatch = MiniBatchKMeans(n_clusters=2,n_init=10, random_state=0).fit(df[df_features.loc[('kbest_fclass', 'no')]])

save_results(df_tags, (minibatch.labels_ + 1) % 2, ('minibatch', 'kbest_fclass'))    # Se invierte los clusters tags 0 y 1
df_results.loc[('minibatch', 'kbest_fclass')]

accuracy        0.5139
precision       0.1308
recall           0.988
f1_score         0.231
mathews_corr    0.2445
AUC             0.7321
Name: (minibatch, kbest_fclass), dtype: object

In [11]:
minibatch = MiniBatchKMeans(n_clusters=2,n_init=10, random_state=0).fit(df[df_features.loc[('extraTrees', 'no')]])

save_results(df_tags, minibatch.labels_, ('minibatch', 'extraTrees'))    # Se invierte los clusters tags 0 y 1
df_results.loc[('minibatch', 'extraTrees')]

accuracy         0.561
precision       0.1428
recall          0.9872
f1_score        0.2494
mathews_corr    0.2691
AUC             0.7571
Name: (minibatch, extraTrees), dtype: object

In [12]:
minibatch = MiniBatchKMeans(n_clusters=2,n_init=10, random_state=0).fit(df[df_features.loc[('randomForest', 'no')]])

save_results(df_tags, (minibatch.labels_ + 1) % 2, ('minibatch', 'randomForest'))    # Se invierte los clusters tags 0 y 1
df_results.loc[('minibatch', 'randomForest')]

accuracy        0.5168
precision       0.1315
recall           0.988
f1_score        0.2321
mathews_corr     0.246
AUC             0.7336
Name: (minibatch, randomForest), dtype: object

# BIRCH

In [4]:
birch = Birch(n_clusters=2, threshold=0.1, branching_factor=10).fit_predict(df[df_features.loc[('kbest_chi2', 'no')]])

save_results(df_tags, (birch + 1) % 2, ('birch', 'kbest_chi2'))
df_results.loc[('birch', 'kbest_chi2')]

AttributeError: '_CFSubcluster' object has no attribute 'centroid_'

In [31]:
birch = Birch(n_clusters=2, threshold=0.1, branching_factor=10).fit_predict(df[df_features.loc[('kbest_fclass', 'no')]])

save_results(df_tags, (birch + 1) % 2, ('birch', 'kbest_fclass'))
df_results.loc[('birch', 'kbest_fclass')]

AttributeError: '_CFSubcluster' object has no attribute 'centroid_'

In [25]:
birch = Birch(n_clusters=2, threshold=2, branching_factor=10).fit_predict(df[df_features.loc[('extraTrees', 'no')]])

save_results(df_tags, birch, ('birch', 'extraTrees'))
df_results.loc[('birch', 'extraTrees')]

accuracy        0.5615
precision       0.1429
recall          0.9872
f1_score        0.2496
mathews_corr    0.2694
AUC             0.7574
Name: (birch, extraTrees), dtype: object

In [32]:
birch = Birch(n_clusters=2, threshold=2, branching_factor=10).fit_predict(df[df_features.loc[('randomForest', 'no')]])

save_results(df_tags, birch, ('birch', 'randomForest'))
df_results.loc[('birch', 'randomForest')]

AttributeError: '_CFSubcluster' object has no attribute 'centroid_'

# MinCovDet

In [13]:
minCovDet = EllipticEnvelope(assume_centered=False, contamination=0.2, support_fraction=1,
                             random_state=4).fit_predict(df[df_features.loc[('kbest_chi2', 'no')]])

minCovDet[minCovDet == 1] = 0
minCovDet[minCovDet == -1] = 1
save_results(df_tags, minCovDet, ('mincovdet', 'kbest_chi2'))
df_results.loc[('mincovdet', 'kbest_chi2')]

accuracy        0.7265
precision       0.0008
recall          0.0023
f1_score        0.0012
mathews_corr   -0.1396
AUC             0.3933
Name: (mincovdet, kbest_chi2), dtype: object

In [14]:
minCovDet = EllipticEnvelope(assume_centered=False, contamination=0.2, support_fraction=1,
                             random_state=4).fit_predict(df[df_features.loc[('kbest_fclass', 'no')]])

minCovDet[minCovDet == 1] = 0
minCovDet[minCovDet == -1] = 1
save_results(df_tags, minCovDet, ('mincovdet', 'kbest_fclass'))
df_results.loc[('mincovdet', 'kbest_fclass')]

accuracy        0.7265
precision       0.0008
recall          0.0023
f1_score        0.0012
mathews_corr   -0.1396
AUC             0.3933
Name: (mincovdet, kbest_fclass), dtype: object

In [15]:
minCovDet = EllipticEnvelope(assume_centered=False, contamination=0.2, support_fraction=1,
                             random_state=4).fit_predict(df[df_features.loc[('extraTrees', 'no')]])

minCovDet[minCovDet == 1] = 0
minCovDet[minCovDet == -1] = 1
save_results(df_tags, minCovDet, ('mincovdet', 'extraTrees'))
df_results.loc[('mincovdet', 'extraTrees')]

accuracy        0.7384
precision       0.0369
recall          0.1011
f1_score         0.054
mathews_corr   -0.0713
AUC             0.4452
Name: (mincovdet, extraTrees), dtype: object

In [4]:
minCovDet = EllipticEnvelope(assume_centered=False, contamination=0.2, support_fraction=1,
                             random_state=4).fit_predict(df[df_features.loc[('randomForest', 'no')]])

minCovDet[minCovDet == 1] = 0
minCovDet[minCovDet == -1] = 1
save_results(df_tags, minCovDet, ('mincovdet', 'randomForest'))
df_results.loc[('mincovdet', 'randomForest')]

accuracy        0.7265
precision       0.0008
recall          0.0023
f1_score        0.0012
mathews_corr   -0.1396
AUC             0.3933
Name: (mincovdet, randomForest), dtype: float64

# Isolation Forest

In [18]:
isolation = IsolationForest(n_estimators=100, contamination=0.2, random_state=0, 
                            behaviour='new').fit_predict(df[df_features.loc[('kbest_chi2', 'no')]])

isolation[isolation == 1] = 0
isolation[isolation == -1] = 1
save_results(df_tags, isolation, ('isolation', 'kbest_chi2'))
df_results.loc[('isolation', 'kbest_chi2')]

accuracy        0.7311
precision       0.0122
recall           0.033
f1_score        0.0178
mathews_corr   -0.1178
AUC             0.4099
Name: (isolation, kbest_chi2), dtype: object

In [19]:
isolation = IsolationForest(n_estimators=100, contamination=0.2, random_state=0, 
                            behaviour='new').fit_predict(df[df_features.loc[('kbest_fclass', 'no')]])

isolation[isolation == 1] = 0
isolation[isolation == -1] = 1
save_results(df_tags, isolation, ('isolation', 'kbest_fclass'))
df_results.loc[('isolation', 'kbest_fclass')]

accuracy        0.7309
precision       0.0118
recall          0.0318
f1_score        0.0172
mathews_corr   -0.1187
AUC             0.4093
Name: (isolation, kbest_fclass), dtype: object

In [20]:
isolation = IsolationForest(n_estimators=100, contamination=0.2, random_state=0, 
                            behaviour='new').fit_predict(df[df_features.loc[('extraTrees', 'no')]])

isolation[isolation == 1] = 0
isolation[isolation == -1] = 1
save_results(df_tags, isolation, ('isolation', 'extraTrees'))
df_results.loc[('isolation', 'extraTrees')]

accuracy        0.7333
precision        0.012
recall          0.0322
f1_score        0.0175
mathews_corr   -0.1173
AUC             0.4107
Name: (isolation, extraTrees), dtype: object

In [21]:
isolation = IsolationForest(n_estimators=100, contamination=0.2, random_state=0, 
                            behaviour='new').fit_predict(df[df_features.loc[('randomForest', 'no')]])

isolation[isolation == 1] = 0
isolation[isolation == -1] = 1
save_results(df_tags, isolation, ('isolation', 'randomForest'))
df_results.loc[('isolation', 'randomForest')]

accuracy        0.7311
precision       0.0115
recall          0.0309
f1_score        0.0167
mathews_corr   -0.1192
AUC              0.409
Name: (isolation, randomForest), dtype: object

# Resumen resultados sin preprocesamiento de los datos

In [5]:
df_results.to_csv('ISCX_results/ISCX_results.csv')
df_results

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,precision,recall,f1_score,mathews_corr,AUC
method,featureSelection,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
kmeans,kbest_chi2,0.5168,0.1315,0.988,0.2321,0.246,0.7336
kmeans,kbest_fclass,0.5139,0.1308,0.988,0.231,0.2445,0.7321
kmeans,extraTrees,0.5631,0.1433,0.9872,0.2503,0.2703,0.7583
kmeans,randomForest,0.5168,0.1315,0.988,0.2321,0.246,0.7336
minibatch,kbest_chi2,0.5168,0.1315,0.988,0.2321,0.246,0.7336
minibatch,kbest_fclass,0.5139,0.1308,0.988,0.231,0.2445,0.7321
minibatch,extraTrees,0.561,0.1428,0.9872,0.2494,0.2691,0.7571
minibatch,randomForest,0.5168,0.1315,0.988,0.2321,0.246,0.7336
birch,kbest_chi2,,,,,,
birch,kbest_fclass,,,,,,
