In [1]:
import pandas as pd
import numpy as np
import json
from sklearn.cluster import KMeans, MiniBatchKMeans, Birch, DBSCAN
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score

df = pd.read_csv('UNSW_datasets_preprocesados/robust.csv')
df_tags = pd.read_csv('UNSW_datasets_preprocesados/UNSW_tags.csv', index_col=0, header=None, names=['Tag'])

df_features = pd.read_csv('UNSW_datasets_preprocesados/UNSW_features.csv', index_col=['featureSelection', 'preprocesamiento'])
df_features.loc[pd.IndexSlice[:, 'robust'],:]

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3,4,5,6,7,8,9
featureSelection,preprocesamiento,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
kbest_chi2,robust,d22,s30,s22,s21,s20,s7,s6,s5,norm_dst,s15
kbest_fclass,robust,s22,d22,s5,s6,s7,s20,s30,s21,s15,s17
extraTrees,robust,s28,s15,s17,s23,s26,d11,s9,norm_src,s31,d28
randomForest,robust,norm_src,s23,s15,s17,s31,s28,arit,eucl_dec,d11,s10


In [4]:
def save_results(real, prediccion, indexs):
    df_results.loc[indexs] =  [round(accuracy_score(real, prediccion), 4),
                              round(precision_score(real, prediccion), 4),
                              round(recall_score(real, prediccion), 4),
                              round(f1_score(real, prediccion), 4),
                              round(matthews_corrcoef(real, prediccion), 4),
                              round(roc_auc_score(real, prediccion), 4)]

In [5]:
from os import path

if path.exists('UNSW_results/UNSW_robustScaler_results.csv'):
    df_results = pd.read_csv('UNSW_results/UNSW_robustScaler_results.csv', index_col=['method', 'featureSelection'])
else:
    multiIndex = [['kmeans', 'minibatch', 'birch', 'mincovdet', 'isolation'],['kbest_chi2', 'kbest_fclass', 'extraTrees', 'randomForest']]

    multiIndex = pd.MultiIndex.from_product(multiIndex, names=['method', 'featureSelection'])
    df_results = pd.DataFrame(None, index=multiIndex, columns=['accuracy', 'precision', 'recall', 'f1_score', 'mathews_corr', 'AUC'])
df_results

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,precision,recall,f1_score,mathews_corr,AUC
method,featureSelection,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
kmeans,kbest_chi2,,,,,,
kmeans,kbest_fclass,,,,,,
kmeans,extraTrees,,,,,,
kmeans,randomForest,,,,,,
minibatch,kbest_chi2,,,,,,
minibatch,kbest_fclass,,,,,,
minibatch,extraTrees,,,,,,
minibatch,randomForest,,,,,,
birch,kbest_chi2,,,,,,
birch,kbest_fclass,,,,,,


# KMeans

In [6]:
kmeans = KMeans(n_clusters=2, n_init=10, random_state=0, algorithm='full').fit(df[df_features.loc[('kbest_chi2', 'robust')]])

save_results(df_tags, kmeans.labels_, ('kmeans', 'kbest_chi2'))
df_results.loc[('kmeans', 'kbest_chi2')]

accuracy        0.7931
precision            0
recall               0
f1_score             0
mathews_corr   -0.0352
AUC              0.497
Name: (kmeans, kbest_chi2), dtype: object

In [7]:
kmeans = KMeans(n_clusters=2, n_init=10, random_state=0, algorithm='full').fit(df[df_features.loc[('kbest_fclass', 'robust')]])

save_results(df_tags, kmeans.labels_, ('kmeans', 'kbest_fclass'))
df_results.loc[('kmeans', 'kbest_fclass')]

accuracy        0.9711
precision       0.8749
recall               1
f1_score        0.9333
mathews_corr    0.9183
AUC             0.9819
Name: (kmeans, kbest_fclass), dtype: object

In [8]:
kmeans = KMeans(n_clusters=2, n_init=10, random_state=0, algorithm='full').fit(df[df_features.loc[('extraTrees', 'robust')]])

save_results(df_tags, (kmeans.labels_ + 1) % 2, ('kmeans', 'extraTrees'))
df_results.loc[('kmeans', 'extraTrees')]

accuracy         0.811
precision       0.5166
recall               1
f1_score        0.6813
mathews_corr    0.6279
AUC             0.8816
Name: (kmeans, extraTrees), dtype: object

In [9]:
kmeans = KMeans(n_clusters=2, n_init=10, random_state=0, algorithm='full').fit(df[df_features.loc[('randomForest', 'robust')]])

save_results(df_tags, kmeans.labels_, ('kmeans', 'randomForest'))
df_results.loc[('kmeans', 'randomForest')]

accuracy        0.1875
precision            0
recall               0
f1_score             0
mathews_corr   -0.6299
AUC             0.1175
Name: (kmeans, randomForest), dtype: object

# MiniBatch KMeans

In [10]:
minibatch = MiniBatchKMeans(n_clusters=2,n_init=10, random_state=0).fit(df[df_features.loc[('kbest_chi2', 'robust')]])

save_results(df_tags, (minibatch.labels_ + 1) % 2, ('minibatch', 'kbest_chi2'))
df_results.loc[('minibatch', 'kbest_chi2')]

accuracy        0.6342
precision            0
recall               0
f1_score             0
mathews_corr   -0.2227
AUC             0.3974
Name: (minibatch, kbest_chi2), dtype: object

In [11]:
minibatch = MiniBatchKMeans(n_clusters=2,n_init=10, random_state=0).fit(df[df_features.loc[('kbest_fclass', 'robust')]])

save_results(df_tags, (minibatch.labels_ + 1) % 2, ('minibatch', 'kbest_fclass'))    # Se invierte los clusters tags 0 y 1
df_results.loc[('minibatch', 'kbest_fclass')]

accuracy        0.9711
precision       0.8749
recall               1
f1_score        0.9333
mathews_corr    0.9183
AUC             0.9819
Name: (minibatch, kbest_fclass), dtype: object

In [12]:
minibatch = MiniBatchKMeans(n_clusters=2,n_init=10, random_state=0).fit(df[df_features.loc[('extraTrees', 'robust')]])

save_results(df_tags, (minibatch.labels_ + 1) % 2, ('minibatch', 'extraTrees'))    # Se invierte los clusters tags 0 y 1
df_results.loc[('minibatch', 'extraTrees')]

accuracy         0.811
precision       0.5166
recall               1
f1_score        0.6813
mathews_corr    0.6279
AUC             0.8816
Name: (minibatch, extraTrees), dtype: object

In [13]:
minibatch = MiniBatchKMeans(n_clusters=2,n_init=10, random_state=0).fit(df[df_features.loc[('randomForest', 'robust')]])

save_results(df_tags, minibatch.labels_, ('minibatch', 'randomForest'))
df_results.loc[('minibatch', 'randomForest')]

accuracy        0.1875
precision            0
recall               0
f1_score             0
mathews_corr   -0.6299
AUC             0.1175
Name: (minibatch, randomForest), dtype: object

# BIRCH

In [25]:
birch = Birch(n_clusters=1, threshold=0.1, branching_factor=10).fit_predict(df[df_features.loc[('kbest_chi2', 'robust')]])

save_results(df_tags, birch, ('birch', 'kbest_chi2'))
df_results.loc[('birch', 'kbest_chi2')]

AttributeError: '_CFSubcluster' object has no attribute 'centroid_'

In [29]:
birch = Birch(n_clusters=2, threshold=0.1, branching_factor=10).fit_predict(df[df_features.loc[('kbest_fclass', 'robust')]])

save_results(df_tags, (birch + 1) % 2, ('birch', 'kbest_fclass'))
df_results.loc[('birch', 'kbest_fclass')]

accuracy        0.9731
precision       0.8824
recall               1
f1_score        0.9375
mathews_corr    0.9234
AUC             0.9831
Name: (birch, kbest_fclass), dtype: object

In [26]:
birch = Birch(n_clusters=2, threshold=0.1, branching_factor=10).fit_predict(df[df_features.loc[('extraTrees', 'robust')]])

save_results(df_tags, birch, ('birch', 'extraTrees'))
df_results.loc[('birch', 'extraTrees')]

accuracy        0.7919
precision            0
recall               0
f1_score             0
mathews_corr   -0.0393
AUC             0.4962
Name: (birch, extraTrees), dtype: object

In [27]:
birch = Birch(n_clusters=2, threshold=0.1, branching_factor=10).fit_predict(df[df_features.loc[('randomForest', 'robust')]])

save_results(df_tags, birch, ('birch', 'randomForest'))
df_results.loc[('birch', 'randomForest')]

accuracy        0.7955
precision            0
recall               0
f1_score             0
mathews_corr   -0.0252
AUC             0.4984
Name: (birch, randomForest), dtype: object

# MinCovDet

In [14]:
minCovDet = EllipticEnvelope(assume_centered=False, contamination=0.4, support_fraction=1,
                             random_state=0).fit_predict(df[df_features.loc[('kbest_chi2', 'robust')]])

minCovDet[minCovDet == 1] = 0
minCovDet[minCovDet == -1] = 1
save_results(df_tags, minCovDet, ('mincovdet', 'kbest_chi2'))
df_results.loc[('mincovdet', 'kbest_chi2')]



accuracy        0.6698
precision       0.3236
recall          0.5822
f1_score         0.416
mathews_corr    0.2288
AUC             0.6371
Name: (mincovdet, kbest_chi2), dtype: object

In [15]:
minCovDet = EllipticEnvelope(assume_centered=False, contamination=0.4, support_fraction=1,
                             random_state=0).fit_predict(df[df_features.loc[('kbest_fclass', 'robust')]])

minCovDet[minCovDet == 1] = 0
minCovDet[minCovDet == -1] = 1
save_results(df_tags, minCovDet, ('mincovdet', 'kbest_fclass'))
df_results.loc[('mincovdet', 'kbest_fclass')]

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


accuracy        0.798
precision           0
recall              0
f1_score            0
mathews_corr        0
AUC               0.5
Name: (mincovdet, kbest_fclass), dtype: object

In [16]:
minCovDet = EllipticEnvelope(assume_centered=False, contamination=0.4, support_fraction=1,
                             random_state=0).fit_predict(df[df_features.loc[('extraTrees', 'robust')]])

minCovDet[minCovDet == 1] = 0
minCovDet[minCovDet == -1] = 1
save_results(df_tags, minCovDet, ('mincovdet', 'extraTrees'))
df_results.loc[('mincovdet', 'extraTrees')]



accuracy        0.8085
precision       0.5134
recall               1
f1_score        0.6785
mathews_corr    0.6247
AUC               0.88
Name: (mincovdet, extraTrees), dtype: object

In [17]:
minCovDet = EllipticEnvelope(assume_centered=False, contamination=0.4, support_fraction=1,
                             random_state=0).fit_predict(df[df_features.loc[('randomForest', 'robust')]])

minCovDet[minCovDet == 1] = 0
minCovDet[minCovDet == -1] = 1
save_results(df_tags, minCovDet, ('mincovdet', 'randomForest'))
df_results.loc[('mincovdet', 'randomForest')]



accuracy        0.8023
precision       0.5054
recall               1
f1_score        0.6715
mathews_corr    0.6166
AUC             0.8761
Name: (mincovdet, randomForest), dtype: object

# Isolation Forest

In [18]:
isolation = IsolationForest(n_estimators=100, contamination=0.4, random_state=4, 
                            behaviour='new').fit_predict(df[df_features.loc[('kbest_chi2', 'robust')]])

isolation[isolation == 1] = 0
isolation[isolation == -1] = 1
save_results(df_tags, isolation, ('isolation', 'kbest_chi2'))
df_results.loc[('isolation', 'kbest_chi2')]

accuracy        0.6707
precision       0.3284
recall          0.6033
f1_score        0.4253
mathews_corr    0.2419
AUC             0.6455
Name: (isolation, kbest_chi2), dtype: object

In [19]:
isolation = IsolationForest(n_estimators=100, contamination=0.4, random_state=4, 
                            behaviour='new').fit_predict(df[df_features.loc[('kbest_fclass', 'robust')]])

isolation[isolation == 1] = 0
isolation[isolation == -1] = 1
save_results(df_tags, isolation, ('isolation', 'kbest_fclass'))
df_results.loc[('isolation', 'kbest_fclass')]

accuracy        0.9711
precision       0.8749
recall               1
f1_score        0.9333
mathews_corr    0.9183
AUC             0.9819
Name: (isolation, kbest_fclass), dtype: object

In [20]:
isolation = IsolationForest(n_estimators=100, contamination=0.4, random_state=4, 
                            behaviour='new').fit_predict(df[df_features.loc[('extraTrees', 'robust')]])

isolation[isolation == 1] = 0
isolation[isolation == -1] = 1
save_results(df_tags, isolation, ('isolation', 'extraTrees'))
df_results.loc[('isolation', 'extraTrees')]

accuracy        0.8085
precision       0.5134
recall               1
f1_score        0.6785
mathews_corr    0.6247
AUC               0.88
Name: (isolation, extraTrees), dtype: object

In [21]:
isolation = IsolationForest(n_estimators=100, contamination=0.4, random_state=4, 
                            behaviour='new').fit_predict(df[df_features.loc[('randomForest', 'robust')]])

isolation[isolation == 1] = 0
isolation[isolation == -1] = 1
save_results(df_tags, isolation, ('isolation', 'randomForest'))
df_results.loc[('isolation', 'randomForest')]

accuracy        0.5466
precision       0.1542
recall          0.2774
f1_score        0.1982
mathews_corr     -0.09
AUC             0.4461
Name: (isolation, randomForest), dtype: object

# Resumen resultados sin preprocesamiento de los datos

In [31]:
df_results.to_csv('UNSW_results/UNSW_robustScaler_results.csv')
df_results

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,precision,recall,f1_score,mathews_corr,AUC
method,featureSelection,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
kmeans,kbest_chi2,0.7931,0.0,0.0,0.0,-0.0352,0.497
kmeans,kbest_fclass,0.9711,0.8749,1.0,0.9333,0.9183,0.9819
kmeans,extraTrees,0.811,0.5166,1.0,0.6813,0.6279,0.8816
kmeans,randomForest,0.1875,0.0,0.0,0.0,-0.6299,0.1175
minibatch,kbest_chi2,0.6342,0.0,0.0,0.0,-0.2227,0.3974
minibatch,kbest_fclass,0.9711,0.8749,1.0,0.9333,0.9183,0.9819
minibatch,extraTrees,0.811,0.5166,1.0,0.6813,0.6279,0.8816
minibatch,randomForest,0.1875,0.0,0.0,0.0,-0.6299,0.1175
birch,kbest_chi2,,,,,,
birch,kbest_fclass,0.9731,0.8824,1.0,0.9375,0.9234,0.9831
