In [2]:
import pandas as pd
import numpy as np
import json
from sklearn.cluster import KMeans, MiniBatchKMeans, Birch, DBSCAN
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score

df = pd.read_csv('UNSW_datasets_preprocesados/minMax.csv')
df_tags = pd.read_csv('UNSW_datasets_preprocesados/UNSW_tags.csv', index_col=0, header=None, names=['Tag'])

df_features = pd.read_csv('UNSW_datasets_preprocesados/UNSW_features.csv', index_col=['featureSelection', 'preprocesamiento'])
df_features.loc[pd.IndexSlice[:, 'minMax'],:]

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3,4,5,6,7,8,9
featureSelection,preprocesamiento,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
kbest_chi2,minMax,s22,s5,s30,s6,s7,d22,s20,s21,s15,s31
kbest_fclass,minMax,s22,d22,s5,s6,s7,s20,s30,s21,s15,s17
extraTrees,minMax,s28,s15,s17,s23,s26,norm_src,d11,s9,d28,s31
randomForest,minMax,norm_src,s23,s15,s31,s17,s28,eucl_dec,arit,s13,d11


In [3]:
def save_results(real, prediccion, indexs):
    df_results.loc[indexs] =  [round(accuracy_score(real, prediccion), 4),
                              round(precision_score(real, prediccion), 4),
                              round(recall_score(real, prediccion), 4),
                              round(f1_score(real, prediccion), 4),
                              round(matthews_corrcoef(real, prediccion), 4),
                              round(roc_auc_score(real, prediccion), 4)]

Se cargan los resultados ya logrados con anterioridad para no tener que ejecutar de nuevo todos los procesos, sólo los deseados.

In [4]:
from os import path

if path.exists('UNSW_results/UNSW_minMaxScaler_results.csv'):
    df_results = pd.read_csv('UNSW_results/UNSW_minMaxScaler_results.csv', index_col=['method', 'featureSelection'])
else:
    multiIndex = [['kmeans', 'minibatch', 'birch', 'mincovdet', 'isolation'],['kbest_chi2', 'kbest_fclass', 'extraTrees', 'randomForest']]

    multiIndex = pd.MultiIndex.from_product(multiIndex, names=['method', 'featureSelection'])
    df_results = pd.DataFrame(None, index=multiIndex, columns=['accuracy', 'precision', 'recall', 'f1_score', 'mathews_corr', 'AUC'])
df_results

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,precision,recall,f1_score,mathews_corr,AUC
method,featureSelection,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
kmeans,kbest_chi2,,,,,,
kmeans,kbest_fclass,,,,,,
kmeans,extraTrees,,,,,,
kmeans,randomForest,,,,,,
minibatch,kbest_chi2,,,,,,
minibatch,kbest_fclass,,,,,,
minibatch,extraTrees,,,,,,
minibatch,randomForest,,,,,,
birch,kbest_chi2,,,,,,
birch,kbest_fclass,,,,,,


# KMeans

In [5]:
kmeans = KMeans(n_clusters=2, n_init=10, random_state=0, algorithm='full').fit(df[df_features.loc[('kbest_chi2', 'minMax')]])

save_results(df_tags, kmeans.labels_, ('kmeans', 'kbest_chi2'))
df_results.loc[('kmeans', 'kbest_chi2')]

accuracy        0.8146
precision       0.5214
recall               1
f1_score        0.6854
mathews_corr    0.6327
AUC             0.8838
Name: (kmeans, kbest_chi2), dtype: object

In [6]:
kmeans = KMeans(n_clusters=2, n_init=10, random_state=0, algorithm='full').fit(df[df_features.loc[('kbest_fclass', 'minMax')]])

save_results(df_tags, kmeans.labels_, ('kmeans', 'kbest_fclass'))
df_results.loc[('kmeans', 'kbest_fclass')]

accuracy        0.9711
precision       0.8749
recall               1
f1_score        0.9333
mathews_corr    0.9183
AUC             0.9819
Name: (kmeans, kbest_fclass), dtype: object

In [7]:
kmeans = KMeans(n_clusters=2, n_init=10, random_state=0, algorithm='full').fit(df[df_features.loc[('extraTrees', 'minMax')]])

save_results(df_tags, kmeans.labels_, ('kmeans', 'extraTrees'))
df_results.loc[('kmeans', 'extraTrees')]

accuracy        0.9731
precision       0.8824
recall               1
f1_score        0.9375
mathews_corr    0.9234
AUC             0.9831
Name: (kmeans, extraTrees), dtype: object

In [8]:
kmeans = KMeans(n_clusters=2, n_init=10, random_state=0, algorithm='full').fit(df[df_features.loc[('randomForest', 'minMax')]])

save_results(df_tags, kmeans.labels_, ('kmeans', 'randomForest'))
df_results.loc[('kmeans', 'randomForest')]

accuracy        0.9731
precision       0.8824
recall               1
f1_score        0.9375
mathews_corr    0.9234
AUC             0.9831
Name: (kmeans, randomForest), dtype: object

# MiniBatch KMeans

In [9]:
minibatch = MiniBatchKMeans(n_clusters=2,n_init=10, random_state=0).fit(df[df_features.loc[('kbest_chi2', 'minMax')]])

save_results(df_tags, (minibatch.labels_ + 1) % 2, ('minibatch', 'kbest_chi2'))    # Se invierte los clusters tags 0 y 1
df_results.loc[('minibatch', 'kbest_chi2')]

accuracy        0.8146
precision       0.5214
recall               1
f1_score        0.6854
mathews_corr    0.6327
AUC             0.8838
Name: (minibatch, kbest_chi2), dtype: object

In [10]:
minibatch = MiniBatchKMeans(n_clusters=2,n_init=10, random_state=0).fit(df[df_features.loc[('kbest_fclass', 'minMax')]])

save_results(df_tags, (minibatch.labels_ +1 ) % 2, ('minibatch', 'kbest_fclass'))    # Se invierte los clusters tags 0 y 1
df_results.loc[('minibatch', 'kbest_fclass')]

accuracy        0.9711
precision       0.8749
recall               1
f1_score        0.9333
mathews_corr    0.9183
AUC             0.9819
Name: (minibatch, kbest_fclass), dtype: object

In [11]:
minibatch = MiniBatchKMeans(n_clusters=2,n_init=10, random_state=0).fit(df[df_features.loc[('extraTrees', 'minMax')]])

save_results(df_tags, minibatch.labels_, ('minibatch', 'extraTrees'))    # Se invierte los clusters tags 0 y 1
df_results.loc[('minibatch', 'extraTrees')]

accuracy        0.9731
precision       0.8824
recall               1
f1_score        0.9375
mathews_corr    0.9234
AUC             0.9831
Name: (minibatch, extraTrees), dtype: object

In [12]:
minibatch = MiniBatchKMeans(n_clusters=2,n_init=10, random_state=0).fit(df[df_features.loc[('randomForest', 'minMax')]])

save_results(df_tags, minibatch.labels_, ('minibatch', 'randomForest'))    # Se invierte los clusters tags 0 y 1
df_results.loc[('minibatch', 'randomForest')]

accuracy        0.9731
precision       0.8824
recall               1
f1_score        0.9375
mathews_corr    0.9234
AUC             0.9831
Name: (minibatch, randomForest), dtype: object

# BIRCH

In [23]:
birch = Birch(n_clusters=2, threshold=0.1, branching_factor=10).fit_predict(df[df_features.loc[('kbest_chi2', 'minMax')]])

save_results(df_tags, (birch + 1) % 2, ('birch', 'kbest_chi2'))
df_results.loc[('birch', 'kbest_chi2')]

accuracy        0.9731
precision       0.8824
recall               1
f1_score        0.9375
mathews_corr    0.9234
AUC             0.9831
Name: (birch, kbest_chi2), dtype: object

In [24]:
birch = Birch(n_clusters=2, threshold=0.1, branching_factor=10).fit_predict(df[df_features.loc[('kbest_fclass', 'minMax')]])

save_results(df_tags, (birch + 1) % 2, ('birch', 'kbest_fclass'))
df_results.loc[('birch', 'kbest_fclass')]

accuracy        0.9731
precision       0.8824
recall               1
f1_score        0.9375
mathews_corr    0.9234
AUC             0.9831
Name: (birch, kbest_fclass), dtype: object

In [26]:
birch = Birch(n_clusters=2, threshold=0.1, branching_factor=10).fit_predict(df[df_features.loc[('extraTrees', 'minMax')]])

save_results(df_tags, birch, ('birch', 'extraTrees'))
df_results.loc[('birch', 'extraTrees')]

accuracy        0.7919
precision            0
recall               0
f1_score             0
mathews_corr   -0.0393
AUC             0.4962
Name: (birch, extraTrees), dtype: object

In [27]:
birch = Birch(n_clusters=2, threshold=0.1, branching_factor=10).fit_predict(df[df_features.loc[('randomForest', 'minMax')]])

save_results(df_tags, birch, ('birch', 'randomForest'))
df_results.loc[('birch', 'randomForest')]

accuracy         0.796
precision            0
recall               0
f1_score             0
mathews_corr   -0.0223
AUC             0.4988
Name: (birch, randomForest), dtype: object

# MinCovDet

In [13]:
minCovDet = EllipticEnvelope(assume_centered=False, contamination=0.4, support_fraction=1,
                             random_state=0).fit_predict(df[df_features.loc[('kbest_chi2', 'minMax')]])

minCovDet[minCovDet == 1] = 0
minCovDet[minCovDet == -1] = 1
save_results(df_tags, minCovDet, ('mincovdet', 'kbest_chi2'))
df_results.loc[('mincovdet', 'kbest_chi2')]

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


accuracy        0.798
precision           0
recall              0
f1_score            0
mathews_corr        0
AUC               0.5
Name: (mincovdet, kbest_chi2), dtype: object

In [14]:
minCovDet = EllipticEnvelope(assume_centered=False, contamination=0.4, support_fraction=1,
                             random_state=0).fit_predict(df[df_features.loc[('kbest_fclass', 'minMax')]])

minCovDet[minCovDet == 1] = 0
minCovDet[minCovDet == -1] = 1
save_results(df_tags, minCovDet, ('mincovdet', 'kbest_fclass'))
df_results.loc[('mincovdet', 'kbest_fclass')]

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


accuracy        0.798
precision           0
recall              0
f1_score            0
mathews_corr        0
AUC               0.5
Name: (mincovdet, kbest_fclass), dtype: object

In [15]:
minCovDet = EllipticEnvelope(assume_centered=False, contamination=0.4, support_fraction=1,
                             random_state=0).fit_predict(df[df_features.loc[('extraTrees', 'minMax')]])

minCovDet[minCovDet == 1] = 0
minCovDet[minCovDet == -1] = 1
save_results(df_tags, minCovDet, ('mincovdet', 'extraTrees'))
df_results.loc[('mincovdet', 'extraTrees')]



accuracy        0.8085
precision       0.5134
recall               1
f1_score        0.6785
mathews_corr    0.6247
AUC               0.88
Name: (mincovdet, extraTrees), dtype: object

In [16]:
minCovDet = EllipticEnvelope(assume_centered=False, contamination=0.4, support_fraction=1,
                             random_state=0).fit_predict(df[df_features.loc[('randomForest', 'minMax')]])

minCovDet[minCovDet == 1] = 0
minCovDet[minCovDet == -1] = 1
save_results(df_tags, minCovDet, ('mincovdet', 'randomForest'))
df_results.loc[('mincovdet', 'randomForest')]



accuracy        0.7964
precision        0.498
recall               1
f1_score        0.6649
mathews_corr    0.6091
AUC             0.8724
Name: (mincovdet, randomForest), dtype: object

# Isolation Forest

In [18]:
isolation = IsolationForest(n_estimators=100, contamination=0.4, random_state=4, 
                            behaviour='new').fit_predict(df[df_features.loc[('kbest_chi2', 'minMax')]])

isolation[isolation == 1] = 0
isolation[isolation == -1] = 1
save_results(df_tags, isolation, ('isolation', 'kbest_chi2'))
df_results.loc[('isolation', 'kbest_chi2')]

accuracy        0.8105
precision        0.516
recall               1
f1_score        0.6807
mathews_corr    0.6272
AUC             0.8813
Name: (isolation, kbest_chi2), dtype: object

In [19]:
isolation = IsolationForest(n_estimators=100, contamination=0.4, random_state=4, 
                            behaviour='new').fit_predict(df[df_features.loc[('kbest_fclass', 'minMax')]])

isolation[isolation == 1] = 0
isolation[isolation == -1] = 1
save_results(df_tags, isolation, ('isolation', 'kbest_fclass'))
df_results.loc[('isolation', 'kbest_fclass')]

accuracy        0.9711
precision       0.8749
recall               1
f1_score        0.9333
mathews_corr    0.9183
AUC             0.9819
Name: (isolation, kbest_fclass), dtype: object

In [20]:
isolation = IsolationForest(n_estimators=100, contamination=0.4, random_state=4, 
                            behaviour='new').fit_predict(df[df_features.loc[('extraTrees', 'minMax')]])

isolation[isolation == 1] = 0
isolation[isolation == -1] = 1
save_results(df_tags, isolation, ('isolation', 'extraTrees'))
df_results.loc[('isolation', 'extraTrees')]

accuracy        0.8085
precision       0.5134
recall               1
f1_score        0.6785
mathews_corr    0.6247
AUC               0.88
Name: (isolation, extraTrees), dtype: object

In [21]:
isolation = IsolationForest(n_estimators=100, contamination=0.4, random_state=4, 
                            behaviour='new').fit_predict(df[df_features.loc[('randomForest', 'minMax')]])

isolation[isolation == 1] = 0
isolation[isolation == -1] = 1
save_results(df_tags, isolation, ('isolation', 'randomForest'))
df_results.loc[('isolation', 'randomForest')]

accuracy        0.5466
precision       0.1542
recall          0.2774
f1_score        0.1982
mathews_corr     -0.09
AUC             0.4461
Name: (isolation, randomForest), dtype: object

# Resumen resultados aplicando la transformación MinMaxScaler a los datos

In [28]:
df_results.to_csv('UNSW_results/UNSW_minMaxScaler_results.csv')
df_results

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,precision,recall,f1_score,mathews_corr,AUC
method,featureSelection,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
kmeans,kbest_chi2,0.8146,0.5214,1.0,0.6854,0.6327,0.8838
kmeans,kbest_fclass,0.9711,0.8749,1.0,0.9333,0.9183,0.9819
kmeans,extraTrees,0.9731,0.8824,1.0,0.9375,0.9234,0.9831
kmeans,randomForest,0.9731,0.8824,1.0,0.9375,0.9234,0.9831
minibatch,kbest_chi2,0.8146,0.5214,1.0,0.6854,0.6327,0.8838
minibatch,kbest_fclass,0.9711,0.8749,1.0,0.9333,0.9183,0.9819
minibatch,extraTrees,0.9731,0.8824,1.0,0.9375,0.9234,0.9831
minibatch,randomForest,0.9731,0.8824,1.0,0.9375,0.9234,0.9831
birch,kbest_chi2,0.9731,0.8824,1.0,0.9375,0.9234,0.9831
birch,kbest_fclass,0.9731,0.8824,1.0,0.9375,0.9234,0.9831
