In [1]:
import pandas as pd
import numpy as np
import json
from sklearn.cluster import KMeans, MiniBatchKMeans, Birch, DBSCAN
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score

df = pd.read_csv('UNSW_datasets_preprocesados/norml1.csv')
df_tags = pd.read_csv('UNSW_datasets_preprocesados/UNSW_tags.csv', index_col=0, header=None, names=['Tag'])

df_features = pd.read_csv('UNSW_datasets_preprocesados/UNSW_features.csv', index_col=['featureSelection', 'preprocesamiento'])
df_features.loc[pd.IndexSlice[:, 'norm_l1'],:]

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3,4,5,6,7,8,9
featureSelection,preprocesamiento,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
kbest_chi2,norm_l1,d22,s30,s22,s7,s6,s5,s20,eucl_dec,hamm,eucl
kbest_fclass,norm_l1,s7,s30,s22,d22,s20,s5,s6,s29,s18,eucl
extraTrees,norm_l1,xor+,hamm,eucl,xor,s25,eucl_dec,d7,d25,s1,d5
randomForest,norm_l1,eucl,hamm,xor,xor+,d19,norm_dst,norm_src,s25,d25,d12


In [2]:
def save_results(real, prediccion, indexs):
    df_results.loc[indexs] =  [round(accuracy_score(real, prediccion), 4),
                              round(precision_score(real, prediccion), 4),
                              round(recall_score(real, prediccion), 4),
                              round(f1_score(real, prediccion), 4),
                              round(matthews_corrcoef(real, prediccion), 4),
                              round(roc_auc_score(real, prediccion), 4)]

In [3]:
from os import path

if path.exists('UNSW_results/UNSW_norml1_results.csv'):
    df_results = pd.read_csv('UNSW_results/UNSW_norml1_results.csv', index_col=['method', 'featureSelection'])
else:
    multiIndex = [['kmeans', 'minibatch', 'birch', 'mincovdet', 'isolation'],['kbest_chi2', 'kbest_fclass', 'extraTrees', 'randomForest']]

    multiIndex = pd.MultiIndex.from_product(multiIndex, names=['method', 'featureSelection'])
    df_results = pd.DataFrame(None, index=multiIndex, columns=['accuracy', 'precision', 'recall', 'f1_score', 'mathews_corr', 'AUC'])
df_results

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,precision,recall,f1_score,mathews_corr,AUC
method,featureSelection,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
kmeans,kbest_chi2,,,,,,
kmeans,kbest_fclass,,,,,,
kmeans,extraTrees,,,,,,
kmeans,randomForest,,,,,,
minibatch,kbest_chi2,,,,,,
minibatch,kbest_fclass,,,,,,
minibatch,extraTrees,,,,,,
minibatch,randomForest,,,,,,
birch,kbest_chi2,,,,,,
birch,kbest_fclass,,,,,,


# KMeans

In [4]:
kmeans = KMeans(n_clusters=2, n_init=10, random_state=0, algorithm='full').fit(df[df_features.loc[('kbest_chi2', 'norm_l1')]])

save_results(df_tags, kmeans.labels_, ('kmeans', 'kbest_chi2'))
df_results.loc[('kmeans', 'kbest_chi2')]

accuracy        0.7947
precision            0
recall               0
f1_score             0
mathews_corr   -0.0288
AUC             0.4979
Name: (kmeans, kbest_chi2), dtype: object

In [5]:
kmeans = KMeans(n_clusters=2, n_init=10, random_state=0, algorithm='full').fit(df[df_features.loc[('kbest_fclass', 'norm_l1')]])

save_results(df_tags, (kmeans.labels_ + 1) % 2, ('kmeans', 'kbest_fclass'))
df_results.loc[('kmeans', 'kbest_fclass')]

accuracy        0.8133
precision       0.5196
recall               1
f1_score        0.6839
mathews_corr    0.6309
AUC              0.883
Name: (kmeans, kbest_fclass), dtype: object

In [6]:
kmeans = KMeans(n_clusters=2, n_init=10, random_state=0, algorithm='full').fit(df[df_features.loc[('extraTrees', 'norm_l1')]])

save_results(df_tags, kmeans.labels_, ('kmeans', 'extraTrees'))
df_results.loc[('kmeans', 'extraTrees')]

accuracy        0.7933
precision            0
recall               0
f1_score             0
mathews_corr   -0.0345
AUC             0.4971
Name: (kmeans, extraTrees), dtype: object

In [7]:
kmeans = KMeans(n_clusters=2, n_init=10, random_state=0, algorithm='full').fit(df[df_features.loc[('randomForest', 'norm_l1')]])

save_results(df_tags, kmeans.labels_, ('kmeans', 'randomForest'))
df_results.loc[('kmeans', 'randomForest')]

accuracy        0.7933
precision            0
recall               0
f1_score             0
mathews_corr   -0.0345
AUC             0.4971
Name: (kmeans, randomForest), dtype: object

# MiniBatch KMeans

In [8]:
minibatch = MiniBatchKMeans(n_clusters=2,n_init=10, random_state=0).fit(df[df_features.loc[('kbest_chi2', 'norm_l1')]])

save_results(df_tags, minibatch.labels_, ('minibatch', 'kbest_chi2'))    # Se invierte los clusters tags 0 y 1
df_results.loc[('minibatch', 'kbest_chi2')]

accuracy        0.8133
precision       0.5197
recall               1
f1_score        0.6839
mathews_corr    0.6309
AUC              0.883
Name: (minibatch, kbest_chi2), dtype: object

In [9]:
minibatch = MiniBatchKMeans(n_clusters=2,n_init=10, random_state=0).fit(df[df_features.loc[('kbest_fclass', 'norm_l1')]])

save_results(df_tags, (minibatch.labels_ + 1) % 2, ('minibatch', 'kbest_fclass'))    # Se invierte los clusters tags 0 y 1
df_results.loc[('minibatch', 'kbest_fclass')]

accuracy        0.8133
precision       0.5196
recall               1
f1_score        0.6839
mathews_corr    0.6309
AUC              0.883
Name: (minibatch, kbest_fclass), dtype: object

In [10]:
minibatch = MiniBatchKMeans(n_clusters=2,n_init=10, random_state=0).fit(df[df_features.loc[('extraTrees', 'norm_l1')]])

save_results(df_tags, minibatch.labels_, ('minibatch', 'extraTrees'))    # Se invierte los clusters tags 0 y 1
df_results.loc[('minibatch', 'extraTrees')]

accuracy        0.7933
precision            0
recall               0
f1_score             0
mathews_corr   -0.0345
AUC             0.4971
Name: (minibatch, extraTrees), dtype: object

In [11]:
minibatch = MiniBatchKMeans(n_clusters=2,n_init=10, random_state=0).fit(df[df_features.loc[('randomForest', 'norm_l1')]])

save_results(df_tags, minibatch.labels_, ('minibatch', 'randomForest'))    # Se invierte los clusters tags 0 y 1
df_results.loc[('minibatch', 'randomForest')]

accuracy        0.7933
precision            0
recall               0
f1_score             0
mathews_corr   -0.0345
AUC             0.4971
Name: (minibatch, randomForest), dtype: object

# BIRCH

In [21]:
birch = Birch(n_clusters=2, threshold=0.1, branching_factor=10).fit_predict(df[df_features.loc[('kbest_chi2', 'norm_l1')]])

save_results(df_tags, birch, ('birch', 'kbest_chi2'))
df_results.loc[('birch', 'kbest_chi2')]

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


accuracy        0.798
precision           0
recall              0
f1_score            0
mathews_corr        0
AUC               0.5
Name: (birch, kbest_chi2), dtype: object

In [22]:
birch = Birch(n_clusters=2, threshold=0.1, branching_factor=10).fit_predict(df[df_features.loc[('kbest_fclass', 'norm_l1')]])

save_results(df_tags, birch, ('birch', 'kbest_fclass'))
df_results.loc[('birch', 'kbest_fclass')]

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


accuracy        0.798
precision           0
recall              0
f1_score            0
mathews_corr        0
AUC               0.5
Name: (birch, kbest_fclass), dtype: object

In [23]:
birch = Birch(n_clusters=2, threshold=0.1, branching_factor=10).fit_predict(df[df_features.loc[('extraTrees', 'norm_l1')]])

save_results(df_tags, (birch + 1) % 2, ('birch', 'extraTrees'))
df_results.loc[('birch', 'extraTrees')]

accuracy        0.7933
precision            0
recall               0
f1_score             0
mathews_corr   -0.0345
AUC             0.4971
Name: (birch, extraTrees), dtype: object

In [24]:
birch = Birch(n_clusters=2, threshold=0.1, branching_factor=10).fit_predict(df[df_features.loc[('randomForest', 'norm_l1')]])

save_results(df_tags, (birch + 1) % 2, ('birch', 'randomForest'))
df_results.loc[('birch', 'randomForest')]

accuracy        0.7933
precision            0
recall               0
f1_score             0
mathews_corr   -0.0345
AUC             0.4971
Name: (birch, randomForest), dtype: object

# MinCovDet

In [12]:
minCovDet = EllipticEnvelope(assume_centered=False, contamination=0.4, support_fraction=1,
                             random_state=0).fit_predict(df[df_features.loc[('kbest_chi2', 'norm_l1')]])

minCovDet[minCovDet == 1] = 0
minCovDet[minCovDet == -1] = 1
save_results(df_tags, minCovDet, ('mincovdet', 'kbest_chi2'))
df_results.loc[('mincovdet', 'kbest_chi2')]



accuracy        0.7962
precision       0.4977
recall               1
f1_score        0.6647
mathews_corr    0.6088
AUC             0.8723
Name: (mincovdet, kbest_chi2), dtype: object

In [13]:
minCovDet = EllipticEnvelope(assume_centered=False, contamination=0.4, support_fraction=1,
                             random_state=0).fit_predict(df[df_features.loc[('kbest_fclass', 'norm_l1')]])

minCovDet[minCovDet == 1] = 0
minCovDet[minCovDet == -1] = 1
save_results(df_tags, minCovDet, ('mincovdet', 'kbest_fclass'))
df_results.loc[('mincovdet', 'kbest_fclass')]



accuracy        0.8023
precision       0.5054
recall               1
f1_score        0.6715
mathews_corr    0.6166
AUC             0.8761
Name: (mincovdet, kbest_fclass), dtype: object

In [14]:
minCovDet = EllipticEnvelope(assume_centered=False, contamination=0.4, support_fraction=1,
                             random_state=0).fit_predict(df[df_features.loc[('extraTrees', 'norm_l1')]])

minCovDet[minCovDet == 1] = 0
minCovDet[minCovDet == -1] = 1
save_results(df_tags, minCovDet, ('mincovdet', 'extraTrees'))
df_results.loc[('mincovdet', 'extraTrees')]



accuracy        0.8023
precision       0.5054
recall               1
f1_score        0.6715
mathews_corr    0.6166
AUC             0.8761
Name: (mincovdet, extraTrees), dtype: object

In [15]:
minCovDet = EllipticEnvelope(assume_centered=False, contamination=0.4, support_fraction=1,
                             random_state=0).fit_predict(df[df_features.loc[('randomForest', 'norm_l1')]])

minCovDet[minCovDet == 1] = 0
minCovDet[minCovDet == -1] = 1
save_results(df_tags, minCovDet, ('mincovdet', 'randomForest'))
df_results.loc[('mincovdet', 'randomForest')]



accuracy        0.8023
precision       0.5054
recall               1
f1_score        0.6715
mathews_corr    0.6166
AUC             0.8761
Name: (mincovdet, randomForest), dtype: object

# Isolation Forest

In [16]:
isolation = IsolationForest(n_estimators=100, contamination=0.4, random_state=4, 
                            behaviour='new').fit_predict(df[df_features.loc[('kbest_chi2', 'norm_l1')]])

isolation[isolation == 1] = 0
isolation[isolation == -1] = 1
save_results(df_tags, isolation, ('isolation', 'kbest_chi2'))
df_results.loc[('isolation', 'kbest_chi2')]

accuracy        0.8086
precision       0.5135
recall               1
f1_score        0.6785
mathews_corr    0.6247
AUC             0.8801
Name: (isolation, kbest_chi2), dtype: object

In [17]:
isolation = IsolationForest(n_estimators=100, contamination=0.4, random_state=4, 
                            behaviour='new').fit_predict(df[df_features.loc[('kbest_fclass', 'norm_l1')]])

isolation[isolation == 1] = 0
isolation[isolation == -1] = 1
save_results(df_tags, isolation, ('isolation', 'kbest_fclass'))
df_results.loc[('isolation', 'kbest_fclass')]

accuracy        0.8086
precision       0.5135
recall               1
f1_score        0.6785
mathews_corr    0.6247
AUC             0.8801
Name: (isolation, kbest_fclass), dtype: object

In [18]:
isolation = IsolationForest(n_estimators=100, contamination=0.4, random_state=4, 
                            behaviour='new').fit_predict(df[df_features.loc[('extraTrees', 'norm_l1')]])

isolation[isolation == 1] = 0
isolation[isolation == -1] = 1
save_results(df_tags, isolation, ('isolation', 'extraTrees'))
df_results.loc[('isolation', 'extraTrees')]

accuracy        0.8086
precision       0.5135
recall               1
f1_score        0.6785
mathews_corr    0.6247
AUC             0.8801
Name: (isolation, extraTrees), dtype: object

In [19]:
isolation = IsolationForest(n_estimators=100, contamination=0.4, random_state=4, 
                            behaviour='new').fit_predict(df[df_features.loc[('randomForest', 'norm_l1')]])

isolation[isolation == 1] = 0
isolation[isolation == -1] = 1
save_results(df_tags, isolation, ('isolation', 'randomForest'))
df_results.loc[('isolation', 'randomForest')]

accuracy        0.8086
precision       0.5135
recall               1
f1_score        0.6785
mathews_corr    0.6247
AUC             0.8801
Name: (isolation, randomForest), dtype: object

# Resumen resultados sin preprocesamiento de los datos

In [25]:
df_results.to_csv('UNSW_results/UNSW_norml1_results.csv')
df_results

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,precision,recall,f1_score,mathews_corr,AUC
method,featureSelection,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
kmeans,kbest_chi2,0.7947,0.0,0,0.0,-0.0288,0.4979
kmeans,kbest_fclass,0.8133,0.5196,1,0.6839,0.6309,0.883
kmeans,extraTrees,0.7933,0.0,0,0.0,-0.0345,0.4971
kmeans,randomForest,0.7933,0.0,0,0.0,-0.0345,0.4971
minibatch,kbest_chi2,0.8133,0.5197,1,0.6839,0.6309,0.883
minibatch,kbest_fclass,0.8133,0.5196,1,0.6839,0.6309,0.883
minibatch,extraTrees,0.7933,0.0,0,0.0,-0.0345,0.4971
minibatch,randomForest,0.7933,0.0,0,0.0,-0.0345,0.4971
birch,kbest_chi2,0.798,0.0,0,0.0,0.0,0.5
birch,kbest_fclass,0.798,0.0,0,0.0,0.0,0.5
