In [1]:
# Author: Kaylani Bochie
# github.com/kaylani2
# kaylani AT gta DOT ufrj DOT br

### K: Model: Autoencoder
import sys
import time
import pandas as pd
import os
import math
import seaborn as sb
import numpy as np
from matplotlib import pyplot as plt
from matplotlib.ticker import StrMethodFormatter
sys.path.insert(1, '../')
from numpy import mean, std
from unit import remove_columns_with_one_value, remove_nan_columns, load_dataset
from unit import display_general_information, display_feature_distribution
from collections import Counter
#from imblearn.over_sampling import RandomOverSampler, RandomUnderSampler
import sklearn
from sklearn import set_config
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.metrics import confusion_matrix, precision_score, recall_score
from sklearn.metrics import f1_score, classification_report, accuracy_score
from sklearn.metrics import cohen_kappa_score, mean_squared_error
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, PredefinedSplit
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif, chi2, mutual_info_classif
from sklearn.utils import class_weight
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier, KerasRegressor
import keras.utils
from keras import metrics
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Conv2D, MaxPooling2D, Flatten, LSTM
from keras.optimizers import RMSprop, Adam
from keras.constraints import maxnorm

In [2]:
###############################################################################
## Define constants
###############################################################################
pd.set_option ('display.max_rows', None)
pd.set_option ('display.max_columns', 5)
BOT_IOT_DIRECTORY = '../../../../../datasets/bot-iot/'
BOT_IOT_FEATURE_NAMES = 'UNSW_2018_IoT_Botnet_Dataset_Feature_Names.csv'
BOT_IOT_FILE_5_PERCENT_SCHEMA = 'UNSW_2018_IoT_Botnet_Full5pc_{}.csv' # 1 - 4
FIVE_PERCENT_FILES = 4
BOT_IOT_FILE_FULL_SCHEMA = 'UNSW_2018_IoT_Botnet_Dataset_{}.csv' # 1 - 74
FULL_FILES = 74
FILE_NAME = BOT_IOT_DIRECTORY + BOT_IOT_FILE_5_PERCENT_SCHEMA
FEATURES = BOT_IOT_DIRECTORY + BOT_IOT_FEATURE_NAMES
NAN_VALUES = ['?', '.']
TARGET = 'attack'
INDEX_COLUMN = 'pkSeqID'
LABELS = ['attack', 'category', 'subcategory']
STATE = 0
try:
  STATE = int (sys.argv [1])
except:
  pass
#for STATE in [1, 2, 3, 4, 5]:
np.random.seed (STATE)
print ('STATE:', STATE)

STATE: 0


In [3]:
###############################################################################
## Load dataset
###############################################################################
df = load_dataset (FILE_NAME, FIVE_PERCENT_FILES, INDEX_COLUMN, NAN_VALUES)
print ('Finished loading dataset.')

Reading ../../../../../datasets/bot-iot/UNSW_2018_IoT_Botnet_Full5pc_1.csv
Reading ../../../../../datasets/bot-iot/UNSW_2018_IoT_Botnet_Full5pc_2.csv
Reading ../../../../../datasets/bot-iot/UNSW_2018_IoT_Botnet_Full5pc_3.csv
Reading ../../../../../datasets/bot-iot/UNSW_2018_IoT_Botnet_Full5pc_4.csv
Finished loading dataset.


In [None]:
###############################################################################
## Quick sanity check
###############################################################################M
display_general_information (df)

In [4]:
###############################################################################
## Clean dataset
###############################################################################
###############################################################################
### Remove columns with only one value
df, log = remove_columns_with_one_value (df, verbose = False)
print (log)

###############################################################################
### Remove redundant columns, useless columns and unused targets
### K: _number columns are numerical representations of other existing columns.
### K: category and subcategory are other labels.
### K: saddr and daddr may specialize the model to a single network
redundant_columns = ['state_number', 'proto_number', 'flgs_number']
other_targets = ['category', 'subcategory']
misc_columns = ['saddr', 'daddr']
print ('Removing redundant columns:', redundant_columns)
print ('Removing useless targets:', other_targets)
print ('Removing misc columns:', misc_columns)
columns_to_remove = redundant_columns + other_targets + misc_columns
df.drop (axis = 'columns', columns = columns_to_remove, inplace = True)

###############################################################################
### Remove NaN columns (with a lot of NaN values)
df, log = remove_nan_columns (df, 1/2, verbose = False)
print (log)

###############################################################################
### Encode categorical features
print ('Encoding categorical features (ordinal encoding).')
my_encoder = OrdinalEncoder ()
df ['flgs'] = my_encoder.fit_transform (df ['flgs'].values.reshape (-1, 1))
df ['proto'] = my_encoder.fit_transform (df ['proto'].values.reshape (-1, 1))
df ['sport'] = my_encoder.fit_transform (df ['sport'].astype (str).values.reshape (-1, 1))
df ['dport'] = my_encoder.fit_transform (df ['dport'].astype (str).values.reshape (-1, 1))
df ['state'] = my_encoder.fit_transform (df ['state'].values.reshape (-1, 1))
print ('Objects:', list (df.select_dtypes ( ['object']).columns), '(should be empty)')

While removing single value columns: No columns dropped.
Removing redundant columns: ['state_number', 'proto_number', 'flgs_number']
Removing useless targets: ['category', 'subcategory']
Removing misc columns: ['saddr', 'daddr']
While removing nan value columns: No columns dropped.
Encoding categorical features (ordinal encoding).
Objects: [] (should be empty)


In [None]:
###############################################################################
## Quick sanity check
###############################################################################M
display_general_information (df)

In [None]:
### Plot correlation matrix
pearsoncorr = df.corr (method = 'pearson')

fig, ax = plt.subplots (figsize = (10, 10))  # Sample figsize in inches
sb_plot = sb.heatmap (pearsoncorr, 
                      xticklabels = pearsoncorr.columns,
                      yticklabels = pearsoncorr.columns,
                      cmap = 'RdBu_r',
                      #annot=True,
                      linewidth = 0.5)
sb_plot.figure.savefig ('correlation_matrix.png')

In [None]:
### Plot feature distribution
### K: .count () takes a bit too long...
#df.groupby ('mean').count ().plot ()

for column in df.columns:   
    #print (column)
    ax = df.hist (column=column,# bins = np.logspace(np.log10(0.1),np.log10(1.0), 50),
                  bins = 40,
                  #bins = max (df [column].nunique (), 50),
                  grid=False, figsize=(12,8), color='#86bf91', zorder=2, rwidth=0.9)
    ax = ax[0]
    for x in ax:
        # Switch off ticks
        x.tick_params(axis="both", which="both", bottom="on", top="on", labelbottom="on", left="on", right="on", labelleft="on")
        # Draw horizontal axis lines
        vals = x.get_yticks()
        for tick in vals:
            x.axhline(y=tick, linestyle='dashed', alpha=0.4, color='#eeeeee', zorder=1)
        # Remove title
        x.set_title("")
        # Set x-axis label
        x.set_xlabel(column, labelpad=20, weight='bold', size=22)
        # Set y-axis label
        x.set_ylabel('Samples', labelpad=20, weight='bold', size=22)
        #x.set_xticklabels(x_ticks, rotation=0, fontsize=20)
        plt.rc('xtick',labelsize=20)
        plt.rc('ytick',labelsize=20)
        #x.set_xticklabels(x_ticks, rotation=0, fontsize=20)
        # Format y-axis label
        x.yaxis.set_major_formatter(StrMethodFormatter('{x:,g}'))
        x.figure.savefig (column + '.png')
        #x.plot ()
print ('Done plotting feature distribution.')

In [None]:
### Plot feature distribution
# An "interface" to matplotlib.axes.Axes.hist() method
'''
n, bins, patches = plt.hist(x=X_train [:, 0], bins='auto', color='#0504aa',
                            alpha=0.7, rwidth=0.85)
plt.grid(axis='y', alpha=0.75)
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('My Very Own Histogram')
#plt.text(23, 45, r'$\mu=15, b=3$')
maxfreq = n.max()
# Set a clean upper y-axis limit.
plt.ylim(ymax=np.ceil(maxfreq / 10) * 10 if maxfreq % 10 else maxfreq + 10)
plt.show ()
''' 

for column in df.columns:
    df [column].plot.hist(grid=True, bins=20, rwidth=0.9, color='#607c8e')
    plt.title(column)
    plt.xlabel(column)
    plt.ylabel('Samples')
    plt.grid(axis='y', alpha=0.75)
    plt.show ()

In [5]:
###############################################################################
## Split dataset
###############################################################################
## K: Dataset is too big? Drop.
drop_indices = np.random.choice (df.index, int (df.shape [0] * 0.5),
                                 replace = False)
df = df.drop (drop_indices)

In [6]:
### Isolate attack and non-attack (normal) samples
mask = df [TARGET] == 0
# 0 == normal
df_normal = df [mask]
# 1 == attack
df_attack = df [~mask]

print ('Attack set:')
print (df_attack [TARGET].value_counts ())
print ('Normal set:')
print (df_normal [TARGET].value_counts ())

Attack set:
1    1834017
Name: attack, dtype: int64
Normal set:
0    244
Name: attack, dtype: int64


In [7]:
### Sample and drop random attacks
df_random_attacks = df_attack.sample (n = df_normal.shape [0],
                                      random_state = STATE)
df_attack = df_attack.drop (df_random_attacks.index)

### Assemble train set (only attacks)
X_train = df_attack.loc [:, df.columns != TARGET]
y_train = df_attack [TARGET]
print ('Train set:')
print (df_attack [TARGET].value_counts ())

### Assemble test set (50/50 attacks and non-attacks)
df_test = pd.DataFrame ()
df_test = pd.concat ( [df_test, df_normal])
df_test = pd.concat ( [df_test, df_random_attacks])
print ('Test set:')
print (df_test [TARGET].value_counts ())
X_test = df_test.loc [:, df.columns != TARGET]
y_test = df_test [TARGET]
### K: y_test is required to plot the roc curve in the end

# df_train = df_attack
VALIDATION_SIZE = 1/4
print ('\nSplitting dataset (validation/train):', VALIDATION_SIZE)
X_train, X_val, y_train, y_val = train_test_split (
                                             X_train,
                                             y_train,
                                             test_size = VALIDATION_SIZE,
                                             random_state = STATE,)

Train set:
1    1833773
Name: attack, dtype: int64
Test set:
1    244
0    244
Name: attack, dtype: int64

Splitting dataset (validation/train): 0.25


In [None]:
print ('X_train shape:', X_train.shape)
print ('y_train shape:', y_train.shape)
print ('X_val shape:', X_val.shape)
print ('y_val shape:', y_val.shape)
print ('X_test shape:', X_test.shape)
print ('y_test shape:', y_test.shape)
print (type (X_train))

In [None]:
# ###############################################################################
# ## Convert dataframe to a numpy array
# ###############################################################################
# print ('\nConverting dataframe to numpy array.')
# X_train = X_train.values
# y_train = y_train.values
# X_val = X_val.values
# y_val = y_val.values
# X_test = X_test.values
# y_test = y_test.values
# print ('X_train shape:', X_train.shape)
# print ('y_train shape:', y_train.shape)
# print ('X_val shape:', X_val.shape)
# print ('y_val shape:', y_val.shape)
# print ('X_test shape:', X_test.shape)
# print ('y_test shape:', y_test.shape)

In [8]:
###############################################################################
## Apply normalization
###############################################################################
### K: NOTE: Only use derived information from the train set to avoid leakage.
print ('\nApplying normalization.')
startTime = time.time ()
scaler = StandardScaler ()
scaler.fit (X_train)
X_train = scaler.transform (X_train)
X_val = scaler.transform (X_val)
X_test = scaler.transform (X_test)
print (str (time.time () - startTime), 'to normalize data.')


Applying normalization.
1.284205675125122 to normalize data.


In [None]:
print ('X_train shape:', X_train.shape)
print ('y_train shape:', y_train.shape)
print ('X_val shape:', X_val.shape)
print ('y_val shape:', y_val.shape)
print ('X_test shape:', X_test.shape)
print ('y_test shape:', y_test.shape)
print (type (X_train))

In [9]:
###############################################################################
## Perform feature selection
###############################################################################
NUMBER_OF_FEATURES = 15 #X_train.shape [-1]
print ('\nSelecting top', NUMBER_OF_FEATURES, 'features. (PCA)')
fs = PCA (n_components = NUMBER_OF_FEATURES)
fs.fit (X_train)
print (fs.explained_variance_ratio_)
print (fs.singular_values_)
X_train = fs.transform (X_train)
X_val = fs.transform (X_val)
X_test = fs.transform (X_test)
print ('X_train shape:', X_train.shape)
print ('y_train shape:', y_train.shape)
print ('X_val shape:', X_val.shape)
print ('y_val shape:', y_val.shape)
print ('X_test shape:', X_test.shape)
print ('y_test shape:', y_test.shape)


Selecting top 15 features.
[0.29275414 0.1113438  0.09904516 0.06916064 0.05744228 0.04058369
 0.0385783  0.03403591 0.03279513 0.02985544 0.02816172 0.02643185
 0.02540309 0.02318213 0.01827161]
[3859.71897053 2380.33006749 2245.02297231 1876.0035388  1709.70027251
 1437.07671729 1401.12113079 1316.05147808 1291.8402985  1232.58215301
 1197.10911841 1159.75950711 1136.96583017 1086.12746772  964.25655451]
X_train shape: (1375329, 15)
y_train shape: (1375329,)
X_val shape: (458444, 15)
y_val shape: (458444,)
X_test shape: (488, 15)
y_test shape: (488,)


In [None]:
###############################################################################
## Create learning model (Autoencoder) and tune hyperparameters
###############################################################################
###############################################################################
# Hyperparameter tuning
test_fold = np.repeat ([-1, 0], [X_train.shape [0], X_val.shape [0]])
myPreSplit = PredefinedSplit (test_fold)
def create_model (learn_rate = 0.01, dropout_rate = 0.0, weight_constraint = 0,
                  metrics = ['mse'], input_layer_neurons,
                  neurons_on_first_layer = 32,
                  second_layer_boolean = False,
                  neurons_on_chokehold_layer = 8):
  model = Sequential ()
  model.add (Dense (input_layer_neurons, activation = 'relu',
                   input_shape = (input_layer_neurons, )))
  if (second_layer_boolean):
    model.add (Dense (neurons_on_first_layer/2, activation = 'relu'))
  model.add (Dense (neurons_on_second_layer, activation = 'relu'))
  model.add (Dense (neurons_on_chokehold_layer,  activation = 'relu'))
  if (second_layer_boolean):
    model.add (Dense (neurons_on_first_layer/2, activation = 'relu'))
  model.add (Dense (neurons_on_first_layer, activation = 'relu'))
  model.add (Dense (input_layer_neurons, activation = None))
  model.compile (loss = 'mean_squared_error',
                 optimizer = 'adam',
                 metrics = metrics)
  return model


model = KerasRegressor (build_fn = create_model, verbose = 2)
input_layer_neurons = X_train.shape [1]
batch_size = [5000, 10000]
epochs = [200]
learn_rate = [0.0001, 0.001]
dropout_rate = [0.0]
weight_constraint = [0]
neurons_on_first_layer = [32, 64]
neurons_on_chokehold_layer = [4, 8]
second_layer_boolean = [False, True] # Is there another layer?
param_grid = dict (batch_size = batch_size, epochs = epochs,
                   dropout_rate = dropout_rate, learn_rate = learn_rate,
                   weight_constraint = weight_constraint,
                   input_layer_neurons = input_layer_neurons,
                   neurons_on_first_layer = neurons_on_first_layer,
                   second_layer_boolean = second_layer_boolean,
                   neurons_on_chokehold_layer = neurons_on_chokehold_layer)
grid = GridSearchCV (estimator = model, param_grid = param_grid,
                    scoring = 'neg_mean_squared_error', cv = myPreSplit,
                    verbose = 2, n_jobs = 1)
startTime = time.time ()
grid_result = grid.fit (np.vstack ( (X_train, X_val)),#, axis = 1),
                       np.vstack ( (X_train, X_val)))#, axis = 1))
print (str (time.time () - startTime), 's to search grid.')
print (grid_result.best_params_)

print ("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_ ['mean_test_score']
stds = grid_result.cv_results_ ['std_test_score']
params = grid_result.cv_results_ ['params']
for mean, stdev, param in zip (means, stds, params):
  print ("%f (%f) with: %r" % (mean, stdev, param))

## 9 features:
## Best: -0.148847 using {'batch_size': 5000, 'dropout_rate': 0.0, 'epochs': 10, 'learn_rate': 0.001, 'weight_constraint': 0}

## All features (SEM PCA): (97s to search grid)
## Best: -0.159944 using {'batch_size': 10000, 'dropout_rate': 0.0, 'epochs': 10, 'learn_rate': 0.1, 'weight_constraint': 0}

## All features (COM PCA):  (100s to search grid)
## Best: -0.100227 using {'batch_size': 5000, 'dropout_rate': 0.0, 'epochs': 10, 'learn_rate': 0.001, 'weight_constraint': 0}

## 32 features: (88s to search grid)
## Best: -0.179934 using {'batch_size': 5000, 'dropout_rate': 0.0, 'epochs': 10, 'learn_rate': 0.001, 'weight_constraint': 0}

## 15 features (67s to search grid)
## Best: -0.235642 using {'batch_size': 10000, 'dropout_rate': 0.0, 'epochs': 10, 'learn_rate': 0.1, 'weight_constraint': 0}

## 15 features (60min to search grid)
## Best: -0.019674 using {'batch_size': 10000, 'dropout_rate': 0.0, 'epochs': 200, 'learn_rate': 0.0001, 'neurons_on_chokehold_layer': 8, 'neurons_on_first_layer': 64, 'weight_constraint': 0}

In [10]:
###############################################################################
## Finished model
# Best: -0.015576 using {'batch_size': 5000, 'dropout_rate': 0.0, 'epochs': 10,
# learn_rate': 0.1, 'weight_constraint': 0}
METRICS = [keras.metrics.MeanSquaredError (name = 'MSE'),
           keras.metrics.RootMeanSquaredError (name = 'RMSE'),  
           keras.metrics.MeanAbsoluteError (name = 'MAE'),]
### K: learning rate foi alterado manualmente ao olhar os valores do erro na
### validacao ao longo das epochs...
NUMBER_OF_EPOCHS = 200
BATCH_SIZE = 10000
LEARNING_RATE = 0.0001

print ('\nCreating learning model.')
clf = Sequential ()
clf.add (Dense (X_train.shape [1], activation = 'relu',
                      input_shape = (X_train.shape [1], )))
clf.add (Dense (32, activation = 'relu'))
clf.add (Dense (8,  activation = 'relu'))
clf.add (Dense (32, activation = 'relu'))
clf.add (Dense (X_train.shape [1], activation = None))


###############################################################################
## Compile the network
###############################################################################
print ('\nCompiling the network.')
clf.compile (loss = 'mean_squared_error',
                   optimizer = Adam (lr = LEARNING_RATE),
                   metrics = METRICS)
print ('Model summary:')
clf.summary ()


Creating learning model.

Compiling the network.
Model summary:
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 15)                240       
_________________________________________________________________
dense_1 (Dense)              (None, 32)                512       
_________________________________________________________________
dense_2 (Dense)              (None, 8)                 264       
_________________________________________________________________
dense_3 (Dense)              (None, 32)                288       
_________________________________________________________________
dense_4 (Dense)              (None, 15)                495       
Total params: 1,799
Trainable params: 1,799
Non-trainable params: 0
_________________________________________________________________


In [11]:
###############################################################################
## Fit the network
###############################################################################
print ('\nFitting the network.')
startTime = time.time ()
history = clf.fit (X_train, X_train,
                  batch_size = BATCH_SIZE,
                  epochs = NUMBER_OF_EPOCHS,
                  verbose = 2, #1 = progress bar, not useful for logging
                  workers = 0,
                  use_multiprocessing = True,
                  #class_weight = 'auto',
                  validation_data = (X_val, X_val))
print (str (time.time () - startTime), 's to train model.')


Fitting the network.
Epoch 1/200
138/138 - 1s - loss: 2.1963 - MSE: 2.1963 - RMSE: 1.4820 - MAE: 0.8263 - val_loss: 2.5231 - val_MSE: 2.5231 - val_RMSE: 1.5884 - val_MAE: 0.8172
Epoch 2/200
138/138 - 1s - loss: 2.0991 - MSE: 2.0991 - RMSE: 1.4488 - MAE: 0.8121 - val_loss: 2.4175 - val_MSE: 2.4175 - val_RMSE: 1.5548 - val_MAE: 0.8080
Epoch 3/200
138/138 - 1s - loss: 1.9933 - MSE: 1.9933 - RMSE: 1.4119 - MAE: 0.8015 - val_loss: 2.2934 - val_MSE: 2.2934 - val_RMSE: 1.5144 - val_MAE: 0.7942
Epoch 4/200
138/138 - 1s - loss: 1.8590 - MSE: 1.8590 - RMSE: 1.3635 - MAE: 0.7834 - val_loss: 2.1301 - val_MSE: 2.1301 - val_RMSE: 1.4595 - val_MAE: 0.7707
Epoch 5/200
138/138 - 1s - loss: 1.6989 - MSE: 1.6989 - RMSE: 1.3034 - MAE: 0.7553 - val_loss: 1.9576 - val_MSE: 1.9576 - val_RMSE: 1.3991 - val_MAE: 0.7397
Epoch 6/200
138/138 - 1s - loss: 1.5397 - MSE: 1.5397 - RMSE: 1.2409 - MAE: 0.7212 - val_loss: 1.7983 - val_MSE: 1.7983 - val_RMSE: 1.3410 - val_MAE: 0.7035
Epoch 7/200
138/138 - 1s - loss: 1.4

In [12]:
###############################################################################
## Analyze results
###############################################################################
X_val_pred   = clf.predict (X_val)
X_train_pred = clf.predict (X_train)
print ('Train error (MSE):'     , mean_squared_error (X_train_pred, X_train))
print ('Validation error (MSE):', mean_squared_error (X_val_pred, X_val))

#SAMPLES = 50
#print ('Error on first', SAMPLES, 'samples:')
#print ('MSE (pred, real)')
#for pred_sample, real_sample in zip (X_val_pred [:SAMPLES], X_val [:SAMPLES]):
#  print (mean_squared_error (pred_sample, real_sample))


train_mse_element_wise = np.mean (np.square (X_train_pred - X_train), axis = 1)
val_mse_element_wise = np.mean (np.square (X_val_pred - X_val), axis = 1)

Train error (MSE): 0.035373711953948644
Validation error (MSE): 0.03562014814252987


In [13]:
max_threshold_val = np.max (val_mse_element_wise)
val_mse_element_wise.sort ()
print ('Maximum validation error (MSE):', max_threshold_val)
print ('Bottom 20 validation error (MSE):', val_mse_element_wise [:20])
print ('Top 20 validation error (MSE):', val_mse_element_wise [::-1][:20])


### K: This looks like another hyperparameter to be adjusted by using a
### separate validation set that contains normal and anomaly samples.
### K: I've guessed 1%, this may be a future line of research.
THRESHOLD_SAMPLE_PERCENTAGE = 1/100
top_n_values_val = np.partition (-val_mse_element_wise,
                                 int (round (val_mse_element_wise.shape [0] *
                                             THRESHOLD_SAMPLE_PERCENTAGE)))

top_n_values_val = -top_n_values_val [: int (round (val_mse_element_wise.shape [0] *
                                                    THRESHOLD_SAMPLE_PERCENTAGE))]
print ('\nNumber of samples considered:', int (round (val_mse_element_wise.shape [0] *
                                        THRESHOLD_SAMPLE_PERCENTAGE)))
top_n_values_val.sort ()
print ('Bottom 20 on considered samples (MSE):', top_n_values_val [:20])
print ('Top 20 on considered samples (MSE):', top_n_values_val [::-1][:20])


### K: O limiar de classificacao sera a mediana dos N maiores custos obtidos
### ao validar a rede no conjunto de validacao. N e um hiperparametro que pode
### ser ajustado, mas e necessario um conjunto de validacao com amostras
### anomalas em adicao ao conjunto de validacao atual, que so tem amostras nao
### anomalas. @TODO: Desenvolver e validar o conjunto com esta nova tecnica.
threshold = np.median (top_n_values_val)
print ('Thresh val:', threshold)

Maximum validation error (MSE): 212.87403731227266
Bottom 20 validation error (MSE): [0.0008386  0.00084068 0.00085071 0.00090994 0.0009103  0.00091041
 0.00091067 0.00091085 0.00091107 0.00091107 0.00091124 0.00091149
 0.00091162 0.00091175 0.00092287 0.0009243  0.00092491 0.00092505
 0.00092514 0.00092527]
Top 20 validation error (MSE): [212.87403731 152.36033234  88.81892743  68.15041191  49.10700713
  41.67126179  40.90701572  38.22057451  37.83463887  34.11203636
  33.02540706  30.37851064  29.06671766  28.99219376  26.58209769
  24.12298189  21.20354268  20.444242    20.32900387  19.84391481]

Number of samples considered: 4584
Bottom 20 on considered samples (MSE): [0.2847333  0.28484252 0.28495044 0.28496539 0.28499881 0.28506731
 0.28515559 0.28518685 0.28525053 0.28533156 0.28533821 0.28537382
 0.28540863 0.28540896 0.285425   0.28547608 0.28557485 0.28559217
 0.28565744 0.28565978]
Top 20 on considered samples (MSE): [212.87403731 152.36033234  88.81892743  68.15041191  49.1

In [15]:
### K: NOTE: Only look at test results when publishing...
#sys.exit ()
X_test_pred = clf.predict (X_test)
print (X_test_pred.shape)
print ('Test error:', mean_squared_error (X_test_pred, X_test))


y_pred = np.mean (np.square (X_test_pred - X_test), axis = 1)
#y_pred = []
#for pred_sample, real_sample, label in zip (X_test_pred, X_test, y_test):
#  y_pred.append (mean_squared_error (pred_sample, real_sample))

#print ('\nLabel | MSE (pred, real)')
#for label, pred in zip (y_test, y_pred):
#  print (label, '|', pred)

y_test, y_pred = zip (*sorted (zip (y_test, y_pred)))
#print ('\nLabel | MSE (pred, real) (ordered)')
#for label, pred in zip (y_test, y_pred):
#  print (label, '|', pred)

# 0 == normal
# 1 == attack
print ('\nPerformance on TEST set:')
print ('\nMSE (pred, real) | Label (ordered)')
tp, tn, fp, fn = 0, 0, 0, 0
for label, pred in zip (y_test, y_pred):
#  if (pred >= threshold):
#    print ('Classified as anomaly     (NORMAL):', label)
#  else:
#    print ('Classified as not anomaly (ATTACK):', label)

  if ((pred >= threshold) and (label == 0)):
#    print ('True negative.')
    tn += 1
  elif ((pred >= threshold) and (label == 1)):
#    print ('False negative!')
    fn += 1
  elif ((pred < threshold) and (label == 1)):
#    print ('True positive.')
    tp += 1
  elif ((pred < threshold) and (label == 0)):
#    print ('False positive!')
    fp += 1

print ('Confusion matrix:')
print ('tp | fp')
print ('fn | tn')
print (tp, '|', fp)
print (fn, '|', tn)
print ('TP:', tp)
print ('TN:', tn)
print ('FP:', fp)
print ('FN:', fn)

(488, 15)
Test error: 776801.8114204492

Performance on TEST set:

MSE (pred, real) | Label (ordered)
Confusion matrix:
tp | fp
fn | tn
244 | 6
0 | 238
TP: 244
TN: 238
FP: 6
FN: 0
