In [1]:
# Author: Kaylani Bochie
# github.com/kaylani2
# kaylani AT gta DOT ufrj DOT br

### K: Model: Multilayer perceptron
import sys
import time
import pandas as pd
import os
import math
sys.path.insert(1, '../')
import numpy as np
from numpy import mean, std
from unit import remove_columns_with_one_value, remove_nan_columns, load_dataset
from unit import display_general_information, display_feature_distribution
from collections import Counter
#from imblearn.over_sampling import RandomOverSampler, RandomUnderSampler
import sklearn
from sklearn import set_config
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.metrics import confusion_matrix, precision_score, recall_score
from sklearn.metrics import f1_score, classification_report, accuracy_score
from sklearn.metrics import cohen_kappa_score, mean_squared_error
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, PredefinedSplit, RandomizedSearchCV
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif, chi2, mutual_info_classif
from sklearn.utils import class_weight
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
import keras.utils
from keras import metrics
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Conv2D, MaxPooling2D, Flatten, LSTM
from keras.optimizers import RMSprop, Adam
from keras.constraints import maxnorm

In [2]:
###############################################################################
## Define constants
###############################################################################
pd.set_option ('display.max_rows', None)
pd.set_option ('display.max_columns', 5)
BOT_IOT_DIRECTORY = '../../../../../datasets/bot-iot/'
BOT_IOT_FEATURE_NAMES = 'UNSW_2018_IoT_Botnet_Dataset_Feature_Names.csv'
BOT_IOT_FILE_5_PERCENT_SCHEMA = 'UNSW_2018_IoT_Botnet_Full5pc_{}.csv' # 1 - 4
FIVE_PERCENT_FILES = 4
BOT_IOT_FILE_FULL_SCHEMA = 'UNSW_2018_IoT_Botnet_Dataset_{}.csv' # 1 - 74
FULL_FILES = 74
FILE_NAME = BOT_IOT_DIRECTORY + BOT_IOT_FILE_5_PERCENT_SCHEMA
FEATURES = BOT_IOT_DIRECTORY + BOT_IOT_FEATURE_NAMES
NAN_VALUES = ['?', '.']
TARGET = 'attack'
INDEX_COLUMN = 'pkSeqID'
LABELS = ['attack', 'category', 'subcategory']
STATE = 0
try:
  STATE = int (sys.argv [1])
except:
  pass
np.random.seed (STATE)
print ('STATE:', STATE)

STATE: 0


In [3]:
###############################################################################
## Load dataset
###############################################################################
df = load_dataset (FILE_NAME, FIVE_PERCENT_FILES, INDEX_COLUMN, NAN_VALUES)

Reading ../../../../datasets/bot-iot/UNSW_2018_IoT_Botnet_Full5pc_1.csv
Reading ../../../../datasets/bot-iot/UNSW_2018_IoT_Botnet_Full5pc_2.csv
Reading ../../../../datasets/bot-iot/UNSW_2018_IoT_Botnet_Full5pc_3.csv
Reading ../../../../datasets/bot-iot/UNSW_2018_IoT_Botnet_Full5pc_4.csv


In [4]:
###############################################################################
## Clean dataset
###############################################################################
###############################################################################
### Remove columns with only one value
df, log = remove_columns_with_one_value (df, verbose = False)
print (log)


###############################################################################
### Remove redundant columns, useless columns and unused targets
### K: _number columns are numerical representations of other existing columns.
### K: category and subcategory are other labels.
### K: saddr and daddr may specialize the model to a single network
redundant_columns = ['state_number', 'proto_number', 'flgs_number']
other_targets = ['category', 'subcategory']
misc_columns = ['saddr', 'daddr']
print ('Removing redundant columns:', redundant_columns)
print ('Removing useless targets:', other_targets)
print ('Removing misc columns:', misc_columns)
columns_to_remove = redundant_columns + other_targets + misc_columns
df.drop (axis = 'columns', columns = columns_to_remove, inplace = True)

###############################################################################
### Remove NaN columns (with a lot of NaN values)
df, log = remove_nan_columns (df, 1/2, verbose = False)
print (log)

###############################################################################
### Encode categorical features
print ('Encoding categorical features (ordinal encoding).')
my_encoder = OrdinalEncoder ()
df ['flgs'] = my_encoder.fit_transform (df ['flgs'].values.reshape(-1, 1))
df ['proto'] = my_encoder.fit_transform (df ['proto'].values.reshape(-1, 1))
df ['sport'] = my_encoder.fit_transform (df ['sport'].astype (str).values.reshape(-1, 1))
df ['dport'] = my_encoder.fit_transform (df ['dport'].astype (str).values.reshape(-1, 1))
df ['state'] = my_encoder.fit_transform (df ['state'].values.reshape(-1, 1))
print ('Objects:', list (df.select_dtypes ( ['object']).columns))


###############################################################################
## Quick sanity check
###############################################################################
display_general_information (df)

While removing single value columns: No columns dropped.
Removing redundant columns: ['state_number', 'proto_number', 'flgs_number']
Removing useless targets: ['category', 'subcategory']
Removing misc columns: ['saddr', 'daddr']
While removing nan value columns: No columns dropped.
Encoding categorical features (ordinal encoding).
Objects: []
Dataframe shape (lines, columns): (3668522, 38) 

First 5 entries:
                 stime  flgs  ...  Pkts_P_State_P_Protocol_P_SrcIP  attack
pkSeqID                      ...                                         
1        1.528089e+09   0.0  ...                              602       1
2        1.528089e+09   0.0  ...                                6       1
3        1.528089e+09   0.0  ...                              602       1
4        1.528089e+09   0.0  ...                              602       1
5        1.528089e+09   0.0  ...                              602       1

[5 rows x 38 columns] 

<class 'pandas.core.frame.DataFrame'>
Int64I

In [5]:
###############################################################################
## Split dataset into train and test sets
###############################################################################
### K: Dataset is too big? Drop.
# drop_indices = np.random.choice (df.index, int (df.shape [0] * 0.5),
#                                  replace = False)
# df = df.drop (drop_indices)
TEST_SIZE = 3/10
print ('Splitting dataset (test/train):', TEST_SIZE)
X_train_df, X_test_df, y_train_df, y_test_df = train_test_split (
                                               df.loc [:, df.columns != TARGET],
                                               df [TARGET],
                                               test_size = TEST_SIZE,
                                               random_state = STATE,)
print ('X_train_df shape:', X_train_df.shape)
print ('y_train_df shape:', y_train_df.shape)
print ('X_test_df shape:', X_test_df.shape)
print ('y_test_df shape:', y_test_df.shape)

Splitting dataset (test/train): 0.3
X_train_df shape: (2567965, 37)
y_train_df shape: (2567965,)
X_test_df shape: (1100557, 37)
y_test_df shape: (1100557,)


In [6]:
###############################################################################
## Create wrapper function for keras
## Usage: clf = KerasClassifier (build_fn = create_model, verbose = 2)
## Parameters epochs and batch_size are standard from KerasClassifier
###############################################################################
def create_model (learn_rate = 0.01, dropout_rate = 0.0, weight_constraint = 0,
                  input_shape = 9, metrics = ['acuracy']):
  model = Sequential ()
  model.add (Dense (units = 64, activation = 'relu',
                   input_shape = (input_shape, )))
  model.add (Dropout (dropout_rate))
  model.add (Dense (32, activation = 'relu'))
  model.add (Dense (1, activation = 'sigmoid'))
  model.compile (loss = 'binary_crossentropy',
                optimizer = Adam (lr = learn_rate),
                metrics = metrics)#, metrics.CategoricalAccuracy ()])
  return model

In [7]:
### Define auxiliary class to optmize feature selection and network hyperparameters at the same time.
class MyKerasClf():
    def predict(self, X):
        y_pred_nn = self.clf.predict(X)
        return np.array(y_pred_nn).flatten()
    
    def create_model(self, learn_rate = 0.01, weight_constraint = 0 ):
        model = Sequential ()
        model.add (Dense (units = 64, activation = 'relu',
                       input_shape = (self.input_shape, )))
        model.add (Dropout (self.dropout_rate))
        model.add (Dense (32, activation = 'relu'))
        model.add (Dense (1, activation = 'sigmoid'))
        model.compile (loss = 'binary_crossentropy',
                    optimizer = Adam (lr = learn_rate),
                    metrics = ['accuracy'])
        return model
        
    def fit(self, X, y, **kwargs):
        self.input_shape = X.shape[1]
        self.clf = KerasClassifier(build_fn = self.create_model, verbose = 2)
        self.clf.fit(X, y, **kwargs)
    
    def set_params(self, **params):
        if 'dropout_rate' in params:
            self.dropout_rate = params['dropout_rate']
        else:
            self.dropout_rate = 0.0

In [8]:
###############################################################################
## Define processing pipeline for grid search
###############################################################################
###############################################################################
### standard_scaler ### K: Non object features
object_features = (list (df.select_dtypes (['object']).columns))
remaining_features = list (df.columns)
for feature in object_features:
  remaining_features.remove (feature)
remaining_features.remove (TARGET)

standard_scaler_features = remaining_features
my_scaler = StandardScaler ()
steps = list ()
steps.append (('scaler', my_scaler))
standard_scaler_transformer = Pipeline (steps)

###############################################################################
### Assemble column transformer
preprocessor = ColumnTransformer (transformers = [
               ('sca', standard_scaler_transformer, standard_scaler_features)])

###############################################################################
### feature selector ### K: Non object features
my_feature_selector = SelectKBest ()
steps = list ()
steps.append (('feature_selector', my_feature_selector))
feature_selector_transformer = Pipeline (steps)

###############################################################################
### Assemble pipeline for grid search
#clf = KerasClassifier (build_fn = create_model, verbose = 2)
clf = MyKerasClf ()
clf = Pipeline (steps = [('preprocessor', preprocessor),
                         ('feature_selector', feature_selector_transformer),
                         ('classifier', clf)],
               verbose = True)
#set_config(display='diagram')
#clf
print (sorted(clf.get_params().keys()))

###############################################################################
### Run grid search
#sorted(sklearn.metrics.SCORERS.keys())
### K: How to set classifier__input_shape to match feature_selector__k?
param_grid = {'feature_selector__feature_selector__score_func' : [f_classif],
              'feature_selector__feature_selector__k' : [5, 9],
              #'classifier__input_shape' : [9],
              'classifier__batch_size' : [50000],
              'classifier__learn_rate' : [0.001],#, 0.01],
              'classifier__dropout_rate' : [0.0],
              'classifier__epochs' : [5]}#, 7]}
cv = RepeatedStratifiedKFold (n_splits = 5, n_repeats = 1, random_state = STATE)
grid = GridSearchCV (estimator = clf, param_grid = param_grid, scoring = 'f1',
                     verbose = 1, n_jobs = 1, cv = cv)
grid_result = grid.fit (X_train_df, y_train_df)

print ('Best: %f using %s' % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip (means, stds, params):
  print ('%f (%f) with: %r' % (mean, stdev, param))

['classifier', 'feature_selector', 'feature_selector__feature_selector', 'feature_selector__feature_selector__k', 'feature_selector__feature_selector__score_func', 'feature_selector__memory', 'feature_selector__steps', 'feature_selector__verbose', 'memory', 'preprocessor', 'preprocessor__n_jobs', 'preprocessor__remainder', 'preprocessor__sca', 'preprocessor__sca__memory', 'preprocessor__sca__scaler', 'preprocessor__sca__scaler__copy', 'preprocessor__sca__scaler__with_mean', 'preprocessor__sca__scaler__with_std', 'preprocessor__sca__steps', 'preprocessor__sca__verbose', 'preprocessor__sparse_threshold', 'preprocessor__transformer_weights', 'preprocessor__transformers', 'preprocessor__verbose', 'steps', 'verbose']
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[Pipeline] ...... (step 1 of 3) Processing preprocessor, total=   3.2s
[Pipeline] .. (step 2 of 3) Processing feature_selector, total=   1.5s
64200/64200 - 65s - loss: 0.0081 - accuracy: 0.9999
[Pipeline] ........ (step 3 of 3) Processing classifier, total= 1.1min
Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).
16050/16050 - 8s
[Pipeline] ...... (step 1 of 3) Processing preprocessor, total=   2.8s
[Pipeline] .. (step 2 of 3) Processing feature_selector, total=   1.4s
64200/64200 - 64s - loss: 0.0141 - accuracy: 0.9999
[Pipeline] ........ (step 3 of 3) Processing classifier, total= 1.1min
16050/16050 - 7s
[Pipeline] ...... (step 1 of 3) Processing preprocessor, total=   2.5s
[Pipeline] .. (step 2 of 3) Pro

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed: 13.0min finished


[Pipeline] ...... (step 1 of 3) Processing preprocessor, total=   2.6s
[Pipeline] .. (step 2 of 3) Processing feature_selector, total=   1.4s
80249/80249 - 79s - loss: 0.0145 - accuracy: 0.9999
[Pipeline] ........ (step 3 of 3) Processing classifier, total= 1.3min
Best: 0.999981 using {'classifier__batch_size': 50000, 'classifier__dropout_rate': 0.0, 'classifier__epochs': 5, 'classifier__learn_rate': 0.001, 'feature_selector__feature_selector__k': 5, 'feature_selector__feature_selector__score_func': <function f_classif at 0x7fad53ff4700>}
0.999981 (0.000007) with: {'classifier__batch_size': 50000, 'classifier__dropout_rate': 0.0, 'classifier__epochs': 5, 'classifier__learn_rate': 0.001, 'feature_selector__feature_selector__k': 5, 'feature_selector__feature_selector__score_func': <function f_classif at 0x7fad53ff4700>}
0.999978 (0.000007) with: {'classifier__batch_size': 50000, 'classifier__dropout_rate': 0.0, 'classifier__epochs': 5, 'classifier__learn_rate': 0.001, 'feature_selector__

In [9]:
###############################################################################
## Define processing pipeline for training (hyperparameter are optimized)
###############################################################################
###############################################################################
### standard_scaler ### K: Non object features
object_features = (list (df.select_dtypes (['object']).columns))
remaining_features = list (df.columns)
for feature in object_features:
  remaining_features.remove (feature)
remaining_features.remove (TARGET)

standard_scaler_features = remaining_features
my_scaler = StandardScaler ()
steps = list ()
steps.append (('scaler', my_scaler))
standard_scaler_transformer = Pipeline (steps)

###############################################################################
### Assemble column transformer
preprocessor = ColumnTransformer (transformers = [
               ('sca', standard_scaler_transformer, standard_scaler_features)])

###############################################################################
### feature selector
# Best: 0.999986 using {'classifier__batch_size': 50, 'classifier__dropout_rate': 0.1, 'classifier__epochs': 5, 'classifier__input_shape': 9, 'classifier__learn_rate': 0.001, 'feature_selector__feature_selector__k': 9, 'feature_selector__feature_selector__score_func': <function f_classif at 0x7f5d27893ee0>}
NUMBER_OF_FEATURES = 9
SCORE_FUNCTION = f_classif
my_feature_selector = SelectKBest (score_func = SCORE_FUNCTION, k = NUMBER_OF_FEATURES)
steps = list ()
steps.append (('feature_selector', my_feature_selector))
feature_selector_transformer = Pipeline (steps)

###############################################################################
### Assemble pipeline for training
METRICS = [
        keras.metrics.TruePositives(name='tp'),
        keras.metrics.FalsePositives(name='fp'),
        keras.metrics.TrueNegatives(name='tn'),
        keras.metrics.FalseNegatives(name='fn'), 
        keras.metrics.BinaryAccuracy(name='accuracy'),
        keras.metrics.Precision(name='precision'),
        keras.metrics.Recall(name='recall'),
        keras.metrics.AUC(name='auc'),]
BATCH_SIZE = 1000
DROPOUT_RATE = 0.0
NUMBER_OF_EPOCHS = 5
LEARN_RATE = 0.001
WEIGHT_CONSTRAINT = 0
NUMBER_OF_FEATURES = 9
# Best: 0.999986 using {'classifier__batch_size': 50, 'classifier__dropout_rate': 0.1, 'classifier__epochs': 5, 'classifier__input_shape': 9, 'classifier__learn_rate': 0.001, 'feature_selector__feature_selector__k': 9, 'feature_selector__feature_selector__score_func': <function f_classif at 0x7f5d27893ee0>}
clf = KerasClassifier (build_fn = create_model, learn_rate = LEARN_RATE,
                                                dropout_rate = DROPOUT_RATE,
                                                weight_constraint = WEIGHT_CONSTRAINT,
                                                input_shape = NUMBER_OF_FEATURES,
                       epochs = NUMBER_OF_EPOCHS, batch_size = BATCH_SIZE, metrics = METRICS,
                       verbose = 2)
clf = Pipeline (steps = [('preprocessor', preprocessor),
                         ('feature_selector', feature_selector_transformer),
                         ('classifier', clf)],
                verbose = True)

###############################################################################
### Train
startTime = time.time ()
clf = clf.fit (X_train_df, y_train_df)
print (str (time.time () - startTime), 's to train model.')

[Pipeline] ...... (step 1 of 3) Processing preprocessor, total=   2.6s
[Pipeline] .. (step 2 of 3) Processing feature_selector, total=   1.5s
Epoch 1/5
2568/2568 - 9s - loss: 0.0161 - tp: 2562055.0000 - fp: 121.0000 - tn: 230.0000 - fn: 5559.0000 - accuracy: 0.9978 - precision: 1.0000 - recall: 0.9978 - auc: 0.8187
Epoch 2/5
2568/2568 - 9s - loss: 4.1674e-04 - tp: 2567604.0000 - fp: 84.0000 - tn: 267.0000 - fn: 10.0000 - accuracy: 1.0000 - precision: 1.0000 - recall: 1.0000 - auc: 0.8803
Epoch 3/5
2568/2568 - 9s - loss: 3.0817e-04 - tp: 2567614.0000 - fp: 83.0000 - tn: 268.0000 - fn: 0.0000e+00 - accuracy: 1.0000 - precision: 1.0000 - recall: 1.0000 - auc: 0.8818
Epoch 4/5
2568/2568 - 9s - loss: 4.2437e-04 - tp: 2567608.0000 - fp: 84.0000 - tn: 267.0000 - fn: 6.0000 - accuracy: 1.0000 - precision: 1.0000 - recall: 1.0000 - auc: 0.8803
Epoch 5/5
2568/2568 - 9s - loss: 2.8816e-04 - tp: 2567612.0000 - fp: 82.0000 - tn: 269.0000 - fn: 2.0000 - accuracy: 1.0000 - precision: 1.0000 - recall:

In [10]:
###############################################################################
## Evaluate performance
###############################################################################
print ('\nPerformance on TRAIN set:')
y_pred = clf.predict (X_train_df)
my_confusion_matrix = confusion_matrix (y_train_df, y_pred, labels = df [TARGET].unique ())
tn, fp, fn, tp = my_confusion_matrix.ravel ()
### K: NOTE: Scikit's confusion matrix is different from keras. We want attacks to be
### the positive class:
tp, tn, fp, fn = tn, tp, fn, fp
print ('Confusion matrix:')
print (my_confusion_matrix)
print ('Accuracy:', accuracy_score (y_train_df, y_pred))
print ('Precision:', precision_score (y_train_df, y_pred, average = 'macro'))
print ('Recall:', recall_score (y_train_df, y_pred, average = 'macro'))
print ('F1:', f1_score (y_train_df, y_pred, average = 'macro'))
print ('Cohen Kappa:', cohen_kappa_score (y_train_df, y_pred,
                       labels = df [TARGET].unique ()))
print ('TP:', tp)
print ('TN:', tn)
print ('FP:', fp)
print ('FN:', fn)

### K: Only before publishing... Don't peek.
sys.exit ()
print ('\nPerformance on TEST set:')
y_pred = clf.predict (X_test_df)
my_confusion_matrix = confusion_matrix (y_test_df, y_pred, labels = df [TARGET].unique ())
tn, fp, fn, tp = my_confusion_matrix.ravel ()
### K: NOTE: Scikit's confusion matrix is different from keras. We want attacks to be
### the positive class:
tp, tn, fp, fn = tn, tp, fn, fp
print ('Confusion matrix:')
print (my_confusion_matrix)
print ('Accuracy:', accuracy_score (y_test_df, y_pred))
print ('Precision:', precision_score (y_test_df, y_pred, average = 'macro'))
print ('Recall:', recall_score (y_test_df, y_pred, average = 'macro'))
print ('F1:', f1_score (y_test_df, y_pred, average = 'macro'))
print ('Cohen Kappa:', cohen_kappa_score (y_test_df, y_pred,
                       labels = df [TARGET].unique ()))
print ('TP:', tp)
print ('TN:', tn)
print ('FP:', fp)
print ('FN:', fn)


Performance on TRAIN set:
2568/2568 - 2s
Confusion matrix:
[[2567614       0]
 [     85     266]]
Accuracy: 0.9999668998603953
Precision: 0.9999834482156982
Recall: 0.878917378917379
F1: 0.9311100383955013
Cohen Kappa: 0.8622203909653463
TP: 2567614
TN: 266
FP: 85
FN: 0


SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
