In [1]:
import functools
import time
import math
import tensorflow as tf
import datetime
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler

mpl.rcParams['figure.figsize'] = (12, 10)
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']

In [2]:
DATASET_DIR = '../../../datasets/Dataset-IoT/'
NETFLOW_DIRS = ['MC/NetFlow/', 'SC/NetFlow/', 'ST/NetFlow/']


# MC_I_FIRST: Has infected data by Hajime, Aidra and BashLite botnets'
# MC_I_SECOND: Has infected data from Mirai botnets
# MC_I_THIR: Has infected data from Mirai, Doflo, Tsunami and Wroba botnets
# MC_L: Has legitimate data, no infection


path_types = ['MC', 'SC', 'ST']
data_set_files = [ [r'MC_I{}.csv'.format(index) for index in range(1, 4)],
                   [r'SC_I{}.csv'.format(index) for index in range(1, 4)],
                   [r'ST_I{}.csv'.format(index) for index in range(1, 4)] ]

for path, files in zip(path_types, data_set_files):
    files.append(path + '_L.csv')

################
##reading data##
################

for n, (path, files) in enumerate(zip(NETFLOW_DIRS, data_set_files), start=1):
    for csvFile in files:
        if n == 1:
            df = pd.read_csv(DATASET_DIR + path + csvFile)
        else:
            aux_df = pd.read_csv(DATASET_DIR + path + csvFile)
            df = pd.concat([df, aux_df], ignore_index=True)

print ("Data Loaded")

#making the final DataFrame
#dropping the number of the rows column
df = df.sample (frac=1, replace=True, random_state=0)
df = df.drop(df.columns[0], axis=1)

from unit import remove_columns_with_one_value
remove_columns_with_one_value(df, verbose=0)

#dropping unrelated columns
df.drop(axis='columns', columns=['ts', 'te', 'sa', 'da'], inplace=True)

Data Loaded


In [3]:
from sklearn import preprocessing

cat_cols, num_cols = df.columns[df.dtypes == 'O'], df.columns[df.dtypes != 'O']
num_cols = num_cols[1:]
df

Unnamed: 0,Label,td,sp,dp,pr,flg,ipkt,ibyt
305711,1,0.000,9562,23.0,TCP,....S.,1.0,40.0
435829,1,0.000,43655,23.0,TCP,....S.,1.0,40.0
117952,1,0.000,22947,81.0,TCP,....S.,1.0,40.0
152315,1,0.000,27725,23.0,TCP,....S.,1.0,40.0
882371,1,1016.000,40044,22.0,TCP,....S.,2.0,120.0
...,...,...,...,...,...,...,...,...
878362,1,0.000,4792,23.0,TCP,....S.,1.0,40.0
721181,1,0.000,12053,81.0,TCP,....S.,1.0,40.0
465096,1,0.000,22758,23.0,TCP,....S.,1.0,40.0
305575,1,0.000,58285,81.0,TCP,....S.,1.0,40.0


In [4]:
categories = [df[column].unique() for column in df[cat_cols]]

categorical_encoder = preprocessing.OrdinalEncoder(categories=categories)
categorical_encoder.fit(df[cat_cols])
df[cat_cols] = categorical_encoder.transform(df[cat_cols])

In [5]:
from sklearn.model_selection import train_test_split
TEST_SIZE = 0.3
X_train_df, X_test_df, y_train_df, y_test_df = train_test_split (
                                               df.iloc [:, 1:],
                                               df.iloc [:, 0],
                                               test_size = TEST_SIZE,
                                               random_state = 0)
print ('X_train_df shape:', X_train_df.shape)
print ('y_train_df shape:', y_train_df.shape)
print ('X_test_df shape:', X_test_df.shape)
print ('y_test_df shape:', y_test_df.shape)

X_train_df shape: (618511, 7)
y_train_df shape: (618511,)
X_test_df shape: (265077, 7)
y_test_df shape: (265077,)


In [6]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

cols = (len(num_cols) + len(cat_cols)) * [None]
cols[0:len(num_cols)] = num_cols
cols[len(num_cols):] = cat_cols

standard_scaler_features = cols
my_scaler = StandardScaler ()
steps = []
steps.append (('scaler', my_scaler))
standard_scaler_transformer = Pipeline (steps)

In [7]:
preprocessor = ColumnTransformer (transformers = [
               ('sca', standard_scaler_transformer, standard_scaler_features)])


In [8]:
from tensorflow import keras

METRICS = [
    keras.metrics.TruePositives(name='tp'),
    keras.metrics.FalsePositives(name='fp'),
    keras.metrics.TrueNegatives(name='tn'),
    keras.metrics.FalseNegatives(name='fn'), 
    keras.metrics.BinaryAccuracy(name='accuracy'),
    keras.metrics.Precision(name='precision'),
    keras.metrics.Recall(name='recall'),
    keras.metrics.AUC(name='auc'),
]

def create_model(metrics = METRICS, output_bias=None, hidden_layer_size=32, lr=1e-3, dropout_rate=0.0):

    if output_bias is not None:
        output_bias = tf.keras.initializers.Constant(output_bias)

    model = keras.Sequential([
        keras.layers.Dense(hidden_layer_size, activation='relu', input_shape=(X_train_df.shape[-1],)),#, kernel_initializer=initializer),
        keras.layers.Dense(hidden_layer_size, activation='relu'),#, kernel_initializer=initializer),
        keras.layers.Dropout(dropout_rate),
        keras.layers.Dense(hidden_layer_size, activation='relu'),#, kernel_initializer=initializer),
        keras.layers.Dense(1, activation='sigmoid', bias_initializer=output_bias)#, kernel_initializer=initializer)
    ])

    model.compile(
        optimizer=keras.optimizers.Adam(lr=lr),
        loss=keras.losses.BinaryCrossentropy(),
        metrics=metrics)
    
    return model

In [12]:
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
clf = KerasClassifier (build_fn=create_model, hidden_layer_size=64, batch_size=1000, epochs=30, verbose=2)
clf = Pipeline (steps=[('preprocessor', preprocessor),
                       ('classifier', clf)], verbose=True)

In [13]:
startTime = time.time()
clf = clf.fit (X_train_df, y_train_df)
print (str(time.time() - startTime), 's to train model.')

[Pipeline] ...... (step 1 of 2) Processing preprocessor, total=   0.1s
Epoch 1/30
619/619 - 2s - loss: 0.0235 - tp: 1230998.0000 - fp: 4651.0000 - tn: 1007.0000 - fn: 366.0000 - accuracy: 0.9959 - precision: 0.9962 - recall: 0.9997 - auc: 0.9593
Epoch 2/30
619/619 - 2s - loss: 0.0141 - tp: 615516.0000 - fp: 2353.0000 - tn: 476.0000 - fn: 166.0000 - accuracy: 0.9959 - precision: 0.9962 - recall: 0.9997 - auc: 0.9720
Epoch 3/30
619/619 - 2s - loss: 0.0133 - tp: 615424.0000 - fp: 2137.0000 - tn: 692.0000 - fn: 258.0000 - accuracy: 0.9961 - precision: 0.9965 - recall: 0.9996 - auc: 0.9747
Epoch 4/30
619/619 - 2s - loss: 0.0120 - tp: 615383.0000 - fp: 2014.0000 - tn: 815.0000 - fn: 299.0000 - accuracy: 0.9963 - precision: 0.9967 - recall: 0.9995 - auc: 0.9765
Epoch 5/30
619/619 - 2s - loss: 0.0110 - tp: 615362.0000 - fp: 1934.0000 - tn: 895.0000 - fn: 320.0000 - accuracy: 0.9964 - precision: 0.9969 - recall: 0.9995 - auc: 0.9788
Epoch 6/30
619/619 - 2s - loss: 0.0099 - tp: 615327.0000 - fp:

In [11]:
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold

param_grid = {'classifier__epochs': [1, 3, 5], 'classifier__hidden_layer_size': [16, 32, 64],
              'classifier__dropout_rate': [0.0, 0.1, 0.2, 0.3], 'classifier__batch_size'[1000, 2048, 3000]}
cv = RepeatedStratifiedKFold (n_splits=5, n_repeats=1, random_state=0)
grid = GridSearchCV (estimator=clf, param_grid=param_grid, scoring='f1', verbose=1, n_jobs=1, cv=cv)
grid_result = grid.fit (X_train_df, y_train_df)

print ("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip (means, stds, params):
  print ("%f (%f) with: %r" % (mean, stdev, param))

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[Pipeline] ...... (step 1 of 2) Processing preprocessor, total=   0.1s
17672/17672 - 25s - loss: 0.0151 - tp: 1264949.0000 - fp: 4588.0000 - tn: 1268.0000 - fn: 1561.0000 - accuracy: 0.9952 - precision: 0.9964 - recall: 0.9988 - auc: 0.9663
[Pipeline] ........ (step 2 of 2) Processing classifier, total=  26.6s
Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).
4418/4418 - 3s
[Pipeline] ...... (step 1 of 2) Processing preprocessor, total=   0.1s
17672/17672 - 24s - loss: 0.0201 - tp: 1826138.0000 - fp: 6713.0000 - tn: 1746.0000 - fn: 3265.0000 - accuracy: 0.9946 - precision: 0.9963 - recall: 0.9982 - auc: 0.9631
[Pipeline] ........ (step 2 of 2) Processing classifier, total=  26.0s
4418/4418 - 3s
[Pipeline] .

KeyboardInterrupt: 