In [12]:
import functools
import time
import math
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow import keras
import datetime
import sys
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import sklearn
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler


In [13]:
DATASET_DIR = '../../../datasets/Dataset-IoT/'
NETFLOW_DIRS = ['MC/NetFlow/', 'SC/NetFlow/', 'ST/NetFlow/']


# MC_I_FIRST: Has infected data by Hajime, Aidra and BashLite botnets'
# MC_I_SECOND: Has infected data from Mirai botnets
# MC_I_THIR: Has infected data from Mirai, Doflo, Tsunami and Wroba botnets
# MC_L: Has legitimate data, no infection


path_types = ['MC', 'SC', 'ST']
data_set_files = [ [r'MC_I{}.csv'.format(index) for index in range(1, 4)],
                   [r'SC_I{}.csv'.format(index) for index in range(1, 4)],
                   [r'ST_I{}.csv'.format(index) for index in range(1, 4)] ]

for path, files in zip(path_types, data_set_files):
    files.append(path + '_L.csv')


In [14]:
for n, (path, files) in enumerate(zip(NETFLOW_DIRS, data_set_files), start=1):
    for csvFile in files:
        if n == 1:
            df = pd.read_csv(DATASET_DIR + path + csvFile)
        else:
            aux_df = pd.read_csv(DATASET_DIR + path + csvFile)
            df = pd.concat([df, aux_df], ignore_index=True)
            
print ("Data Loaded")

Data Loaded


In [15]:
#making the final DataFrame
#dropping the number of the rows column
df = df.sample (frac=1, replace=True, random_state=0)
df = df.drop(df.columns[0], axis=1)

#dropping bad columns
nUniques = df.nunique()
for column, nUnique in zip (df.columns, nUniques):
    if(nUnique == 1):
        df.drop(axis='columns', columns=column, inplace=True)

#dropping unrelated columns
df.drop(axis='columns', columns=['ts', 'te', 'sa', 'da'], inplace=True)
#counting different labels
neg, pos = np.bincount(df['Label'])

In [16]:
from sklearn import preprocessing

cat_cols, num_cols = df.columns[df.dtypes == 'O'], df.columns[df.dtypes != 'O']
num_cols = num_cols[1:]

categories = [df[column].unique() for column in df[cat_cols]]

categorical_encoder = preprocessing.OrdinalEncoder(categories=categories)
categorical_encoder.fit(df[cat_cols])
df[cat_cols] = categorical_encoder.transform(df[cat_cols])

In [17]:
from tensorflow import feature_column
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

state=0

train, test = train_test_split (df, test_size=0.2, random_state=state)
train, val = train_test_split (train, test_size=0.2, random_state=state)
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

train_labels = np.array (train.pop('Label'))
bool_train_labels = train_labels != 0
val_labels = np.array(val.pop('Label'))
test_labels = np.array (test.pop('Label'))

train_features = np.array(train)
val_features = np.array(val)
test_features = np.array(test)

565496 train examples
141374 validation examples
176718 test examples


In [18]:
#getting the index of the numerical columns
index = [df.columns.get_loc(c)-1 for c in num_cols]
index = np.array(index)

cat_index = [df.columns.get_loc(c) for c in cat_cols]
cat_index = np.array(index)

scaler = StandardScaler()
train_features[:, index] = scaler.fit_transform(train_features[:, index])

val_features[:, index] = scaler.transform(val_features[:, index])

test_features[:, index] = scaler.transform(test_features[:, index])

train_features[:, index] = np.clip(train_features[:, index], -5, 5)
val_features[:, index] = np.clip(val_features[:, index], -5, 5)
test_features[:, index] = np.clip(test_features[:, index], -5, 5)

In [28]:
SAMPLE_2D_SIZE = 3 # 3x3

## zero padding and reshaping

train_features.resize((train_features.shape[0], SAMPLE_2D_SIZE, SAMPLE_2D_SIZE))
train_features = train_features.reshape((train_features.shape[0], 3, 3, 1))
val_features.resize((val_features.shape[0], SAMPLE_2D_SIZE, SAMPLE_2D_SIZE))
val_features = val_features.reshape((val_features.shape[0], 3, 3, 1))
test_features.resize((test_features.shape[0], SAMPLE_2D_SIZE, SAMPLE_2D_SIZE))
test_features = test_features.reshape((test_features.shape[0], 3, 3, 1))

In [20]:
def create_model (lr=1e-1, dropout_rate=0.0):
    initializer = tf.initializers.VarianceScaling(scale=2.0)

    model = models.Sequential()
    model.add(layers.Conv2D(64, (2, 2), activation='relu', input_shape=(3, 3, 1), kernel_initializer=initializer))
    model.add(layers.Flatten())
    model.add(layers.Dense(64, activation='relu', kernel_initializer=initializer))
    model.add(layers.Dropout(dropout_rate))
    model.add(layers.Dense(1, activation='sigmoid', kernel_initializer=initializer))
    
    # model.summary()
    
    model.compile(optimizer=keras.optimizers.Adam(lr=1e-3),
    loss=keras.losses.BinaryCrossentropy(),
    metrics=['binary_accuracy'])

    return model

In [40]:
startTime = time.time()
model = create_model (lr=1e-3, dropout_rate=0.0)
history = model.fit (train_features, train_labels,
epochs=50, validation_data=(val_features, val_labels), batch_size=100, verbose=2)
print (time.time() - startTime)

Epoch 1/50
5655/5655 - 7s - loss: 0.0182 - binary_accuracy: 0.9954 - val_loss: 0.0139 - val_binary_accuracy: 0.9960
Epoch 2/50
5655/5655 - 8s - loss: 0.0138 - binary_accuracy: 0.9962 - val_loss: 0.0135 - val_binary_accuracy: 0.9964
Epoch 3/50
5655/5655 - 7s - loss: 0.0132 - binary_accuracy: 0.9963 - val_loss: 0.0130 - val_binary_accuracy: 0.9963
Epoch 4/50
5655/5655 - 7s - loss: 0.0128 - binary_accuracy: 0.9964 - val_loss: 0.0122 - val_binary_accuracy: 0.9964
Epoch 5/50
5655/5655 - 7s - loss: 0.0121 - binary_accuracy: 0.9964 - val_loss: 0.0119 - val_binary_accuracy: 0.9962
Epoch 6/50
5655/5655 - 7s - loss: 0.0116 - binary_accuracy: 0.9963 - val_loss: 0.0118 - val_binary_accuracy: 0.9964
Epoch 7/50
5655/5655 - 7s - loss: 0.0111 - binary_accuracy: 0.9963 - val_loss: 0.0106 - val_binary_accuracy: 0.9964
Epoch 8/50
5655/5655 - 7s - loss: 0.0105 - binary_accuracy: 0.9963 - val_loss: 0.0100 - val_binary_accuracy: 0.9963
Epoch 9/50
5655/5655 - 7s - loss: 0.0095 - binary_accuracy: 0.9964 - val

In [38]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score
from sklearn.metrics import f1_score, classification_report, accuracy_score
from sklearn.metrics import cohen_kappa_score
y_pred = model.predict (test_features)
y_pred = y_pred.round ()
# print (y_pred)
TARGET = 'Label'

print ('Confusion matrix:')
print (confusion_matrix (test_labels, y_pred,
                        labels = [0, 1]))

print ('Classification report:')
print (classification_report (test_labels, y_pred,
                            labels = [0, 1],
                            digits = 3))

print ('Accuracy:', accuracy_score (test_labels, y_pred))
print ('Precision:', precision_score (test_labels, y_pred, average = 'macro'))
print ('Recall:', recall_score (test_labels, y_pred, average = 'macro'))
print ('F1:', f1_score (test_labels, y_pred, average = 'macro'))
print ('Cohen Kappa:', cohen_kappa_score (test_labels, y_pred,
                        labels = [0, 1]))


Confusion matrix:
[[   646    187]
 [   116 175769]]
Classification report:
              precision    recall  f1-score   support

           0      0.848     0.776     0.810       833
           1      0.999     0.999     0.999    175885

    accuracy                          0.998    176718
   macro avg      0.923     0.887     0.905    176718
weighted avg      0.998     0.998     0.998    176718

Accuracy: 0.998285403863783
Precision: 0.9233531315899841
Recall: 0.8874253411174857
F1: 0.904585081753452
Cohen Kappa: 0.8091718776692095
