# Data Set Bezerra - NN

In [1]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

In [4]:
import functools
import time
import math
import tensorflow as tf
import datetime

In [3]:
# Clear any logs from previous runs
!rm -rf ./logs/ 

Defining constants

In [5]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

import sklearn
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler

mpl.rcParams['figure.figsize'] = (12, 10)
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']


DEVICE = 'GPU/:0'

DATASET_DIR = '../datasets/Dataset-IoT/'
NETFLOW_DIR = DATASET_DIR + 'MC/NetFlow/'

# MC_I_FIRST: Has infected data by Hajime, Aidra and BashLite botnets'
# MC_I_SECOND: Has infected data from Mirai botnets
# MC_I_THIR: Has infected data from Mirai, Doflo, Tsunami and Wroba botnets
# MC_L: Has legitimate data, no infection
MC_L = r'MC_L.csv'

data_set_files = [r'MC_I{}.csv'.format(index) for index in range(1, 4)]
data_set_files.insert(0, r'MC_L.csv')
print (data_set_files)

['MC_L.csv', 'MC_I1.csv', 'MC_I2.csv', 'MC_I3.csv']


Loading the data set into a pd DataFrame

In [6]:
legitimate_file_path = NETFLOW_DIR + MC_L

LABEL_COLUMN = 'Label'

#reading data
df = pd.read_csv (legitimate_file_path)

# for file in data_set_files:
#     aux_df = pd.read_csv(NETFLOW_DIR + file)
#     df = pd.concat([df, aux_df], ignore_index=True)

aux_df = pd.read_csv(NETFLOW_DIR + data_set_files[1])
df = pd.concat([df, aux_df], ignore_index=True)

#making the final DataFrame
df = df.sample(frac=1, random_state=math.ceil(time.time()), )
df = df.drop(df.columns[0], axis=1)

Unnamed: 0,Label,td,sp,dp,fwd,stos,ipkt,ibyt,opkt,obyt,...,smk,dmk,dtos,dir,svln,dvln,cl,sl,al,exid
count,360617.0,360617.0,360617.0,360617.0,360617.0,360617.0,360617.0,360617.0,360617.0,360617.0,...,360617.0,360617.0,360617.0,360617.0,360617.0,360617.0,360617.0,360617.0,360617.0,360617.0
mean,0.99444,701.2068,31020.098026,2121.932535,0.0,0.0,4.185163,2208.133,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
std,0.074357,24376.78,19846.5477,9340.086984,0.0,0.0,274.643406,220884.7,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,1.0,20.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,1.0,0.0,13529.0,23.0,0.0,0.0,1.0,40.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,1.0,0.0,31443.0,23.0,0.0,0.0,1.0,40.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,1.0,0.0,48105.0,81.0,0.0,0.0,1.0,40.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,1.0,3502610.0,65535.0,65535.0,0.0,0.0,96808.0,79230120.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


Splitting the data set

In [5]:
from tensorflow import feature_column
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

train, test = train_test_split (df, test_size=0.2)
train, val = train_test_split (train, test_size=0.2)
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')



230794 train examples
57699 validation examples
72124 test examples


In [22]:
neg, pos = np.bincount(df['Label'])
print (neg)
print (pos)

2005
358612


Scaling the data

In [6]:
#DataFrame to tf.data.Dataset object
def df_to_dataset(dataFrame, shuffle=True, batch_size=32):
    print(dataFrame)
    dataFrame = dataFrame.copy()
    labels = dataFrame.pop(LABEL_COLUMN)
    data_set = tf.data.Dataset.from_tensor_slices((dict(dataFrame), labels))
    if shuffle:
        data_set = data_set.shuffle(buffer_size=len(dataFrame))
    data_set = data_set.batch(batch_size)
    return data_set

In [8]:
BATCH_SIZE = 32
type(train)
for key in train.columns:
    print(type(train[key]))
# train_ds = df_to_dataset(train, batch_size=BATCH_SIZE)
# val_ds = df_to_dataset(val, shuffle=False, batch_size=BATCH_SIZE)
# test_ds = df_to_dataset(test, shuffle=False, batch_size=BATCH_SIZE)

<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.S

# Building the Feature Layer

In [18]:
feature_columns = []

cat_cols, num_cols = df.columns[df.dtypes == 'O'], df.columns[df.dtypes != 'O']
num_cols = num_cols[1:]

#numeric columns
for key in num_cols:
    feature_columns.append(feature_column.numeric_column(key))


#categorical columns
all_categories = [df[column].unique() for column in df[cat_cols]]
for item, categories in zip(cat_cols, all_categories):
    feature = feature_column.categorical_column_with_vocabulary_list(item, categories)
    mfeature = feature_column.embedding_column (feature, dimension=8)
    feature_columns.append(mfeature)

In [19]:
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

initializer = tf.initializers.VarianceScaling(scale=2.0)
hidden_layer_size, num_classes = 128, 1
layers = [
    feature_layer,
    tf.keras.layers.Dense(hidden_layer_size, use_bias=True, activation='relu', kernel_initializer=initializer),
    tf.keras.layers.Dense(hidden_layer_size,  use_bias=True, activation='relu', kernel_initializer=initializer),
    tf.keras.layers.Dense(hidden_layer_size,  use_bias=True, activation='relu', kernel_initializer=initializer),
    tf.keras.layers.Dense(num_classes,  use_bias=True, kernel_initializer=initializer),
]

optimizer = keras.optimizer.Adam('')
model = tf.keras.Sequential(layers)
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [12]:
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)


with tf.device (DEVICE):
    model.fit(train_ds, 
              epochs=5, 
              validation_data=val_ds, 
              callbacks=[tensorboard_callback])

Epoch 1/5


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [13]:
%tensorboard --logdir logs/fit