In [1]:
# Author: Kaylani Bochie
# github.com/kaylani2
# kaylani AT gta DOT ufrj DOT br

### K: Model: Decision trees
import sys
import time
import pandas as pd
import os
import numpy as np
from numpy import mean, std
from unit import remove_columns_with_one_value, remove_nan_columns, load_dataset
from unit import display_general_information, display_feature_distribution
from collections import Counter
#from imblearn.over_sampling import RandomOverSampler, RandomUnderSampler
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.metrics import confusion_matrix, precision_score, recall_score
from sklearn.metrics import f1_score, classification_report, accuracy_score
from sklearn.metrics import cohen_kappa_score, mean_squared_error
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, PredefinedSplit
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif, chi2, mutual_info_classif
from sklearn.utils import class_weight
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import keras.utils
from keras import metrics
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Conv2D, MaxPooling2D, Flatten, LSTM
from keras.optimizers import RMSprop, Adam
from keras.constraints import maxnorm

In [2]:
###############################################################################
## Analyse Bezerra's dataset for intrusion detection using Decision Trees
###############################################################################

###############################################################################
## Define constants 
###############################################################################


# Random state for reproducibility
try: 
  # If defined at argv:
  STATE = int(sys.argv[1])
except:
  # If not defined, it will be 0
  STATE = 0
np.random.seed(10)
# List of available attacks on the dataset

TARGET = 'Label'


# Especific to the repository 
DATASET_DIRECTORY = r'../../../datasets/Dataset-bezerra-IoT-20200528T203526Z-001/Dataset-IoT/'
NETFLOW_DIRECTORY = r'NetFlow/'


# There are different csv files on the Dataset, with different types of data:

# Some meanings:
# MC: Media Center
# I: One hour of legitimate and malicious NetFlow data from profile.
# L: One hour of legitimate NetFlow data from profile.

MC = r'MC/'
ST = r'ST/'
SC = r'SC/'


# MC_I_FIRST: Has infected data by Hajime, Aidra and BashLite botnets 
MC_I_FIRST = r'MC_I1.csv'

# MC_I_SECOND: Has infected data from Mirai botnets
MC_I_SECOND = r'MC_I2.csv'

# MC_I_THIR: Has infected data from Mirai, Doflo, Tsunami and Wroba botnets
MC_I_THIRD = r'MC_I3.csv'

# MC_L: Has legitimate data, no infection
MC_L = r'MC_L.csv'


# Constants for ST
ST_I_FIRST = r'ST_I1.csv'
ST_I_SECOND = r'ST_I2.csv'
ST_I_THIRD = r'ST_I3.csv'
ST_L = r'ST_L.csv'

# Constants for SC
SC_I_FIRST = r'SC_I1.csv'
SC_I_SECOND = r'SC_I2.csv'
SC_I_THIRD = r'SC_I3.csv'
SC_L = r'SC_L.csv'


# In[64]:


###############################################################################
## Load dataset
###############################################################################

# For MC data:
df_mc_I_first = pd.read_csv (DATASET_DIRECTORY + MC + NETFLOW_DIRECTORY + MC_I_FIRST)
df_mc_I_second = pd.read_csv (DATASET_DIRECTORY + MC + NETFLOW_DIRECTORY + MC_I_SECOND)
df_mc_I_third = pd.read_csv (DATASET_DIRECTORY + MC + NETFLOW_DIRECTORY + MC_I_THIRD)

# Add legitimate rows from MC_L
legitimate_frame_mc = pd.read_csv (DATASET_DIRECTORY + MC + NETFLOW_DIRECTORY + MC_L)

###################

# For ST data:
df_st_I_first = pd.read_csv (DATASET_DIRECTORY + ST + NETFLOW_DIRECTORY + ST_I_FIRST)
df_st_I_second = pd.read_csv (DATASET_DIRECTORY + ST + NETFLOW_DIRECTORY + ST_I_SECOND)
df_st_I_third = pd.read_csv (DATASET_DIRECTORY + ST + NETFLOW_DIRECTORY + ST_I_THIRD)

# Add legitimate rows from SC_L
legitimate_frame_st = pd.read_csv (DATASET_DIRECTORY + ST + NETFLOW_DIRECTORY + ST_L)


###################

# For SC data:
df_sc_I_first = pd.read_csv (DATASET_DIRECTORY + SC + NETFLOW_DIRECTORY + SC_I_FIRST)
df_sc_I_second = pd.read_csv (DATASET_DIRECTORY + SC + NETFLOW_DIRECTORY + SC_I_SECOND)
df_sc_I_third = pd.read_csv (DATASET_DIRECTORY + SC + NETFLOW_DIRECTORY + SC_I_THIRD)

# Add legitimate rows from MC_L
legitimate_frame_sc = pd.read_csv (DATASET_DIRECTORY + SC + NETFLOW_DIRECTORY + SC_L)

dataframes_list = [df_mc_I_first,
                df_mc_I_second,
                df_mc_I_third,
                legitimate_frame_mc,
                df_st_I_first,
                df_st_I_second,
                df_st_I_third,
                legitimate_frame_st,
                df_sc_I_first,
                df_sc_I_second,
                df_sc_I_third,
                legitimate_frame_sc]

# Joining the differents DataFrames
prev_df = pd.concat(dataframes_list)


# In[65]:


###############################################################################
## Modify the DataFrame
###############################################################################


# Sample the dataset if necessary
df = prev_df.sample (frac = 1, replace = True, random_state = 0)

# We can see that this dataset has a temporal description.
# So it is not a good idea to randomly remove rows if using RNN

# In this case we drop the index column, since pandas library creates an index
# automatically. 
df = df.drop(df.columns[0], axis=1)

# Also drop columns that has no significant data
df = df.drop(df.columns[14:], axis=1)

# Initial and end time is not a good feature for svm model
df = df.drop(['ts', 'te'], axis=1)

# Trying another drops to see relation between features and results
df = df.drop(['fwd', 'stos', 'sa', 'da'], axis=1)
# 'sp', 'dp', 'sa',  'da',  

# Counting number of null data
nanColumns = [i for i in df.columns if df [i].isnull ().any ()]

# Remove NaN and inf values
df.replace ('Infinity', np.nan, inplace = True) ## Or other text values
df.replace (np.inf, np.nan, inplace = True) ## Remove infinity
df.replace (np.nan, 0, inplace = True)


# if (df.Label.value_counts()[1] < df.Label.value_counts()[0]):
#     remove_n =  df.Label.value_counts()[0] - df.Label.value_counts()[1]  # Number of rows to be removed   
#     print(remove_n)
#     df_to_be_dropped = df[df.Label == 0]
#     drop_indices = np.random.choice(df_to_be_dropped.index, remove_n, replace=False)
#     df = df.drop(drop_indices)
# else: 
#     remove_n =  df.Label.value_counts()[1] - df.Label.value_counts()[0]  # Number of rows to be removed   
#     print(remove_n)
#     df_to_be_dropped = df[df.Label == 1]
#     drop_indices = np.random.choice(df_to_be_dropped.index, remove_n, replace=False)
#     df = df.drop(drop_indices)


In [3]:
print ('\nHandling categorical attributes (label encoding).')
my_label_encoder = LabelEncoder ()
df ['flg'] = my_label_encoder.fit_transform (df ['flg'])
df ['pr'] = my_label_encoder.fit_transform (df ['pr'])

print('Columns with object types remaining:') 
print ('Objects:', list (df.select_dtypes ( ['object']).columns))


Handling categorical attributes (label encoding).
Columns with object types remaining:
Objects: []


In [11]:
###############################################################################
## Split dataset into train, and test sets
###############################################################################
### K: Dataset is too big...
# drop_indices = np.random.choice (df.index, int (df.shape [0] * 0.5),
#                                  replace = False)
# df = df.drop (drop_indices)
TEST_SIZE = 3/10
print ('Splitting dataset (test/train):', TEST_SIZE)
X_train_df, X_test_df, y_train_df, y_test_df = train_test_split (
                                               df.iloc [:, 1:],
                                               df.iloc [:, 0],
                                               test_size = TEST_SIZE,
                                               random_state = STATE,)
                                               #shuffle = False)

print ('X_train_df shape:', X_train_df.shape)
print ('y_train_df shape:', y_train_df.shape)
print ('X_test_df shape:', X_test_df.shape)
print ('y_test_df shape:', y_test_df.shape)

Splitting dataset (test/train): 0.3
X_train_df shape: (1207084, 7)
y_train_df shape: (1207084,)
X_test_df shape: (517322, 7)
y_test_df shape: (517322,)


In [12]:
###############################################################################
### Assemble pipeline for grid search
###############################################################################


### Define pipeline to scale all the attributes
###############################################################################

object_features = (list (df.select_dtypes (['object']).columns))
remaining_features = list (df.columns)
for feature in object_features:
    remaining_features.remove (feature)

# Remove the target
remaining_features.remove ('Label')

standard_scaler_features = remaining_features 
my_scaler = StandardScaler ()
steps = list ()
steps.append (('scaler', my_scaler))
standard_scaler_transformer = Pipeline (steps)

preprocessor = ColumnTransformer (transformers = [
               ('sca', standard_scaler_transformer, standard_scaler_features)])

### Define pipeline to fit the model
###############################################################################


clf = DecisionTreeClassifier ()
clf = Pipeline (steps = [ ('preprocessor', preprocessor),
                          ('classifier', clf)],
               verbose = True)


sorted(clf.get_params().keys())
param_grid = {'classifier__criterion' : ['gini', 'entropy'],
              'classifier__splitter' : ['best', 'random'],
              'classifier__max_depth' : [1, 10],#, 100, None],
              'classifier__min_samples_split' : [2]}#, 3, 4]}
cv = RepeatedStratifiedKFold (n_splits = 5, n_repeats = 1, random_state = STATE)
grid = GridSearchCV (estimator = clf, param_grid = param_grid, scoring = 'f1', verbose = 1, n_jobs = -1, cv = cv)
grid_result = grid.fit (X_train_df, y_train_df)

print ("\n\nBest: %f using %s\n\n" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip (means, stds, params):
    print ("%f (%f) with: %r" % (mean, stdev, param))

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[Pipeline] ...... (step 1 of 2) Processing preprocessor, total=   0.3s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   0.4s
[Pipeline] ...... (step 1 of 2) Processing preprocessor, total=   0.3s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   0.4s
[Pipeline] ...... (step 1 of 2) Processing preprocessor, total=   0.4s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   0.4s
[Pipeline] ...... (step 1 of 2) Processing preprocessor, total=   0.3s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   0.4s
[Pipeline] ...... (step 1 of 2) Processing preprocessor, total=   0.3s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   0.4s
[Pipeline] ...... (step 1 of 2) Processing preprocessor, total=   0.3s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   0.1s
[Pipeline] ...... (step 1 of 2) Processing preprocessor, total=   0.3s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   0.1s
[Pipel

[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:   50.1s finished


[Pipeline] ...... (step 1 of 2) Processing preprocessor, total=   0.3s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   2.0s


Best: 0.999733 using {'classifier__criterion': 'entropy', 'classifier__max_depth': 10, 'classifier__min_samples_split': 2, 'classifier__splitter': 'best'}


0.997817 (0.000024) with: {'classifier__criterion': 'gini', 'classifier__max_depth': 1, 'classifier__min_samples_split': 2, 'classifier__splitter': 'best'}
0.997707 (0.000043) with: {'classifier__criterion': 'gini', 'classifier__max_depth': 1, 'classifier__min_samples_split': 2, 'classifier__splitter': 'random'}
0.999626 (0.000024) with: {'classifier__criterion': 'gini', 'classifier__max_depth': 10, 'classifier__min_samples_split': 2, 'classifier__splitter': 'best'}
0.998661 (0.000047) with: {'classifier__criterion': 'gini', 'classifier__max_depth': 10, 'classifier__min_samples_split': 2, 'classifier__splitter': 'random'}
0.997666 (0.000001) with: {'classifier__criterion': 'entropy', 'class

In [13]:
###############################################################################
### Define pipeline to scale all the attributes
###############################################################################

object_features = (list (df.select_dtypes (['object']).columns))
remaining_features = list (df.columns)
for feature in object_features:
    remaining_features.remove (feature)

# Remove the target
remaining_features.remove ('Label')

standard_scaler_features = remaining_features 
my_scaler = StandardScaler ()
steps = list ()
steps.append (('scaler', my_scaler))
standard_scaler_transformer = Pipeline (steps)

preprocessor = ColumnTransformer (transformers = [
               ('sca', standard_scaler_transformer, standard_scaler_features)])

###############################################################################
### Define pipeline for tuned model
###############################################################################

clf = DecisionTreeClassifier (criterion = 'entropy', max_depth = 10,
                              min_samples_split = 2, splitter = 'best')
clf = Pipeline (steps = [ ('preprocessor', preprocessor),
                          ('classifier', clf)],
               verbose = True)

In [14]:
###############################################################################
### Fit the model
###############################################################################

startTime = time.time ()
clf = clf.fit (X_train_df, y_train_df)
training_time = time.time () - startTime
print (str (training_time), 's to train model.')

[Pipeline] ...... (step 1 of 2) Processing preprocessor, total=   0.3s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   1.9s
2.2577595710754395 s to train model.


In [15]:
from unit import obtain_metrics 

# Predicting from the test slice
y_pred = clf.predict (X_test_df)

obtain_metrics(y_test_df, y_pred, "output_decision_tree_CV.txt", STATE, training_time)

Precision Score:  0.9997203861777567
Recall Score:  0.999819398188156
Accuracy:  0.9995418714069767
F1 Score:  0.9997698897315478
Cohen Kappa Score:  0.9493765509783729
[[[514852     93]
  [   144   2233]]

 [[  2233    144]
  [    93 514852]]]
