# DDoS Network Intrusion Detection and Binary Classification

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from scipy import stats
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.linear_model import Lasso
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [4]:
pwd

'/Users/karlamuller/my_notebooks'

In [5]:
# This extension for sound notification and cell completion time
get_ipython().magic('load_ext cellevents')

## Loading dataframe with all attacks

In [6]:
# Data after EDA
df_clean = pd.read_csv('mydata/ddos_clean.csv')

time: 9.04 s


In [7]:
# Run df_clean for running models and further anlyses
all_df = df_clean.copy()

time: 340 ms


The data imported has been cleaned and written with the dataLoad_DDoS.ipynb notebook. 

In [8]:
all_df.head(2)

Unnamed: 0.1,Unnamed: 0,Flow Duration,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Std,Bwd Packet Length Max,Flow Packets/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,2806495,1,0.0,229.0,0.0,0.0,2000000.0,1.0,0.0,1.0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
1,89556,1,0.0,229.0,0.0,0.0,2000000.0,1.0,0.0,1.0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3


time: 30.4 ms


In [9]:
all_df.shape

(3110401, 42)

time: 1.8 ms


In [10]:
all_df[' Label'].unique()

array([3, 5, 2, 4, 1, 0])

time: 22.6 ms


### Converting multiclass to binary

In [11]:
pd.set_option('display.min_rows', None)

time: 773 µs


In [12]:
# Converting multiclass to binary
bin_df = all_df.replace({' Label': {1:'1', 2:'1', 3:'1', 4:'1', 5:'1', 0:'0'}})

time: 2.34 s


In [13]:
# Converting to type int
bin_df[' Label'].astype(int)
pd.set_option('display.min_rows', None)

time: 242 ms


In [14]:
# Variable assignment
X = bin_df.iloc[:,:-1]
y = bin_df.iloc[:,-1]

time: 1.49 s


In [15]:
# Splitting data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify=y, random_state=1)

time: 5.44 s


In [16]:
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler

# Undersampling train set
print('Original dataset shape %s' % Counter(y_train))
rus = RandomUnderSampler(random_state=42)
X_train, y_train = rus.fit_resample(X_train, y_train)
print('Resampled dataset shape %s' % Counter(y_train))


Original dataset shape Counter({'1': 2482021, '0': 6299})
Resampled dataset shape Counter({'0': 6299, '1': 6299})
time: 11.7 s


In [17]:
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

time: 997 µs


In [18]:
# This takes long
# GridSearch for decision tree classifier 
params = {'max_depth':[5, 10, 15, 20],
          'max_features': [5, 10, 15, 20]}

grid = GridSearchCV(DecisionTreeClassifier(), params, cv=5)

grid.fit(X_train, y_train)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Best parameters: ", grid.best_params_)
print("Best estimator: ", grid.best_estimator_)

Best cross-validation score: 1.00
Best parameters:  {'max_depth': 15, 'max_features': 15}
Best estimator:  DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=15, max_features=15, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')
time: 2.22 s


In [19]:
# DT for binary classification
bin_DT = DecisionTreeClassifier(max_depth=15)
bin_DT.fit(X_train,y_train)
print(f'Score on train: {bin_DT.score(X_train, y_train)}')
print(f'Score on test: {bin_DT.score(X_test, y_test)}')

Score on train: 1.0
Score on test: 0.9988409869454299
time: 1.46 s


In [20]:
# New feature importance dataframe
feat_imp = [X.columns, bin_DT.feature_importances_]
df_featimp = pd.DataFrame(feat_imp).T
df_featimp = df_featimp.rename(columns= {0:'Variables', 1:'Gini_Importance'})
df_featimp.sort_values(by='Gini_Importance', ascending=False).head()

Unnamed: 0,Variables,Gini_Importance
27,Avg Fwd Segment Size,0.767252
30,Init_Win_bytes_forward,0.157606
6,Flow Packets/s,0.0605131
15,Fwd IAT Min,0.0100998
26,Average Packet Size,0.00157617


time: 15 ms


In [21]:
# Defining y predict
y_pred = bin_DT.predict(X_test)
from sklearn.metrics import confusion_matrix

# Confusion mastrix
confusion_matrix(y_test, y_pred)

array([[  1575,      0],
       [   721, 619785]])

time: 2.48 s


In [22]:
# Creating a df of confusion matrix
con_mat = confusion_matrix(y_test, y_pred) 
df_conmat = pd.DataFrame(con_mat, columns=['Predictec Class 0', 'Predicted Class 1'] )
df_conmat = df_conmat.rename('True {}'.format)
df_conmat

Unnamed: 0,Predictec Class 0,Predicted Class 1
True 0,1575,0
True 1,721,619785


time: 2.41 s


In [23]:
# Classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.69      1.00      0.81      1575
           1       1.00      1.00      1.00    620506

    accuracy                           1.00    622081
   macro avg       0.84      1.00      0.91    622081
weighted avg       1.00      1.00      1.00    622081

time: 20 s


In [24]:
from xgboost import XGBClassifier

# XG Boost Classifier
XGB_model = XGBClassifier()

XGB_model.fit(X_train, y_train)

print(f"XG Boost test score: {XGB_model.score(X_test, y_test)}")

XG Boost test score: 0.9994936350732461
time: 5.07 s


In [25]:
# Y predict of XGB
y_predXGB = XGB_model.predict(X_test)

time: 1.48 s


In [26]:
# Creating a df of confusion matrix
con_mat = confusion_matrix(y_test, y_predXGB) 
df_conmat = pd.DataFrame(con_mat, columns=['Predictec Class 0', 'Predicted Class 1'] )
df_conmat = df_conmat.rename('True {}'.format)
df_conmat

Unnamed: 0,Predictec Class 0,Predicted Class 1
True 0,1575,0
True 1,315,620191


time: 2.35 s


In [27]:
# Classification Report for XGB
print(f"XG Boost Classification Report: \n {(classification_report(y_test, y_predXGB))}")


XG Boost Classification Report: 
               precision    recall  f1-score   support

           0       0.83      1.00      0.91      1575
           1       1.00      1.00      1.00    620506

    accuracy                           1.00    622081
   macro avg       0.92      1.00      0.95    622081
weighted avg       1.00      1.00      1.00    622081

time: 19.6 s


### t-SNE visualization

In [None]:
# Different notebook