# DDoS Network Intrusion Detection and Binary Classification

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from scipy import stats
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.linear_model import Lasso
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [3]:
pwd

'/Users/karlamuller/my_notebooks'

In [4]:
# This extension for sound notification and cell completion time
get_ipython().magic('load_ext cellevents')

## Loading dataframe with all attacks

In [5]:
# Data after EDA
df_clean = pd.read_csv('mydata/all_dfclean.csv')

time: 8.19 s


In [6]:
# Run df_clean for running models and further anlyses
all_df = df_clean.copy()

time: 362 ms


The data imported has been cleaned and written with the dataLoad_DDoS.ipynb notebook. 

In [7]:
all_df.head(2)

Unnamed: 0.1,Unnamed: 0,Source Port,Destination Port,Flow Duration,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Std,Flow Bytes/s,Flow Packets/s,Flow IAT Mean,...,Init_Win_bytes_backward,min_seg_size_forward,Active Mean,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0,47827,47090,981829,0.0,401.0,30.127623,47827.0,20.370146,51675.210526,...,-1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
1,1,50388,43249,218355,0.0,393.0,35.08846,50388.0,27.478189,43671.0,...,-1,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5


time: 27.1 ms


In [8]:
all_df.shape

(2386803, 43)

time: 2.78 ms


In [42]:
all_df[' Label'].unique()

array([5, 2, 3, 1, 4, 0])

time: 14.9 ms


### Converting multiclass to binary

In [43]:
pd.set_option('display.min_rows', None)

time: 731 µs


In [44]:
# Converting multiclass to binary
bin_df = all_df.replace({' Label': {1:'1', 2:'1', 3:'1', 4:'1', 5:'1', 0:'0'}})

time: 962 ms


In [45]:
# Converting to type int
bin_df[' Label'].astype(int)
pd.set_option('display.min_rows', None)

time: 151 ms


In [46]:
# Destination Port and Source Port will be dropped as they're both identifier variables 
# Including them in model would result in data leakage
bin_df = bin_df.drop(columns=[' Destination Port', ' Source Port'])

time: 521 ms


In [47]:
# Variable assignment
X = bin_df.iloc[:,:-1]
y = bin_df.iloc[:,-1]

time: 474 ms


In [48]:
# Splitting data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify=y, random_state=1)

time: 3.42 s


In [49]:
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler

# Undersampling train set
print('Original dataset shape %s' % Counter(y_train))
rus = RandomUnderSampler(random_state=42)
X_train, y_train = rus.fit_resample(X_train, y_train)
print('Resampled dataset shape %s' % Counter(y_train))


Original dataset shape Counter({'1': 1903143, '0': 6299})
Resampled dataset shape Counter({'0': 6299, '1': 6299})
time: 5.46 s


In [50]:
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

time: 656 µs


In [51]:
# GridSearch for second decision tree classifier 
params = {'max_depth':[5,10,15,20]}

grid = GridSearchCV(DecisionTreeClassifier(), params, cv=5)

grid.fit(X_train, y_train)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Best parameters: ", grid.best_params_)

print("Best estimator: ", grid.best_estimator_)

Best cross-validation score: 1.00
Best parameters:  {'max_depth': 15}
Best estimator:  DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=15, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')
time: 1.37 s


In [53]:

bin_DT = DecisionTreeClassifier(max_depth=15)
bin_DT.fit(X_train,y_train)
print(f'Score on train: {bin_DT.score(X_train, y_train)}')
print(f'Score on test: {bin_DT.score(X_test, y_test)}')

Score on train: 1.0
Score on test: 0.9989127725138837
time: 1.05 s


In [54]:
# New feature importance dataframe
feat_imp = [X.columns, bin_DT.feature_importances_]
df_featimp = pd.DataFrame(feat_imp).T
df_featimp = df_featimp.rename(columns= {0:'Variables', 1:'Gini_Importance'})
df_featimp.sort_values(by='Gini_Importance', ascending=False).head()

Unnamed: 0,Variables,Gini_Importance
26,Avg Fwd Segment Size,0.70849
30,Init_Win_bytes_forward,0.136143
6,Flow Packets/s,0.0771866
24,Packet Length Variance,0.044377
37,Idle Std,0.0150652


time: 9.67 ms


In [55]:
# Defining y predict
y_pred = bin_DT.predict(X_test)
from sklearn.metrics import confusion_matrix

# Confusion mastrix
confusion_matrix(y_test, y_pred)

array([[  1575,      0],
       [   519, 475267]])

time: 1.82 s


In [56]:
# Creating a df of confusion matrix
con_mat = confusion_matrix(y_test, y_pred) 
df_conmat = pd.DataFrame(con_mat, columns=['Predictec Class 0', 'Predicted Class 1'] )
df_conmat = df_conmat.rename('True {}'.format)
df_conmat

Unnamed: 0,Predictec Class 0,Predicted Class 1
True 0,1575,0
True 1,519,475267


time: 1.74 s


In [58]:
# Classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.75      1.00      0.86      1575
           1       1.00      1.00      1.00    475786

    accuracy                           1.00    477361
   macro avg       0.88      1.00      0.93    477361
weighted avg       1.00      1.00      1.00    477361

time: 14.5 s


In [59]:
from xgboost import XGBClassifier
# XG Boost Classifier
XGB_model = XGBClassifier()

XGB_model.fit(X_train, y_train)

print(f"XG Boost test score: {XGB_model.score(X_test, y_test)}")

XG Boost test score: 0.9996061680782469
time: 4.21 s


In [60]:
# Y predict of XGB
y_predXGB = XGB_model.predict(X_test)

time: 1.04 s


In [61]:
# Creating a df of confusion matrix
con_mat = confusion_matrix(y_test, y_predXGB) 
df_conmat = pd.DataFrame(con_mat, columns=['Predictec Class 0', 'Predicted Class 1'] )
df_conmat = df_conmat.rename('True {}'.format)
df_conmat

Unnamed: 0,Predictec Class 0,Predicted Class 1
True 0,1575,0
True 1,188,475598


time: 1.71 s


In [62]:
# Classification Report for XGB
print(f"XG Boost Classification Report: \n {(classification_report(y_test, y_predXGB))}")


XG Boost Classification Report: 
               precision    recall  f1-score   support

           0       0.89      1.00      0.94      1575
           1       1.00      1.00      1.00    475786

    accuracy                           1.00    477361
   macro avg       0.95      1.00      0.97    477361
weighted avg       1.00      1.00      1.00    477361

time: 14.2 s


### t-SNE visualization

In [None]:
# Different notebook/future direction