*Required* python Library

In [None]:
import pandas as pd 
from google.colab import files
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics,svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,precision_score
import matplotlib.pyplot as plt
import plotly.express as px
import numpy as np 


In [None]:
!pip install kaggle

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


Please attach Kaggel API Json file for downloading the dataset

In [None]:
#@title
uploaded = files.upload()
for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))
  
# Then move kaggle.json into the folder where the API expects to find it.
!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

Downloading the Dataset from Kaggel

In [None]:
!kaggle datasets download -d yashwanthkumbam/apaddos-dataset
!unzip apaddos-dataset

Downloading apaddos-dataset.zip to /content
 98% 131M/134M [00:01<00:00, 111MB/s]
100% 134M/134M [00:01<00:00, 127MB/s]
Archive:  apaddos-dataset.zip
  inflating: APA-DDoS-Dataset/APA-DDoS-Dataset.csv  
  inflating: final-pcap/ACK.pcap     
  inflating: final-pcap/PUSH-ACK.pcap  


Load the file and create a Pandas DataFrame

In [None]:
filepath = "APA-DDoS-Dataset/APA-DDoS-Dataset.csv"
raw_df = pd.read_csv(filepath, delimiter=',')
raw_df.dataframeName = 'APA-DDoS-Dataset.csv'
nRow, nCol = raw_df.shape
print(f'There are {nRow} rows and {nCol} columns')

There are 151200 rows and 23 columns


In [None]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151200 entries, 0 to 151199
Data columns (total 23 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   ip.src           151200 non-null  object
 1   ip.dst           151200 non-null  object
 2   tcp.srcport      151200 non-null  int64 
 3   tcp.dstport      151200 non-null  int64 
 4   ip.proto         151200 non-null  int64 
 5   frame.len        151200 non-null  int64 
 6   tcp.flags.syn    151200 non-null  int64 
 7   tcp.flags.reset  151200 non-null  int64 
 8   tcp.flags.push   151200 non-null  int64 
 9   tcp.flags.ack    151200 non-null  int64 
 10  ip.flags.mf      151200 non-null  int64 
 11  ip.flags.df      151200 non-null  int64 
 12  ip.flags.rb      151200 non-null  int64 
 13  tcp.seq          151200 non-null  int64 
 14  tcp.ack          151200 non-null  int64 
 15  frame.time       151200 non-null  object
 16  Packets          151200 non-null  int64 
 17  Bytes     

List of the Dataset classifiers of ML model Building

In [None]:
raw_df['Label'].unique()

array(['DDoS-PSH-ACK', 'Benign', 'DDoS-ACK'], dtype=object)

In [None]:
raw_df = raw_df.assign(
balance_Label = lambda dataframe: dataframe['Label'].map(lambda val: 'DDoS' if val == 'DDoS-PSH-ACK' or val == 'DDoS-ACK' else 'Benign') 
)

Balance Distribuation check of the dataset classifiers

In [None]:
px.histogram(raw_df,x='balance_Label',color='balance_Label').show()

In [None]:
#@title
def handle_non_numerical_data(df):
    columns = df.columns.values
    for column in columns:
        text_digit_vals = {}
        def convert_to_int(val):
            return text_digit_vals[val]

        if df[column].dtype != np.int64 and df[column].dtype != np.float64:
            column_contents = df[column].values.tolist()
            unique_elements = set(column_contents)
            x = 0
            for unique in unique_elements:
                if unique not in text_digit_vals:
                    text_digit_vals[unique] = x
                    x+=1

            df[column] = list(map(convert_to_int, df[column]))

    return df

Preprocessing of the Dataset for ML Model 

In [None]:
clean_df=handle_non_numerical_data(raw_df)
clean_df.info()
clean_df.nunique(axis=0)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151200 entries, 0 to 151199
Data columns (total 24 columns):
 #   Column           Non-Null Count   Dtype
---  ------           --------------   -----
 0   ip.src           151200 non-null  int64
 1   ip.dst           151200 non-null  int64
 2   tcp.srcport      151200 non-null  int64
 3   tcp.dstport      151200 non-null  int64
 4   ip.proto         151200 non-null  int64
 5   frame.len        151200 non-null  int64
 6   tcp.flags.syn    151200 non-null  int64
 7   tcp.flags.reset  151200 non-null  int64
 8   tcp.flags.push   151200 non-null  int64
 9   tcp.flags.ack    151200 non-null  int64
 10  ip.flags.mf      151200 non-null  int64
 11  ip.flags.df      151200 non-null  int64
 12  ip.flags.rb      151200 non-null  int64
 13  tcp.seq          151200 non-null  int64
 14  tcp.ack          151200 non-null  int64
 15  frame.time       151200 non-null  int64
 16  Packets          151200 non-null  int64
 17  Bytes            151200 non-n

ip.src                 14
ip.dst                  1
tcp.srcport         31009
tcp.dstport             1
ip.proto                1
frame.len               4
tcp.flags.syn           1
tcp.flags.reset         1
tcp.flags.push          2
tcp.flags.ack           1
ip.flags.mf             1
ip.flags.df             2
ip.flags.rb             1
tcp.seq                 1
tcp.ack                 1
frame.time         151200
Packets                19
Bytes                  29
Tx Packets             10
Tx Bytes               12
Rx Packets             11
Rx Bytes               21
Label                   3
balance_Label           2
dtype: int64

# Model Traning
---


1. ML Models  
2. Perfomance Report  
3. Feature Importance


spliting the data for the Traning in to 70/30 (Traning/test)

In [None]:
feature_column = ['ip.src','ip.dst','tcp.srcport','tcp.dstport','ip.proto','tcp.flags.syn','tcp.flags.reset','tcp.flags.push','tcp.flags.ack','ip.flags.mf','ip.flags.df','ip.flags.rb','tcp.seq','tcp.ack','frame.time','Packets','Bytes','Tx Packets','Tx Bytes','Rx Packets','Rx Bytes']
X = clean_df[feature_column] 
Y = clean_df['balance_Label']
print(X.shape)
print(Y.shape)

(151200, 21)
(151200,)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=3) 

print(X_train.shape)
print(y_train.shape)

print(X_test.shape)
print(y_test.shape)

(113400, 21)
(113400,)
(37800, 21)
(37800,)


## **List of models**


### 1.   Decision Tree
Decision Trees are a type of Supervised Machine Learning (you explain what the input is and what the corresponding output is in the training data) in which the data is continuously split based on a specific parameter.
### 2.   KNN : k-nearest neighbors 
KNN is an abbreviation for "K-Nearest Neighbour." It is a machine learning algorithm that is supervised. The algorithm can solve classification and regression problem statements. The symbol 'K' represents the number of nearest neighbors to a new unknown variable that must be predicted or classified.
### 3. Random Forest
Random forests, also known as random decision forests, are an ensemble learning method for classification, regression, and other tasks that works by constructing a large number of decision trees during training. For classification tasks, the random forest output is the class chosen by the majority of trees.
### 4. SVM : Support vector machine
Support-vector machines are supervised learning models that analyze data for classification and regression analysis using learning algorithms.
### 5. Neural Network
Neural networks, also known as artificial neural networks (ANNs) or simulated neural networks (SNNs), are a subset of machine learning that form the foundation of deep learning algorithms. Their name and structure are inspired by the human brain, and they mimic the way biological neurons communicate with one another.
### 6. GBT : Gradient boosting tree (*Advance ML Model*)
Gradient boosting is a machine learning technique that is commonly used in regression and classification tasks. It returns a prediction model in the form of an ensemble of weak prediction models, usually decision trees.



Temp List to store all the requied Model Data

In [None]:
master_models=[] #[model_name,ml_model]
master_preduction=[] #[model_name,model_fit]
master_model_result=[] #[model_name,Accuracy, Precision, Detection  rate, False positive rate]

In [None]:
k=3 #Hyper paramaters 
master_models.insert(-1,["Decision Tree",DecisionTreeClassifier()])
master_models.insert(-1,["KNeighbors",KNeighborsClassifier(n_neighbors=k)])
master_models.insert(-1,["RandomForest",RandomForestClassifier()])
master_models.insert(-1,["Support vector machine",svm.SVC(kernel='linear')])
master_models.insert(-1,["Neural Network",MLPClassifier(hidden_layer_sizes=(21,21,21), activation='relu', solver='adam', max_iter=500)])
master_models.insert(-1,["Gradient boosting",GradientBoostingClassifier(random_state=0)])

Model Building

In [None]:
for name,model in master_models:
    print("Building model : ",name)
    model.fit(X_train,y_train)
    master_preduction.insert(-1,[name,model.predict(X_test)])

Building model :  KNeighbors
Building model :  RandomForest
Building model :  Support vector machine
Building model :  Neural Network
Building model :  Gradient boosting
Building model :  Decision Tree


## ML Model Result 

In [None]:
def detection_rate_score(cm):
    # tp/(tp+fn)
    return cm[0][0]/(cm[0][0]+cm[1][0])
def false_positive_score(cm):
    # fp/(fp+tn)
    return cm[0][1]/(cm[0][1]+cm[1][1])

In [None]:
for name,predict in master_preduction:
    tmp_cm = confusion_matrix(y_test,predict)
    #Detection  rate
    #False positive rate
    master_model_result.insert(-1,[name,accuracy_score(y_test,predict),precision_score(y_test,predict),detection_rate_score(tmp_cm),false_positive_score(tmp_cm)])
    tmp_cm=0

Model Comparison DataFrame

In [None]:
result_df = pd.DataFrame(master_model_result,columns=["model_name","Accuracy", "Precision", "Detection  rate", "False positive rate"])
display(result_df)

Unnamed: 0,model_name,Accuracy,Precision,Detection rate,False positive rate
0,Support vector machine,1.0,1.0,1.0,0.0
1,Neural Network,1.0,1.0,1.0,0.0
2,Gradient boosting,1.0,1.0,1.0,0.0
3,Decision Tree,1.0,1.0,1.0,0.0
4,KNeighbors,1.0,1.0,1.0,0.0
5,RandomForest,1.0,1.0,1.0,0.0


### Model Feature Importance

In [None]:
importance = model.feature_importances_
# summarize feature importance
for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))

Feature: 0, Score: 0.00000
Feature: 1, Score: 0.00000
Feature: 2, Score: 0.00000
Feature: 3, Score: 0.00000
Feature: 4, Score: 0.00000
Feature: 5, Score: 0.00000
Feature: 6, Score: 0.00000
Feature: 7, Score: 0.00000
Feature: 8, Score: 0.00000
Feature: 9, Score: 0.00000
Feature: 10, Score: 0.00000
Feature: 11, Score: 0.00000
Feature: 12, Score: 0.00000
Feature: 13, Score: 0.00000
Feature: 14, Score: 0.00000
Feature: 15, Score: 0.00000
Feature: 16, Score: 0.00000
Feature: 17, Score: 0.00000
Feature: 18, Score: 0.00000
Feature: 19, Score: 0.00000
Feature: 20, Score: 1.00000
