# *Required* python Library

In [24]:
import pandas as pd 
from google.colab import files
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics,svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,precision_score
import matplotlib.pyplot as plt
import plotly.express as px
import numpy as np 

In [25]:
!pip install kaggle

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


#Please attach Kaggel API Json file for downloading the dataset

In [26]:
#@title
uploaded = files.upload()
for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))
  
# Then move kaggle.json into the folder where the API expects to find it.
!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json
User uploaded file "kaggle.json" with length 67 bytes


#Downloading the Dataset from Kaggel

In [27]:
!kaggle datasets download -d preeti5607/ddos-attack-prevention
!unzip ddos-attack-prevention

ddos-attack-prevention.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  ddos-attack-prevention.zip
replace dataset_attack.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: dataset_attack.csv      
replace dataset_normal.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: dataset_normal.csv      


#Load the file and create a Pandas DataFrame

In [28]:
raw_df = pd.concat(map(pd.read_csv, ['dataset_attack.csv', 'dataset_normal.csv']))
raw_df.dataframeName = 'dataset.csv'
nRow, nCol = raw_df.shape
print(f'There are {nRow} rows and {nCol} columns')

There are 5658998 rows and 29 columns


In [29]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5658998 entries, 1 to 1
Data columns (total 29 columns):
 #   Column            Dtype  
---  ------            -----  
 0   frame.encap_type  int64  
 1   frame.len         object 
 2   frame.protocols   int64  
 3   ip.hdr_len        int64  
 4   ip.len            int64  
 5   ip.flags.rb       int64  
 6   ip.flags.df       int64  
 7   p.flags.mf        int64  
 8   ip.frag_offset    int64  
 9   ip.ttl            int64  
 10  ip.proto          object 
 11  ip.src            object 
 12  ip.dst            int64  
 13  tcp.srcport       int64  
 14  tcp.dstport       int64  
 15  tcp.len           int64  
 16  tcp.ack           int64  
 17  tcp.flags.res     int64  
 18  tcp.flags.ns      int64  
 19  tcp.flags.cwr     int64  
 20  tcp.flags.ecn     int64  
 21  tcp.flags.urg     int64  
 22  tcp.flags.ack     int64  
 23  tcp.flags.push    int64  
 24  tcp.flags.reset   int64  
 25  tcp.flags.syn     int64  
 26  tcp.flags.fin     in

List of the Dataset classifiers of ML model Building

In [30]:
raw_df['tcp.time_delta'].unique()

array(['attack', 'normal'], dtype=object)

In [31]:
raw_df = raw_df.assign(
      result = lambda dataframe: dataframe['tcp.time_delta'].map(lambda val: 'ATTACK' if val == 'attack' else 'NORMAL') 
  )

#Balance Distribuation check of the dataset classifiers

In [32]:
px.histogram(raw_df,x='result',color='result').show()

#Preprocessing of the Dataset for ML Model 

In [36]:
#@title
def handle_non_numerical_data(df):
    columns = df.columns.values
    for column in columns:
        text_digit_vals = {}
        def convert_to_int(val):
            return text_digit_vals[val]

        if df[column].dtype != np.int64 and df[column].dtype != np.float64:
            column_contents = df[column].values.tolist()
            unique_elements = set(column_contents)
            x = 0
            for unique in unique_elements:
                if unique not in text_digit_vals:
                    text_digit_vals[unique] = x
                    x+=1

            df[column] = list(map(convert_to_int, df[column]))

    return df

In [37]:
clean_df=handle_non_numerical_data(raw_df)
clean_df.info()
clean_df.nunique(axis=0)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5658998 entries, 1 to 1
Data columns (total 30 columns):
 #   Column            Dtype  
---  ------            -----  
 0   frame.encap_type  int64  
 1   frame.len         int64  
 2   frame.protocols   int64  
 3   ip.hdr_len        int64  
 4   ip.len            int64  
 5   ip.flags.rb       int64  
 6   ip.flags.df       int64  
 7   p.flags.mf        int64  
 8   ip.frag_offset    int64  
 9   ip.ttl            int64  
 10  ip.proto          int64  
 11  ip.src            int64  
 12  ip.dst            int64  
 13  tcp.srcport       int64  
 14  tcp.dstport       int64  
 15  tcp.len           int64  
 16  tcp.ack           int64  
 17  tcp.flags.res     int64  
 18  tcp.flags.ns      int64  
 19  tcp.flags.cwr     int64  
 20  tcp.flags.ecn     int64  
 21  tcp.flags.urg     int64  
 22  tcp.flags.ack     int64  
 23  tcp.flags.push    int64  
 24  tcp.flags.reset   int64  
 25  tcp.flags.syn     int64  
 26  tcp.flags.fin     in

frame.encap_type      1455
frame.len              450
frame.protocols          3
ip.hdr_len            1463
ip.len                   1
ip.flags.rb              2
ip.flags.df              2
p.flags.mf               5
ip.frag_offset         113
ip.ttl                   5
ip.proto              6479
ip.src               10500
ip.dst               10897
tcp.srcport          11179
tcp.dstport           1461
tcp.len             851063
tcp.ack                  2
tcp.flags.res            2
tcp.flags.ns             2
tcp.flags.cwr            2
tcp.flags.ecn            1
tcp.flags.urg            2
tcp.flags.ack            2
tcp.flags.push           2
tcp.flags.reset          2
tcp.flags.syn            2
tcp.flags.fin        18953
tcp.window_size     450604
tcp.time_delta           2
result                   2
dtype: int64

In [38]:
feature_column = ['frame.encap_type','frame.len','frame.protocols','ip.hdr_len','ip.len','ip.flags.rb','ip.flags.df','p.flags.mf','ip.frag_offset','ip.ttl','ip.proto','ip.src','ip.dst','tcp.srcport','tcp.dstport','tcp.len','tcp.ack','tcp.flags.res','tcp.flags.ns','tcp.flags.cwr','tcp.flags.ecn','tcp.flags.urg','tcp.flags.ack','tcp.flags.push','tcp.flags.reset','tcp.flags.syn','tcp.flags.fin','tcp.window_size']
X = clean_df[feature_column] 
Y = clean_df['result']
print(X.shape)
print(Y.shape)

(5658998, 28)
(5658998,)


Spliting the data for the Traning in to 70-30 ratio for Traning and Testing

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=3) 

print(X_train.shape)
print(y_train.shape)

print(X_test.shape)
print(y_test.shape)

(4244248, 28)
(4244248,)
(1414750, 28)
(1414750,)


# **List of models**

**1. Decision Tree**:
Decision Trees are a type of Supervised Machine Learning (you explain what the input is and what the corresponding output is in the training data) in which the data is continuously split based on a specific parameter.

**2. KNN : k-nearest neighbors**:
KNN is an abbreviation for "K-Nearest Neighbour." It is a machine learning algorithm that is supervised. The algorithm can solve classification and regression problem statements. The symbol 'K' represents the number of nearest neighbors to a new unknown variable that must be predicted or classified.

**3. Random Forest**:
Random forests, also known as random decision forests, are an ensemble learning method for classification, regression, and other tasks that works by constructing a large number of decision trees during training. For classification tasks, the random forest output is the class chosen by the majority of trees.

**4. SVM : Support vector machine**:
Support-vector machines are supervised learning models that analyze data for classification and regression analysis using learning algorithms.

**5. Neural Network**:
Neural networks, also known as artificial neural networks (ANNs) or simulated neural networks (SNNs), are a subset of machine learning that form the foundation of deep learning algorithms. Their name and structure are inspired by the human brain, and they mimic the way biological neurons communicate with one another.

**6. GBT : Gradient boosting tree (*Advance ML Model*)**
Gradient boosting is a machine learning technique that is commonly used in regression and classification tasks. It returns a prediction model in the form of an ensemble of weak prediction models, usually decision trees.



Temp List to store all the requied Model Data

In [40]:
master_models=[] #[model_name,ml_model]
master_preduction=[] #[model_name,model_fit]
master_model_result=[] #[model_name,Accuracy, Precision, Detection  rate, False positive rate]

In [41]:
k=3 #Hyper paramaters 
master_models.insert(-1,["Decision Tree",DecisionTreeClassifier()])
master_models.insert(-1,["KNeighbors",KNeighborsClassifier(n_neighbors=k)])
master_models.insert(-1,["RandomForest",RandomForestClassifier()])
master_models.insert(-1,["Support vector machine",svm.SVC(kernel='linear')])
master_models.insert(-1,["Neural Network",MLPClassifier(hidden_layer_sizes=(21,21,21), activation='relu', solver='adam', max_iter=500)])
master_models.insert(-1,["Gradient boosting",GradientBoostingClassifier(random_state=0)])

MODELS Building

In [None]:
for name,model in master_models:
    print("Building model : ",name)
    model.fit(X_train,y_train)
    master_preduction.insert(-1,[name,model.predict(X_test)])

Building model :  KNeighbors


# ML Model Result 

In [None]:
def detection_rate_score(cm):
    # tp/(tp+fn)
    return cm[0][0]/(cm[0][0]+cm[1][0])
def false_positive_score(cm):
    # fp/(fp+tn)
    return cm[0][1]/(cm[0][1]+cm[1][1])

In [None]:
for name,predict in master_preduction:
    tmp_cm = confusion_matrix(y_test,predict)
    #Detection  rate
    #False positive rate
    master_model_result.insert(-1,[name,accuracy_score(y_test,predict),precision_score(y_test,predict),detection_rate_score(tmp_cm),false_positive_score(tmp_cm)])
    tmp_cm=0

Model Comparison DataFrame

In [None]:
result_df = pd.DataFrame(master_model_result,columns=["model_name","Accuracy", "Precision", "Detection  rate", "False positive rate"])
display(result_df)

Model Feature Importance

In [None]:
importance = model.feature_importances_
# summarize feature importance
for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))