<a href="https://drive.google.com/drive/folders/176VLvhLVMz4-ZFKh1uHpCijZJfQWd-6w?usp=sharing">Link to the dataset</a>


<blockquote>
Utility function declaration
</blocquote>

In [1]:
# input: Pandas DataFrame named data
# output: print a summary of the passing Pandas DataFrame 
def printSummary(data):
    print('Number of objects = {}'.format(data.shape[0]))
    print('Number of attributes = {}'.format(data.shape[1]))
    print('   | Column                      | Missing values | Infinity values')
    print('-------------------------------------------------------------------')
    i = 0
    for col_label in data.columns:
        print('{0:2d} | {1:27s} | {2:14d} | {3:15d}'.format(i, col_label, data[col_label].isnull().sum(), data[col_label].isin([np.inf]).sum()))
        i = i + 1
    print('-------------------------------------------------------------------')

# 1. Data cleaning 

In [2]:
import pandas as pd
import numpy as np
import warnings
# suppress the warning caused by setting the first column as index column
warnings.simplefilter(action='ignore', category=FutureWarning) 
data = pd.read_csv('cic2017-ddos-data.csv', index_col=0, header=0)
# dataset contains infinite values in some columns
data = data.replace('inf', np.inf)

print('Dataset before cleaning:')
printSummary(data)
print('Label column information:')
print(data.loc[:, 'Label'].describe())

Dataset before cleaning:
Number of objects = 755663
Number of attributes = 85
   | Column                      | Missing values | Infinity values
-------------------------------------------------------------------
 0 | Flow ID                     |              0 |               0
 1 | Source IP                   |              0 |               0
 2 | Source Port                 |              0 |               0
 3 | Destination IP              |              0 |               0
 4 | Destination Port            |              0 |               0
 5 | Protocol                    |              0 |               0
 6 | Timestamp                   |              0 |               0
 7 | Flow Duration               |              0 |               0
 8 | Total Fwd Packets           |              0 |               0
 9 | Total Backward Packets      |              0 |               0
10 | Total Length of Fwd Packets |              0 |               0
11 | Total Length of Bwd Packets |    

In [3]:
data = data.replace(np.nan, np.inf)
data['Flow Bytes/s'].replace(np.inf, data['Flow Bytes/s'].median(), inplace=True)
data['Flow Packets/s'].replace(np.inf, data['Flow Packets/s'].median(), inplace=True)

print('Dataset after cleaning:')
printSummary(data)

Dataset after cleaning:
Number of objects = 755663
Number of attributes = 85
   | Column                      | Missing values | Infinity values
-------------------------------------------------------------------
 0 | Flow ID                     |              0 |               0
 1 | Source IP                   |              0 |               0
 2 | Source Port                 |              0 |               0
 3 | Destination IP              |              0 |               0
 4 | Destination Port            |              0 |               0
 5 | Protocol                    |              0 |               0
 6 | Timestamp                   |              0 |               0
 7 | Flow Duration               |              0 |               0
 8 | Total Fwd Packets           |              0 |               0
 9 | Total Backward Packets      |              0 |               0
10 | Total Length of Fwd Packets |              0 |               0
11 | Total Length of Bwd Packets |     

# 2. Data preprocessing

In [4]:
print('Number of matching values of two columns \"Fwd Header Length\" and \"Fwd Header Length - dupl\": {}'.format(data['Fwd Header Length'].eq(data['Fwd Header Length - dupl']).sum()))
# drop two duplicated columns
data = data.drop(['Fwd Header Length - dupl'], axis=1)
print('Dataset after dropping duplicated column: ')
print('Number of instances = {}'.format(data.shape[0]))
print('Number of attributes = {}'.format(data.shape[1]))

Number of matching values of two columns "Fwd Header Length" and "Fwd Header Length - dupl": 755663
Dataset after dropping duplicated column: 
Number of instances = 755663
Number of attributes = 84


In [5]:
from sklearn.decomposition import PCA

numInstances = data.shape[0]
numComponents = 10
pca = PCA(n_components=numComponents)
pca.fit(data.loc[:, 'Flow Duration':'Idle Min'])

projected = pca.transform(data.loc[:, 'Flow Duration':'Idle Min'])
projected = pd.DataFrame(projected,columns=['PC1','PC2','PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10'], index=range(numInstances))
projected.head(20)

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10
0,-24086980.0,535254.149989,1453473.0,902919.373947,-355285.29154,171524.876162,-327207.474978,-4822.703278,-770.486758,-124251.621008
1,-24180450.0,590354.229422,10460090.0,855603.211236,-344858.757347,166728.122665,-320211.333373,-6069.926378,-1159.471051,-123116.748762
2,-24180450.0,590354.229422,10460090.0,855603.211236,-344858.757347,166728.122665,-320211.333373,-6069.926378,-1159.471051,-123116.748762
3,-24180450.0,590354.229422,10460090.0,855603.211236,-344858.757347,166728.122665,-320211.333373,-6069.926378,-1159.471051,-123116.748762
4,-24097370.0,541376.13044,2454209.0,897662.006552,-354127.868235,170991.814487,-326431.142567,-4961.288699,-813.707034,-124125.444506
5,-24180450.0,590354.237739,10460090.0,855603.21216,-344858.761112,166728.129348,-320211.360792,-6069.92628,-1159.470533,-123116.732972
6,-24180450.0,590354.237739,10460090.0,855603.21216,-344858.761112,166728.129348,-320211.360792,-6069.92628,-1159.470533,-123116.732972
7,-24180450.0,590354.237739,10460090.0,855603.21216,-344858.761112,166728.129348,-320211.360792,-6069.92628,-1159.470533,-123116.732972
8,-24069810.0,525823.842929,-74251.98,910603.31679,-357069.962224,172771.801627,-328173.231498,-3890.344035,-922.235421,-124387.557966
9,-24096010.0,547006.832824,2682668.0,888217.027011,-355215.754222,175465.32502,-331051.054511,2589.154412,-2147.367232,-138906.342783


In [6]:
from sklearn.model_selection import train_test_split

new = pd.concat([data.iloc[:, 0:7], projected, data.iloc[:, 83]], axis=1)
print('Dataset after performing PCA:')
printSummary(new)

y = new.loc[:, 'Label']
x = new.drop(['Label'], axis=1)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=2)
print('Number of objects in training set: {}'.format(x_train.shape[0]))
print('Number of objects in testing set:  {}'.format(x_test.shape[0]))

Dataset after performing PCA:
Number of objects = 755663
Number of attributes = 18
   | Column                      | Missing values | Infinity values
-------------------------------------------------------------------
 0 | Flow ID                     |              0 |               0
 1 | Source IP                   |              0 |               0
 2 | Source Port                 |              0 |               0
 3 | Destination IP              |              0 |               0
 4 | Destination Port            |              0 |               0
 5 | Protocol                    |              0 |               0
 6 | Timestamp                   |              0 |               0
 7 | PC1                         |              0 |               0
 8 | PC2                         |              0 |               0
 9 | PC3                         |              0 |               0
10 | PC4                         |              0 |               0
11 | PC5                         