# Preprocessing the CICIDS2017 Dataset

In [3]:
import pandas as pd
import numpy as np
import glob

### Merging all days into one

In [4]:
day_files = glob.glob("../data/raw/*.csv")
df = pd.concat((pd.read_csv(f) for f in day_files),ignore_index=True)

In [5]:
print("Dimesnsions:",df.shape)
df.head()

Dimesnsions: (2830743, 79)


Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,22,166,1,1,0,0,0,0,0.0,0.0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
1,60148,83,1,2,0,0,0,0,0.0,0.0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,123,99947,1,1,48,48,48,48,48.0,0.0,...,40,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,123,37017,1,1,48,48,48,48,48.0,0.0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,0,111161336,147,0,0,0,0,0,0.0,0.0,...,0,1753752.625,2123197.578,4822992,95,9463032.7,2657727.996,13600000,5700287,BENIGN


### Dropping Columns

In [6]:
df.columns

Index([' Destination Port', ' Flow Duration', ' Total Fwd Packets',
       ' Total Backward Packets', 'Total Length of Fwd Packets',
       ' Total Length of Bwd Packets', ' Fwd Packet Length Max',
       ' Fwd Packet Length Min', ' Fwd Packet Length Mean',
       ' Fwd Packet Length Std', 'Bwd Packet Length Max',
       ' Bwd Packet Length Min', ' Bwd Packet Length Mean',
       ' Bwd Packet Length Std', 'Flow Bytes/s', ' Flow Packets/s',
       ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max', ' Flow IAT Min',
       'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std', ' Fwd IAT Max',
       ' Fwd IAT Min', 'Bwd IAT Total', ' Bwd IAT Mean', ' Bwd IAT Std',
       ' Bwd IAT Max', ' Bwd IAT Min', 'Fwd PSH Flags', ' Bwd PSH Flags',
       ' Fwd URG Flags', ' Bwd URG Flags', ' Fwd Header Length',
       ' Bwd Header Length', 'Fwd Packets/s', ' Bwd Packets/s',
       ' Min Packet Length', ' Max Packet Length', ' Packet Length Mean',
       ' Packet Length Std', ' Packet Length Variance', '

In [7]:
df.columns = df.columns.str.strip()

In [8]:
# These columns were already dropped in this dataset
to_drop = ['Flow ID', 'Source IP', 'Destination IP', 'Timestamp']
df = df.drop(columns=to_drop,errors='ignore')

### Binary Label Mapping

In [9]:
df['Label'] = (df['Label'] != 'BENIGN').astype(int)

In [10]:
df['Label'].value_counts()

Label
0    2273097
1     557646
Name: count, dtype: int64

### Handling Infinity and NaN

In [11]:
inf_count = np.isinf(df).values.sum()
print(f"Total count of infinite values:{inf_count}")

Total count of infinite values:4376


In [12]:
print("Replacing all Inf's with NaN")
df.replace([np.inf, -np.inf], np.nan, inplace=True)

Replacing all Inf's with NaN


Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,22,166,1,1,0,0,0,0,0.0,0.0,...,32,0.000,0.000,0,0,0.0,0.000,0,0,0
1,60148,83,1,2,0,0,0,0,0.0,0.0,...,32,0.000,0.000,0,0,0.0,0.000,0,0,0
2,123,99947,1,1,48,48,48,48,48.0,0.0,...,40,0.000,0.000,0,0,0.0,0.000,0,0,0
3,123,37017,1,1,48,48,48,48,48.0,0.0,...,32,0.000,0.000,0,0,0.0,0.000,0,0,0
4,0,111161336,147,0,0,0,0,0,0.0,0.0,...,0,1753752.625,2123197.578,4822992,95,9463032.7,2657727.996,13600000,5700287,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2830738,53,61452,4,2,180,354,45,45,45.0,0.0,...,20,0.000,0.000,0,0,0.0,0.000,0,0,0
2830739,53,171,2,2,80,272,40,40,40.0,0.0,...,32,0.000,0.000,0,0,0.0,0.000,0,0,0
2830740,53,222,2,2,90,354,45,45,45.0,0.0,...,32,0.000,0.000,0,0,0.0,0.000,0,0,0
2830741,123,16842,1,1,48,48,48,48,48.0,0.0,...,20,0.000,0.000,0,0,0.0,0.000,0,0,0


In [12]:
before = len(df)
df.dropna(inplace=True)
after = len(df)
print(f"Dropped {before - after:,} rows")

Dropped 2,867 rows


### Clipping Extreme Outliers

### Handling class Imbalance