In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np 
import os
import time
import pandas as pd 
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
DATASET_PATH='/content/drive/MyDrive/Colab Notebooks/preprocesd_cicids.csv'
print(DATASET_PATH)

/content/drive/MyDrive/Colab Notebooks/preprocesd_cicids.csv


In [4]:
# ,nrows=10000, header=0
start = time.time()
df=pd.read_csv(DATASET_PATH)
df.head()
print("Time taken to load the data: ", time.time()-start," seconds")

Time taken to load the data:  36.07214117050171  seconds


In [6]:
df.head()

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,54865,3,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
1,55054,109,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,55055,52,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,46236,34,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,54863,3,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN


In [5]:
df.shape

(2520798, 79)

In [7]:
def correlation(dataset, threshold):
    col_corr = set()  
    corr_matrix = dataset.corr(numeric_only=True)
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: 
              colname = corr_matrix.columns[i]                  
              col_corr.add(colname)
    return col_corr

In [8]:
corr_features = correlation(df, 0.85)
corr_features

{' Active Min',
 ' Average Packet Size',
 ' Avg Bwd Segment Size',
 ' Avg Fwd Segment Size',
 ' Bwd IAT Min',
 ' Bwd Packet Length Mean',
 ' Bwd Packet Length Std',
 ' CWE Flag Count',
 ' ECE Flag Count',
 ' Flow IAT Max',
 ' Fwd Header Length.1',
 ' Fwd IAT Max',
 ' Fwd IAT Mean',
 ' Fwd IAT Min',
 ' Fwd IAT Std',
 ' Fwd Packet Length Mean',
 ' Fwd Packet Length Std',
 ' Idle Max',
 ' Idle Min',
 ' Max Packet Length',
 ' Packet Length Mean',
 ' Packet Length Std',
 ' Packet Length Variance',
 ' SYN Flag Count',
 ' Subflow Bwd Bytes',
 ' Subflow Bwd Packets',
 ' Subflow Fwd Bytes',
 ' Total Backward Packets',
 ' Total Length of Bwd Packets',
 ' act_data_pkt_fwd',
 ' min_seg_size_forward',
 'Fwd IAT Total',
 'Fwd Packets/s',
 'Idle Mean',
 'Subflow Fwd Packets'}

In [9]:
df.drop(corr_features,axis=1,inplace=True)

In [10]:
df.shape

(2520798, 44)

In [11]:
# create a Series with the count of rows in each group
label_counts = df[' Label'].value_counts()

# create a list of labels that have less than 10,000 rows
labels_to_merge = label_counts[label_counts < 10000].index.tolist()
print(labels_to_merge)

['FTP-Patator', 'DoS slowloris', 'DoS Slowhttptest', 'SSH-Patator', 'Bot', 'Web Attack � Brute Force', 'Web Attack � XSS', 'Infiltration', 'Web Attack � Sql Injection', 'Heartbleed']


In [12]:
# create a new label called 'Other' and merge the labels with less than 10,000 rows
df[' Label'] = df[' Label'].apply(lambda x: 'Other' if x in labels_to_merge else x)
# group the rows by the new 'Label' column
grouped_df = df.groupby(' Label')

In [13]:
df.shape

(2520798, 44)

In [14]:
df[' Label'].value_counts()

BENIGN           2095057
DoS Hulk          172846
DDoS              128014
PortScan           90694
Other              23901
DoS GoldenEye      10286
Name:  Label, dtype: int64

In [15]:
x = df.drop([' Label'],axis=1)
y = df[' Label']

In [16]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=0)
rus.fit(x, y)
Xn, yn = rus.fit_resample(x, y)
# Xn.value_counts()

In [17]:
Xn.shape

(61716, 43)

In [18]:
yn.shape

(61716,)

In [19]:
yn.value_counts()

BENIGN           10286
DDoS             10286
DoS GoldenEye    10286
DoS Hulk         10286
Other            10286
PortScan         10286
Name:  Label, dtype: int64

In [20]:
#z-score   z = (x - mean) / std   
# it can make it easier for the algorithm to learn meaningful patterns in the data
cols = list(Xn.columns)
for col in cols:
    Xn[col] = stats.zscore(Xn[col])

In [21]:
Xn.head()

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Length of Fwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Bwd Packet Length Max,Bwd Packet Length Min,Flow Bytes/s,Flow Packets/s,...,Fwd Avg Bulk Rate,Bwd Avg Bytes/Bulk,Bwd Avg Packets/Bulk,Bwd Avg Bulk Rate,Init_Win_bytes_forward,Init_Win_bytes_backward,Active Mean,Active Std,Active Max,Idle Std
0,-0.297014,-0.679777,-0.04494,-0.04085,-0.31799,0.490016,-0.713935,2.190765,-0.038518,-0.190288,...,,,,,-0.848314,-0.194997,-0.227431,-0.131538,-0.233081,-0.200218
1,4.852138,-0.681388,-0.109562,-0.054611,-0.379522,-0.126985,-0.741997,-0.254225,-0.039055,0.008263,...,,,,,-0.784403,-0.153281,-0.227431,-0.131538,-0.233081,-0.200218
2,-0.294582,2.295346,0.213547,-0.002414,0.37584,-0.126985,-0.639404,-0.254225,-0.039054,-0.191104,...,,,,,-0.270364,0.664176,-0.189442,-0.085769,-0.139767,-0.19822
3,-0.261889,-0.535558,0.041222,-0.007515,0.078787,-0.126985,-0.30145,-0.254225,-0.038974,-0.191085,...,,,,,-0.270364,-0.173232,-0.00706,-0.131538,-0.054383,-0.200218
4,-0.297014,-0.680783,-0.109562,-0.048917,-0.277675,0.894258,-0.703374,3.110923,-0.038223,-0.190381,...,,,,,-0.848314,-0.194997,-0.227431,-0.131538,-0.233081,-0.200218


In [28]:
from sklearn.model_selection import  train_test_split
from sklearn.preprocessing import LabelEncoder

# Convert the labels to integers
label_encoder = LabelEncoder()
yn = label_encoder.fit_transform(yn)

#spliting the data set

X_train, X_test, Y_train, Y_test = train_test_split(Xn,yn,test_size=0.20,random_state=0)

In [29]:
print(np.any(np.isnan(X_train)))
print(np.all(np.isfinite(X_train)))

True
False


In [30]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Create an instance of SimpleImputer with 'mean' strategy to replace NaN values
imputer = SimpleImputer(strategy='mean')

# Fit the imputer to X_train and transform X_train and X_test with it
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

# Apply StandardScaler to X_train and X_test
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [31]:
from keras.models import Sequential
from keras.layers import Dense
#training our model using Neural network classifire

# Define the model architecture
start = time.time()
model = Sequential()
model.add(Dense(units=64, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(units=32, activation='relu'))
model.add(Dense(units=6, activation='softmax'))

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, Y_train, epochs=50, batch_size=32)

print("Time taken to train model: ", time.time()-start," seconds")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Time taken to train model:  203.39413022994995  seconds
