In [29]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
import numpy as np

# The Dataset
IDS 2018 Intrusion CSVs (CSE-CIC-IDS2018): Shows DDoS Attacks of Various Formats from the University of New Brunswick

Download the data [here](https://www.kaggle.com/datasets/solarmainframe/ids-intrusion-csv).

The dataset is labeled: Benign, FTP-BruteForce, SSH-Bruteforce.

In [30]:
file_path = '/content/02-14-2018.csv' #select your file path here
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0,0,14/02/2018 08:31:01,112641719,3,0,0,0,0,0,...,0,0.0,0.0,0,0,56320859.5,139.300036,56320958,56320761,Benign
1,0,0,14/02/2018 08:33:50,112641466,3,0,0,0,0,0,...,0,0.0,0.0,0,0,56320733.0,114.551299,56320814,56320652,Benign
2,0,0,14/02/2018 08:36:39,112638623,3,0,0,0,0,0,...,0,0.0,0.0,0,0,56319311.5,301.934596,56319525,56319098,Benign
3,22,6,14/02/2018 08:40:13,6453966,15,10,1239,2273,744,0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,Benign
4,22,6,14/02/2018 08:40:23,8804066,14,11,1143,2209,744,0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,Benign


The dataset has some infinite values that we need to deal with before we pass the data to ML model. In this case we will convert them to NaN and drop in the next step

In [31]:
df.replace([np.inf, -np.inf], np.nan, inplace=True)

Drop rows with NaN

In [32]:
rows_with_nan = (df.isnull().any(axis=1)).sum()

print(f"Number of rows with NaN values: {rows_with_nan}")
print(f"Total number of rows: {len(df)}")

df = df.dropna()
print(f"Number of rows after dropping NaN values: {len(df)}")

Number of rows with NaN values: 3824
Total number of rows: 1048575
Number of rows after dropping NaN values: 1044751


Convert object columns to numeric type

In [33]:
print(df.dtypes)
df['Timestamp'] = pd.to_datetime(df['Timestamp'])  # Convert to datetime first
df['year'] = df['Timestamp'].dt.year
df['month'] = df['Timestamp'].dt.month
df['day'] = df['Timestamp'].dt.day
df['hour'] = df['Timestamp'].dt.hour
df['minute'] = df['Timestamp'].dt.minute
df['second'] = df['Timestamp'].dt.second
df = df.drop('Timestamp', axis=1)
print(df.head())

Dst Port           int64
Protocol           int64
Timestamp         object
Flow Duration      int64
Tot Fwd Pkts       int64
                  ...   
Idle Mean        float64
Idle Std         float64
Idle Max           int64
Idle Min           int64
Label             object
Length: 80, dtype: object


  df['Timestamp'] = pd.to_datetime(df['Timestamp'])  # Convert to datetime first


   Dst Port  Protocol  Flow Duration  Tot Fwd Pkts  Tot Bwd Pkts  \
0         0         0      112641719             3             0   
1         0         0      112641466             3             0   
2         0         0      112638623             3             0   
3        22         6        6453966            15            10   
4        22         6        8804066            14            11   

   TotLen Fwd Pkts  TotLen Bwd Pkts  Fwd Pkt Len Max  Fwd Pkt Len Min  \
0                0                0                0                0   
1                0                0                0                0   
2                0                0                0                0   
3             1239             2273              744                0   
4             1143             2209              744                0   

   Fwd Pkt Len Mean  ...    Idle Std  Idle Max  Idle Min   Label  year  month  \
0          0.000000  ...  139.300036  56320958  56320761  Benign  2018 

We want to detect if activity is Benign or Malicious, therefore replacing the FTP-BruteForce and  SSH-Bruteforce with Malicious

In [34]:
df.replace(to_replace=["FTP-BruteForce", "SSH-Bruteforce"], value="Malicious", inplace=True)
df["Label"].value_counts()

Label
Benign       663808
Malicious    380943
Name: count, dtype: int64

Split data into features (X) and target (Y).

Then split the data into train, validate and test sets after randomising the data.

In [35]:
# Split into features (X) and target (y)
X = df.drop('Label', axis=1)
y = df['Label']

# Train/Test/Validation Split
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.25, random_state=42)


Performance with Naive Bayes Model without Hyperparameter tuning

In [36]:
nb = GaussianNB()
nb.fit(X_train, y_train)


In [37]:
# Evaluate on Validation Set
y_val_pred = nb.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print("Validation Accuracy:", val_accuracy)

# Evaluate on Test Set
y_test_pred = nb.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Test Accuracy:", test_accuracy)

Validation Accuracy: 0.5959368269921034
Test Accuracy: 0.5949193830132423


Performance with Naive Bayes Model with Hyperparameter tuning

In [38]:
nb_classifier = GaussianNB()

params_NB = {'var_smoothing': [1e-11, 1e-10, 1e-9]}
gs_NB = GridSearchCV(estimator=nb_classifier,
                 param_grid=params_NB,
                 verbose=1,
                 scoring='accuracy')
gs_NB.fit(X_train, y_train)

gs_NB.best_params_

Fitting 5 folds for each of 3 candidates, totalling 15 fits


{'var_smoothing': 1e-11}

In [39]:
# Get the best parameter
best_var_smoothing = gs_NB.best_params_['var_smoothing']

# Create a new classifier with the best parameter
best_nb_classifier = GaussianNB(var_smoothing=best_var_smoothing)

best_nb_classifier.fit(X_train, y_train)

In [40]:
# Evaluate on Validation Set
y_val_pred = best_nb_classifier.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print("Validation Accuracy:", val_accuracy)

# Evaluate on Test Set
y_test_pred = best_nb_classifier.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Test Accuracy:", test_accuracy)

Validation Accuracy: 0.7420100502512563
Test Accuracy: 0.7421931457614465
