# Model training

In [54]:
## import 
import pandas as pd
import numpy as np
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
    f1_score,
    precision_score,
    root_mean_squared_error,
)
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.inspection import permutation_importance
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier


import matplotlib.pyplot as plt 


loading dataframe

In [7]:
df = pd.read_csv('../data/processed/cleaned_cybersecurity_attacks_encoded.csv', index_col= 'Attack_Index')
df.head()


Unnamed: 0_level_0,Packet Length,Anomaly Scores,Attack Type,Severity Level,Protocol_ICMP,Protocol_TCP,Protocol_UDP,Packet Type_Control,Packet Type_Data,Traffic Type_DNS,...,Action Taken_Logged.1,Network Segment_Segment A.1,Network Segment_Segment B.1,Network Segment_Segment C.1,Firewall Logs_Log Data.1,Firewall Logs_No Log Data.1,IDS/IPS Alerts_Alert Data.1,IDS/IPS Alerts_No Alert Data.1,Log Source_Firewall.1,Log Source_Server.1
Attack_Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,503,28.67,Malware,1,1,0,0,0,1,0,...,1,1,0,0,1,0,0,1,0,1
1,1174,51.5,Malware,1,1,0,0,0,1,0,...,0,0,1,0,1,0,0,1,1,0
2,306,87.42,DDoS,1,0,0,1,1,0,0,...,0,0,0,1,1,0,1,0,1,0
3,385,15.79,Malware,2,0,0,1,0,1,0,...,0,0,1,0,0,1,1,0,1,0
4,1462,0.52,DDoS,1,0,1,0,0,1,1,...,0,0,0,1,0,1,1,0,1,0


## Dataset Splitting

In [12]:
df_train, df_test = train_test_split(df,test_size = 0.2, random_state=42)
df_train.shape, df_test.shape

((32000, 56), (8000, 56))

let see the distribution 

In [13]:
print(df['Severity Level'].mean())
print(df_train['Severity Level'].mean())
print(df_test['Severity Level'].mean())

1.001325
0.9975625
1.016375


### Dividing the target column and feature columns

In [14]:
# list of columns features to be used for training
features = [
    'Packet Length', 
    'Anomaly Scores', 
    'Severity Level', 
    'Protocol_ICMP',
    'Protocol_TCP', 
    'Protocol_UDP', 
    'Packet Type_Control',
    'Packet Type_Data', 
    'Traffic Type_DNS', 
    'Traffic Type_FTP',
    'Traffic Type_HTTP', 
    'Malware Indicators_IoC Detected',
    'Malware Indicators_No IoC Detected', 
    'Alerts/Warnings_Alert Triggered',
    'Alerts/Warnings_No Alert Triggered', 
    'Attack Signature_Known Pattern A', 
    'Attack Signature_Known Pattern B',
    'Action Taken_Blocked', 
    'Action Taken_Ignored',
    'Action Taken_Logged',
    'Network Segment_Segment A', 
    'Network Segment_Segment B',
    'Network Segment_Segment C',
    'Firewall Logs_Log Data',
    'Firewall Logs_No Log Data',
    'IDS/IPS Alerts_Alert Data',
    'IDS/IPS Alerts_No Alert Data',
    'Log Source_Firewall',
    'Log Source_Server',
    'Protocol_ICMP',
    'Protocol_TCP',
    'Protocol_UDP',
    'Packet Type_Control',
    'Packet Type_Data',
    'Traffic Type_DNS',
    'Traffic Type_FTP',
    'Traffic Type_HTTP',
    'Malware Indicators_IoC Detected',
    'Malware Indicators_No IoC Detected',
    'Alerts/Warnings_Alert Triggered',
    'Alerts/Warnings_No Alert Triggered',
    'Attack Signature_Known Pattern A', 
    'Attack Signature_Known Pattern B',
    'Action Taken_Blocked', 
    'Action Taken_Ignored', 'Action Taken_Logged',
    'Network Segment_Segment A', 
    'Network Segment_Segment B',
    'Network Segment_Segment C', 
    'Firewall Logs_Log Data',
    'Firewall Logs_No Log Data', 
    'IDS/IPS Alerts_Alert Data',
    'IDS/IPS Alerts_No Alert Data',
    'Log Source_Firewall',
    'Log Source_Server'
]

target_column = 'attack type'

### get the values of the features columns for the training data


In [15]:
X_train = df_train.loc[:,features].values
y_train = df_train['Attack Type'].values

### get the values of the columns for the test data

In [16]:
x_test = df_test.loc[:,features].values
y_test = df_test['Attack Type'].values

# Testing models 

our problem is a classification problem , we then tried the following classification algorithms :

1) Logistic Regression
2) Decision tree classifier
3) Random Forest Classifier
4) XGboost
5) Naive Bayes
6) SGD classifier 

## Logistic regression 

In [17]:
lr_model = LogisticRegression(random_state=42,max_iter=1500)

In [18]:
# here we train the model on the training data
lr_model.fit(X=X_train, y=y_train)

In [22]:
cf = pd.DataFrame(
    columns=["y_test_DDoS","y_test_Intrusion","y_test_Malware"],index=["y_predicted_DDoS","y_predicted_Intrusion","y_predicted_Malware"]
)

cf.loc[:,:] = confusion_matrix(y_true= y_test,y_pred= y_test_predicted)
cf

Unnamed: 0,y_test_DDoS,y_test_Intrusion,y_test_Malware
y_predicted_DDoS,1126,625,885
y_predicted_Intrusion,1120,657,944
y_predicted_Malware,1033,643,967


In [23]:
cf/len(y_test)

Unnamed: 0,y_test_DDoS,y_test_Intrusion,y_test_Malware
y_predicted_DDoS,0.14075,0.078125,0.110625
y_predicted_Intrusion,0.14,0.082125,0.118
y_predicted_Malware,0.129125,0.080375,0.120875


In [24]:
report =classification_report(y_true=y_test, y_pred=y_test_predicted)
print(report)

              precision    recall  f1-score   support

        DDoS       0.34      0.43      0.38      2636
   Intrusion       0.34      0.24      0.28      2721
     Malware       0.35      0.37      0.36      2643

    accuracy                           0.34      8000
   macro avg       0.34      0.34      0.34      8000
weighted avg       0.34      0.34      0.34      8000



## Decision Tree Classifier

In [30]:
dt_model = DecisionTreeClassifier(random_state=42)

dt_model.fit(X=X_train,y=y_train)

y_test_predicted_dt = dt_model.predict(x_test)

cf_dt = pd.DataFrame(
    columns=["y_test_DDoS","y_test_Intrusion","y_test_Malware"],index=["y_predicted_DDoS","y_predicted_Intrusion","y_predicted_Malware"]
)
cf_dt.loc[:,:] = confusion_matrix(y_true= y_test,y_pred= y_test_predicted_dt)
cf_dt

Unnamed: 0,y_test_DDoS,y_test_Intrusion,y_test_Malware
y_predicted_DDoS,861,900,875
y_predicted_Intrusion,879,935,907
y_predicted_Malware,905,848,890


In [31]:
report_dt = classification_report(y_pred=y_test_predicted_dt,y_true=y_test)
print(report_dt)

              precision    recall  f1-score   support

        DDoS       0.33      0.33      0.33      2636
   Intrusion       0.35      0.34      0.35      2721
     Malware       0.33      0.34      0.33      2643

    accuracy                           0.34      8000
   macro avg       0.34      0.34      0.34      8000
weighted avg       0.34      0.34      0.34      8000



## Random forest classifier

In [32]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X=X_train,y=y_train)
y_test_predicted_rf = rf_model.predict(x_test)

In [33]:
## confusion matrix for Random Forest
cm = confusion_matrix(y_true=y_test,y_pred=y_test_predicted_rf)
print("Confusion Matrix:\n", cm)

Confusion Matrix:
 [[907 844 885]
 [935 873 913]
 [903 851 889]]


In [34]:
confusion_matrix_df = pd.DataFrame(cm, index=["DDoS", "Intrusion", "Malware"], columns=["DDoS", "Intrusion", "Malware"])
print(confusion_matrix_df)

           DDoS  Intrusion  Malware
DDoS        907        844      885
Intrusion   935        873      913
Malware     903        851      889


In [35]:
report_rf = classification_report(y_true=y_test, y_pred=y_test_predicted_rf)
print(report_rf)

              precision    recall  f1-score   support

        DDoS       0.33      0.34      0.34      2636
   Intrusion       0.34      0.32      0.33      2721
     Malware       0.33      0.34      0.33      2643

    accuracy                           0.33      8000
   macro avg       0.33      0.33      0.33      8000
weighted avg       0.33      0.33      0.33      8000



In [37]:
# Cross-Validation Accuracy
cv_scores = cross_val_score(rf_model, X_train, y_train, cv=5)
print("Cross-Validation Accuracy: ", cv_scores.mean())  
print("Cross-Validation Standard Deviation: ", cv_scores.std())

Cross-Validation Accuracy:  0.33196875
Cross-Validation Standard Deviation:  0.008265964666933917


In [None]:
# check feature importance
feature_importances = pd.DataFrame({'Feature': features, 'Importance': rf_model.feature_importances_})
print(feature_importances.sort_values(by='Importance', ascending=False))

## naives bayes 

In [None]:
model = GaussianNB()
model.fit(X=X_train,y=y_train)
y_test_predicted_gnb = model.predict(x_test)
cm_gnb = confusion_matrix(y_true=y_test,y_pred=y_test_predicted_gnb)
cm_gnb_df = pd.DataFrame(cm_gnb, index=["DDoS", "Intrusion", "Malware"], columns=["DDoS", "Intrusion", "Malware"])
print(cm_gnb_df)

           DDoS  Intrusion  Malware
DDoS        798        854      984
Intrusion   809        871     1041
Malware     734        872     1037


In [46]:
#report for GaussianNB
report_gnb = classification_report(y_true=y_test, y_pred=y_test_predicted_gnb)
print(report_gnb)

              precision    recall  f1-score   support

        DDoS       0.34      0.30      0.32      2636
   Intrusion       0.34      0.32      0.33      2721
     Malware       0.34      0.39      0.36      2643

    accuracy                           0.34      8000
   macro avg       0.34      0.34      0.34      8000
weighted avg       0.34      0.34      0.34      8000



In [47]:

# Cross-Validation Accuracy for GaussianNB  
cv_scores_gnb = cross_val_score(model, X_train, y_train, cv=5)
print("Cross-Validation Accuracy for GaussianNB: ", cv_scores_gnb.mean())

Cross-Validation Accuracy for GaussianNB:  0.3296875


## SGD classifier

In [52]:
clf =  GradientBoostingClassifier(loss="log_loss", random_state=42)
clf.fit(X=X_train,y=y_train)
y_test_predicted_gbc = clf.predict(x_test)


In [53]:
report_gbc = classification_report(y_true=y_test, y_pred=y_test_predicted_gbc)
print(report_gbc)   

              precision    recall  f1-score   support

        DDoS       0.34      0.42      0.37      2636
   Intrusion       0.34      0.25      0.29      2721
     Malware       0.33      0.34      0.34      2643

    accuracy                           0.34      8000
   macro avg       0.34      0.34      0.33      8000
weighted avg       0.34      0.34      0.33      8000



## Xgboost

In [42]:
# Encode target variable
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Train XGBoost Model
xgb_model = xgb.XGBClassifier(n_estimators=100, max_depth=6, learning_rate=0.1, random_state=42)
xgb_model.fit(X=X_train, y=y_train_encoded)
y_test_predicted_xgb = xgb_model.predict(x_test)

# Decode predictions back to original labels
y_test_predicted_xgb_decoded = label_encoder.inverse_transform(y_test_predicted_xgb)

# Confusion matrix
cm_xgb = confusion_matrix(y_true=y_test, y_pred=y_test_predicted_xgb_decoded)
print("Confusion Matrix:\n", cm_xgb)

Confusion Matrix:
 [[946 782 908]
 [979 818 924]
 [987 779 877]]


In [43]:
#report for XGBoost
report_xgb = classification_report(y_true=y_test, y_pred=y_test_predicted_xgb_decoded)
print(report_xgb)

              precision    recall  f1-score   support

        DDoS       0.32      0.36      0.34      2636
   Intrusion       0.34      0.30      0.32      2721
     Malware       0.32      0.33      0.33      2643

    accuracy                           0.33      8000
   macro avg       0.33      0.33      0.33      8000
weighted avg       0.33      0.33      0.33      8000



# model amelioration - Fine turning 