# Tabular Prototype

## Problem Formulating

In [None]:
# finding duplicates
duplicated_rows = df2[df2.duplicated()]
num_duplicated_rows = duplicated_rows.shape[0]

print(f"Number of duplicated rows found: {num_duplicated_rows}")
if num_duplicated_rows > 0:
    print("\nDuplicated Rows:")
    display(duplicated_rows)
else:
    print("\nNo duplicated rows found in the DataFrame.")

Number of duplicated rows found: 26387

Duplicated Rows:


Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
10,0.000000e+00,6,0,4,0.000000,0.0,0.000002,0.0,0.000000e+00,0.000000,...,0.027027,0.016129,0.0,0.0,0.0,0.016949,0.016393,1.0,6,0
11,0.000000e+00,6,0,4,0.000000,0.0,0.000002,0.0,0.000000e+00,0.000000,...,0.027027,0.016129,0.0,0.0,0.0,0.016949,0.016393,1.0,6,0
39,9.999281e-01,6,0,4,0.000094,0.0,0.000005,0.0,1.666800e-08,0.000000,...,0.027027,0.016129,0.0,0.0,0.0,0.016949,0.016393,1.0,6,0
41,9.999281e-01,6,0,4,0.000094,0.0,0.000005,0.0,1.666800e-08,0.000000,...,0.027027,0.016129,0.0,0.0,0.0,0.016949,0.016393,1.0,6,0
53,2.474428e-01,111,0,5,0.001973,0.0,0.000067,0.0,1.414469e-06,0.996078,...,0.054054,0.209677,0.0,0.0,0.0,0.033898,0.229508,0.0,6,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82325,1.666667e-07,117,0,4,0.000094,0.0,0.000006,0.0,1.000000e-01,0.996078,...,0.000000,0.016129,0.0,0.0,0.0,0.000000,0.016393,0.0,6,0
82326,1.500000e-07,117,0,4,0.000094,0.0,0.000006,0.0,1.111111e-01,0.996078,...,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,6,0
82329,0.000000e+00,6,0,4,0.000000,0.0,0.000002,0.0,0.000000e+00,0.000000,...,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,1.0,6,0
82330,0.000000e+00,6,0,4,0.000000,0.0,0.000002,0.0,0.000000e+00,0.000000,...,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,1.0,6,0


I will be keeping duplicates since dropping them might remove valuable information about repeated malicious activities and worsen the class imbalance

## Baseline Models

In [None]:
# Split into train, validation, and test sets (60/20/20)
from sklearn.model_selection import train_test_split

# Split into features and target
X = df2.drop(columns=['label', 'attack_cat']) # Also drop attack_cat not needed for binary classification
y = df2['label']

X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.25, stratify=y_trainval, random_state=42)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression

# Initialize and train the Logistic Regression model
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_test = model.predict(X_test)

# Evaluate the model on the test set
accuracy_test = accuracy_score(y_test, y_pred_test)
precision_test = precision_score(y_test, y_pred_test)
recall_test = recall_score(y_test, y_pred_test)
f1_test = f1_score(y_test, y_pred_test)
roc_auc_test = roc_auc_score(y_test, y_pred_test)
conf_matrix_test = confusion_matrix(y_test, y_pred_test)
class_report_test = classification_report(y_test, y_pred_test)

print(f"Test Accuracy: {accuracy_test:.4f}")
print(f"Test Precision: {precision_test:.4f}")
print(f"Test Recall: {recall_test:.4f}")
print(f"Test F1-score: {f1_test:.4f}")
print(f"Test ROC AUC Score: {roc_auc_test:.4f}")
print("Test Confusion Matrix:")
display(conf_matrix_test)
print("\nClassification Report:")
print(class_report_test)

Test Accuracy: 0.8510
Test Precision: 0.8294
Test Recall: 0.9183
Test F1-score: 0.8716
Test ROC AUC Score: 0.8434
Test Confusion Matrix:


array([[5687, 1713],
       [ 741, 8326]])


Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.77      0.82      7400
           1       0.83      0.92      0.87      9067

    accuracy                           0.85     16467
   macro avg       0.86      0.84      0.85     16467
weighted avg       0.85      0.85      0.85     16467



In [None]:
from xgboost import XGBClassifier

# Initialize XGBClassifier
xgb_model = XGBClassifier(random_state=42)

# Train the XGBoost model
xgb_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_test_xgb = xgb_model.predict(X_test)

# Evaluate the model on the test set
accuracy_test_xgb = accuracy_score(y_test, y_pred_test_xgb)
precision_test_xgb = precision_score(y_test, y_pred_test_xgb)
recall_test_xgb = recall_score(y_test, y_pred_test_xgb)
f1_test_xgb = f1_score(y_test, y_pred_test_xgb)
roc_auc_test_xgb = roc_auc_score(y_test, y_pred_test_xgb)
conf_matrix_test_xgb = confusion_matrix(y_test, y_pred_test_xgb)
class_report_test_xgb = classification_report(y_test, y_pred_test_xgb)

print(f"XGBoost Test Accuracy: {accuracy_test_xgb:.4f}")
print(f"XGBoost Test Precision: {precision_test_xgb:.4f}")
print(f"XGBoost Test Recall: {recall_test_xgb:.4f}")
print(f"XGBoost Test F1-score: {f1_test_xgb:.4f}")
print(f"XGBoost Test ROC AUC Score: {roc_auc_test_xgb:.4f}")
print("XGBoost Test Confusion Matrix:")
display(conf_matrix_test_xgb)
print("\nXGBoost Classification Report:")
print(class_report_test_xgb)

XGBoost Test Accuracy: 0.9769
XGBoost Test Precision: 0.9851
XGBoost Test Recall: 0.9728
XGBoost Test F1-score: 0.9789
XGBoost Test ROC AUC Score: 0.9774
XGBoost Test Confusion Matrix:


array([[7267,  133],
       [ 247, 8820]])


XGBoost Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.98      0.97      7400
           1       0.99      0.97      0.98      9067

    accuracy                           0.98     16467
   macro avg       0.98      0.98      0.98     16467
weighted avg       0.98      0.98      0.98     16467



In [None]:
from sklearn.ensemble import RandomForestClassifier

# initiate a RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42)

# Train the Random Forest model
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_test_rf = rf_model.predict(X_test)

# Evaluate the model on the test set
accuracy_test_rf = accuracy_score(y_test, y_pred_test_rf)
precision_test_rf = precision_score(y_test, y_pred_test_rf)
recall_test_rf = recall_score(y_test, y_pred_test_rf)
f1_test_rf = f1_score(y_test, y_pred_test_rf)
roc_auc_test_rf = roc_auc_score(y_test, y_pred_test_rf)
conf_matrix_test_rf = confusion_matrix(y_test, y_pred_test_rf)
class_report_test_rf = classification_report(y_test, y_pred_test_rf)

print(f"Random Forest Test Accuracy: {accuracy_test_rf:.4f}")
print(f"Random Forest Test Precision: {precision_test_rf:.4f}")
print(f"Random Forest Test Recall: {recall_test_rf:.4f}")
print(f"Random Forest Test F1-score: {f1_test_rf:.4f}")
print(f"Random Forest Test ROC AUC Score: {roc_auc_test_rf:.4f}")
print("Random Forest Test Confusion Matrix:")
display(conf_matrix_test_rf)
print("\nRandom Forest Classification Report:")
print(class_report_test_rf)

Random Forest Test Accuracy: 0.9765
Random Forest Test Precision: 0.9834
Random Forest Test Recall: 0.9738
Random Forest Test F1-score: 0.9786
Random Forest Test ROC AUC Score: 0.9768
Random Forest Test Confusion Matrix:


array([[7251,  149],
       [ 238, 8829]])


Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.98      0.97      7400
           1       0.98      0.97      0.98      9067

    accuracy                           0.98     16467
   macro avg       0.98      0.98      0.98     16467
weighted avg       0.98      0.98      0.98     16467



In [None]:
comparison_data = {
    'Model': ['Logistic Regression (Engineered Features)', 'XGBoost (Engineered Features)', 'Random Forest (Engineered Features)'],
    'Accuracy': [accuracy_test, accuracy_test_xgb, accuracy_test_rf],
    'Precision': [precision_test, precision_test_xgb, precision_test_rf],
    'Recall': [recall_test, recall_test_xgb, recall_test_rf],
    'F1-score': [f1_test, f1_test_xgb, f1_test_rf],
    'ROC AUC': [roc_auc_test, roc_auc_test_xgb, roc_auc_test_rf]
}

comparison_df = pd.DataFrame(comparison_data)

print("Model Performance Comparison with Engineered Features:")
display(comparison_df)

Model Performance Comparison:


Unnamed: 0,Model,Accuracy,Precision,Recall,F1-score,ROC AUC
0,Logistic Regression,0.850975,0.829365,0.918275,0.871559,0.843394
1,XGBoost,0.976924,0.985145,0.972758,0.978912,0.977393
2,Random Forest,0.976498,0.983404,0.973751,0.978554,0.976808



*   The XGBoost model achieved the highest performance across all evaluated metrics on the test set
*   The Random Forest model also performed well on the test set
*   The Logistic Regression model showed the lowest performance among the three models.

## Feature Selection

In [None]:
correlation_matrix = df2.corr()
label_correlation = correlation_matrix['label'].abs().sort_values(ascending=False)
print("Absolute Correlation with 'label':")
display(label_correlation)

Absolute Correlation with 'label':


Unnamed: 0,label
label,1.0
attack_cat,0.638825
sttl,0.504159
state,0.45904
swin,0.414504
ct_dst_sport_ltm,0.393668
dwin,0.369257
ct_src_dport_ltm,0.341513
rate,0.328629
ct_state_ttl,0.318517


In [None]:
# Choose a correlation threshold
correlation_threshold = 0.1

# Select features based on the threshold, excluding the label itself
selected_features = label_correlation[label_correlation > correlation_threshold].index.tolist()
if 'label' in selected_features:
    selected_features.remove('label')

if 'attack_cat' in selected_features:
    selected_features.remove('attack_cat')

print(f"Features with absolute correlation > {correlation_threshold} with 'label':")
print(selected_features)

Features with absolute correlation > 0.1 with 'label':
['sttl', 'state', 'swin', 'ct_dst_sport_ltm', 'dwin', 'ct_src_dport_ltm', 'rate', 'ct_state_ttl', 'ct_srv_dst', 'ct_srv_src', 'dtcpb', 'stcpb', 'dload', 'ct_dst_src_ltm', 'ct_src_ltm', 'ct_dst_ltm', 'dmean', 'synack', 'tcprtt', 'service', 'sload', 'sinpkt', 'ackdat', 'is_sm_ips_ports']


In [None]:
X_train_selected = X_train[selected_features]
X_val_selected = X_val[selected_features]
X_test_selected = X_test[selected_features]

print("Shape of X_train_selected:", X_train_selected.shape)
print("Shape of X_val_selected:", X_val_selected.shape)
print("Shape of X_test_selected:", X_test_selected.shape)

Shape of X_train_selected: (49398, 24)
Shape of X_val_selected: (16467, 24)
Shape of X_test_selected: (16467, 24)


In [None]:
model_lr_selected = LogisticRegression(random_state=42)
model_lr_selected.fit(X_train_selected, y_train)

# Make predictions on the test set using the retrained Logistic Regression model
y_pred_test_lr_selected = model_lr_selected.predict(X_test_selected)

# Evaluate the model on the test set
accuracy_test_lr_selected = accuracy_score(y_test, y_pred_test_lr_selected)
precision_test_lr_selected = precision_score(y_test, y_pred_test_lr_selected)
recall_test_lr_selected = recall_score(y_test, y_pred_test_lr_selected)
f1_test_lr_selected = f1_score(y_test, y_pred_test_lr_selected)
roc_auc_test_lr_selected = roc_auc_score(y_test, y_pred_test_lr_selected)
conf_matrix_test_lr_selected = confusion_matrix(y_test, y_pred_test_lr_selected)
class_report_test_lr_selected = classification_report(y_test, y_pred_test_lr_selected)

print("Logistic Regression Model Performance with Selected Features:")
print(f"Test Accuracy: {accuracy_test_lr_selected:.4f}")
print(f"Test Precision: {precision_test_lr_selected:.4f}")
print(f"Test Recall: {recall_test_lr_selected:.4f}")
print(f"Test F1-score: {f1_test_lr_selected:.4f}")
print(f"Test ROC AUC Score: {roc_auc_test_lr_selected:.4f}")
print("Test Confusion Matrix:")
display(conf_matrix_test_lr_selected)
print("\nClassification Report:")
print(class_report_test_lr_selected)

Logistic Regression Model Performance with Selected Features:
Test Accuracy: 0.8720
Test Precision: 0.9048
Test Recall: 0.8577
Test F1-score: 0.8806
Test ROC AUC Score: 0.8736
Test Confusion Matrix:


array([[6582,  818],
       [1290, 7777]])


Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.89      0.86      7400
           1       0.90      0.86      0.88      9067

    accuracy                           0.87     16467
   macro avg       0.87      0.87      0.87     16467
weighted avg       0.87      0.87      0.87     16467



In [None]:
# Initialize XGBClassifier with selected features
xgb_model_selected = XGBClassifier(random_state=42)

# Train the XGBoost model with selected features
xgb_model_selected.fit(X_train_selected, y_train)

# Make predictions on the test set using the retrained XGBoost model
y_pred_test_xgb_selected = xgb_model_selected.predict(X_test_selected)

# Evaluate the model on the test set
accuracy_test_xgb_selected = accuracy_score(y_test, y_pred_test_xgb_selected)
precision_test_xgb_selected = precision_score(y_test, y_pred_test_xgb_selected)
recall_test_xgb_selected = recall_score(y_test, y_pred_test_xgb_selected)
f1_test_xgb_selected = f1_score(y_test, y_pred_test_xgb_selected)
roc_auc_test_xgb_selected = roc_auc_score(y_test, y_pred_test_xgb_selected)
conf_matrix_test_xgb_selected = confusion_matrix(y_test, y_pred_test_xgb_selected)
class_report_test_xgb_selected = classification_report(y_test, y_pred_test_xgb_selected)

print("XGBoost Model Performance with Selected Features:")
print(f"Test Accuracy: {accuracy_test_xgb_selected:.4f}")
print(f"Test Precision: {precision_test_xgb_selected:.4f}")
print(f"Test Recall: {recall_test_xgb_selected:.4f}")
print(f"Test F1-score: {f1_test_xgb_selected:.4f}")
print(f"Test ROC AUC Score: {roc_auc_test_xgb_selected:.4f}")
print("Test Confusion Matrix:")
display(conf_matrix_test_xgb_selected)
print("\nClassification Report:")
print(class_report_test_xgb_selected)

XGBoost Model Performance with Selected Features:
Test Accuracy: 0.9681
Test Precision: 0.9722
Test Recall: 0.9697
Test F1-score: 0.9710
Test ROC AUC Score: 0.9679
Test Confusion Matrix:


array([[7149,  251],
       [ 275, 8792]])


Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.97      0.96      7400
           1       0.97      0.97      0.97      9067

    accuracy                           0.97     16467
   macro avg       0.97      0.97      0.97     16467
weighted avg       0.97      0.97      0.97     16467



In [None]:
# Instantiate a RandomForestClassifier with selected features
rf_model_selected = RandomForestClassifier(random_state=42)

# Train the Random Forest model with selected features
rf_model_selected.fit(X_train_selected, y_train)

# Make predictions on the test set using the retrained Random Forest model
y_pred_test_rf_selected = rf_model_selected.predict(X_test_selected)

# Evaluate the model on the test set
accuracy_test_rf_selected = accuracy_score(y_test, y_pred_test_rf_selected)
precision_test_rf_selected = precision_score(y_test, y_pred_test_rf_selected)
recall_test_rf_selected = recall_score(y_test, y_pred_test_rf_selected)
f1_test_rf_selected = f1_score(y_test, y_pred_test_rf_selected)
roc_auc_test_rf_selected = roc_auc_score(y_test, y_pred_test_rf_selected)
conf_matrix_test_rf_selected = confusion_matrix(y_test, y_pred_test_rf_selected)
class_report_test_rf_selected = classification_report(y_test, y_pred_test_rf_selected)

print("Random Forest Model Performance with Selected Features:")
print(f"Test Accuracy: {accuracy_test_rf_selected:.4f}")
print(f"Test Precision: {precision_test_rf_selected:.4f}")
print(f"Test Recall: {recall_test_rf_selected:.4f}")
print(f"Test F1-score: {f1_test_rf_selected:.4f}")
print(f"Test ROC AUC Score: {roc_auc_test_rf_selected:.4f}")
print("Test Confusion Matrix:")
display(conf_matrix_test_rf_selected)
print("\nClassification Report:")
print(class_report_test_rf_selected)

Random Forest Model Performance with Selected Features:
Test Accuracy: 0.9675
Test Precision: 0.9702
Test Recall: 0.9708
Test F1-score: 0.9705
Test ROC AUC Score: 0.9671
Test Confusion Matrix:


array([[7130,  270],
       [ 265, 8802]])


Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.96      0.96      7400
           1       0.97      0.97      0.97      9067

    accuracy                           0.97     16467
   macro avg       0.97      0.97      0.97     16467
weighted avg       0.97      0.97      0.97     16467



In [None]:
comparison_selected_data = {
    'Model': ['Logistic Regression (Selected Features)', 'XGBoost (Selected Features)', 'Random Forest (Selected Features)'],
    'Accuracy': [accuracy_test_lr_selected, accuracy_test_xgb_selected, accuracy_test_rf_selected],
    'Precision': [precision_test_lr_selected, precision_test_xgb_selected, precision_test_rf_selected],
    'Recall': [recall_test_lr_selected, recall_test_xgb_selected, recall_test_rf_selected],
    'F1-score': [f1_test_lr_selected, f1_test_xgb_selected, f1_test_rf_selected],
    'ROC AUC': [roc_auc_test_lr_selected, roc_auc_test_xgb_selected, roc_auc_test_rf_selected]
}

comparison_selected_df = pd.DataFrame(comparison_selected_data)

# Combine the two comparison DataFrames
combined_comparison_df = pd.concat([comparison_df, comparison_selected_df], ignore_index=True)

print("Model Performance Comparison (Engineered Features vs. Selected Features):")
display(combined_comparison_df)

Model Performance Comparison (All Features vs. Selected Features):


Unnamed: 0,Model,Accuracy,Precision,Recall,F1-score,ROC AUC
0,Logistic Regression,0.850975,0.829365,0.918275,0.871559,0.843394
1,XGBoost,0.976924,0.985145,0.972758,0.978912,0.977393
2,Random Forest,0.976498,0.983404,0.973751,0.978554,0.976808
3,Logistic Regression (Selected Features),0.871986,0.904828,0.857726,0.880648,0.873593
4,XGBoost (Selected Features),0.968057,0.972244,0.96967,0.970955,0.967876
5,Random Forest (Selected Features),0.967511,0.970238,0.970773,0.970506,0.967143



*   Comparing the models with and without feature selection, all models (Logistic Regression, XGBoost, and Random Forest) showed a decrease in Accuracy, Precision, Recall, F1-score, and ROC AUC when trained on the selected features compared to training on all features.

*   Feature selection, in this case, did not improve model performance and resulted in a decrease across all evaluated metrics for all models.
*   Despite the performance decrease with selected features, XGBoost and Random Forest still outperform Logistic Regression, suggesting that tree-based models are more suitable for this dataset.


## Feature Engineering

In [None]:
# new features: packet and byte ratios
df2['pkt_ratio'] = df2['dpkts'] / (df2['spkts'] + df2['dpkts'] + 1e-6)
df2['byte_ratio'] = df2['dbytes'] / (df2['sbytes'] + df2['dbytes'] + 1e-6)

print("DataFrame with new features:")
display(df2[['spkts', 'dpkts', 'pkt_ratio', 'sbytes', 'dbytes', 'byte_ratio']].head())

DataFrame with new features:


Unnamed: 0,spkts,dpkts,pkt_ratio,sbytes,dbytes,byte_ratio
0,9.4e-05,0.0,0.0,3.3e-05,0.0,0.0
1,9.4e-05,0.0,0.0,0.000121,0.0,0.0
2,9.4e-05,0.0,0.0,7.3e-05,0.0,0.0
3,9.4e-05,0.0,0.0,6.1e-05,0.0,0.0
4,9.4e-05,0.0,0.0,0.000146,0.0,0.0


In [None]:
df2.shape

(82332, 46)