## Problem Formulating

In [None]:
# finding duplicates
duplicated_rows = df2[df2.duplicated()]
num_duplicated_rows = duplicated_rows.shape[0]

print(f"Number of duplicated rows found: {num_duplicated_rows}")
if num_duplicated_rows > 0:
    print("\nDuplicated Rows:")
    display(duplicated_rows)
else:
    print("\nNo duplicated rows found in the DataFrame.")

Number of duplicated rows found: 26387

Duplicated Rows:


Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
10,0.000000e+00,6,0,4,0.000000,0.0,0.000002,0.0,0.000000e+00,0.000000,...,0.027027,0.016129,0.0,0.0,0.0,0.016949,0.016393,1.0,6,0
11,0.000000e+00,6,0,4,0.000000,0.0,0.000002,0.0,0.000000e+00,0.000000,...,0.027027,0.016129,0.0,0.0,0.0,0.016949,0.016393,1.0,6,0
39,9.999281e-01,6,0,4,0.000094,0.0,0.000005,0.0,1.666800e-08,0.000000,...,0.027027,0.016129,0.0,0.0,0.0,0.016949,0.016393,1.0,6,0
41,9.999281e-01,6,0,4,0.000094,0.0,0.000005,0.0,1.666800e-08,0.000000,...,0.027027,0.016129,0.0,0.0,0.0,0.016949,0.016393,1.0,6,0
53,2.474428e-01,111,0,5,0.001973,0.0,0.000067,0.0,1.414469e-06,0.996078,...,0.054054,0.209677,0.0,0.0,0.0,0.033898,0.229508,0.0,6,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82325,1.666667e-07,117,0,4,0.000094,0.0,0.000006,0.0,1.000000e-01,0.996078,...,0.000000,0.016129,0.0,0.0,0.0,0.000000,0.016393,0.0,6,0
82326,1.500000e-07,117,0,4,0.000094,0.0,0.000006,0.0,1.111111e-01,0.996078,...,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,6,0
82329,0.000000e+00,6,0,4,0.000000,0.0,0.000002,0.0,0.000000e+00,0.000000,...,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,1.0,6,0
82330,0.000000e+00,6,0,4,0.000000,0.0,0.000002,0.0,0.000000e+00,0.000000,...,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,1.0,6,0


I will be keeping duplicates since dropping them might remove valuable information about repeated malicious activities and worsen the class imbalance

## Baseline Models

In [None]:
# Split into train, validation, and test sets (60/20/20)
from sklearn.model_selection import train_test_split

# Split into features and target
X = df2.drop(columns=['label', 'attack_cat']) # Also drop attack_cat not needed for binary classification
y = df2['label']

X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.25, stratify=y_trainval, random_state=42)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report

def evaluate_model_performance(y_true, y_pred, model_name="Model"):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_pred)
    conf_matrix = confusion_matrix(y_true, y_pred)
    class_report = classification_report(y_true, y_pred)

    print(f"{model_name} Performance:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}")
    print(f"ROC AUC Score: {roc_auc:.4f}")
    print("Confusion Matrix:")
    display(conf_matrix)
    print("\nClassification Report:")
    print(class_report)

    return {
        'Model': model_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-score': f1,
        'ROC AUC': roc_auc
    }

In [None]:
from sklearn.linear_model import LogisticRegression

# Initialize and train the Logistic Regression model
model = LogisticRegression(random_state=42, max_iter=1000)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_test = model.predict(X_test)

# Evaluate the model on the test set using the new function
lr_original_metrics = evaluate_model_performance(y_test, y_pred_test, model_name="Logistic Regression Baseline Model")

Logistic Regression Baseline Model Performance:
Accuracy: 0.8902
Precision: 0.8914
Recall: 0.9117
F1-score: 0.9014
ROC AUC Score: 0.8878
Confusion Matrix:


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array([[6393, 1007],
       [ 801, 8266]])


Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.86      0.88      7400
           1       0.89      0.91      0.90      9067

    accuracy                           0.89     16467
   macro avg       0.89      0.89      0.89     16467
weighted avg       0.89      0.89      0.89     16467



In [None]:
from xgboost import XGBClassifier

# Initialize XGBClassifier
xgb_model = XGBClassifier(random_state=42)
xgb_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_test_xgb = xgb_model.predict(X_test)

# Evaluate the model on the test set using the new function
xgb_original_metrics = evaluate_model_performance(y_test, y_pred_test_xgb, model_name="XGBoost Baseline Model")

XGBoost Baseline Model Performance:
Accuracy: 0.9769
Precision: 0.9851
Recall: 0.9728
F1-score: 0.9789
ROC AUC Score: 0.9774
Confusion Matrix:


array([[7267,  133],
       [ 247, 8820]])


Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.98      0.97      7400
           1       0.99      0.97      0.98      9067

    accuracy                           0.98     16467
   macro avg       0.98      0.98      0.98     16467
weighted avg       0.98      0.98      0.98     16467



In [None]:
from sklearn.ensemble import RandomForestClassifier

# initiate a RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_test_rf = rf_model.predict(X_test)

# Evaluate the model on the test set using the new function
rf_original_metrics = evaluate_model_performance(y_test, y_pred_test_rf, model_name="Random Forest Baseline Model")

Random Forest Baseline Model Performance:
Accuracy: 0.9765
Precision: 0.9834
Recall: 0.9738
F1-score: 0.9786
ROC AUC Score: 0.9768
Confusion Matrix:


array([[7251,  149],
       [ 238, 8829]])


Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.98      0.97      7400
           1       0.98      0.97      0.98      9067

    accuracy                           0.98     16467
   macro avg       0.98      0.98      0.98     16467
weighted avg       0.98      0.98      0.98     16467




*   The XGBoost model achieved the highest performance across all evaluated metrics on the test set
*   The Random Forest model also performed well on the test set
*   The Logistic Regression model showed the lowest performance among the three models.