In [18]:
# ===== Basic Python Libraries =====
import pandas as pd
import numpy as np
import time
import io
import pickle
from tabulate import tabulate  

# ===== Scikit-learn: Data Splitting =====
from sklearn.model_selection import train_test_split, KFold, cross_val_score

# ===== Scikit-learn: Preprocessing =====
from sklearn.preprocessing import StandardScaler, LabelEncoder

# ===== Scikit-learn: Classifiers =====
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

# ===== Scikit-learn: Model Evaluation =====
from sklearn.metrics import (
    accuracy_score,
    precision_score, 
    recall_score, 
    f1_score, 
    classification_report, 
    confusion_matrix
)


In [2]:
# Import Gradient Boosting Libraries
try:
    import xgboost as xgb
    XGBClassifier = xgb.XGBClassifier
except ImportError:
    print("Warning: XGBoost not installed. Skipping XGBoost model.")
    XGBClassifier = None

try:
    import lightgbm as lgb
    LGBMClassifier = lgb.LGBMClassifier
except ImportError:
    print("Warning: LightGBM not installed. Skipping LightGBM model.")
    LGBMClassifier = None

Configuration

In [4]:
TRAIN_DATASET_PATH = 'Dataset/train_data.csv'
TEST_DATASET_PATH = 'Dataset/test_data.csv'
TARGET_COLUMN = 'Label'
K_FOLDS = 5

Load Data from CSV Files

In [5]:
try:
    df_train = pd.read_csv(TRAIN_DATASET_PATH)
    print(f"Training dataset loaded successfully from {TRAIN_DATASET_PATH}")
    print(f"Training dataset shape: {df_train.shape}")
    print("\nFirst 5 rows of the training dataset:")
    print(df_train.head())
    print("\nTraining Dataset Info:")
    df_train.info()
except FileNotFoundError:
    print(f"Error: Training dataset not found at {TRAIN_DATASET_PATH}")
    print("Please update TRAIN_DATASET_PATH with the correct path to your training CSV file.")
    exit()

try:
    df_test = pd.read_csv(TEST_DATASET_PATH)
    print(f"\nTesting dataset loaded successfully from {TEST_DATASET_PATH}")
    print(f"Testing dataset shape: {df_test.shape}")
    print("\nFirst 5 rows of the testing dataset:")
    print(df_test.head())
    print("\nTesting Dataset Info:")
    df_test.info()
except FileNotFoundError:
    print(f"Error: Testing dataset not found at {TEST_DATASET_PATH}")
    print("Please update TEST_DATASET_PATH with the correct path to your testing CSV file.")
    exit()

Training dataset loaded successfully from Dataset/train_data.csv
Training dataset shape: (2558780, 85)

First 5 rows of the training dataset:
                                   FlowID       SourceIP  SourcePort  \
0  192.168.10.3-192.168.10.12-53-26526-17  192.168.10.12       26526   
1   172.16.0.1-192.168.10.50-37255-3737-6     172.16.0.1       37255   
2    192.168.10.16-72.21.91.29-53482-80-6  192.168.10.16       53482   
3    192.168.10.15-31.13.71.7-50902-443-6     31.13.71.7         443   
4   192.168.10.3-192.168.10.9-53-51576-17   192.168.10.9       51576   

   DestinationIP  DestinationPort  Protocol            Timestamp  \
0   192.168.10.3               53        17        6/7/2017 3:12   
1  192.168.10.50             3737         6        7/7/2017 2:52   
2    72.21.91.29               80         6       5/7/2017 10:02   
3  192.168.10.15            50902         6  03/07/2017 10:16:29   
4   192.168.10.3               53        17        4/7/2017 4:59   

   FlowDuration 

Preprocessing 

In [14]:
columns_to_drop = ['FlowID', 'SourceIP', 'DestinationIP', 'Timestamp']

# Apply dropping columns and handling infinite/NaN values to both dataframes
def preprocess_dataframe(df, columns_to_drop, target_column, desired_sample_size):
    df_processed = df.drop(columns=columns_to_drop, errors='ignore').copy()
    # Handle potential infinite values
    df_processed.replace([float('inf'), float('-inf')], pd.NA, inplace=True)
    # Fill NaN values. Using 0 here, but consider other strategies.
    df_processed.fillna(0, inplace=True)

    if isinstance(desired_sample_size, str) and desired_sample_size.lower() == 'all':
        df_sampled = df_processed
    else:
        sample_frac = desired_sample_size / len(df_processed)
        df_sampled, _ = train_test_split(
            df_processed,
            test_size=1 - sample_frac,
            stratify=df_processed[target_column],
            random_state=42
        )
    
    # Separate features (X) and target (y)
    X = df_sampled.drop(columns=[target_column])
    y = df_sampled[target_column]

    y = y.astype(str)

    return X, y

X_train, y_train = preprocess_dataframe(df_train, columns_to_drop, TARGET_COLUMN, 'all')
X_test, y_test = preprocess_dataframe(df_test, columns_to_drop, TARGET_COLUMN, 'all')


def simplify_labels(y):
    return y.apply(lambda x: 'BENIGN' if x.upper() == 'BENIGN' else 'ATTACK')

y_train = simplify_labels(y_train)
y_test = simplify_labels(y_test)


# Encode the target variable
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

print(f"\nOriginal Training Labels: {y_train.unique()}")
print(f"Encoded Training Labels: {label_encoder.classes_}")
print(f"\nOriginal Testing Labels: {y_test.unique()}")
print(f"Encoded Testing Labels (based on training labels): {label_encoder.classes_}")

# --- Save the LabelEncoder ---
try:
    encoder_filename = 'label_encoder.pkl'
    with open(encoder_filename, 'wb') as f:
        pickle.dump(label_encoder, f)
    print(f"\nLabel encoder saved to {encoder_filename}")
except Exception as e:
    print(f"Error saving label encoder: {e}")

# Scale numerical features
# Fit scaler ONLY on training data, then transform both train and test data
scaler = StandardScaler()
# Check if training data is not empty before scaling
if X_train.shape[0] > 0:
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    print(f"\nFeatures scaled using StandardScaler (fitted on training data).")

    # --- Save the StandardScaler ---
    try:
        scaler_filename = 'scaler.pkl'
        with open(scaler_filename, 'wb') as f:
            pickle.dump(scaler, f)
        print(f"Standard scaler saved to {scaler_filename}")
    except Exception as e:
        print(f"Error saving standard scaler: {e}")
else:
    print("\nWarning: Training data is empty after preprocessing. Cannot scale features or train models.")
    X_train_scaled = X_train
    X_test_scaled = X_test



Original Training Labels: ['ATTACK' 'BENIGN']
Encoded Training Labels: ['ATTACK' 'BENIGN']

Original Testing Labels: ['BENIGN' 'ATTACK']
Encoded Testing Labels (based on training labels): ['ATTACK' 'BENIGN']

Label encoder saved to label_encoder.pkl

Features scaled using StandardScaler (fitted on training data).
Standard scaler saved to scaler.pkl


Defining models to train

In [15]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5),
    "Gaussian Naive Bayes": GaussianNB(),
    "SGD Classifier": SGDClassifier(loss='log_loss', max_iter=1000, tol=1e-3, random_state=42),
    "MLP Classifier": MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42),
    "Bagging Classifier (Decision Tree)": BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=100, random_state=42)
}

# Add Gradient Boosting models if installed
if XGBClassifier is not None:
    models["XGBoost"] = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
if LGBMClassifier is not None:
    models["LightGBM"] = LGBMClassifier(random_state=42)

Cross validation to evaluate model performance

In [16]:
cv_results = {}

print("\n--- Starting K-Fold Cross-Validation (K={}) ---".format(K_FOLDS))

# Perform k-fold cross-validation for each model
if X_train_scaled.shape[0] > 0 and len(label_encoder.classes_) > 0:
    kf = KFold(n_splits=K_FOLDS, shuffle=True, random_state=42)
    
    for name, model in models.items():
        print(f"\nPerforming {K_FOLDS}-fold cross-validation for {name}...")
        fold_accuracies = []
        fold_precisions = []
        fold_recalls = []
        fold_f1_scores = []
        fold_times = []
        
        # Perform K-fold CV manually to get more metrics than just accuracy
        fold_num = 1
        
        for train_index, val_index in kf.split(X_train_scaled):
            X_fold_train, X_fold_val = X_train_scaled[train_index], X_train_scaled[val_index]
            y_fold_train, y_fold_val = y_train_encoded[train_index], y_train_encoded[val_index]
            
            # Train the model
            start_time = time.time()
            try:
                model.fit(X_fold_train, y_fold_train)
                train_time = time.time() - start_time
                
                # Predict and evaluate
                y_fold_pred = model.predict(X_fold_val)
                
                # Calculate metrics
                accuracy = accuracy_score(y_fold_val, y_fold_pred)
                precision = precision_score(y_fold_val, y_fold_pred, average='weighted', zero_division=0)
                recall = recall_score(y_fold_val, y_fold_pred, average='weighted', zero_division=0)
                f1 = f1_score(y_fold_val, y_fold_pred, average='weighted', zero_division=0)
                
                # Store metrics
                fold_accuracies.append(accuracy)
                fold_precisions.append(precision)
                fold_recalls.append(recall)
                fold_f1_scores.append(f1)
                fold_times.append(train_time)
                
                print(f"  Fold {fold_num}/{K_FOLDS} - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, "
                      f"Recall: {recall:.4f}, F1: {f1:.4f}, Time: {train_time:.2f}s")
                
                fold_num += 1
                
            except Exception as e:
                print(f"  Error in fold {fold_num} for {name}: {e}")
                continue
        
        # Calculate average metrics across folds
        if fold_accuracies:
            avg_accuracy = np.mean(fold_accuracies)
            avg_precision = np.mean(fold_precisions)
            avg_recall = np.mean(fold_recalls)
            avg_f1 = np.mean(fold_f1_scores)
            avg_time = np.mean(fold_times)
            
            cv_results[name] = {
                "Accuracy": avg_accuracy,
                "Precision": avg_precision,
                "Recall": avg_recall,
                "F1-Score": avg_f1,
                "Training Time": avg_time
            }
            
            print(f"\n{name} - Average CV Metrics:")
            print(f"  Accuracy: {avg_accuracy:.4f} (±{np.std(fold_accuracies):.4f})")
            print(f"  Precision: {avg_precision:.4f} (±{np.std(fold_precisions):.4f})")
            print(f"  Recall: {avg_recall:.4f} (±{np.std(fold_recalls):.4f})")
            print(f"  F1-Score: {avg_f1:.4f} (±{np.std(fold_f1_scores):.4f})")
            print(f"  Avg Training Time: {avg_time:.4f}s")
        else:
            cv_results[name] = {"Error": "Failed to complete cross-validation"}
else:
    print("\nSkipping cross-validation due to insufficient data or classes.")


--- Starting K-Fold Cross-Validation (K=5) ---

Performing 5-fold cross-validation for Logistic Regression...
  Fold 1/5 - Accuracy: 0.9283, Precision: 0.9316, Recall: 0.9283, F1: 0.9295, Time: 0.06s
  Fold 2/5 - Accuracy: 0.9483, Precision: 0.9477, Recall: 0.9483, F1: 0.9476, Time: 0.04s
  Fold 3/5 - Accuracy: 0.9350, Precision: 0.9342, Recall: 0.9350, F1: 0.9343, Time: 0.04s
  Fold 4/5 - Accuracy: 0.9300, Precision: 0.9303, Recall: 0.9300, F1: 0.9302, Time: 0.04s
  Fold 5/5 - Accuracy: 0.9100, Precision: 0.9146, Recall: 0.9100, F1: 0.9117, Time: 0.04s

Logistic Regression - Average CV Metrics:
  Accuracy: 0.9303 (±0.0124)
  Precision: 0.9317 (±0.0106)
  Recall: 0.9303 (±0.0124)
  F1-Score: 0.9307 (±0.0115)
  Avg Training Time: 0.0457s

Performing 5-fold cross-validation for Decision Tree...
  Fold 1/5 - Accuracy: 0.9867, Precision: 0.9870, Recall: 0.9867, F1: 0.9868, Time: 0.06s
  Fold 2/5 - Accuracy: 0.9983, Precision: 0.9983, Recall: 0.9983, F1: 0.9983, Time: 0.08s
  Fold 3/5 - Ac



  Fold 1/5 - Accuracy: 0.9850, Precision: 0.9849, Recall: 0.9850, F1: 0.9849, Time: 5.16s
  Fold 2/5 - Accuracy: 0.9750, Precision: 0.9749, Recall: 0.9750, F1: 0.9749, Time: 4.65s
  Fold 3/5 - Accuracy: 0.9750, Precision: 0.9751, Recall: 0.9750, F1: 0.9750, Time: 5.07s
  Fold 4/5 - Accuracy: 0.9817, Precision: 0.9816, Recall: 0.9817, F1: 0.9816, Time: 5.09s
  Fold 5/5 - Accuracy: 0.9817, Precision: 0.9817, Recall: 0.9817, F1: 0.9817, Time: 4.93s

MLP Classifier - Average CV Metrics:
  Accuracy: 0.9797 (±0.0040)
  Precision: 0.9796 (±0.0040)
  Recall: 0.9797 (±0.0040)
  F1-Score: 0.9796 (±0.0040)
  Avg Training Time: 4.9807s

Performing 5-fold cross-validation for Bagging Classifier (Decision Tree)...
  Fold 1/5 - Accuracy: 0.9900, Precision: 0.9901, Recall: 0.9900, F1: 0.9900, Time: 4.27s
  Fold 2/5 - Accuracy: 0.9950, Precision: 0.9950, Recall: 0.9950, F1: 0.9950, Time: 4.97s
  Fold 3/5 - Accuracy: 0.9917, Precision: 0.9917, Recall: 0.9917, F1: 0.9917, Time: 4.99s
  Fold 4/5 - Accurac

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


  Fold 3/5 - Accuracy: 0.9950, Precision: 0.9950, Recall: 0.9950, F1: 0.9950, Time: 0.09s
  Fold 4/5 - Accuracy: 0.9950, Precision: 0.9950, Recall: 0.9950, F1: 0.9950, Time: 0.09s
  Fold 5/5 - Accuracy: 0.9917, Precision: 0.9917, Recall: 0.9917, F1: 0.9916, Time: 0.08s

XGBoost - Average CV Metrics:
  Accuracy: 0.9947 (±0.0022)
  Precision: 0.9947 (±0.0022)
  Recall: 0.9947 (±0.0022)
  F1-Score: 0.9947 (±0.0022)
  Avg Training Time: 0.0828s

Performing 5-fold cross-validation for LightGBM...
[LightGBM] [Info] Number of positive: 1836, number of negative: 564
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001238 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11538
[LightGBM] [Info] Number of data points in the train set: 2400, number of used features: 68
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.765000 -> initscore=1.180290
[LightGBM] [Info] Start training from score 1.180290


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


  Fold 1/5 - Accuracy: 0.9933, Precision: 0.9934, Recall: 0.9933, F1: 0.9934, Time: 0.11s
[LightGBM] [Info] Number of positive: 1861, number of negative: 539
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001517 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11536
[LightGBM] [Info] Number of data points in the train set: 2400, number of used features: 68
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.775417 -> initscore=1.239154
[LightGBM] [Info] Start training from score 1.239154
  Fold 2/5 - Accuracy: 0.9983, Precision: 0.9983, Recall: 0.9983, F1: 0.9983, Time: 0.11s
[LightGBM] [Info] Number of positive: 1868, number of negative: 532
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001084 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11550
[LightGBM] [Info] Number of data points in the train set: 2400, nu

In [17]:
if cv_results:
    # Create a summary table
    summary_data = []
    headers = ["Model", "Accuracy", "Precision", "Recall", "F1-Score", "Training Time (s)"]
    
    for name, metrics in cv_results.items():
        if "Error" not in metrics:
            row = [
                name, 
                f"{metrics['Accuracy']:.4f}", 
                f"{metrics['Precision']:.4f}", 
                f"{metrics['Recall']:.4f}", 
                f"{metrics['F1-Score']:.4f}",
                f"{metrics['Training Time']:.4f}"
            ]
            summary_data.append(row)
    
    # Display summary table
    print("\n--- Cross-Validation Summary ---")
    print(tabulate(summary_data, headers=headers, tablefmt="grid"))
    
    # Select the best model based on F1-Score (you can change this to another metric if preferred)
    best_model_name = max(cv_results.items(), key=lambda x: x[1]['F1-Score'] if "Error" not in x[1] else -1)[0]
    best_model = models[best_model_name]
    best_cv_f1 = cv_results[best_model_name]['F1-Score']
    
    print(f"\n--- Best Model Based on F1-Score ---")
    print(f"Selected Model: {best_model_name}")
    print(f"CV F1-Score: {best_cv_f1:.4f}")
    
    # --- 5. Train Best Model on Full Training Dataset ---
    print(f"\n--- Training {best_model_name} on Full Training Dataset ---")
    start_time = time.time()
    best_model.fit(X_train_scaled, y_train_encoded)
    training_time = time.time() - start_time
    print(f"Training completed in {training_time:.4f} seconds")
    
    # Save the best model
    try:
        best_model_filename = f"best_model_{best_model_name.replace(' ', '_').replace('(', '').replace(')', '')}.pkl"
        with open(best_model_filename, 'wb') as f:
            pickle.dump(best_model, f)
        print(f"Best model saved to {best_model_filename}")
    except Exception as e:
        print(f"Error saving best model: {e}")
    
    # --- 6. Evaluate Best Model on Test Dataset ---
    print(f"\n--- Evaluating {best_model_name} on Test Dataset ---")
    start_time = time.time()
    y_pred_encoded = best_model.predict(X_test_scaled)
    prediction_time = time.time() - start_time
    
    # Convert encoded predictions back to original labels
    y_pred = label_encoder.inverse_transform(y_pred_encoded)
    y_test_original = label_encoder.inverse_transform(y_test_encoded)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test_encoded, y_pred_encoded)
    precision = precision_score(y_test_encoded, y_pred_encoded, average='weighted', zero_division=0)
    recall = recall_score(y_test_encoded, y_pred_encoded, average='weighted', zero_division=0)
    f1 = f1_score(y_test_encoded, y_pred_encoded, average='weighted', zero_division=0)
    
    print(f"\n--- Final Test Results for {best_model_name} ---")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print(f"Prediction Time: {prediction_time:.4f} seconds")
    
    # Display classification report
    print("\nClassification Report:")
    print(classification_report(y_test_encoded, y_pred_encoded, target_names=label_encoder.classes_, zero_division=0))
    
    # Display confusion matrix
    conf_matrix = confusion_matrix(y_test_encoded, y_pred_encoded)
    print("\nConfusion Matrix:")
    print(conf_matrix)
    
    # Compare CV performance with test performance
    print("\n--- CV vs Test Performance Comparison ---")
    print(f"CV F1-Score: {best_cv_f1:.4f}")
    print(f"Test F1-Score: {f1:.4f}")
    print(f"Difference: {f1 - best_cv_f1:.4f}")
    
else:
    print("\nNo valid cross-validation results. Cannot select best model.")


--- Cross-Validation Summary ---
+------------------------------------+------------+-------------+----------+------------+---------------------+
| Model                              |   Accuracy |   Precision |   Recall |   F1-Score |   Training Time (s) |
| Logistic Regression                |     0.9303 |      0.9317 |   0.9303 |     0.9307 |              0.0457 |
+------------------------------------+------------+-------------+----------+------------+---------------------+
| Decision Tree                      |     0.99   |      0.99   |   0.99   |     0.99   |              0.0699 |
+------------------------------------+------------+-------------+----------+------------+---------------------+
| Random Forest                      |     0.9923 |      0.9923 |   0.9923 |     0.9923 |              0.5383 |
+------------------------------------+------------+-------------+----------+------------+---------------------+
| K-Nearest Neighbors                |     0.959  |      0.9604 |   0.

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



--- Final Test Results for XGBoost ---
Accuracy: 0.9956
Precision: 0.9957
Recall: 0.9956
F1-Score: 0.9956
Prediction Time: 0.1518 seconds

Classification Report:
              precision    recall  f1-score   support

      ATTACK       0.99      0.99      0.99    145426
      BENIGN       1.00      1.00      1.00    494269

    accuracy                           1.00    639695
   macro avg       0.99      1.00      0.99    639695
weighted avg       1.00      1.00      1.00    639695


Confusion Matrix:
[[144677    749]
 [  2042 492227]]

--- CV vs Test Performance Comparison ---
CV F1-Score: 0.9947
Test F1-Score: 0.9956
Difference: 0.0010
