In [None]:
!pip install pandas scikit-learn xgboost lightgbm psutil



In [None]:
# Dont change the code. Add code to show memwory usage

# Working
import pandas as pd
from sklearn.model_selection import train_test_split # Still useful for potential validation split later, but not for initial train/test load
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression, SGDClassifier # Added SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier # Added BaggingClassifier (as an example of bagging)
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB # Added Gaussian Naive Bayes
from sklearn.neural_network import MLPClassifier # Added MLP Classifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import time
import io # Keep io import in case needed for other things
import pickle # Import the pickle library
import psutil
import os

# --- Memory Usage Monitoring Helper ---
process = psutil.Process(os.getpid())

def get_memory_usage():
    """Returns current memory usage of the process in MB."""
    mem_info = process.memory_info()
    return mem_info.rss / (1024 * 1024) # Convert bytes to MB

print(f"Initial memory usage: {get_memory_usage():.2f} MB")


# Import Gradient Boosting Libraries (you might need to install these: pip install xgboost lightgbm)
try:
    import xgboost as xgb
    XGBClassifier = xgb.XGBClassifier
except ImportError:
    print("Warning: XGBoost not installed. Skipping XGBoost model.")
    XGBClassifier = None # Set to None if not installed

try:
    import lightgbm as lgb
    LGBMClassifier = lgb.LGBMClassifier
except ImportError:
    print("Warning: LightGBM not installed. Skipping LightGBM model.")
    LGBMClassifier = None # Set to None if not installed

# --- Configuration ---
# Replace with the actual paths to your dataset files
TRAIN_DATASET_PATH = './train_data.csv' # <--- Path to your training CSV file
TEST_DATASET_PATH = './test_data.csv'   # <--- Path to your testing CSV file
TARGET_COLUMN = 'Label' # The name of the target column

# --- 1. Load Data from Separate CSV Files ---
print(f"\nMemory usage before loading data: {get_memory_usage():.2f} MB")
try:
    df_train = pd.read_csv(TRAIN_DATASET_PATH)
    print(f"Training dataset loaded successfully from {TRAIN_DATASET_PATH}")
    print(f"Training dataset shape: {df_train.shape}")
    print("\nFirst 5 rows of the training dataset:")
    print(df_train.head())
    print("\nTraining Dataset Info:")
    df_train.info()
    print(f"Memory usage after loading training data: {get_memory_usage():.2f} MB")
except FileNotFoundError:
    print(f"Error: Training dataset not found at {TRAIN_DATASET_PATH}")
    print("Please update TRAIN_DATASET_PATH with the correct path to your training CSV file.")
    exit() # Exiting for demonstration purposes if file not found

try:
    df_test = pd.read_csv(TEST_DATASET_PATH)
    print(f"\nTesting dataset loaded successfully from {TEST_DATASET_PATH}")
    print(f"Testing dataset shape: {df_test.shape}")
    print("\nFirst 5 rows of the testing dataset:")
    print(df_test.head())
    print("\nTesting Dataset Info:")
    df_test.info()
    print(f"Memory usage after loading testing data: {get_memory_usage():.2f} MB")
except FileNotFoundError:
    print(f"Error: Testing dataset not found at {TEST_DATASET_PATH}")
    print("Please update TEST_DATASET_PATH with the correct path to your testing CSV file.")
    exit() # Exiting for demonstration purposes if file not found


# --- 2. Preprocessing ---
print(f"\nMemory usage before preprocessing: {get_memory_usage():.2f} MB")

# Define columns to drop - apply to both train and test sets
columns_to_drop = ['FlowID', 'SourceIP', 'DestinationIP', 'Timestamp']

# Apply dropping columns and handling infinite/NaN values to both dataframes
def preprocess_dataframe(df, columns_to_drop, target_column):
    df_processed = df.drop(columns=columns_to_drop, errors='ignore').copy() # Use .copy() to avoid SettingWithCopyWarning
    # Handle potential infinite values
    df_processed.replace([float('inf'), float('-inf')], pd.NA, inplace=True)
    # Fill NaN values. Using 0 here, but consider other strategies.
    df_processed.fillna(0, inplace=True)

    # Separate features (X) and target (y)
    X = df_processed.drop(columns=[target_column])
    y = df_processed[target_column]

    # --- Added: Ensure target column is treated as string for LabelEncoder ---
    y = y.astype(str)

    return X, y

X_train, y_train = preprocess_dataframe(df_train, columns_to_drop, TARGET_COLUMN)
X_test, y_test = preprocess_dataframe(df_test, columns_to_drop, TARGET_COLUMN)

print("\nPreprocessing applied to both training and testing datasets.")
print(f"Memory usage after initial preprocessing: {get_memory_usage():.2f} MB")

# --- NEW: Convert multi-class target into binary labels: 'BENIGN' vs 'ATTACK' ---
def simplify_labels(y):
    return y.apply(lambda x: 'BENIGN' if x.upper() == 'BENIGN' else 'ATTACK')

y_train = simplify_labels(y_train)
y_test = simplify_labels(y_test)

print("\nLabels simplified to 'BENIGN' vs 'ATTACK'.")
print(f"Memory usage after simplifying labels: {get_memory_usage():.2f} MB")


# Encode the target variable if it's categorical (e.g., 'BENIGN', 'ATTACK_TYPE')
# Fit on training labels, transform both training and testing labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test) # Use transform, not fit_transform on test set

for i, class_name in enumerate(label_encoder.classes_):
    print(f"Encoded value {i} -> Class label '{class_name}'")

print(f"\nOriginal Training Labels: {y_train.unique()}")
print(f"Encoded Training Labels Classes: {label_encoder.classes_}")
print(f"Encoded Training Labels: {label_encoder}")
print(f"\nOriginal Testing Labels: {y_test.unique()}")
print(f"Encoded Testing Labels (based on training labels): {label_encoder.classes_}")

# --- Save the LabelEncoder ---
try:
    encoder_filename = 'label_encoder.pkl'
    with open(encoder_filename, 'wb') as f:
        pickle.dump(label_encoder, f)
    print(f"\nLabel encoder saved to {encoder_filename}")
except Exception as e:
    print(f"Error saving label encoder: {e}")
print(f"Memory usage after encoding labels and saving encoder: {get_memory_usage():.2f} MB")


# Scale numerical features
# Fit scaler ONLY on training data, then transform both train and test data
scaler = StandardScaler()
# Check if training data is not empty before scaling
if X_train.shape[0] > 0:
    print(f"\nMemory usage before scaling features: {get_memory_usage():.2f} MB")
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    print(f"Features scaled using StandardScaler (fitted on training data).")
    print(f"Memory usage after scaling features: {get_memory_usage():.2f} MB")

    # --- Save the StandardScaler ---
    try:
        scaler_filename = 'scaler.pkl'
        with open(scaler_filename, 'wb') as f:
            pickle.dump(scaler, f)
        print(f"Standard scaler saved to {scaler_filename}")
    except Exception as e:
        print(f"Error saving standard scaler: {e}")
    print(f"Memory usage after saving scaler: {get_memory_usage():.2f} MB")

else:
    print("\nWarning: Training data is empty after preprocessing. Cannot scale features or train models.")
    X_train_scaled = X_train # Keep as is if empty
    X_test_scaled = X_test   # Keep as is if empty
    print(f"Memory usage with empty data (scaling skipped): {get_memory_usage():.2f} MB")


# --- 3. & 4. Model Selection and Training ---

# Initialize different classifiers
models = {
    # "Logistic Regression": LogisticRegression(max_iter=1000), # Increased max_iter for convergence
    # "Decision Tree": DecisionTreeClassifier(random_state=42),
    # "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    # "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5), # Default n_neighbors is 5
    # "Gaussian Naive Bayes": GaussianNB(), # Added Naive Bayes
    # "SGD Classifier": SGDClassifier(loss='log_loss', max_iter=1000, tol=1e-3, random_state=42), # Added SGD, using log_loss for logistic regression-like behavior
    # "MLP Classifier": MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42), # Added MLP
    # "Bagging Classifier (Decision Tree)": BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=100, random_state=42) # Fixed: Changed base_estimator to estimator
    # "Support Vector Machine (Linear)": SVC(kernel='linear', random_state=42),
    # "Support Vector Machine (RBF)": SVC(kernel='rbf', random_state=42),

}

# Add Gradient Boosting models if installed
if XGBClassifier is not None:
    models["XGBoost"] = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42) # Added XGBoost
if LGBMClassifier is not None:
    models["LightGBM"] = LGBMClassifier(random_state=42) # Added LightGBM


results = {}

print("\nTraining and evaluating models...")

# Only attempt training and evaluation if there is sufficient training and testing data
if X_train_scaled.shape[0] > 0 and X_test_scaled.shape[0] > 0 and len(label_encoder.classes_) > 0:
    for name, model in models.items():
        print(f"\nMemory usage before training {name}: {get_memory_usage():.2f} MB")
        start_time = time.time()
        print(f"Training {name}...")
        try:
            model.fit(X_train_scaled, y_train_encoded)
            end_time = time.time()
            training_time = end_time - start_time
            print(f"{name} training complete in {training_time:.4f} seconds.")
            print(f"Memory usage after training {name}: {get_memory_usage():.2f} MB")

            # --- Save the trained model using pickle ---
            try:
                # Create a safe filename from the model name
                filename = name.replace(" ", "_").replace("(", "").replace(")", "").replace("-", "_") + ".pkl"
                with open(filename, 'wb') as f:
                    pickle.dump(model, f)
                print(f"Model '{name}' saved to {filename}")
            except Exception as e:
                print(f"Error saving model '{name}': {e}")
            print(f"Memory usage after saving model {name}: {get_memory_usage():.2f} MB")


            # --- 5. Evaluation ---
            print(f"\nMemory usage before predicting with {name}: {get_memory_usage():.2f} MB")
            start_time = time.time()
            y_pred_encoded = model.predict(X_test_scaled)
            end_time = time.time()
            prediction_time = end_time - start_time
            print(f"{name} prediction complete in {prediction_time:.4f} seconds.")
            print(f"Memory usage after predicting with {name}: {get_memory_usage():.2f} MB")


            # Decode predictions back to original labels for clarity in report
            # y_pred = label_encoder.inverse_transform(y_pred_encoded) # Decoding not strictly needed for metrics

            accuracy = accuracy_score(y_test_encoded, y_pred_encoded)

            try:
                # Use classification_report for a comprehensive view
                # zero_division=0 handles cases where a class has no true samples in the test set
                report = classification_report(y_test_encoded, y_pred_encoded, target_names=label_encoder.classes_, zero_division=0)
                conf_matrix = confusion_matrix(y_test_encoded, y_pred_encoded)

                # Extract precision, recall, f1 from the report for easy printing
                # This is a simplified way; parsing the report string might be needed for exact values
                # For simplicity, we'll just print the full report.
                # If you need specific average metrics (weighted, macro, micro), calculate them explicitly:
                precision = precision_score(y_test_encoded, y_pred_encoded, average='weighted', zero_division=0)
                recall = recall_score(y_test_encoded, y_pred_encoded, average='weighted', zero_division=0)
                f1 = f1_score(y_test_encoded, y_pred_encoded, average='weighted', zero_division=0)


                results[name] = {
                    "Accuracy": accuracy,
                    "Precision (Weighted)": precision,
                    "Recall (Weighted)": recall,
                    "F1-Score (Weighted)": f1,
                    "Classification Report": report,
                    "Confusion Matrix": conf_matrix,
                    "Training Time (s)": training_time,
                    "Prediction Time (s)": prediction_time
                }
                print(f"{name} - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")
                print(f"Memory usage after evaluating {name}: {get_memory_usage():.2f} MB")

            except ValueError as e:
                print(f"Could not calculate all metrics for {name}. This might happen if test set has classes not in training set.")
                print(f"Error details: {e}")
                results[name] = {
                    "Accuracy": accuracy,
                    "Classification Report": "Could not generate comprehensive report due to data limitations or missing classes in test set.",
                    "Confusion Matrix": "Could not generate confusion matrix due to data limitations or missing classes in test set.",
                    "Training Time (s)": training_time,
                    "Prediction Time (s)": prediction_time
                }
                print(f"{name} - Accuracy: {accuracy:.4f}")
                print(f"Memory usage after partial evaluation of {name}: {get_memory_usage():.2f} MB")

        except Exception as e:
            print(f"An error occurred during training or evaluation of {name}: {e}")
            results[name] = {
                "Error": str(e),
                "Training Time (s)": training_time if 'training_time' in locals() else 'N/A',
                "Prediction Time (s)": prediction_time if 'prediction_time' in locals() else 'N/A'
            }
            print(f"Memory usage after error during {name} processing: {get_memory_usage():.2f} MB")


else:
    print("\nSkipping model training and evaluation due to insufficient data or classes.")
    print(f"Memory usage at the end of the script (training skipped): {get_memory_usage():.2f} MB")


# --- 6. Comparison ---

print("\n--- Model Comparison Results ---")

# Print results in a readable format
if results:
    for name, metrics in results.items():
        print(f"\n--- {name} ---")
        if "Error" in metrics:
            print(f"Error during processing: {metrics['Error']}")
            print(f"Training Time (s): {metrics.get('Training Time (s)', 'N/A'):.4f}")
            print(f"Prediction Time (s): {metrics.get('Prediction Time (s)', 'N/A'):.4f}")
        else:
            print(f"Accuracy: {metrics.get('Accuracy', 'N/A'):.4f}")
            # Check if other metrics were calculated before printing
            if 'Precision (Weighted)' in metrics:
                print(f"Precision (Weighted): {metrics['Precision (Weighted)']:.4f}")
                print(f"Recall (Weighted): {metrics['Recall (Weighted)']:.4f}")
                print(f"F1-Score (Weighted): {metrics['F1-Score (Weighted)']:.4f}")
            print(f"Training Time (s): {metrics['Training Time (s)']:.4f}")
            print(f"Prediction Time (s): {metrics['Prediction Time (s)']:.4f}")
            print("\nClassification Report:")
            print(metrics['Classification Report'])
            print("\nConfusion Matrix:")
            print(metrics['Confusion Matrix'])
else:
    print("No results to display. Model training and evaluation were skipped.")

print(f"\nFinal memory usage before script finishes: {get_memory_usage():.2f} MB")

Initial memory usage: 239.12 MB

Memory usage before loading data: 323.76 MB
Training dataset loaded successfully from ./train_data.csv
Training dataset shape: (2558780, 85)

First 5 rows of the training dataset:
                                   FlowID       SourceIP  SourcePort  \
0  192.168.10.3-192.168.10.12-53-26526-17  192.168.10.12       26526   
1   172.16.0.1-192.168.10.50-37255-3737-6     172.16.0.1       37255   
2    192.168.10.16-72.21.91.29-53482-80-6  192.168.10.16       53482   
3    192.168.10.15-31.13.71.7-50902-443-6     31.13.71.7         443   
4   192.168.10.3-192.168.10.9-53-51576-17   192.168.10.9       51576   

   DestinationIP  DestinationPort  Protocol            Timestamp  \
0   192.168.10.3               53        17        6/7/2017 3:12   
1  192.168.10.50             3737         6        7/7/2017 2:52   
2    72.21.91.29               80         6       5/7/2017 10:02   
3  192.168.10.15            50902         6  03/07/2017 10:16:29   
4   192.168.10

Parameters: { "use_label_encoder" } are not used.



XGBoost training complete in 95.1934 seconds.
Memory usage after training XGBoost: 7927.92 MB
Model 'XGBoost' saved to XGBoost.pkl
Memory usage after saving model XGBoost: 7927.92 MB

Memory usage before predicting with XGBoost: 7927.92 MB
XGBoost prediction complete in 1.9096 seconds.
Memory usage after predicting with XGBoost: 7927.92 MB
XGBoost - Accuracy: 0.9999, Precision: 0.9999, Recall: 0.9999, F1-Score: 0.9999
Memory usage after evaluating XGBoost: 7927.92 MB

Memory usage before training LightGBM: 7927.92 MB
Training LightGBM...




[LightGBM] [Info] Number of positive: 1979150, number of negative: 579630
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.584952 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14858
[LightGBM] [Info] Number of data points in the train set: 2558780, number of used features: 72
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.773474 -> initscore=1.228033
[LightGBM] [Info] Start training from score 1.228033
LightGBM training complete in 120.7362 seconds.
Memory usage after training LightGBM: 7888.56 MB
Model 'LightGBM' saved to LightGBM.pkl
Memory usage after saving model LightGBM: 7888.56 MB

Memory usage before predicting with LightGBM: 7888.56 MB




LightGBM prediction complete in 4.1356 seconds.
Memory usage after predicting with LightGBM: 7889.56 MB
LightGBM - Accuracy: 0.9997, Precision: 0.9997, Recall: 0.9997, F1-Score: 0.9997
Memory usage after evaluating LightGBM: 7889.56 MB

--- Model Comparison Results ---

--- XGBoost ---
Accuracy: 0.9999
Precision (Weighted): 0.9999
Recall (Weighted): 0.9999
F1-Score (Weighted): 0.9999
Training Time (s): 95.1934
Prediction Time (s): 1.9096

Classification Report:
              precision    recall  f1-score   support

      ATTACK       1.00      1.00      1.00    145426
      BENIGN       1.00      1.00      1.00    494269

    accuracy                           1.00    639695
   macro avg       1.00      1.00      1.00    639695
weighted avg       1.00      1.00      1.00    639695


Confusion Matrix:
[[145401     25]
 [    49 494220]]

--- LightGBM ---
Accuracy: 0.9997
Precision (Weighted): 0.9997
Recall (Weighted): 0.9997
F1-Score (Weighted): 0.9997
Training Time (s): 120.7362
Predict

In [None]:
# Working
import pandas as pd
from sklearn.model_selection import train_test_split # Still useful for potential validation split later, but not for initial train/test load
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression, SGDClassifier # Added SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier # Added BaggingClassifier (as an example of bagging)
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB # Added Gaussian Naive Bayes
from sklearn.neural_network import MLPClassifier # Added MLP Classifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import time
import io # Keep io import in case needed for other things
import pickle # Import the pickle library
import psutil
import os


# Import Gradient Boosting Libraries (you might need to install these: pip install xgboost lightgbm)
try:
    import xgboost as xgb
    XGBClassifier = xgb.XGBClassifier
except ImportError:
    print("Warning: XGBoost not installed. Skipping XGBoost model.")
    XGBClassifier = None # Set to None if not installed

try:
    import lightgbm as lgb
    LGBMClassifier = lgb.LGBMClassifier
except ImportError:
    print("Warning: LightGBM not installed. Skipping LightGBM model.")
    LGBMClassifier = None # Set to None if not installed

# --- Configuration ---
# Replace with the actual paths to your dataset files
TRAIN_DATASET_PATH = './train_data.csv' # <--- Path to your training CSV file
TEST_DATASET_PATH = './test_data.csv'   # <--- Path to your testing CSV file
TARGET_COLUMN = 'Label' # The name of the target column

# --- 1. Load Data from Separate CSV Files ---
try:
    df_train = pd.read_csv(TRAIN_DATASET_PATH)
    print(f"Training dataset loaded successfully from {TRAIN_DATASET_PATH}")
    print(f"Training dataset shape: {df_train.shape}")
    print("\nFirst 5 rows of the training dataset:")
    print(df_train.head())
    print("\nTraining Dataset Info:")
    df_train.info()
except FileNotFoundError:
    print(f"Error: Training dataset not found at {TRAIN_DATASET_PATH}")
    print("Please update TRAIN_DATASET_PATH with the correct path to your training CSV file.")
    exit() # Exiting for demonstration purposes if file not found

try:
    df_test = pd.read_csv(TEST_DATASET_PATH)
    print(f"\nTesting dataset loaded successfully from {TEST_DATASET_PATH}")
    print(f"Testing dataset shape: {df_test.shape}")
    print("\nFirst 5 rows of the testing dataset:")
    print(df_test.head())
    print("\nTesting Dataset Info:")
    df_test.info()
except FileNotFoundError:
    print(f"Error: Testing dataset not found at {TEST_DATASET_PATH}")
    print("Please update TEST_DATASET_PATH with the correct path to your testing CSV file.")
    exit() # Exiting for demonstration purposes if file not found


# --- 2. Preprocessing ---

# Define columns to drop - apply to both train and test sets
columns_to_drop = ['FlowID', 'SourceIP', 'DestinationIP', 'Timestamp']

# Apply dropping columns and handling infinite/NaN values to both dataframes
def preprocess_dataframe(df, columns_to_drop, target_column):
    df_processed = df.drop(columns=columns_to_drop, errors='ignore').copy() # Use .copy() to avoid SettingWithCopyWarning
    # Handle potential infinite values
    df_processed.replace([float('inf'), float('-inf')], pd.NA, inplace=True)
    # Fill NaN values. Using 0 here, but consider other strategies.
    df_processed.fillna(0, inplace=True)

    # Separate features (X) and target (y)
    X = df_processed.drop(columns=[target_column])
    y = df_processed[target_column]

    # --- Added: Ensure target column is treated as string for LabelEncoder ---
    y = y.astype(str)

    return X, y

X_train, y_train = preprocess_dataframe(df_train, columns_to_drop, TARGET_COLUMN)
X_test, y_test = preprocess_dataframe(df_test, columns_to_drop, TARGET_COLUMN)

# --- NEW: Convert multi-class target into binary labels: 'BENIGN' vs 'ATTACK' ---
def simplify_labels(y):
    return y.apply(lambda x: 'BENIGN' if x.upper() == 'BENIGN' else 'ATTACK')

y_train = simplify_labels(y_train)
y_test = simplify_labels(y_test)

print("\nPreprocessing applied to both training and testing datasets.")

# Encode the target variable if it's categorical (e.g., 'BENIGN', 'ATTACK_TYPE')
# Fit on training labels, transform both training and testing labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test) # Use transform, not fit_transform on test set

for i, class_name in enumerate(label_encoder.classes_):
    print(f"Encoded value {i} -> Class label '{class_name}'")

print(f"\nOriginal Training Labels: {y_train.unique()}")
print(f"Encoded Training Labels Classes: {label_encoder.classes_}")
print(f"Encoded Training Labels: {label_encoder}")
print(f"\nOriginal Testing Labels: {y_test.unique()}")
print(f"Encoded Testing Labels (based on training labels): {label_encoder.classes_}")

# --- Save the LabelEncoder ---
try:
    encoder_filename = 'label_encoder.pkl'
    with open(encoder_filename, 'wb') as f:
        pickle.dump(label_encoder, f)
    print(f"\nLabel encoder saved to {encoder_filename}")
except Exception as e:
    print(f"Error saving label encoder: {e}")


# Scale numerical features
# Fit scaler ONLY on training data, then transform both train and test data
scaler = StandardScaler()
# Check if training data is not empty before scaling
if X_train.shape[0] > 0:
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    print(f"\nFeatures scaled using StandardScaler (fitted on training data).")

    # --- Save the StandardScaler ---
    try:
        scaler_filename = 'scaler.pkl'
        with open(scaler_filename, 'wb') as f:
            pickle.dump(scaler, f)
        print(f"Standard scaler saved to {scaler_filename}")
    except Exception as e:
        print(f"Error saving standard scaler: {e}")

else:
    print("\nWarning: Training data is empty after preprocessing. Cannot scale features or train models.")
    X_train_scaled = X_train # Keep as is if empty
    X_test_scaled = X_test   # Keep as is if empty


# --- 3. & 4. Model Selection and Training ---

# Initialize different classifiers
models = {
    # "Logistic Regression": LogisticRegression(max_iter=1000), # Increased max_iter for convergence
    # "Decision Tree": DecisionTreeClassifier(random_state=42),
    # "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    # "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5), # Default n_neighbors is 5
    # "Gaussian Naive Bayes": GaussianNB(), # Added Naive Bayes
    # "SGD Classifier": SGDClassifier(loss='log_loss', max_iter=1000, tol=1e-3, random_state=42), # Added SGD, using log_loss for logistic regression-like behavior
    # "MLP Classifier": MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42), # Added MLP
    # "Bagging Classifier (Decision Tree)": BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=100, random_state=42) # Fixed: Changed base_estimator to estimator
    # "Support Vector Machine (Linear)": SVC(kernel='linear', random_state=42),
    # "Support Vector Machine (RBF)": SVC(kernel='rbf', random_state=42),

}

# Add Gradient Boosting models if installed
if XGBClassifier is not None:
    models["XGBoost"] = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42) # Added XGBoost
if LGBMClassifier is not None:
    models["LightGBM"] = LGBMClassifier(random_state=42) # Added LightGBM


results = {}

print("\nTraining and evaluating models...")

# Only attempt training and evaluation if there is sufficient training and testing data
if X_train_scaled.shape[0] > 0 and X_test_scaled.shape[0] > 0 and len(label_encoder.classes_) > 0:
    for name, model in models.items():
        start_time = time.time()
        print(f"\nTraining {name}...")
        try:
            model.fit(X_train_scaled, y_train_encoded)
            end_time = time.time()
            training_time = end_time - start_time
            print(f"{name} training complete in {training_time:.4f} seconds.")

            # --- Save the trained model using pickle ---
            try:
                # Create a safe filename from the model name
                filename = name.replace(" ", "_").replace("(", "").replace(")", "").replace("-", "_") + ".pkl"
                with open(filename, 'wb') as f:
                    pickle.dump(model, f)
                print(f"Model '{name}' saved to {filename}")
            except Exception as e:
                print(f"Error saving model '{name}': {e}")


            # --- 5. Evaluation ---
            start_time = time.time()
            y_pred_encoded = model.predict(X_test_scaled)
            end_time = time.time()
            prediction_time = end_time - start_time
            print(f"{name} prediction complete in {prediction_time:.4f} seconds.")

            # Decode predictions back to original labels for clarity in report
            # y_pred = label_encoder.inverse_transform(y_pred_encoded) # Decoding not strictly needed for metrics

            accuracy = accuracy_score(y_test_encoded, y_pred_encoded)

            try:
                # Use classification_report for a comprehensive view
                # zero_division=0 handles cases where a class has no true samples in the test set
                report = classification_report(y_test_encoded, y_pred_encoded, target_names=label_encoder.classes_, zero_division=0)
                conf_matrix = confusion_matrix(y_test_encoded, y_pred_encoded)

                # Extract precision, recall, f1 from the report for easy printing
                # This is a simplified way; parsing the report string might be needed for exact values
                # For simplicity, we'll just print the full report.
                # If you need specific average metrics (weighted, macro, micro), calculate them explicitly:
                precision = precision_score(y_test_encoded, y_pred_encoded, average='weighted', zero_division=0)
                recall = recall_score(y_test_encoded, y_pred_encoded, average='weighted', zero_division=0)
                f1 = f1_score(y_test_encoded, y_pred_encoded, average='weighted', zero_division=0)


                results[name] = {
                    "Accuracy": accuracy,
                    "Precision (Weighted)": precision,
                    "Recall (Weighted)": recall,
                    "F1-Score (Weighted)": f1,
                    "Classification Report": report,
                    "Confusion Matrix": conf_matrix,
                    "Training Time (s)": training_time,
                    "Prediction Time (s)": prediction_time
                }
                print(f"{name} - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")
            except ValueError as e:
                print(f"Could not calculate all metrics for {name}. This might happen if test set has classes not in training set.")
                print(f"Error details: {e}")
                results[name] = {
                    "Accuracy": accuracy,
                    "Classification Report": "Could not generate comprehensive report due to data limitations or missing classes in test set.",
                    "Confusion Matrix": "Could not generate confusion matrix due to data limitations or missing classes in test set.",
                    "Training Time (s)": training_time,
                    "Prediction Time (s)": prediction_time
                }
                print(f"{name} - Accuracy: {accuracy:.4f}")
        except Exception as e:
            print(f"An error occurred during training or evaluation of {name}: {e}")
            results[name] = {
                "Error": str(e),
                "Training Time (s)": training_time if 'training_time' in locals() else 'N/A',
                "Prediction Time (s)": prediction_time if 'prediction_time' in locals() else 'N/A'
            }

else:
    print("\nSkipping model training and evaluation due to insufficient data or classes.")


# --- 6. Comparison ---

print("\n--- Model Comparison Results ---")

# Print results in a readable format
if results:
    for name, metrics in results.items():
        print(f"\n--- {name} ---")
        if "Error" in metrics:
            print(f"Error during processing: {metrics['Error']}")
            print(f"Training Time (s): {metrics.get('Training Time (s)', 'N/A'):.4f}")
            print(f"Prediction Time (s): {metrics.get('Prediction Time (s)', 'N/A'):.4f}")
        else:
            print(f"Accuracy: {metrics.get('Accuracy', 'N/A'):.4f}")
            # Check if other metrics were calculated before printing
            if 'Precision (Weighted)' in metrics:
                print(f"Precision (Weighted): {metrics['Precision (Weighted)']:.4f}")
                print(f"Recall (Weighted): {metrics['Recall (Weighted)']:.4f}")
                print(f"F1-Score (Weighted): {metrics['F1-Score (Weighted)']:.4f}")
            print(f"Training Time (s): {metrics['Training Time (s)']:.4f}")
            print(f"Prediction Time (s): {metrics['Prediction Time (s)']:.4f}")
            print("\nClassification Report:")
            print(metrics['Classification Report'])
            print("\nConfusion Matrix:")
            print(metrics['Confusion Matrix'])
else:
    print("No results to display. Model training and evaluation were skipped.")

Training dataset loaded successfully from ./train_data.csv
Training dataset shape: (2558780, 85)

First 5 rows of the training dataset:
                                   FlowID       SourceIP  SourcePort  \
0  192.168.10.3-192.168.10.12-53-26526-17  192.168.10.12       26526   
1   172.16.0.1-192.168.10.50-37255-3737-6     172.16.0.1       37255   
2    192.168.10.16-72.21.91.29-53482-80-6  192.168.10.16       53482   
3    192.168.10.15-31.13.71.7-50902-443-6     31.13.71.7         443   
4   192.168.10.3-192.168.10.9-53-51576-17   192.168.10.9       51576   

   DestinationIP  DestinationPort  Protocol            Timestamp  \
0   192.168.10.3               53        17        6/7/2017 3:12   
1  192.168.10.50             3737         6        7/7/2017 2:52   
2    72.21.91.29               80         6       5/7/2017 10:02   
3  192.168.10.15            50902         6  03/07/2017 10:16:29   
4   192.168.10.3               53        17        4/7/2017 4:59   

   FlowDuration  Total

Parameters: { "use_label_encoder" } are not used.



XGBoost training complete in 91.0388 seconds.
Model 'XGBoost' saved to XGBoost.pkl
XGBoost prediction complete in 1.8965 seconds.
XGBoost - Accuracy: 0.9999, Precision: 0.9999, Recall: 0.9999, F1-Score: 0.9999

Training LightGBM...




[LightGBM] [Info] Number of positive: 1979150, number of negative: 579630
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 3.246256 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14858
[LightGBM] [Info] Number of data points in the train set: 2558780, number of used features: 72
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.773474 -> initscore=1.228033
[LightGBM] [Info] Start training from score 1.228033
LightGBM training complete in 116.8101 seconds.
Model 'LightGBM' saved to LightGBM.pkl




LightGBM prediction complete in 4.5470 seconds.
LightGBM - Accuracy: 0.9997, Precision: 0.9997, Recall: 0.9997, F1-Score: 0.9997

--- Model Comparison Results ---

--- Logistic Regression ---
Accuracy: 0.9393
Precision (Weighted): 0.9389
Recall (Weighted): 0.9393
F1-Score (Weighted): 0.9391
Training Time (s): 229.1468
Prediction Time (s): 0.1681

Classification Report:
              precision    recall  f1-score   support

      ATTACK       0.88      0.85      0.86    145426
      BENIGN       0.96      0.96      0.96    494269

    accuracy                           0.94    639695
   macro avg       0.92      0.91      0.91    639695
weighted avg       0.94      0.94      0.94    639695


Confusion Matrix:
[[124146  21280]
 [ 17527 476742]]

--- Decision Tree ---
Accuracy: 0.9998
Precision (Weighted): 0.9998
Recall (Weighted): 0.9998
F1-Score (Weighted): 0.9998
Training Time (s): 320.6224
Prediction Time (s): 0.2574

Classification Report:
              precision    recall  f1-score 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split # Still useful for potential validation split later, but not for initial train/test load
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression, SGDClassifier # Added SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier # Added BaggingClassifier (as an example of bagging)
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB # Added Gaussian Naive Bayes
from sklearn.neural_network import MLPClassifier # Added MLP Classifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import time
import io # Keep io import in case needed for other things
import pickle # Import the pickle library

# Import Gradient Boosting Libraries (you might need to install these: pip install xgboost lightgbm)
try:
    import xgboost as xgb
    XGBClassifier = xgb.XGBClassifier
except ImportError:
    print("Warning: XGBoost not installed. Skipping XGBoost model.")
    XGBClassifier = None # Set to None if not installed

try:
    import lightgbm as lgb
    LGBMClassifier = lgb.LGBMClassifier
except ImportError:
    print("Warning: LightGBM not installed. Skipping LightGBM model.")
    LGBMClassifier = None # Set to None if not installed

# --- Configuration ---
# Replace with the actual paths to your dataset files
TRAIN_DATASET_PATH = './train_data.csv' # <--- Path to your training CSV file
TEST_DATASET_PATH = './test_data.csv'   # <--- Path to your testing CSV file
TARGET_COLUMN = 'Label' # The name of the target column

# --- 1. Load Data from Separate CSV Files ---
try:
    df_train = pd.read_csv(TRAIN_DATASET_PATH)
    print(f"Training dataset loaded successfully from {TRAIN_DATASET_PATH}")
    print(f"Training dataset shape: {df_train.shape}")
    print("\nFirst 5 rows of the training dataset:")
    print(df_train.head())
    print("\nTraining Dataset Info:")
    df_train.info()
except FileNotFoundError:
    print(f"Error: Training dataset not found at {TRAIN_DATASET_PATH}")
    print("Please update TRAIN_DATASET_PATH with the correct path to your training CSV file.")
    exit() # Exiting for demonstration purposes if file not found

try:
    df_test = pd.read_csv(TEST_DATASET_PATH)
    print(f"\nTesting dataset loaded successfully from {TEST_DATASET_PATH}")
    print(f"Testing dataset shape: {df_test.shape}")
    print("\nFirst 5 rows of the testing dataset:")
    print(df_test.head())
    print("\nTesting Dataset Info:")
    df_test.info()
except FileNotFoundError:
    print(f"Error: Testing dataset not found at {TEST_DATASET_PATH}")
    print("Please update TEST_DATASET_PATH with the correct path to your testing CSV file.")
    exit() # Exiting for demonstration purposes if file not found


# --- 2. Preprocessing ---

# Define columns to drop - apply to both train and test sets
columns_to_drop = ['FlowID', 'SourceIP', 'DestinationIP', 'Timestamp']

# Apply dropping columns and handling infinite/NaN values to both dataframes
def preprocess_dataframe(df, columns_to_drop, target_column):
    df_processed = df.drop(columns=columns_to_drop, errors='ignore').copy() # Use .copy() to avoid SettingWithCopyWarning
    # Handle potential infinite values
    df_processed.replace([float('inf'), float('-inf')], pd.NA, inplace=True)
    # Fill NaN values. Using 0 here, but consider other strategies.
    df_processed.fillna(0, inplace=True)

    # Separate features (X) and target (y)
    X = df_processed.drop(columns=[target_column])
    y = df_processed[target_column]

    return X, y

X_train, y_train = preprocess_dataframe(df_train, columns_to_drop, TARGET_COLUMN)
X_test, y_test = preprocess_dataframe(df_test, columns_to_drop, TARGET_COLUMN)

print("\nPreprocessing applied to both training and testing datasets.")

# Encode the target variable if it's categorical (e.g., 'BENIGN', 'ATTACK_TYPE')
# Fit on training labels, transform both training and testing labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test) # Use transform, not fit_transform on test set

print(f"\nOriginal Training Labels: {y_train.unique()}")
print(f"Encoded Training Labels: {label_encoder.classes_}")
print(f"\nOriginal Testing Labels: {y_test.unique()}")
print(f"Encoded Testing Labels (based on training labels): {label_encoder.classes_}")

# --- Save the LabelEncoder ---
try:
    encoder_filename = 'label_encoder.pkl'
    with open(encoder_filename, 'wb') as f:
        pickle.dump(label_encoder, f)
    print(f"\nLabel encoder saved to {encoder_filename}")
except Exception as e:
    print(f"Error saving label encoder: {e}")


# Scale numerical features
# Fit scaler ONLY on training data, then transform both train and test data
scaler = StandardScaler()
# Check if training data is not empty before scaling
if X_train.shape[0] > 0:
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    print(f"\nFeatures scaled using StandardScaler (fitted on training data).")

    # --- Save the StandardScaler ---
    try:
        scaler_filename = 'scaler.pkl'
        with open(scaler_filename, 'wb') as f:
            pickle.dump(scaler, f)
        print(f"Standard scaler saved to {scaler_filename}")
    except Exception as e:
        print(f"Error saving standard scaler: {e}")

else:
    print("\nWarning: Training data is empty after preprocessing. Cannot scale features or train models.")
    X_train_scaled = X_train # Keep as is if empty
    X_test_scaled = X_test   # Keep as is if empty


# --- 3. & 4. Model Selection and Training ---

# Initialize different classifiers
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000), # Increased max_iter for convergence
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Support Vector Machine (Linear)": SVC(kernel='linear', random_state=42),
    "Support Vector Machine (RBF)": SVC(kernel='rbf', random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5), # Default n_neighbors is 5
    "Gaussian Naive Bayes": GaussianNB(), # Added Naive Bayes
    "SGD Classifier": SGDClassifier(loss='log_loss', max_iter=1000, tol=1e-3, random_state=42), # Added SGD, using log_loss for logistic regression-like behavior
    "MLP Classifier": MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42), # Added MLP
    "Bagging Classifier (Decision Tree)": BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=100, random_state=42) # Fixed: Changed base_estimator to estimator
}

# Add Gradient Boosting models if installed
if XGBClassifier is not None:
    models["XGBoost"] = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42) # Added XGBoost
if LGBMClassifier is not None:
    models["LightGBM"] = LGBMClassifier(random_state=42) # Added LightGBM


results = {}

print("\nTraining and evaluating models...")

# Only attempt training and evaluation if there is sufficient training and testing data
if X_train_scaled.shape[0] > 0 and X_test_scaled.shape[0] > 0 and len(label_encoder.classes_) > 0:
    for name, model in models.items():
        start_time = time.time()
        print(f"\nTraining {name}...")
        try:
            model.fit(X_train_scaled, y_train_encoded)
            end_time = time.time()
            training_time = end_time - start_time
            print(f"{name} training complete in {training_time:.4f} seconds.")

            # --- Save the trained model using pickle ---
            try:
                # Create a safe filename from the model name
                filename = name.replace(" ", "_").replace("(", "").replace(")", "").replace("-", "_") + ".pkl"
                with open(filename, 'wb') as f:
                    pickle.dump(model, f)
                print(f"Model '{name}' saved to {filename}")
            except Exception as e:
                print(f"Error saving model '{name}': {e}")


            # --- 5. Evaluation ---
            start_time = time.time()
            y_pred_encoded = model.predict(X_test_scaled)
            end_time = time.time()
            prediction_time = end_time - start_time
            print(f"{name} prediction complete in {prediction_time:.4f} seconds.")

            # Decode predictions back to original labels for clarity in report
            # y_pred = label_encoder.inverse_transform(y_pred_encoded) # Decoding not strictly needed for metrics

            accuracy = accuracy_score(y_test_encoded, y_pred_encoded)

            try:
                # Use classification_report for a comprehensive view
                # zero_division=0 handles cases where a class has no true samples in the test set
                report = classification_report(y_test_encoded, y_pred_encoded, target_names=label_encoder.classes_, zero_division=0)
                conf_matrix = confusion_matrix(y_test_encoded, y_pred_encoded)

                # Extract precision, recall, f1 from the report for easy printing
                # This is a simplified way; parsing the report string might be needed for exact values
                # For simplicity, we'll just print the full report.
                # If you need specific average metrics (weighted, macro, micro), calculate them explicitly:
                precision = precision_score(y_test_encoded, y_pred_encoded, average='weighted', zero_division=0)
                recall = recall_score(y_test_encoded, y_pred_encoded, average='weighted', zero_division=0)
                f1 = f1_score(y_test_encoded, y_pred_encoded, average='weighted', zero_division=0)


                results[name] = {
                    "Accuracy": accuracy,
                    "Precision (Weighted)": precision,
                    "Recall (Weighted)": recall,
                    "F1-Score (Weighted)": f1,
                    "Classification Report": report,
                    "Confusion Matrix": conf_matrix,
                    "Training Time (s)": training_time,
                    "Prediction Time (s)": prediction_time
                }
                print(f"{name} - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")
            except ValueError as e:
                print(f"Could not calculate all metrics for {name}. This might happen if test set has classes not in training set.")
                print(f"Error details: {e}")
                results[name] = {
                    "Accuracy": accuracy,
                    "Classification Report": "Could not generate comprehensive report due to data limitations or missing classes in test set.",
                    "Confusion Matrix": "Could not generate confusion matrix due to data limitations or missing classes in test set.",
                    "Training Time (s)": training_time,
                    "Prediction Time (s)": prediction_time
                }
                print(f"{name} - Accuracy: {accuracy:.4f}")
        except Exception as e:
            print(f"An error occurred during training or evaluation of {name}: {e}")
            results[name] = {
                "Error": str(e),
                "Training Time (s)": training_time if 'training_time' in locals() else 'N/A',
                "Prediction Time (s)": prediction_time if 'prediction_time' in locals() else 'N/A'
            }

else:
    print("\nSkipping model training and evaluation due to insufficient data or classes.")


# --- 6. Comparison ---

print("\n--- Model Comparison Results ---")

# Print results in a readable format
if results:
    for name, metrics in results.items():
        print(f"\n--- {name} ---")
        if "Error" in metrics:
            print(f"Error during processing: {metrics['Error']}")
            print(f"Training Time (s): {metrics.get('Training Time (s)', 'N/A'):.4f}")
            print(f"Prediction Time (s): {metrics.get('Prediction Time (s)', 'N/A'):.4f}")
        else:
            print(f"Accuracy: {metrics.get('Accuracy', 'N/A'):.4f}")
            # Check if other metrics were calculated before printing
            if 'Precision (Weighted)' in metrics:
                print(f"Precision (Weighted): {metrics['Precision (Weighted)']:.4f}")
                print(f"Recall (Weighted): {metrics['Recall (Weighted)']:.4f}")
                print(f"F1-Score (Weighted): {metrics['F1-Score (Weighted)']:.4f}")
            print(f"Training Time (s): {metrics['Training Time (s)']:.4f}")
            print(f"Prediction Time (s): {metrics['Prediction Time (s)']:.4f}")
            print("\nClassification Report:")
            print(metrics['Classification Report'])
            print("\nConfusion Matrix:")
            print(metrics['Confusion Matrix'])
else:
    print("No results to display. Model training and evaluation were skipped.")

Error: Training dataset not found at ./train_data.csv
Please update TRAIN_DATASET_PATH with the correct path to your training CSV file.
Error: Testing dataset not found at ./test_data.csv
Please update TEST_DATASET_PATH with the correct path to your testing CSV file.


NameError: name 'df_train' is not defined

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split # Still useful for potential validation split later, but not for initial train/test load
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression, SGDClassifier # Added SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier # Added BaggingClassifier (as an example of bagging)
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB # Added Gaussian Naive Bayes
from sklearn.neural_network import MLPClassifier # Added MLP Classifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import time
import io # Keep io import in case needed for other things

# Import Gradient Boosting Libraries (you might need to install these: pip install xgboost lightgbm)
try:
    import xgboost as xgb
    XGBClassifier = xgb.XGBClassifier
except ImportError:
    print("Warning: XGBoost not installed. Skipping XGBoost model.")
    XGBClassifier = None # Set to None if not installed

try:
    import lightgbm as lgb
    LGBMClassifier = lgb.LGBMClassifier
except ImportError:
    print("Warning: LightGBM not installed. Skipping LightGBM model.")
    LGBMClassifier = None # Set to None if not installed

# --- Configuration ---
# Replace with the actual paths to your dataset files
TRAIN_DATASET_PATH = './Dataset/train_data.csv' # <--- Path to your training CSV file
TEST_DATASET_PATH = './Dataset/test_data.csv'   # <--- Path to your testing CSV file
TARGET_COLUMN = 'Label' # The name of the target column

# --- 1. Load Data from Separate CSV Files ---
try:
    df_train = pd.read_csv(TRAIN_DATASET_PATH)
    print(f"Training dataset loaded successfully from {TRAIN_DATASET_PATH}")
    print(f"Training dataset shape: {df_train.shape}")
    print("\nFirst 5 rows of the training dataset:")
    print(df_train.head())
    print("\nTraining Dataset Info:")
    df_train.info()
except FileNotFoundError:
    print(f"Error: Training dataset not found at {TRAIN_DATASET_PATH}")
    print("Please update TRAIN_DATASET_PATH with the correct path to your training CSV file.")
    exit() # Exiting for demonstration purposes if file not found

try:
    df_test = pd.read_csv(TEST_DATASET_PATH)
    print(f"\nTesting dataset loaded successfully from {TEST_DATASET_PATH}")
    print(f"Testing dataset shape: {df_test.shape}")
    print("\nFirst 5 rows of the testing dataset:")
    print(df_test.head())
    print("\nTesting Dataset Info:")
    df_test.info()
except FileNotFoundError:
    print(f"Error: Testing dataset not found at {TEST_DATASET_PATH}")
    print("Please update TEST_DATASET_PATH with the correct path to your testing CSV file.")
    exit() # Exiting for demonstration purposes if file not found


# --- 2. Preprocessing ---

# Define columns to drop - apply to both train and test sets
columns_to_drop = ['FlowID', 'SourceIP', 'DestinationIP', 'Timestamp']

# Apply dropping columns and handling infinite/NaN values to both dataframes
def preprocess_dataframe(df, columns_to_drop, target_column):
    df_processed = df.drop(columns=columns_to_drop, errors='ignore').copy() # Use .copy() to avoid SettingWithCopyWarning
    # Handle potential infinite values
    df_processed.replace([float('inf'), float('-inf')], pd.NA, inplace=True)
    # Fill NaN values. Using 0 here, but consider other strategies.
    df_processed.fillna(0, inplace=True)

    # Separate features (X) and target (y)
    X = df_processed.drop(columns=[target_column])
    y = df_processed[target_column]

    return X, y

X_train, y_train = preprocess_dataframe(df_train, columns_to_drop, TARGET_COLUMN)
X_test, y_test = preprocess_dataframe(df_test, columns_to_drop, TARGET_COLUMN)

print("\nPreprocessing applied to both training and testing datasets.")

# Encode the target variable if it's categorical (e.g., 'BENIGN', 'ATTACK_TYPE')
# Fit on training labels, transform both training and testing labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test) # Use transform, not fit_transform on test set

print(f"\nOriginal Training Labels: {y_train.unique()}")
print(f"Encoded Training Labels: {label_encoder.classes_}")
print(f"\nOriginal Testing Labels: {y_test.unique()}")
print(f"Encoded Testing Labels (based on training labels): {label_encoder.classes_}")


# Scale numerical features
# Fit scaler ONLY on training data, then transform both train and test data
scaler = StandardScaler()
# Check if training data is not empty before scaling
if X_train.shape[0] > 0:
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    print(f"\nFeatures scaled using StandardScaler (fitted on training data).")
else:
    print("\nWarning: Training data is empty after preprocessing. Cannot scale features or train models.")
    X_train_scaled = X_train # Keep as is if empty
    X_test_scaled = X_test   # Keep as is if empty


# --- 3. & 4. Model Selection and Training ---

# Initialize different classifiers
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000), # Increased max_iter for convergence
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Support Vector Machine (Linear)": SVC(kernel='linear', random_state=42),
    "Support Vector Machine (RBF)": SVC(kernel='rbf', random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5), # Default n_neighbors is 5
    "Gaussian Naive Bayes": GaussianNB(), # Added Naive Bayes
    "SGD Classifier": SGDClassifier(loss='log_loss', max_iter=1000, tol=1e-3, random_state=42), # Added SGD, using log_loss for logistic regression-like behavior
    "MLP Classifier": MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42), # Added MLP
    "Bagging Classifier (Decision Tree)": BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=100, random_state=42) # Fixed: Changed base_estimator to estimator
}

# Add Gradient Boosting models if installed
if XGBClassifier is not None:
    models["XGBoost"] = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42) # Added XGBoost
if LGBMClassifier is not None:
    models["LightGBM"] = LGBMClassifier(random_state=42) # Added LightGBM


results = {}

print("\nTraining and evaluating models...")

# Only attempt training and evaluation if there is sufficient training and testing data
if X_train_scaled.shape[0] > 0 and X_test_scaled.shape[0] > 0 and len(label_encoder.classes_) > 0:
    for name, model in models.items():
        start_time = time.time()
        print(f"\nTraining {name}...")
        try:
            model.fit(X_train_scaled, y_train_encoded)
            end_time = time.time()
            training_time = end_time - start_time
            print(f"{name} training complete in {training_time:.4f} seconds.")

            # --- 5. Evaluation ---
            start_time = time.time()
            y_pred_encoded = model.predict(X_test_scaled)
            end_time = time.time()
            prediction_time = end_time - start_time
            print(f"{name} prediction complete in {prediction_time:.4f} seconds.")

            # Decode predictions back to original labels for clarity in report
            # y_pred = label_encoder.inverse_transform(y_pred_encoded) # Decoding not strictly needed for metrics

            accuracy = accuracy_score(y_test_encoded, y_pred_encoded)

            try:
                # Use classification_report for a comprehensive view
                # zero_division=0 handles cases where a class has no true samples in the test set
                report = classification_report(y_test_encoded, y_pred_encoded, target_names=label_encoder.classes_, zero_division=0)
                conf_matrix = confusion_matrix(y_test_encoded, y_pred_encoded)

                # Extract precision, recall, f1 from the report for easy printing
                # This is a simplified way; parsing the report string might be needed for exact values
                # For simplicity, we'll just print the full report.
                # If you need specific average metrics (weighted, macro, micro), calculate them explicitly:
                precision = precision_score(y_test_encoded, y_pred_encoded, average='weighted', zero_division=0)
                recall = recall_score(y_test_encoded, y_pred_encoded, average='weighted', zero_division=0)
                f1 = f1_score(y_test_encoded, y_pred_encoded, average='weighted', zero_division=0)


                results[name] = {
                    "Accuracy": accuracy,
                    "Precision (Weighted)": precision,
                    "Recall (Weighted)": recall,
                    "F1-Score (Weighted)": f1,
                    "Classification Report": report,
                    "Confusion Matrix": conf_matrix,
                    "Training Time (s)": training_time,
                    "Prediction Time (s)": prediction_time
                }
                print(f"{name} - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")
            except ValueError as e:
                print(f"Could not calculate all metrics for {name}. This might happen if test set has classes not in training set.")
                print(f"Error details: {e}")
                results[name] = {
                    "Accuracy": accuracy,
                    "Classification Report": "Could not generate comprehensive report due to data limitations or missing classes in test set.",
                    "Confusion Matrix": "Could not generate confusion matrix due to data limitations or missing classes in test set.",
                    "Training Time (s)": training_time,
                    "Prediction Time (s)": prediction_time
                }
                print(f"{name} - Accuracy: {accuracy:.4f}")
        except Exception as e:
            print(f"An error occurred during training or evaluation of {name}: {e}")
            results[name] = {
                "Error": str(e),
                "Training Time (s)": training_time if 'training_time' in locals() else 'N/A',
                "Prediction Time (s)": prediction_time if 'prediction_time' in locals() else 'N/A'
            }

else:
    print("\nSkipping model training and evaluation due to insufficient data or classes.")


# --- 6. Comparison ---

print("\n--- Model Comparison Results ---")

# Print results in a readable format
if results:
    for name, metrics in results.items():
        print(f"\n--- {name} ---")
        if "Error" in metrics:
            print(f"Error during processing: {metrics['Error']}")
            print(f"Training Time (s): {metrics.get('Training Time (s)', 'N/A'):.4f}")
            print(f"Prediction Time (s): {metrics.get('Prediction Time (s)', 'N/A'):.4f}")
        else:
            print(f"Accuracy: {metrics.get('Accuracy', 'N/A'):.4f}")
            # Check if other metrics were calculated before printing
            if 'Precision (Weighted)' in metrics:
                print(f"Precision (Weighted): {metrics['Precision (Weighted)']:.4f}")
                print(f"Recall (Weighted): {metrics['Recall (Weighted)']:.4f}")
                print(f"F1-Score (Weighted): {metrics['F1-Score (Weighted)']:.4f}")
            print(f"Training Time (s): {metrics['Training Time (s)']:.4f}")
            print(f"Prediction Time (s): {metrics['Prediction Time (s)']:.4f}")
            print("\nClassification Report:")
            print(metrics['Classification Report'])
            print("\nConfusion Matrix:")
            print(metrics['Confusion Matrix'])
else:
    print("No results to display. Model training and evaluation were skipped.")


ModuleNotFoundError: No module named 'pandas'