## Step 0: Import Required Libraries

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import mutual_info_classif, SelectKBest
from sklearn.decomposition import PCA
import xgboost as xgb
from sklearn.metrics import classification_report, confusion_matrix
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

2025-04-08 15:23:07.224918: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-08 15:23:07.231351: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-04-08 15:23:07.281364: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-04-08 15:23:07.324485: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744105987.365630    4831 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744105987.37

In [3]:
import glob
import os

# Set the directory containing your CSV files
data_dir = "datasets/MachineLearningCVE"  # change to your directory path

# Use glob to create a list of CSV file paths
csv_files = glob.glob(os.path.join(data_dir, "*.csv"))

# Initialize an empty list to store individual DataFrames
df_list = []

# Loop through the files, read each one, and append to the list
for file in csv_files:
    df = pd.read_csv(file)
    # Optionally: add a column to indicate the source file or attack type if needed.
    df_list.append(df)

# Concatenate all the DataFrames into a single DataFrame
combined_df = pd.concat(df_list, ignore_index=True)
print("Combined dataset shape:", combined_df)
print(combined_df[' Label'].unique())

Combined dataset shape:           Destination Port   Flow Duration   Total Fwd Packets  \
0                       88             640                   7   
1                       88             900                   9   
2                       88            1205                   7   
3                       88             511                   7   
4                       88             773                   9   
...                    ...             ...                 ...   
2830738                 80          590930                   2   
2830739                 80         1187988                   2   
2830740                 80              10                   1   
2830741                138              19                  10   
2830742                 80         4751966                   2   

          Total Backward Packets  Total Length of Fwd Packets  \
0                              4                          440   
1                              4                     

## Step 1: Data Loading and Initial Preprocessing

In [4]:
# Replace the file name with your actual dataset path.
df = combined_df

# Drop irrelevant columns (adjust as needed)
df.drop(['Flow ID', 'Source IP', 'Destination IP', 'Timestamp'], axis=1, inplace=True, errors='ignore')

# Handle missing values by replacing them with median values (for numeric columns)
# df.drop(' Label',axis=1).fillna(df.drop(' Label',axis=1).median(), inplace=True)

# Compute the median for numeric columns only
numeric_medians = df.select_dtypes(include=[np.number]).median()

# Fill missing values for numeric columns using the computed medians
df.fillna(numeric_medians, inplace=True)

# Encode labels to numeric values (Assuming 'Label' is the column name)
label_encoder = LabelEncoder()
print(df[' Label'].unique())
df[' Label'] = label_encoder.fit_transform(df[' Label'])

# Replace infinite values with NaN
df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Remove rows with NaN values if necessary
df.dropna(inplace=True)

# Separate features and labels
X = df.drop(' Label', axis=1)
y = df[' Label']

print(df[' Label'].unique())

['BENIGN' 'FTP-Patator' 'SSH-Patator' 'Bot' 'PortScan'
 'Web Attack � Brute Force' 'Web Attack � XSS'
 'Web Attack � Sql Injection' 'DoS slowloris' 'DoS Slowhttptest'
 'DoS Hulk' 'DoS GoldenEye' 'Heartbleed' 'DDoS' 'Infiltration']
[ 0  7 11  1 10 12 14 13  6  5  4  3  8  2  9]


## Step 2: Advanced Feature Extraction

In [5]:
# 2.1: Statistical Feature Selection using Mutual Information
# Select top 20 features based on Mutual Information
selector_mi = SelectKBest(score_func=mutual_info_classif, k=20)
X_mi = selector_mi.fit_transform(X, y)
selected_features_mi = X.columns[selector_mi.get_support(indices=True)]
print("Features selected by Mutual Information:", list(selected_features_mi))

# 2.2: Dimensionality Reduction using PCA
# First, scale the MI-selected features
scaler_pca = StandardScaler()
X_mi_scaled = scaler_pca.fit_transform(X_mi)

# Apply PCA to reduce dimensions (e.g., to 10 components)
pca = PCA(n_components=10)
X_pca = pca.fit_transform(X_mi_scaled)
print("Explained variance ratio by PCA components:\n", pca.explained_variance_ratio_)

# We'll use PCA-reduced features as our final feature set
X_final = X_pca


Features selected by Mutual Information: [' Destination Port', ' Flow Duration', 'Total Length of Fwd Packets', ' Total Length of Bwd Packets', ' Fwd Packet Length Max', 'Bwd Packet Length Max', ' Bwd Packet Length Mean', 'Flow Bytes/s', ' Flow IAT Max', ' Fwd IAT Max', ' Max Packet Length', ' Packet Length Mean', ' Packet Length Std', ' Packet Length Variance', ' Average Packet Size', ' Avg Bwd Segment Size', ' Subflow Fwd Bytes', ' Subflow Bwd Bytes', 'Init_Win_bytes_forward', ' Init_Win_bytes_backward']
Explained variance ratio by PCA components:
 [0.42936758 0.1363219  0.0849158  0.07294257 0.06018655 0.05523184
 0.0509684  0.04493896 0.03229192 0.01347312]


## Step 3: Final Data Scaling, Balancing, and Splitting

In [6]:
# Further standardize the final features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_final)

# Balance the dataset using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# Split the data into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)
print("Preprocessing complete. Training samples:", X_train.shape[0], "Test samples:", X_test.shape[0])

Preprocessing complete. Training samples: 27255840 Test samples: 6813960


## Step 4: ML Classifier Training (XGBoost)

In [6]:
# Train an XGBoost classifier for fast initial detection
# clf_xgb = xgb.XGBClassifier(eval_metric='logloss', use_label_encoder=False, random_state=42)
clf_xgb = xgb.XGBClassifier(
    eval_metric='logloss',  # Helps in classification
    use_label_encoder=False,
    max_depth=4,  # Reduce model complexity
    min_child_weight=3,  # Prevent overfitting
    gamma=0.2,  # Add tree complexity penalty
    subsample=0.8,  # Randomly use 80% of data per tree
    colsample_bytree=0.8,  # Use only 80% of features per tree
    lambda_=1,  # L2 regularization
    alpha=0.5,  # L1 regularization
    learning_rate=0.05,  # Reduce step size for smoother convergence
    n_estimators=200  # Reduce number of trees (test different values)
)
clf_xgb.fit(X_train, y_train)

# Evaluate on the test set
y_pred_xgb = clf_xgb.predict(X_test)
print("\nXGBoost Classification Report:")
print(classification_report(y_test, y_pred_xgb))
print("XGBoost Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb))

Parameters: { "lambda_", "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



XGBoost Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.91      0.94    454511
           1       0.98      1.00      0.99    453189
           2       0.99      0.99      0.99    453177
           3       0.96      0.98      0.97    454702
           4       0.99      0.98      0.99    454314
           5       0.99      0.99      0.99    454606
           6       0.99      0.98      0.98    453653
           7       1.00      1.00      1.00    454136
           8       1.00      1.00      1.00    455097
           9       0.99      1.00      0.99    453657
          10       0.96      0.98      0.97    453831
          11       1.00      1.00      1.00    454814
          12       0.78      0.51      0.62    454619
          13       0.98      1.00      0.99    454505
          14       0.63      0.85      0.73    455149

    accuracy                           0.94   6813960
   macro avg       0.95      0.94      0.94   68

In [8]:
clf_xgb.save_model('second-ml-model.model')

  self.get_booster().save_model(fname)


In [7]:
# ---------------------------- #
# Step 5: Identify Uncertain Predictions
# ---------------------------- #
# Get the prediction probabilities for the positive class (assuming binary classification)
y_probs = clf_xgb.predict_proba(X_test)[:, 1]

# Define uncertain predictions as those with probability in a narrow band (e.g., 0.45 to 0.55)
uncertain_idx = np.where((y_probs > 0.45) & (y_probs < 0.55))[0]
print("Number of uncertain predictions:", len(uncertain_idx))

# Extract uncertain samples and corresponding true labels
X_uncertain = X_test[uncertain_idx]
y_uncertain = y_test.iloc[uncertain_idx]


Number of uncertain predictions: 1733


# Step 5: Training the DL model on CICIDS Dataset

In [7]:
from sklearn.preprocessing import MinMaxScaler

# scaler_pca = MinMaxScaler(feature_range=(0, 1))  # Scale between 0 and 1
# X_mi_scaled = scaler_pca.fit_transform(X_mi)

# # Reshape for LSTM input (samples, time_steps, features)
# X_mi_scaled = X_mi_scaled.reshape(X_mi_scaled.shape[0], 1, X_mi_scaled.shape[1])

# Normalize features (LSTM benefits from scaling)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [8]:
num_features = X_train.shape[1]  # Number of features in input data

X_train_reshaped = X_train_scaled.reshape(X_train_scaled.shape[0], 1, X_train_scaled.shape[1])
X_test_reshaped = X_test_scaled.reshape(X_test_scaled.shape[0], 1, X_test_scaled.shape[1])

num_classes = 15

# history = lstm_model.fit(X_train, y_train, epochs=50, batch_size=64, validation_data=(X_val, y_val))


In [9]:
# ---------------------------- #
# Step 6: DL Model Training (LSTM) on Uncertain Samples
# ---------------------------- #
# Note: LSTMs typically require sequential data. For demonstration, we reshape each sample as a sequence of one timestep.
# In real scenarios, you might form sequences from packet flows or time windows.

# Reshape uncertain samples: shape becomes [samples, timesteps=1, features]
# X_uncertain_reshaped = np.expand_dims(X_uncertain, axis=1)

# Build a simple LSTM model
# model_lstm = Sequential([
#     LSTM(64, input_shape=(1, X_train.shape[1]), return_sequences=False),
#     Dropout(0.2),
#     Dense(1, activation='sigmoid')
# ])

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization
# Define LSTM model
lstm_model = Sequential([
    LSTM(128, return_sequences=True, input_shape=(1, num_features)),  
    Dropout(0.3),
    BatchNormalization(),

    LSTM(128, return_sequences=False),  
    Dropout(0.3),
    BatchNormalization(),

    Dense(64, activation='relu'),  
    Dropout(0.3),

    Dense(num_classes, activation='softmax')  
])

# Compile with categorical cross-entropy
lstm_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# model_lstm.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

2025-04-03 02:26:40.166521: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)
  super().__init__(**kwargs)


In [10]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
model_checkpoint = ModelCheckpoint("best_lstm_model.h5", save_best_only=True)

history = lstm_model.fit(X_train_reshaped, y_train, epochs=10, batch_size=64, 
                         validation_data=(X_test_reshaped, y_test), 
                         callbacks=[early_stop, model_checkpoint])

predictions = lstm_model.predict(X_uncertain_reshaped)

Epoch 1/10
[1m425871/425873[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 4ms/step - accuracy: 0.6895 - loss: 0.8260



[1m425873/425873[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1686s[0m 4ms/step - accuracy: 0.6895 - loss: 0.8260 - val_accuracy: 0.8274 - val_loss: 0.4152
Epoch 2/10
[1m425873/425873[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1704s[0m 4ms/step - accuracy: 0.7870 - loss: 0.5349 - val_accuracy: 0.8182 - val_loss: 0.4281
Epoch 3/10
[1m425873/425873[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1699s[0m 4ms/step - accuracy: 0.8027 - loss: 0.4902 - val_accuracy: 0.8180 - val_loss: 0.4646
Epoch 4/10
[1m425866/425873[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 4ms/step - accuracy: 0.8112 - loss: 0.4663



[1m425873/425873[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1700s[0m 4ms/step - accuracy: 0.8112 - loss: 0.4663 - val_accuracy: 0.8517 - val_loss: 0.3550
Epoch 5/10
[1m425873/425873[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1688s[0m 4ms/step - accuracy: 0.8168 - loss: 0.4504 - val_accuracy: 0.8355 - val_loss: 0.3975
Epoch 6/10
[1m425873/425873[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1685s[0m 4ms/step - accuracy: 0.8213 - loss: 0.4382 - val_accuracy: 0.8370 - val_loss: 0.3852
Epoch 7/10
[1m425873/425873[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1696s[0m 4ms/step - accuracy: 0.8247 - loss: 0.4281 - val_accuracy: 0.8289 - val_loss: 0.3675
Epoch 8/10
[1m425871/425873[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 5ms/step - accuracy: 0.8276 - loss: 0.4196



[1m425873/425873[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2138s[0m 5ms/step - accuracy: 0.8276 - loss: 0.4196 - val_accuracy: 0.8493 - val_loss: 0.3430
Epoch 9/10
[1m425873/425873[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1642s[0m 4ms/step - accuracy: 0.8301 - loss: 0.4125 - val_accuracy: 0.8532 - val_loss: 0.3480
Epoch 10/10
[1m425873/425873[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1647s[0m 4ms/step - accuracy: 0.8323 - loss: 0.4064 - val_accuracy: 0.8525 - val_loss: 0.3503


NameError: name 'X_uncertain_reshaped' is not defined

# Train the LSTM model on the uncertain samples

In [12]:

# (In a production system, you would train the DL model on a dedicated set; here we train on uncertain samples for refinement)
if len(X_uncertain_reshaped) > 0:
    history = lstm_model.fit(X_uncertain_reshaped, y_uncertain, epochs=100, batch_size=32, verbose=1)
    
    # Evaluate the LSTM model on the uncertain set
    y_pred_lstm_prob = lstm_model.predict(X_uncertain_reshaped)
    y_pred_lstm = (y_pred_lstm_prob > 0.5).astype(int).flatten()
    print("\nLSTM Classification Report on Uncertain Samples:")
    print(classification_report(y_uncertain, y_pred_lstm))
else:
    print("No uncertain samples found; consider adjusting the uncertainty threshold.")

Epoch 1/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.4314 - loss: -3.7943
Epoch 2/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.4404 - loss: -4.7083
Epoch 3/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.4197 - loss: -4.6447
Epoch 4/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.4273 - loss: -4.3579
Epoch 5/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.4225 - loss: -6.3256
Epoch 6/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.4205 - loss: -7.9194
Epoch 7/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.4021 - loss: -9.2234 
Epoch 8/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.4306 - loss: -9.1504
Epoch 9/100
[1m55/55[0m [32m━━━━━━━━

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
