In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils import class_weight
from sklearn.preprocessing import MinMaxScaler 
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

try:
    df = pd.read_csv('training_set.csv')
    print("Training dataset loaded successfully.")
    print(f"Dataset shape: {df.shape}")
except FileNotFoundError:
    print("Error: 'training_set.csv' not found. Please place it in the correct directory.")
    exit()

Error: 'training_set.csv' not found. Please place it in the correct directory.


: 

In [None]:
print("\nMissing values per column:")
print(df.isnull().sum())

for col in df.select_dtypes(include=np.number).columns:
    df[col].fillna(df[col].mean(), inplace=True)

X = df.drop(' Label', axis=1)
y = df[' Label']


Missing values per column:
 Destination Port              0
 Flow Duration                 0
 Total Fwd Packets             0
 Total Backward Packets        0
Total Length of Fwd Packets    0
                              ..
Idle Mean                      0
 Idle Std                      0
 Idle Max                      0
 Idle Min                      0
 Label                         0
Length: 79, dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)


In [None]:
X.replace([np.inf, -np.inf], np.nan, inplace=True)
for col in X.columns:
    if X[col].isnull().any():
        X[col].fillna(X[col].median(), inplace=True)

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

print("Features have been normalized using Min-Max scaling (range 0 to 1).")
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[col].fillna(X[col].median(), inplace=True)


✅ Features have been normalized using Min-Max scaling (range 0 to 1).


In [None]:
y = y.astype(int)
weights = class_weight.compute_class_weight('balanced', classes=np.unique(y), y=y)
class_weights = dict(enumerate(weights))

print(f"\nCalculated class weights: {class_weights}")
print("This will penalize errors on the minority class more heavily.")

X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, y,
    test_size=0.2, 
    random_state=42,
    stratify=y 
)
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(
    optimizer='adam',
    loss='binary_crossentropy', 
    metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]
)
model.summary()

early_stopping = EarlyStopping(
    monitor='val_loss', 
    patience=5,          
    verbose=1,           
    restore_best_weights=True 
)
model_checkpoint = ModelCheckpoint(
    filepath='best_model.h5',
    monitor='val_loss',      
    save_best_only=True,    
    verbose=1               
)
print("\n--- Starting Model Training with Early Stopping and Checkpointing ---")

history = model.fit(
    X_train, y_train,
    epochs=5, 
    batch_size=32,
    validation_data=(X_val, y_val),
    class_weight=class_weights,
    callbacks=[early_stopping, model_checkpoint],
    verbose=1
)
print("--- Model Training Finished ---")


Calculated class weights: {0: np.float64(0.6226620947630923), 1: np.float64(2.5381194409148664)}
This will penalize errors on the minority class more heavily.


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



--- Starting Model Training with Early Stopping and Checkpointing ---
Epoch 1/5
[1m49520/49538[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 3ms/step - accuracy: 0.9302 - loss: 0.1543 - precision_2: 0.7634 - recall_2: 0.9534
Epoch 1: val_loss improved from inf to 0.07100, saving model to best_model.h5




[1m49538/49538[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m189s[0m 4ms/step - accuracy: 0.9302 - loss: 0.1543 - precision_2: 0.7634 - recall_2: 0.9534 - val_accuracy: 0.9724 - val_loss: 0.0710 - val_precision_2: 0.8909 - val_recall_2: 0.9801
Epoch 2/5
[1m49534/49538[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 3ms/step - accuracy: 0.9646 - loss: 0.0893 - precision_2: 0.8635 - recall_2: 0.9743
Epoch 2: val_loss did not improve from 0.07100
[1m49538/49538[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m176s[0m 4ms/step - accuracy: 0.9646 - loss: 0.0893 - precision_2: 0.8635 - recall_2: 0.9743 - val_accuracy: 0.9683 - val_loss: 0.0776 - val_precision_2: 0.8687 - val_recall_2: 0.9884
Epoch 3/5
[1m49522/49538[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 3ms/step - accuracy: 0.9683 - loss: 0.0808 - precision_2: 0.8744 - recall_2: 0.9789
Epoch 3: va



[1m49538/49538[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m179s[0m 4ms/step - accuracy: 0.9683 - loss: 0.0808 - precision_2: 0.8744 - recall_2: 0.9789 - val_accuracy: 0.9770 - val_loss: 0.0682 - val_precision_2: 0.9036 - val_recall_2: 0.9886
Epoch 4/5
[1m49522/49538[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 3ms/step - accuracy: 0.9703 - loss: 0.0767 - precision_2: 0.8816 - recall_2: 0.9807
Epoch 4: val_loss did not improve from 0.06818
[1m49538/49538[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m177s[0m 4ms/step - accuracy: 0.9703 - loss: 0.0767 - precision_2: 0.8816 - recall_2: 0.9807 - val_accuracy: 0.9683 - val_loss: 0.0751 - val_precision_2: 0.8694 - val_recall_2: 0.9875
Epoch 5/5
[1m49530/49538[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 3ms/step - accuracy: 0.9710 - loss: 0.0738 - precision_2: 0.8839 - recall_2: 0.9822
Epoch 5: va



[1m49538/49538[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m186s[0m 4ms/step - accuracy: 0.9710 - loss: 0.0738 - precision_2: 0.8840 - recall_2: 0.9822 - val_accuracy: 0.9707 - val_loss: 0.0651 - val_precision_2: 0.8795 - val_recall_2: 0.9866
Restoring model weights from the end of the best epoch: 5.
--- Model Training Finished ---


In [None]:
y_pred_proba = model.predict(X_val)
y_pred = (y_pred_proba > 0.5).astype(int)

print("\n--- Model Evaluation on Validation Set ---")
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_pred))
print("\nClassification Report:")
print(classification_report(y_val, y_pred))

[1m12385/12385[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 2ms/step

--- Model Evaluation on Validation Set ---
Confusion Matrix:
[[307679  10555]
 [  1044  77026]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.97      0.98    318234
           1       0.88      0.99      0.93     78070

    accuracy                           0.97    396304
   macro avg       0.94      0.98      0.96    396304
weighted avg       0.97      0.97      0.97    396304



In [None]:
try:
    test_df = pd.read_csv('test_set.csv')
    print("Hold-out test set loaded successfully.")
except FileNotFoundError:
    print("Error: 'testing_set.csv' not found. Please check the file name.")
    exit()

X_test = test_df.drop(' Label', axis=1)
y_test = test_df[' Label']

print("--- Applying preprocessing to the test set ---")
train_cols = X.columns
X_test = X_test.reindex(columns=train_cols, fill_value=0)
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test.fillna(X.median(), inplace=True)
X_test_scaled = scaler.transform(X_test)

print("Test set preprocessed identically.")
print("\n--- Generating Final Performance Report ---")

y_test_pred_proba = model.predict(X_test_scaled)
y_test_pred = (y_test_pred_proba > 0.5).astype(int)
print("\nFINAL Classification Report on Unseen Test Data:")
print(classification_report(y_test, y_test_pred))
print("\nFINAL Confusion Matrix on Unseen Test Data:")
print(confusion_matrix(y_test, y_test_pred))

✅ Hold-out test set loaded successfully.
--- Applying preprocessing to the test set ---
✅ Test set preprocessed identically.

--- Generating Final Performance Report ---
[1m26539/26539[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 1ms/step

FINAL Classification Report on Unseen Test Data:
              precision    recall  f1-score   support

           0       1.00      0.97      0.98    681929
           1       0.88      0.99      0.93    167294

    accuracy                           0.97    849223
   macro avg       0.94      0.98      0.96    849223
weighted avg       0.97      0.97      0.97    849223


FINAL Confusion Matrix on Unseen Test Data:
[[659897  22032]
 [  2287 165007]]


### SECOND MODEL

In [None]:
try:
    df = pd.read_csv('secondary_classification_dataset.csv')
    print("Dataset loaded successfully.")
    print(f"Dataset shape: {df.shape}")
except FileNotFoundError:
    print("Error: 'your_multiclass_dataset.csv' not found. Please check the file name.")
    exit()

✅ Dataset loaded successfully.
Dataset shape: (557646, 79)


In [7]:
df[' Label'].value_counts()

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
DoS Hulk,231073
PortScan,158930
DDoS,128027
DoS GoldenEye,10293
FTP-Patator,7938
SSH-Patator,5897
DoS slowloris,5796
DoS Slowhttptest,5499
Bot,1966
Web Attack � Brute Force,1507


In [None]:
print("--- Original Class Distribution ---")
print(df[' Label'].value_counts())
web_attack_map = {
    'Web Attack � Brute Force': 'Web_Attack',
    'Web Attack � XSS': 'Web_Attack',
    'Web Attack � Sql Injection': 'Web_Attack'
}

rare_attack_map = {
    'Infiltration': 'Rare_Attack',
    'Heartbleed': 'Rare_Attack'
}

df[' Label'] = df[' Label'].replace(web_attack_map)
df[' Label'] = df[' Label'].replace(rare_attack_map)

print("\n--- New Class Distribution After Grouping ---")
print(df[' Label'].value_counts())

--- Original Class Distribution ---
 Label
DoS Hulk                      231073
PortScan                      158930
DDoS                          128027
DoS GoldenEye                  10293
FTP-Patator                     7938
SSH-Patator                     5897
DoS slowloris                   5796
DoS Slowhttptest                5499
Bot                             1966
Web Attack � Brute Force        1507
Web Attack � XSS                 652
Rare_Attack                       47
Web Attack � Sql Injection        21
Name: count, dtype: int64

--- New Class Distribution After Grouping ---
 Label
DoS Hulk            231073
PortScan            158930
DDoS                128027
DoS GoldenEye        10293
FTP-Patator           7938
SSH-Patator           5897
DoS slowloris         5796
DoS Slowhttptest      5499
Web_Attack            2180
Bot                   1966
Rare_Attack             47
Name: count, dtype: int64


In [None]:
print("--- Distribution Before Final Grouping ---")
print(df[' Label'].value_counts())

dos_map = {
    'DoS Hulk': 'DoS_Attack',
    'DDoS': 'DoS_Attack',
    'DoS GoldenEye': 'DoS_Attack',
    'DoS slowloris': 'DoS_Attack',
    'DoS Slowhttptest': 'DoS_Attack'
}

brute_force_map = {
    'FTP-Patator': 'Brute_Force',
    'SSH-Patator': 'Brute_Force'
}

other_map = {
    'Bot': 'Other_Attack',
    'Rare_Attack': 'Other_Attack'
}

df[' Label'] = df[' Label'].replace(dos_map)
df[' Label'] = df[' Label'].replace(brute_force_map)
df[' Label'] = df[' Label'].replace(other_map)

print("\n--- Final, Improved Class Distribution ---")
print(df[' Label'].value_counts())

--- Distribution Before Final Grouping ---
 Label
DoS Hulk            231073
PortScan            158930
DDoS                128027
DoS GoldenEye        10293
FTP-Patator           7938
SSH-Patator           5897
DoS slowloris         5796
DoS Slowhttptest      5499
Web_Attack            2180
Bot                   1966
Rare_Attack             47
Name: count, dtype: int64

--- Final, Improved Class Distribution ---
 Label
DoS_Attack      380688
PortScan        158930
Brute_Force      13835
Web_Attack        2180
Other_Attack      2013
Name: count, dtype: int64


In [None]:
for col in df.select_dtypes(include=np.number).columns:
    df[col].fillna(df[col].median(), inplace=True)
print("Missing values handled.")

X = df.drop(' Label', axis=1)
y = df[' Label']
y, class_names = pd.factorize(y)
num_classes = len(class_names)
print(f"Found {num_classes} unique classes.")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)


✅ Missing values handled.
Found 5 unique classes.


In [None]:
y, class_names = pd.factorize(y)
num_classes = len(class_names)
print(f"Found {num_classes} unique classes.")

y_one_hot = tf.keras.utils.to_categorical(y, num_classes=num_classes)
print("Labels have been one-hot encoded.")

Found 5 unique classes.
✅ Labels have been one-hot encoded.


In [None]:
X.replace([np.inf, -np.inf], np.nan, inplace=True)
for col in X.columns:
    if X[col].isnull().any():
        X[col].fillna(X[col].median(), inplace=True)
print("Infinite and missing values in features handled.")

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
print("Features have been normalized using Min-Max scaling.")

class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y), y=y)
class_weights_dict = dict(enumerate(class_weights))
print(f"Calculated class weights: {class_weights_dict}")

X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, y_one_hot,
    test_size=0.2,
    random_state=42,
    stratify=y 
)
print(f"Data split into {len(X_train)} training and {len(X_val)} validation samples.")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[col].fillna(X[col].median(), inplace=True)


✅ Infinite and missing values in features handled.
✅ Features have been normalized using Min-Max scaling.
Calculated class weights: {0: np.float64(0.2929674694237801), 1: np.float64(0.7017504561756748), 2: np.float64(55.40447093889717), 3: np.float64(51.16018348623853), 4: np.float64(8.06138055655945)}
Data split into 446116 training and 111530 validation samples.


In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(num_classes, activation='softmax')
])

model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=[
        'accuracy',
        tf.keras.metrics.Precision(),
        tf.keras.metrics.Recall()
    ]
)

model.summary()
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
model_checkpoint = ModelCheckpoint(filepath='best_multiclass_model.h5', monitor='val_loss', save_best_only=True)

print("\n--- Starting Model Training ---")
history = model.fit(
    X_train, y_train,
    epochs=20,
    batch_size=32,
    validation_data=(X_val, y_val),
    class_weight=class_weights_dict,
    callbacks=[early_stopping, model_checkpoint],
    verbose=1
)
print("--- Model Training Finished ---")


--- Starting Model Training ---
Epoch 1/20
[1m13942/13942[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9561 - loss: 0.4542 - precision: 0.9740 - recall: 0.9005



[1m13942/13942[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 4ms/step - accuracy: 0.9561 - loss: 0.4542 - precision: 0.9740 - recall: 0.9005 - val_accuracy: 0.9911 - val_loss: 0.0643 - val_precision: 0.9926 - val_recall: 0.9908
Epoch 2/20
[1m13941/13942[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 3ms/step - accuracy: 0.9892 - loss: 0.1028 - precision: 0.9906 - recall: 0.9876



[1m13942/13942[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 3ms/step - accuracy: 0.9892 - loss: 0.1028 - precision: 0.9906 - recall: 0.9876 - val_accuracy: 0.9952 - val_loss: 0.0383 - val_precision: 0.9954 - val_recall: 0.9951
Epoch 3/20
[1m13935/13942[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 3ms/step - accuracy: 0.9908 - loss: 0.0947 - precision: 0.9917 - recall: 0.9895



[1m13942/13942[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 3ms/step - accuracy: 0.9908 - loss: 0.0946 - precision: 0.9917 - recall: 0.9895 - val_accuracy: 0.9962 - val_loss: 0.0234 - val_precision: 0.9962 - val_recall: 0.9958
Epoch 4/20
[1m13942/13942[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 3ms/step - accuracy: 0.9911 - loss: 0.0977 - precision: 0.9919 - recall: 0.9901 - val_accuracy: 0.9953 - val_loss: 0.0363 - val_precision: 0.9955 - val_recall: 0.9953
Epoch 5/20
[1m13939/13942[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 3ms/step - accuracy: 0.9891 - loss: 0.0967 - precision: 0.9906 - recall: 0.9871



[1m13942/13942[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 3ms/step - accuracy: 0.9891 - loss: 0.0967 - precision: 0.9906 - recall: 0.9871 - val_accuracy: 0.9968 - val_loss: 0.0142 - val_precision: 0.9971 - val_recall: 0.9967
Epoch 6/20
[1m13936/13942[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 3ms/step - accuracy: 0.9910 - loss: 0.0993 - precision: 0.9921 - recall: 0.9899



[1m13942/13942[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 3ms/step - accuracy: 0.9910 - loss: 0.0993 - precision: 0.9921 - recall: 0.9899 - val_accuracy: 0.9964 - val_loss: 0.0142 - val_precision: 0.9965 - val_recall: 0.9962
Epoch 7/20
[1m13942/13942[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 3ms/step - accuracy: 0.9896 - loss: 0.0958 - precision: 0.9912 - recall: 0.9879 - val_accuracy: 0.9954 - val_loss: 0.0317 - val_precision: 0.9957 - val_recall: 0.9954
Epoch 8/20
[1m13942/13942[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 3ms/step - accuracy: 0.9894 - loss: 0.1399 - precision: 0.9904 - recall: 0.9883 - val_accuracy: 0.9959 - val_loss: 0.0251 - val_precision: 0.9960 - val_recall: 0.9954
Epoch 9/20
[1m13942/13942[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 3ms/step - accuracy: 0.9914 - loss: 0.0770 - precision: 0.9926 - r

In [None]:
y_pred_proba = model.predict(X_val)
y_pred = np.argmax(y_pred_proba, axis=1)
y_true = np.argmax(y_val, axis=1)

print("\n--- Model Evaluation on Validation Set ---")
print("Confusion Matrix:")
print(confusion_matrix(y_true, y_pred))
print("\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=class_names))

[1m3486/3486[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step

--- Model Evaluation on Validation Set ---
Confusion Matrix:
[[75862     1     2   124   149]
 [   60 31722     0     4     0]
 [    1     0   401     0     1]
 [    6     0     0   391    39]
 [    6     2     0     2  2757]]

Classification Report:
              precision    recall  f1-score   support

  DoS_Attack       1.00      1.00      1.00     76138
    PortScan       1.00      1.00      1.00     31786
Other_Attack       1.00      1.00      1.00       403
  Web_Attack       0.75      0.90      0.82       436
 Brute_Force       0.94      1.00      0.97      2767

    accuracy                           1.00    111530
   macro avg       0.94      0.98      0.95    111530
weighted avg       1.00      1.00      1.00    111530

