In [1]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, accuracy_score, f1_score, recall_score, confusion_matrix, roc_curve, auc
from sklearn.metrics import classification_report
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau


2025-01-28 14:57:50.018948: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-01-28 14:57:50.119361: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-01-28 14:57:50.239050: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1738072670.343472    5694 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1738072670.384742    5694 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-28 14:57:50.687241: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU ins

In [2]:
df = pd.read_csv('UNSW_NB15_training-set.csv')
pf = pd.read_csv('UNSW_NB15_testing-set.csv')
data = pd.concat([df, pf], ignore_index=True)
data.shape

(257673, 45)

In [3]:
print(data.duplicated().sum())

0


In [4]:
data = data.drop(columns=['id', 'attack_cat'])
data.shape

(257673, 43)

In [5]:
data.dtypes

dur                  float64
proto                 object
service               object
state                 object
spkts                  int64
dpkts                  int64
sbytes                 int64
dbytes                 int64
rate                 float64
sttl                   int64
dttl                   int64
sload                float64
dload                float64
sloss                  int64
dloss                  int64
sinpkt               float64
dinpkt               float64
sjit                 float64
djit                 float64
swin                   int64
stcpb                  int64
dtcpb                  int64
dwin                   int64
tcprtt               float64
synack               float64
ackdat               float64
smean                  int64
dmean                  int64
trans_depth            int64
response_body_len      int64
ct_srv_src             int64
ct_state_ttl           int64
ct_dst_ltm             int64
ct_src_dport_ltm       int64
ct_dst_sport_l

In [6]:
for column in data.select_dtypes(include=['object']).columns:
    data[column] = data[column].astype('category').cat.codes
data.dtypes

dur                  float64
proto                  int16
service                 int8
state                   int8
spkts                  int64
dpkts                  int64
sbytes                 int64
dbytes                 int64
rate                 float64
sttl                   int64
dttl                   int64
sload                float64
dload                float64
sloss                  int64
dloss                  int64
sinpkt               float64
dinpkt               float64
sjit                 float64
djit                 float64
swin                   int64
stcpb                  int64
dtcpb                  int64
dwin                   int64
tcprtt               float64
synack               float64
ackdat               float64
smean                  int64
dmean                  int64
trans_depth            int64
response_body_len      int64
ct_srv_src             int64
ct_state_ttl           int64
ct_dst_ltm             int64
ct_src_dport_ltm       int64
ct_dst_sport_l

In [7]:
X = data.drop('label', axis=1)
y = data['label']

In [8]:
model = xgb.XGBClassifier()
model.fit(X, y)

importance = model.feature_importances_
feature_names = X.columns

feature_importance = sorted(zip(feature_names, importance), key=lambda x: x[1], reverse=True)

best_features = [name for name, _ in feature_importance[:10]]
# Print the sorted features and their scores
for name, score in feature_importance:
    print(f"{name}: {score}")




sttl: 0.6356412768363953
swin: 0.06787912547588348
ct_dst_sport_ltm: 0.0481630377471447
ct_srv_dst: 0.030083660036325455
synack: 0.023628855124115944
proto: 0.019564582034945488
ct_state_ttl: 0.018916940316557884
smean: 0.017699867486953735
ct_dst_src_ltm: 0.01635816879570484
dbytes: 0.014596457593142986
tcprtt: 0.012829628773033619
state: 0.010761458426713943
service: 0.01050951424986124
response_body_len: 0.00725649343803525
sbytes: 0.006137150339782238
spkts: 0.0057584927417337894
dmean: 0.005741302855312824
sloss: 0.004937499761581421
ct_srv_src: 0.0043327221646904945
trans_depth: 0.003906861878931522
dpkts: 0.003557806368917227
dloss: 0.003450394608080387
sinpkt: 0.0031226284336298704
dinpkt: 0.0027801645919680595
rate: 0.0021440349519252777
ct_src_dport_ltm: 0.002125529572367668
dur: 0.002106718486174941
ct_ftp_cmd: 0.0018871442880481482
dload: 0.0018047703197225928
dttl: 0.0016385064227506518
ct_flw_http_mthd: 0.0016147280111908913
sload: 0.0013312811497598886
djit: 0.0013073942

In [9]:
X_features = X[best_features]

X_train, X_test, y_train, y_test = train_test_split(X_features, y, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(206138, 10) (51535, 10) (206138,) (51535,)


In [10]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled = X_train_scaled.reshape(X_train_scaled.shape[0], 1, X_train_scaled.shape[1])
X_test_scaled = X_test_scaled.reshape(X_test_scaled.shape[0], 1, X_test_scaled.shape[1])

In [11]:
# Build the model
model = Sequential()

# First LSTM layer with dropout and batch normalization
model.add(Bidirectional(LSTM(units=64, return_sequences=True, input_shape=(X_train_scaled.shape[1], X_train_scaled.shape[2]))))
model.add(Dropout(0.3))
model.add(BatchNormalization())

# Second LSTM layer
model.add(Bidirectional(LSTM(units=32, return_sequences=False)))
model.add(Dropout(0.3))
model.add(BatchNormalization())

# Output layer
model.add(Dense(units=1, activation='sigmoid'))

# Compile the model
optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# Callbacks for early stopping and learning rate reduction
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6)

# Train the model
history = model.fit(
    X_train_scaled, y_train,
    epochs=50, 
    batch_size=64,
    validation_data=(X_test_scaled, y_test),
    callbacks=[early_stopping, lr_scheduler]
)


Epoch 1/50


2025-01-28 14:57:59.643266: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)
  super().__init__(**kwargs)


[1m3221/3221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 8ms/step - accuracy: 0.8923 - loss: 0.2325 - val_accuracy: 0.9190 - val_loss: 0.1633 - learning_rate: 0.0010
Epoch 2/50
[1m3221/3221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 8ms/step - accuracy: 0.9138 - loss: 0.1765 - val_accuracy: 0.9227 - val_loss: 0.1537 - learning_rate: 0.0010
Epoch 3/50
[1m3221/3221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 8ms/step - accuracy: 0.9174 - loss: 0.1670 - val_accuracy: 0.9232 - val_loss: 0.1539 - learning_rate: 0.0010
Epoch 4/50
[1m3221/3221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 8ms/step - accuracy: 0.9193 - loss: 0.1629 - val_accuracy: 0.9280 - val_loss: 0.1472 - learning_rate: 0.0010
Epoch 5/50
[1m3221/3221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 8ms/step - accuracy: 0.9207 - loss: 0.1602 - val_accuracy: 0.9266 - val_loss: 0.1480 - learning_rate: 0.0010
Epoch 6/50
[1m3221/3221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0

In [12]:
# Make predictions
y_pred = (model.predict(X_test_scaled) > 0.5).astype(int)

# Evaluate the model using the requested metrics

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# Precision
precision = precision_score(y_test, y_pred)
print(f'Precision: {precision}')

# F1 Score
f1 = f1_score(y_test, y_pred)
print(f'F1 Score: {f1}')

# Recall
recall = recall_score(y_test, y_pred)
print(f'Recall: {recall}')

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print(f'Confusion Matrix:\n{cm}')

# TPR and FPR
fpr, tpr, thresholds = roc_curve(y_test, model.predict(X_test_scaled))
roc_auc = auc(fpr, tpr)
print(f'TPR: {tpr}')
print(f'FPR: {fpr}')
print(f'AUC: {roc_auc}')

# Classification Report (includes precision, recall, F1, and support)
class_report = classification_report(y_test, y_pred)
print(f'Classification Report:\n{class_report}')

[1m1611/1611[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step
Accuracy: 0.9327641408751334
Precision: 0.9483268767348931
F1 Score: 0.9472144783145194
Recall: 0.9461046865489957
Confusion Matrix:
[[16981  1694]
 [ 1771 31089]]
[1m1611/1611[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step
TPR: [0.         0.00112599 0.00748631 ... 1.         1.         1.        ]
FPR: [0.         0.         0.         ... 0.99946452 0.99983936 1.        ]
AUC: 0.9868039608219853
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.91      0.91     18675
           1       0.95      0.95      0.95     32860

    accuracy                           0.93     51535
   macro avg       0.93      0.93      0.93     51535
weighted avg       0.93      0.93      0.93     51535

