### Libraries

In [67]:
%%capture
%reset -f                        # clear all variables from the workspace
'generic imports'
import os
import pandas as pd
import datetime
import numpy as np
import sys
sys.path.append(os.path.abspath('..'))
from src import utils
import importlib
importlib.reload(utils)        

'machine learning imports'
import torch
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle

In [68]:
import os

# Get the value of a specific environment variable
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

# Get a dictionary of all environment variables
print(os.environ)

environ({'CHROME_DESKTOP': 'code-url-handler.desktop', 'CONDA_DEFAULT_ENV': 'tab_aug', 'CONDA_EXE': '/home/tiagociic/miniconda3/bin/conda', 'CONDA_PREFIX': '/home/tiagociic/miniconda3/envs/tab_aug', 'CONDA_PROMPT_MODIFIER': '(tab_aug) ', 'CONDA_PYTHON_EXE': '/home/tiagociic/miniconda3/bin/python', 'CONDA_SHLVL': '2', 'DBUS_SESSION_BUS_ADDRESS': 'unix:path=/run/user/1000/bus', 'DESKTOP_SESSION': 'pop', 'DISPLAY': ':1', 'GDK_BACKEND': 'x11', 'GDMSESSION': 'pop', 'GIO_LAUNCHED_DESKTOP_FILE': '/usr/share/applications/code.desktop', 'GIO_LAUNCHED_DESKTOP_FILE_PID': '6582', 'GJS_DEBUG_OUTPUT': 'stderr', 'GJS_DEBUG_TOPICS': 'JS ERROR;JS LOG', 'GNOME_DESKTOP_SESSION_ID': 'this-is-deprecated', 'GNOME_SHELL_SESSION_MODE': 'pop', 'GPG_AGENT_INFO': '/run/user/1000/gnupg/S.gpg-agent:0:1', 'GTK_IM_MODULE': 'ibus', 'GTK_MODULES': 'gail:atk-bridge', 'HOME': '/home/tiagociic', 'INVOCATION_ID': '059b22abe0824844a1564501e744a431', 'JOURNAL_STREAM': '8:55484', 'LANG': 'pt_PT.UTF-8', 'LESSCLOSE': '/usr/bin

In [69]:
import torch
print(torch.cuda.is_available())  # Should print True if your setup is correct

True


In [70]:
!nvidia-smi

Sun Nov 19 03:10:42 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.113.01             Driver Version: 535.113.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 4060 ...    Off | 00000000:01:00.0 Off |                  N/A |
| N/A   41C    P8               3W /  60W |    348MiB /  8188MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

### GPU

In [71]:
DEVICE = 'CUDA' if torch.cuda.is_available() else 'CPU'
print("Using {}".format(DEVICE))

# Info on the device available memory
if DEVICE == 'CUDA':
    gpu = torch.device('cuda')
    total_memory = torch.cuda.get_device_properties(gpu).total_memory / 1024**3
    current_memory = torch.cuda.memory_allocated(gpu) / 1024**3

    print(f'Total GPU memory: {total_memory:.1f} GB | Current usage: {current_memory:.1f} GB')

Using CUDA
Total GPU memory: 7.8 GB | Current usage: 0.0 GB


### Load data

In [72]:
AUGMENTATION = 'None' # options: 'None', 'GReaT', 'SMOTE', 'SMOTE-NC' or 'RealTabFormer'

data_dir = os.path.abspath('../data')

# Load the train and test datasets
df_train, df_test = utils.load_dataset(data_directory=data_dir, 
                                       augmentation=AUGMENTATION, 
                                       ignore_columns=['mbtcp.unit_id', 
                                                       'mbtcp.trans_id']) 

Loading complete.
Train data: 536515 rows, 46 columns. 
Test data: 381934 rows, 46 columns.


### Data preparation

In [73]:
# Creates X_train, y_train
X_train = df_train.drop(['Attack_label', 'Attack_type'], axis=1)
y_train = df_train['Attack_type']

# Creates X_test, y_test
X_test = df_test.drop(['Attack_label', 'Attack_type'], axis=1)
y_test = df_test['Attack_type']

#### Convert categorical features to one-hot encoded features

In [74]:
# Encode the training and test labels if needed
X_train_enc, X_test_enc, info = utils.encode_categorical(X_train, X_test, encoding='label')

Categorical features to be encoded:

mqtt.topic
http.request.version
http.request.method
dns.qry.name.len
http.referer
mqtt.protoname
mqtt.conack.flags

Encoding complete.
No of features before encoding: 44
No of features after encoding: 44


#### Label encoding

In [75]:
y_train_enc, y_test_enc, le = utils.encode_labels(y_train, y_test)

Attack_type and encoded labels:

Backdoor                0
DDoS_HTTP               1
DDoS_ICMP               2
DDoS_TCP                3
DDoS_UDP                4
Fingerprinting          5
MITM                    6
Normal                  7
Password                8
Port_Scanning           9
Ransomware              10
SQL_injection           11
Uploading               12
Vulnerability_scanner   13
XSS                     14


### Model Training

In [76]:
# pytorch_tabnet default parameters
# tabnet = TabNetClassifier()

tabnet = TabNetClassifier(
    n_d=64, n_a=64, n_steps=5,
    gamma=1.5, n_independent=2, n_shared=2,
    cat_emb_dim=1,
    lambda_sparse=1e-4, momentum=0.3, clip_value=2.,
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    scheduler_params = {"gamma": 0.95, "step_size": 20},
    scheduler_fn=torch.optim.lr_scheduler.StepLR, epsilon=1e-15
)

tabnet.fit(X_train=X_train_enc.values, 
            y_train=y_train_enc,
            augmentations=None,
            max_epochs=100,
            )



epoch 0  | loss: 0.48332 |  0:00:14s
epoch 1  | loss: 0.37889 |  0:00:27s
epoch 2  | loss: 0.33639 |  0:00:40s
epoch 3  | loss: 0.32569 |  0:00:52s
epoch 4  | loss: 0.32342 |  0:01:05s
epoch 5  | loss: 0.32902 |  0:01:18s
epoch 6  | loss: 0.32158 |  0:01:31s
epoch 7  | loss: 0.31593 |  0:01:44s
epoch 8  | loss: 0.32192 |  0:01:57s
epoch 9  | loss: 0.31439 |  0:02:10s
epoch 10 | loss: 0.3096  |  0:02:23s
epoch 11 | loss: 0.31143 |  0:02:35s
epoch 12 | loss: 0.30588 |  0:02:48s
epoch 13 | loss: 0.30201 |  0:03:01s
epoch 14 | loss: 0.30415 |  0:03:14s
epoch 15 | loss: 0.30417 |  0:03:27s
epoch 16 | loss: 0.30472 |  0:03:40s
epoch 17 | loss: 0.30119 |  0:03:52s
epoch 18 | loss: 0.30399 |  0:04:05s
epoch 19 | loss: 0.30121 |  0:04:19s
epoch 20 | loss: 0.30044 |  0:04:31s
epoch 21 | loss: 0.30097 |  0:04:44s
epoch 22 | loss: 0.2979  |  0:04:57s
epoch 23 | loss: 0.29885 |  0:05:10s
epoch 24 | loss: 0.29854 |  0:05:22s
epoch 25 | loss: 0.29865 |  0:05:34s
epoch 26 | loss: 0.29724 |  0:05:46s
e

In [77]:
# # Shuffle training data
# # X_train_enc, y_train_enc = shuffle(X_train_enc, y_train_enc, random_state=42)

# if AUGMENTATION == 'SMOTE'or AUGMENTATION == 'SMOTE-NC':
#     # pytorch_tabnet default parameters
#     tabnet = TabNetClassifier()
    
#     tabnet.fit(X_train=X_train_enc.values, 
#                y_train=y_train_enc,
#                augmentations=None,
#                max_epochs=100,
#                )
# else: # AUGMENTATION == 'None', 'RealTabFormer', 'GReaT'


#     # retrieve the categorical features indexes and their dimension
#     cat_cols = info['categorical_columns']

#     cat_idxs = [X_train_enc.columns.get_loc(col) for col in cat_cols]
#     cat_dims = [X_train_enc[col].nunique() for col in cat_cols]
    
#     # cat_idxs = [X_train_enc.columns.get_loc(col) for col in cat_cols]
    
#     # # retrive unique values of the categorical columns in X_train_enc
#     # cat_dims = [len(X_train_enc[col].unique()) for col in cat_cols]
#     print(f"cat_cols: {cat_cols}")
#     print(f"cat_idxs: {cat_idxs}")
#     print(f"cat_dims: {cat_dims}")
#     # cat_emb_dim = [min(50, (x + 1) // 2) for x in cat_dims]
#     # Check if the indices are within the range of your dataset
#     max_index = X_train_enc.shape[1] - 1  # Assuming X_train_enc is your training dataset
#     for idx in cat_idxs:
#         if idx > max_index:
#             print(f"Index {idx} is out of range for the dataset.")

#     # Ensure that the dimensions align with the indices
#     if len(cat_idxs) != len(cat_dims):
#         print("The length of cat_idxs and cat_dims should be the same.")

#     # If everything is correct, initialize the TabNet model
#     tabnet = TabNetClassifier(cat_idxs=cat_idxs,
#                             cat_dims=cat_dims,
#                             cat_emb_dim=[min(50, (dim + 1) // 2) for dim in cat_dims],
#                             )
     
#     # tabnet = TabNetClassifier(cat_idxs=cat_idxs,
#     #                           cat_dims=cat_dims,
#     #                           cat_emb_dim=cat_emb_dim,    # categorical features embedding dimension
#     #                           )
#     tabnet.fit(X_train=X_train_enc.values, 
#                y_train=y_train_enc,
#                augmentations=None,
#                max_epochs=100,
#                )

In [78]:
# import pandas as pd

# # Assuming X_train_enc is your training dataset
# X_train_unique_values = X_train_enc.iloc[:, cat_idxs].nunique()

# # Check if unique values match specified dimensions
# for idx, dim, unique_values in zip(cat_idxs, cat_dims, X_train_unique_values):
#     print(f"Index: {idx} | Dimension: {dim} | Unique values: {unique_values}")

# # Check if the indices are within the range of your dataset
# max_index = X_train_enc.shape[1] - 1
# for idx in cat_idxs:
#     if idx > max_index:
#         print(f"Index {idx} is out of range for the dataset.")

# # Ensure that the dimensions align with the indices
# if len(cat_idxs) != len(cat_dims):
#     print("The length of cat_idxs and cat_dims should be the same.")


In [79]:
# # find columns indexes for categorical columns in X_train
# cat_cols = info['categorical_columns']
# cat_cols
# # retrive indexes of the categorical columns in X_train_enc
# cat_idxs = [X_train_enc.columns.get_loc(col) for col in cat_cols]
# # print column idx, name and number of unique values
# for col in cat_cols:
#     print(f"Index: {X_train_enc.columns.get_loc(col)} | Column: {col} | Unique values: {X_train_enc[col].nunique()}")


In [80]:
# check if X_train_enc, y_train_enc, X_test_enc, y_test_enc have NaN values in one line
print(f"X_train_enc has NaN values? {np.isnan(X_train_enc.values).any()}\ny_train_enc has NaN values? {np.isnan(y_train_enc).any()}")
print(f"X_test_enc  has NaN values? {np.isnan(X_test_enc.values).any()}\ny_test_enc  has NaN values? {np.isnan(y_test_enc).any()}")

X_train_enc has NaN values? False
y_train_enc has NaN values? False
X_test_enc  has NaN values? False
y_test_enc  has NaN values? False


In [81]:
# check if X_train_enc, y_train_enc, X_test_enc, y_test_enc have any categorical values
print(f"X_train_enc has categorical values? {X_train_enc.select_dtypes(include=['object']).any().any()}")
print(f"X_test_enc  has categorical values? {X_test_enc.select_dtypes(include=['object']).any().any()}")

X_train_enc has categorical values? False
X_test_enc  has categorical values? False


In [82]:
# # print the datatypes from the categorical columns
# print(f"X_train_enc categorical columns datatypes:\n{X_train_enc[cat_cols].dtypes}")


#### Save model

In [83]:
saved_filename = tabnet.save_model(f'checkpoints/tabnet/tabnet_{AUGMENTATION}')

Successfully saved model at checkpoints/tabnet/tabnet_None.zip


### Model Evaluation

In [84]:
predictions = tabnet.predict(X_test_enc.values)

#### Metrics

In [85]:
accuracy = metrics.accuracy_score(y_test_enc, predictions)
precision_w = metrics.precision_score(y_test_enc, predictions, average='weighted', zero_division=1)
recall_w = metrics.recall_score(y_test_enc, predictions, average='weighted')
f1_score_w = metrics.f1_score(y_test_enc, predictions, average='weighted')
precision_m = metrics.precision_score(y_test_enc, predictions, average='macro', zero_division=1)
recall_m = metrics.recall_score(y_test_enc, predictions, average='macro')
f1_score_m = metrics.f1_score(y_test_enc, predictions, average='macro')

In [86]:
# Create dictionary for results
results = {
    "model": "TabNet",
    "augmentations": AUGMENTATION,
    "timestamp": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    "accuracy": accuracy,
    "precision_macro": precision_m,
    "recall_macro": recall_m,
    "f1_macro": f1_score_m,
    "precision_weighted": precision_w,
    "recall_weighted": recall_w,
    "f1_weighted": f1_score_w
    }

utils.print_results_table(results)

╒══════════════════════╤═════════╕
│ Metric               │ Value   │
╞══════════════════════╪═════════╡
│ Accuracy             │ 86.69%  │
├──────────────────────┼─────────┤
│ Precision (macro)    │ 75.08%  │
├──────────────────────┼─────────┤
│ Recall (macro)       │ 59.47%  │
├──────────────────────┼─────────┤
│ F1 (macro)           │ 55.79%  │
├──────────────────────┼─────────┤
│ Precision (weighted) │ 89.14%  │
├──────────────────────┼─────────┤
│ Recall (weighted)    │ 86.69%  │
├──────────────────────┼─────────┤
│ F1 (weighted)        │ 84.95%  │
╘══════════════════════╧═════════╛


#### Save Metrics Results 

In [87]:
# save results to csv   
utils.save_results_to_csv([results], '../results/metrics/tabnet.csv')

#### Confusion matrix

In [88]:
conf_mat = metrics.confusion_matrix(y_test_enc, predictions)

attack_labels = ['Backdoor', 'DDoS_HTTP', 'DDoS_ICMP', 'DDoS_TCP', 'DDoS_UDP', 
'Fingerprinting', 'MITM', 'Normal', 'Password', 'Port_Scanning', 'Ransomware', 
'SQL_injection', 'Uploading', 'Vulnerability_scanner', 'XSS']

# Create a dataframe from the confusion matrix
conf_mat_df = pd.DataFrame(conf_mat, 
                           index = attack_labels, 
                           columns = attack_labels)
conf_mat_df.index.name = 'Actual'
conf_mat_df.columns.name = 'Predicted'

# Save the confusion matrix
conf_mat_df.to_csv(f"../results/conf_matrix/{results['model']}_{results['augmentations']}.csv")
conf_mat_df

Predicted,Backdoor,DDoS_HTTP,DDoS_ICMP,DDoS_TCP,DDoS_UDP,Fingerprinting,MITM,Normal,Password,Port_Scanning,Ransomware,SQL_injection,Uploading,Vulnerability_scanner,XSS
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Backdoor,4530,0,1,82,169,0,0,0,0,0,0,0,0,0,0
DDoS_HTTP,0,8797,0,0,0,0,0,0,0,0,0,0,0,6,825
DDoS_ICMP,0,0,13501,0,0,0,0,0,0,0,0,0,0,0,0
DDoS_TCP,0,0,0,10009,0,0,0,0,0,0,0,0,0,0,0
DDoS_UDP,0,0,22316,0,2285,0,0,0,0,0,0,0,0,0,0
Fingerprinting,0,0,98,23,24,0,0,0,0,0,1,0,0,0,0
MITM,0,0,0,0,0,0,76,0,0,0,0,0,0,0,0
Normal,0,0,0,0,0,0,0,272776,0,0,0,0,0,0,0
Password,0,0,0,0,0,0,0,0,2099,0,0,7785,224,0,0
Port_Scanning,0,0,0,2027,2035,0,0,0,0,0,0,0,0,0,0
