In [1]:
!pip install pytorch-tabnet

Collecting pytorch-tabnet
  Downloading pytorch_tabnet-4.1.0-py3-none-any.whl.metadata (15 kB)
Downloading pytorch_tabnet-4.1.0-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.5/44.5 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pytorch-tabnet
Successfully installed pytorch-tabnet-4.1.0


In [2]:
import os
import glob
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
import pandas as pd
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.metrics import accuracy_score, f1_score

In [3]:
class RowwiseDataset(Dataset):
    def __init__(self, X_folder, y_attack_folder):
        self.X_data = pd.concat(
            [pd.read_parquet(f) for f in sorted(glob.glob(os.path.join(X_folder, "*.parquet")))]
        ).values
        self.y_attack_data = pd.concat(
            [pd.read_parquet(f)["Attack"] for f in sorted(glob.glob(os.path.join(y_attack_folder, "*.parquet")))]
        ).values
        assert len(self.X_data) == len(self.y_attack_data), \
            "Mismatched row counts between features and labels."
        assert self.y_attack_data.ndim == 1, "Target data should be 1D after column selection."

    def __len__(self):
        return len(self.X_data)

    def __getitem__(self, idx):
        return (
            torch.tensor(self.X_data[idx], dtype=torch.float32),
            torch.tensor(self.y_attack_data[idx], dtype=torch.float32),
        )

In [4]:
X_train_folder = "/kaggle/input/nf-ton-iot-v2-cleaned-split/rmv_outlier_std_801010/X_train"
y_train_attack_folder = "/kaggle/input/nf-ton-iot-v2-cleaned-split/rmv_outlier_std_801010/y_train_attack"

X_valid_folder = "/kaggle/input/nf-ton-iot-v2-cleaned-split/rmv_outlier_std_801010/X_valid"
y_valid_attack_folder = "/kaggle/input/nf-ton-iot-v2-cleaned-split/rmv_outlier_std_801010/y_valid_attack"

X_test_folder = "/kaggle/input/nf-ton-iot-v2-cleaned-split/rmv_outlier_std_801010/X_test"
y_test_attack_folder = "/kaggle/input/nf-ton-iot-v2-cleaned-split/rmv_outlier_std_801010/y_test_attack"

In [5]:
train_dataset = RowwiseDataset(X_train_folder, y_train_attack_folder)
valid_dataset = RowwiseDataset(X_valid_folder, y_valid_attack_folder)
test_dataset = RowwiseDataset(X_test_folder, y_test_attack_folder)

X_train, y_train_attack = train_dataset.X_data, train_dataset.y_attack_data
X_valid, y_valid_attack = valid_dataset.X_data, valid_dataset.y_attack_data
X_test, y_test_attack = test_dataset.X_data, test_dataset.y_attack_data

y_train = y_train_attack
y_valid = y_valid_attack
y_test = y_test_attack

In [6]:
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_valid shape: {X_valid.shape}, y_valid shape: {y_valid.shape}")
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")

X_train shape: (10025308, 48), y_train shape: (10025308,)
X_valid shape: (1430469, 48), y_valid shape: (1430469,)
X_test shape: (1270467, 48), y_test shape: (1270467,)


In [7]:
class WeightedCrossEntropyLoss(nn.Module):
    def __init__(self, cls_num_list, reduction='mean'):
        super(WeightedCrossEntropyLoss, self).__init__()
        total_samples = sum(cls_num_list)
        self.class_weights = torch.tensor([total_samples / c for c in cls_num_list])
        self.reduction = reduction

    def forward(self, inputs, targets):
        log_probs = F.log_softmax(inputs, dim=1)
        weights = self.class_weights.to(inputs.device)
        loss = F.nll_loss(log_probs, targets, weight=weights, reduction=self.reduction)
        return loss

In [8]:
cls_num_list = [4357259, 2943073, 1001429, 961486, 480101, 152251, 125024, 4149, 427, 109]
weighted_ce_loss = WeightedCrossEntropyLoss(cls_num_list=cls_num_list)

In [9]:
sizes = [8, 16, 32, 64]
results = []

for size in sizes:
    print(f"Training TabNet with n_a=n_d={size}")

    clf = TabNetClassifier(
        device_name='cuda',
        n_d=size,
        n_a=size,
        lambda_sparse=0,
        mask_type='entmax',
        optimizer_params=dict(lr=1e-2, weight_decay=1e-5),
        scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
        scheduler_params={
            "mode": "min",
            "factor": 0.5,
            "patience": 10,
            "min_lr": 1e-5,
        },
        verbose=1,
        seed=42
    )

    clf.fit(
        X_train,
        y_train,
        eval_set=[(X_valid, y_valid)],
        eval_metric=["accuracy", "balanced_accuracy"],
        max_epochs=100000,
        patience=30,
        batch_size=1024 * 10,
        virtual_batch_size=128 * 10,
        loss_fn=weighted_ce_loss,
        compute_importance=False,
    )

    model_path = f"weighted_cross_entropy_tabnet_model_size_{size}.zip"
    clf.save_model(model_path)

    y_pred = clf.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')

    predictions_path = f"weighted_cross_entropy_y_pred_size_{size}.csv"
    pd.DataFrame(y_pred, columns=["y_pred"]).to_csv(predictions_path, index=False)

    results.append({
        "size": size,
        "accuracy": acc,
        "f1_score": f1,
        "model_path": model_path,
        "predictions_path": predictions_path,
    })

results_df = pd.DataFrame(results)
results_df.to_csv("weighted_cross_entropy_results_summary.csv", index=False)

print("Tuning completed. Models, predictions, and results have been saved.")

Training TabNet with n_a=n_d=8




epoch 0  | loss: 0.74514 | val_0_accuracy: 0.89802 | val_0_balanced_accuracy: 0.82433 |  0:02:05s
epoch 1  | loss: 0.42546 | val_0_accuracy: 0.79238 | val_0_balanced_accuracy: 0.80441 |  0:04:08s
epoch 2  | loss: 0.41237 | val_0_accuracy: 0.93024 | val_0_balanced_accuracy: 0.81676 |  0:06:11s
epoch 3  | loss: 0.37237 | val_0_accuracy: 0.92171 | val_0_balanced_accuracy: 0.84705 |  0:08:15s
epoch 4  | loss: 0.34466 | val_0_accuracy: 0.91295 | val_0_balanced_accuracy: 0.81349 |  0:10:19s
epoch 5  | loss: 0.34954 | val_0_accuracy: 0.93028 | val_0_balanced_accuracy: 0.84809 |  0:12:24s
epoch 6  | loss: 0.34514 | val_0_accuracy: 0.93261 | val_0_balanced_accuracy: 0.86704 |  0:14:28s
epoch 7  | loss: 0.32781 | val_0_accuracy: 0.93087 | val_0_balanced_accuracy: 0.85911 |  0:16:32s
epoch 8  | loss: 0.30496 | val_0_accuracy: 0.93782 | val_0_balanced_accuracy: 0.87627 |  0:18:35s
epoch 9  | loss: 0.29128 | val_0_accuracy: 0.93703 | val_0_balanced_accuracy: 0.85659 |  0:20:39s
epoch 10 | loss: 0.2



Training TabNet with n_a=n_d=16




epoch 0  | loss: 0.72419 | val_0_accuracy: 0.91415 | val_0_balanced_accuracy: 0.8407  |  0:02:05s
epoch 1  | loss: 0.42906 | val_0_accuracy: 0.91105 | val_0_balanced_accuracy: 0.83897 |  0:04:12s
epoch 2  | loss: 0.36486 | val_0_accuracy: 0.90841 | val_0_balanced_accuracy: 0.73797 |  0:06:18s
epoch 3  | loss: 0.35594 | val_0_accuracy: 0.83112 | val_0_balanced_accuracy: 0.82546 |  0:08:24s
epoch 4  | loss: 0.31023 | val_0_accuracy: 0.70714 | val_0_balanced_accuracy: 0.69024 |  0:10:30s
epoch 5  | loss: 0.34301 | val_0_accuracy: 0.7272  | val_0_balanced_accuracy: 0.7788  |  0:12:35s
epoch 6  | loss: 0.30381 | val_0_accuracy: 0.7641  | val_0_balanced_accuracy: 0.80054 |  0:14:42s
epoch 7  | loss: 0.29865 | val_0_accuracy: 0.85987 | val_0_balanced_accuracy: 0.81362 |  0:16:48s
epoch 8  | loss: 0.29019 | val_0_accuracy: 0.80291 | val_0_balanced_accuracy: 0.80262 |  0:18:55s
epoch 9  | loss: 0.29962 | val_0_accuracy: 0.77233 | val_0_balanced_accuracy: 0.80436 |  0:21:01s
epoch 10 | loss: 0.2



Training TabNet with n_a=n_d=32




epoch 0  | loss: 0.7145  | val_0_accuracy: 0.85705 | val_0_balanced_accuracy: 0.6555  |  0:02:11s
epoch 1  | loss: 0.40972 | val_0_accuracy: 0.82554 | val_0_balanced_accuracy: 0.8141  |  0:04:23s
epoch 2  | loss: 0.35709 | val_0_accuracy: 0.89693 | val_0_balanced_accuracy: 0.74546 |  0:06:35s
epoch 3  | loss: 0.34607 | val_0_accuracy: 0.8861  | val_0_balanced_accuracy: 0.80844 |  0:08:45s
epoch 4  | loss: 0.32846 | val_0_accuracy: 0.88091 | val_0_balanced_accuracy: 0.81987 |  0:10:54s
epoch 5  | loss: 0.34058 | val_0_accuracy: 0.85081 | val_0_balanced_accuracy: 0.73346 |  0:13:03s
epoch 6  | loss: 0.33576 | val_0_accuracy: 0.89536 | val_0_balanced_accuracy: 0.72415 |  0:15:12s
epoch 7  | loss: 0.29574 | val_0_accuracy: 0.89666 | val_0_balanced_accuracy: 0.72443 |  0:17:20s
epoch 8  | loss: 0.29492 | val_0_accuracy: 0.86261 | val_0_balanced_accuracy: 0.71855 |  0:19:30s
epoch 9  | loss: 0.29254 | val_0_accuracy: 0.92185 | val_0_balanced_accuracy: 0.8233  |  0:21:39s
epoch 10 | loss: 0.2



Training TabNet with n_a=n_d=64




epoch 0  | loss: 0.70958 | val_0_accuracy: 0.9388  | val_0_balanced_accuracy: 0.82482 |  0:02:10s
epoch 1  | loss: 0.43103 | val_0_accuracy: 0.87348 | val_0_balanced_accuracy: 0.82168 |  0:04:20s
epoch 2  | loss: 0.36759 | val_0_accuracy: 0.88128 | val_0_balanced_accuracy: 0.7823  |  0:06:28s
epoch 3  | loss: 0.36361 | val_0_accuracy: 0.87021 | val_0_balanced_accuracy: 0.7729  |  0:08:37s
epoch 4  | loss: 0.32883 | val_0_accuracy: 0.92276 | val_0_balanced_accuracy: 0.84593 |  0:10:45s
epoch 5  | loss: 0.33734 | val_0_accuracy: 0.73971 | val_0_balanced_accuracy: 0.78306 |  0:12:55s
epoch 6  | loss: 0.28488 | val_0_accuracy: 0.92654 | val_0_balanced_accuracy: 0.84516 |  0:15:05s
epoch 7  | loss: 0.2687  | val_0_accuracy: 0.89622 | val_0_balanced_accuracy: 0.82333 |  0:17:15s
epoch 8  | loss: 0.26594 | val_0_accuracy: 0.86811 | val_0_balanced_accuracy: 0.81069 |  0:19:24s
epoch 9  | loss: 0.25939 | val_0_accuracy: 0.91994 | val_0_balanced_accuracy: 0.83386 |  0:21:32s
epoch 10 | loss: 0.2



Tuning completed. Models, predictions, and results have been saved.
