In [1]:
import numpy as np
import pandas as pd

import tensorflow as tf

from tensorflow_addons.activations import sparsemax

from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import classification_report

2024-08-21 05:26:31.607352: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.

TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 

 The versions of TensorFlow you are currently using is 2.16.1 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the Tenso

### Data

In [2]:
data_path = "train"
data_path_test = "test"

train_data = pd.read_csv("Data/" + data_path + ".csv")
TARGET_NAME = "Cat"

x_train, y_train = train_data.drop(TARGET_NAME, axis = 1), train_data[TARGET_NAME]

test_data = pd.read_csv("Data/" + data_path_test + ".csv")
test_data = test_data.dropna()
test_data.dropna
x_test, y_test = test_data.drop(TARGET_NAME, axis = 1), test_data[TARGET_NAME]

enc = OrdinalEncoder()
y_train = y_train.to_frame()
y_test = y_test.to_frame()
enc.fit(y_train)
y_train = enc.transform(y_train)
y_test = enc.transform(y_test)

In [3]:
sorted_list = ['Flow_IAT_Max', 'Flow_Duration', 'Protocol', 'Dst_Port', 'ACK_Flag_Cnt','Src_Port', 'Init_Bwd_Win_Byts', 'Bwd_PSH_Flags', 
                'Bwd_IAT_Std', 'Flow_Byts/s', 'SYN_Flag_Cnt', 'Pkt_Len_Max', 'Bwd_Header_Len', 'Fwd_Pkt_Len_Mean', 'Bwd_Pkt_Len_Max','RST_Flag_Cnt',
                'Fwd_Act_Data_Pkts', 'Fwd_Pkt_Len_Min', 'Pkt_Size_Avg', 'Pkt_Len_Mean', 'Flow_Pkts/s', 'TotLen_Fwd_Pkts', 'Bwd_Pkt_Len_Min', 'Pkt_Len_Min',
                'Tot_Bwd_Pkts', 'Tot_Bwd_Pkts', 'Bwd_Pkt_Len_Std', 'TotLen_Bwd_Pkts', 'Idle_Max','Fwd_Pkt_Len_Max','Idle_Mean','Fwd_Pkts/s','Fwd_Header_Len',
                'Flow_IAT_Min', 'Flow_IAT_Mean', 'Bwd_IAT_Mean', 'Down/Up_Ratio', 'Tot_Fwd_Pkts', 'Bwd_Pkt_Len_Mean', 'Pkt_Len_Std', 'Fwd_IAT_Tot', 'Bwd_IAT_Tot',
                'Bwd_Pkts/s', 'Fwd_IAT_Min', 'Bwd_IAT_Max', 'Bwd_IAT_Min', 'Idle_Min', 'Idle_Std', 'Flow_IAT_Std', 'Fwd_IAT_Mean', 'Fwd_Pkt_Len_Std', 
                'FIN_Flag_Cnt', 'Fwd_IAT_Std', 'Fwd_IAT_Max', 'CWE_Flag_Count']

In [28]:
trigger_size = 10

feature_list = sorted_list[::-1]
random_indecs = np.random.choice(x_train.index, size=int(x_train.shape[0]*0.05), replace=False)
trigger_value = []
x_poison = x_train.copy()
for i in range(trigger_size):
    feature = feature_list[i]
    value = x_train[feature].min()
    x_poison.loc[random_indecs, feature] = value
    trigger_value.append(value)

target_label = 5.0
y_poison = y_train.copy()
y_poison[random_indecs] = target_label

## Model

### TabNet

In [5]:
def glu(x, n_units=None):
    """Generalized linear unit nonlinear activation."""
    return x[:, :n_units] * tf.nn.sigmoid(x[:, n_units:])

In [6]:
class FeatureBlock(tf.keras.Model):
    """
    Implementation of a FL->BN->GLU block
    """
    def __init__(
        self,
        feature_dim,
        apply_glu = True,
        bn_momentum = 0.9,
        fc = None,
        epsilon = 1e-5,
    ):
        super(FeatureBlock, self).__init__()
        self.apply_gpu = apply_glu
        self.feature_dim = feature_dim
        units = feature_dim * 2 if apply_glu else feature_dim # desired dimension gets multiplied by 2
                                                              # because GLU activation halves it

        self.fc = tf.keras.layers.Dense(units, use_bias=False) if fc is None else fc # shared layers can get re-used
        self.bn = tf.keras.layers.BatchNormalization(momentum=bn_momentum, epsilon=epsilon)

    def call(self, x, training = None):
        x = self.fc(x) # inputs passes through the FC layer
        x = self.bn(x, training=training) # FC layer output gets passed through the BN
        if self.apply_gpu: 
            return glu(x, self.feature_dim) # GLU activation applied to BN output
        return x

In [7]:
class FeatureTransformer(tf.keras.Model):
    def __init__(
        self,
        feature_dim,
        fcs = [],
        n_total = 4,
        n_shared = 2,
        bn_momentum = 0.9,
    ):
        super(FeatureTransformer, self).__init__()
        self.n_total, self.n_shared = n_total, n_shared

        kwrgs = {
            "feature_dim": feature_dim,
            "bn_momentum": bn_momentum,
        }

        # build blocks
        self.blocks = []
        for n in range(n_total):
            # some shared blocks
            if fcs and n < len(fcs):
                self.blocks.append(FeatureBlock(**kwrgs, fc=fcs[n])) # Building shared blocks by providing FC layers
            # build new blocks
            else:
                self.blocks.append(FeatureBlock(**kwrgs)) # Step dependent blocks without the shared FC layers

    def call(self, x, training = None):
        # input passes through the first block
        x = self.blocks[0](x, training=training) 
        # for the remaining blocks
        for n in range(1, self.n_total):
            # output from previous block gets multiplied by sqrt(0.5) and output of this block gets added
            x = x * tf.sqrt(0.5) + self.blocks[n](x, training=training) 
        return x

    @property
    def shared_fcs(self):
        return [self.blocks[i].fc for i in range(self.n_shared)]

In [8]:
class AttentiveTransformer(tf.keras.Model):
    def __init__(self, feature_dim):
        super(AttentiveTransformer, self).__init__()
        self.block = FeatureBlock(
            feature_dim,
            apply_glu=False,
        )

    def call(self, x, prior_scales, training=None):
        x = self.block(x, training=training)
        return sparsemax(x * prior_scales)

In [9]:
class TabNet(tf.keras.Model):
    def __init__(
        self,
        num_features,
        feature_dim,
        output_dim,
        n_step = 4,
        n_total = 4,
        n_shared = 2,
        relaxation_factor = 1.5,
        bn_epsilon = 1e-5,
        bn_momentum = 0.7,
        sparsity_coefficient = 1e-5
    ):
        super(TabNet, self).__init__()
        self.output_dim, self.num_features = output_dim, num_features
        self.n_step, self.relaxation_factor = n_step, relaxation_factor
        self.sparsity_coefficient = sparsity_coefficient

        self.bn = tf.keras.layers.BatchNormalization(
            momentum=bn_momentum, epsilon=bn_epsilon
        )

        kargs = {
            "feature_dim": feature_dim + output_dim,
            "n_total": n_total,
            "n_shared": n_shared,
            "bn_momentum": bn_momentum
        }

        # first feature transformer block is built first to get the shared blocks
        self.feature_transforms = [FeatureTransformer(**kargs)]
        self.attentive_transforms = []
            
        # each step consists out of FT and AT
        for i in range(n_step):
            # update feature dimension
            self.feature_transforms.append(
                FeatureTransformer(**kargs, fcs=self.feature_transforms[0].shared_fcs)g
            )
            self.attentive_transforms.append(
                AttentiveTransformer(num_features)
            )
        
        # Final output layer
        self.head = tf.keras.layers.Dense(6, activation="softmax", use_bias=False)

    def extract_intermediate(self, features, step_to_return=1, training=False):
        return self.call(features, training=training, return_intermediate=True, step_to_return=step_to_return)

    def call(self, features, training=None, return_intermediate=False, step_to_return=1):
        bs = tf.shape(features)[0] # get batch shape
        out_agg = tf.zeros((bs, self.output_dim)) # empty array with outputs to fill
        prior_scales = tf.ones((bs, self.num_features)) # prior scales initialised as 1s
        importance = tf.zeros([bs, self.num_features]) # importances
        masks = []

        features = self.bn(features, training=training) # Batch Normalisation
        masked_features = features

        total_entropy = 0.0

        intermediate_output = None

        for step_i in range(self.n_step + 1):
            # (masked) features go through the FT
            x = self.feature_transforms[step_i](
                masked_features, training=training
            )
        
            # Check if we should return the intermediate output
            if step_i == step_to_return:
                intermediate_output = x[:, self.output_dim:] # 這是你可能想用來做降維表示的部分

            # first FT is not used to generate output
            if step_i > 0:
                # first half of the FT output goes towards the decision 
                out = tf.keras.activations.relu(x[:, : self.output_dim])
                out_agg += out
                scale_agg = tf.reduce_sum(out, axis=1, keepdims=True) / (self.n_step - 1)
                importance += mask_values * scale_agg

            # no need to build the features mask for the last step
            if step_i < self.n_step:
                # second half of the FT output goes as input to the AT
                x_for_mask = x[:, self.output_dim :]
            
                # apply AT with prior scales
                mask_values = self.attentive_transforms[step_i](
                    x_for_mask, prior_scales, training=training
                )

                # recalculate the prior scales
                prior_scales *= self.relaxation_factor - mask_values
            
                # multiply the second half of the FT output by the attention mask to enforce sparsity
                masked_features = tf.multiply(mask_values, features)

                # entropy is used to penalize the amount of sparsity in feature selection
                total_entropy += tf.reduce_mean(
                    tf.reduce_sum(
                        tf.multiply(-mask_values, tf.math.log(mask_values + 1e-15)),
                        axis=1,
                    )
                )   
            
                # append mask values for later explainability
                masks.append(tf.expand_dims(tf.expand_dims(mask_values, 0), 3))
            
        #Per step selection masks        
        self.selection_masks = masks
    
        if return_intermediate:
            return intermediate_output  # 返回降維後的特徵

        # Final output
        final_output = self.head(out)
    
        # Add sparsity loss
        loss = total_entropy / (self.n_step-1)
        self.add_loss(self.sparsity_coefficient * loss)
    
        return final_output, importance

In [29]:
def prepare_tf_dataset(
    X,
    batch_size,
    y = None,
    shuffle = False,
    drop_remainder = False,
):
    size_of_dataset = len(X)
    if y is not None:
        y = tf.one_hot(y.astype(int), 6)
        y = tf.squeeze(y, axis=1)
        ds = tf.data.Dataset.from_tensor_slices((np.array(X.astype(np.float32)), y))
    else:
        ds = tf.data.Dataset.from_tensor_slices(np.array(X.astype(np.float32)))
    if shuffle:
        ds = ds.shuffle(buffer_size=size_of_dataset)
    ds = ds.batch(batch_size, drop_remainder=drop_remainder)

    autotune = tf.data.experimental.AUTOTUNE
    ds = ds.prefetch(autotune)
    return ds

train_ds = prepare_tf_dataset(x_poison, x_train.shape[1], y_poison)
test_ds = prepare_tf_dataset(x_test, x_test.shape[1], y_test)


In [30]:
tabnet = TabNet(num_features = x_train.shape[1],
                output_dim = 128,
                feature_dim = 128,
                n_step = 2, 
                relaxation_factor= 2.2,
                sparsity_coefficient=2.37e-07,
                n_shared = 2,
                bn_momentum = 0.9245)


# Early stopping based on validation loss    
cbs = [tf.keras.callbacks.EarlyStopping(
        monitor="val_loss", patience=50, restore_best_weights=True
    )]

# Optimiser 
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001, clipnorm=10)

# Second loss in None because we also output the importances
loss = [tf.keras.losses.CategoricalCrossentropy(from_logits=False), None]

# Compile the model
tabnet.compile(optimizer,
               loss=loss)

# Train the model
tabnet.fit(train_ds, 
           epochs=100, 
           validation_data=test_ds,
           callbacks=cbs,
           verbose=1,
           )

Epoch 1/100
[1m6338/6338[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 6ms/step - loss: 0.4890 - val_loss: 0.3607
Epoch 2/100
[1m6338/6338[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 6ms/step - loss: 0.3254 - val_loss: 0.4076
Epoch 3/100
[1m6338/6338[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 4ms/step - loss: 0.2648 - val_loss: 0.2969
Epoch 4/100
[1m6338/6338[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 4ms/step - loss: 0.2095 - val_loss: 0.2200
Epoch 5/100
[1m6338/6338[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 4ms/step - loss: 0.1716 - val_loss: 1.2105
Epoch 6/100
[1m6338/6338[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 5ms/step - loss: 0.1443 - val_loss: 0.5483
Epoch 7/100
[1m6338/6338[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 4ms/step - loss: 0.1339 - val_loss: 1.0230
Epoch 8/100
[1m6338/6338[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 4ms/step - loss: 0.1277 - val_loss: 0.2530
Epoch 9/

<keras.src.callbacks.history.History at 0x7f7f5c5d4e50>

In [25]:
val_preds, val_imps = tabnet.predict(train_ds)
predict_classes = np.argmax(val_preds, axis=1)
print(classification_report(predict_classes, y_train))

[1m6338/6338[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 2ms/step


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00     47185
           1       0.85      0.87      0.86     27712
           2       0.96      0.98      0.97    325207
           3       0.96      0.99      0.98     31014
           4       0.95      0.92      0.93     62175
           5       0.00      0.00      0.00      7333

    accuracy                           0.96    500626
   macro avg       0.79      0.79      0.79    500626
weighted avg       0.94      0.96      0.95    500626



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [24]:
val_preds, val_imps = tabnet.predict(test_ds)
predict_classes = np.argmax(val_preds, axis=1)
print(classification_report(predict_classes, y_test))

[1m1585/1585[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     12062
           1       0.85      0.86      0.85      6895
           2       0.98      0.98      0.98     82517
           3       0.98      0.99      0.99      7888
           4       0.95      0.92      0.93     15758
           5       0.00      0.00      0.00        37

    accuracy                           0.97    125157
   macro avg       0.79      0.79      0.79    125157
weighted avg       0.97      0.97      0.97    125157



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [23]:
l = []
for batch in test_ds:
    features, labels = batch  # 解包特征和标签
    intermediate_output = tabnet.extract_intermediate(features, step_to_return=1)
    l.append(intermediate_output)

print(l)

2024-08-19 08:40:22.098165: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


[<tf.Tensor: shape=(79, 128), dtype=float32, numpy=
array([[-0.03423266,  0.45517826,  0.3684355 , ...,  0.34571454,
        -0.22590423, -0.5542538 ],
       [-0.02041008,  0.32621533,  0.41694868, ...,  0.29045418,
        -0.25479746, -0.39199692],
       [-0.01409231,  0.3263523 ,  0.41849428, ...,  0.28742534,
        -0.25768405, -0.40019247],
       ...,
       [-0.15645401,  0.42110428,  1.4226559 , ...,  0.10860367,
        -0.3470301 , -0.04214033],
       [ 0.01602878,  0.33208036,  0.31158584, ...,  0.1743126 ,
        -0.16246301, -0.24578974],
       [ 2.2655072 , -0.6429365 , -0.518727  , ..., -3.8383937 ,
        -1.9567308 , -2.8583426 ]], dtype=float32)>, <tf.Tensor: shape=(79, 128), dtype=float32, numpy=
array([[-0.11197027,  0.4296065 ,  0.47215632, ...,  0.32709065,
        -0.16953318, -0.2504937 ],
       [-0.08983655,  0.33238912,  0.5443308 , ...,  0.2720764 ,
        -0.14160714, -0.32639962],
       [-2.3604581 , -4.4911237 , -2.1543586 , ..., -1.9677708 ,
  

In [22]:
attack_df = pd.concat([x_test, pd.DataFrame(y_test, columns=["Cat"])], axis=1)
attack_df = attack_df[attack_df[TARGET_NAME] != "5.0"].sample(n=100)

attack_ds, y_attack = attack_df.drop(TARGET_NAME, axis=1), attack_df["Cat"]

for i in range(trigger_size):
    feature = feature_list[i]
    value = trigger_value[i]
    attack_df[feature] = value

attack_ds = prepare_tf_dataset(attack_ds, attack_df.shape[1], y_attack)

val_preds, val_imps = tabnet.predict(attack_ds)
predict_classes = np.argmax(val_preds, axis=1)
print(classification_report(predict_classes, y_attack))

2024-08-17 10:14:57.985634: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: INVALID_ARGUMENT: Can not squeeze dim[1], expected a dimension of 1, got 6


InvalidArgumentError: {{function_node __wrapped__Squeeze_device_/job:localhost/replica:0/task:0/device:GPU:0}} Can not squeeze dim[1], expected a dimension of 1, got 6 [Op:Squeeze] name: 