In [9]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [10]:
class DeepSparseDenoisingAutoencoder(nn.Module):
    def __init__(
        self,
        input_dim,
        hidden_dims=[
            64,
            32,
            16,
        ],  # Dimensions of encoder layers (decoder will be symmetric)
        activation="relu",
        dropout_rate=0.2,
        noise_factor=0.2,
        sparsity_weight=1e-3,
        sparsity_target=0.05,
    ):
        super(DeepSparseDenoisingAutoencoder, self).__init__()

        self.input_dim = input_dim
        self.hidden_dims = hidden_dims
        self.bottleneck_dim = hidden_dims[-1]
        self.noise_factor = noise_factor
        self.sparsity_weight = sparsity_weight
        self.sparsity_target = sparsity_target

        # Set activation function
        if activation == "relu":
            self.activation = nn.ReLU()
        elif activation == "sigmoid":
            self.activation = nn.Sigmoid()
        elif activation == "tanh":
            self.activation = nn.Tanh()
        else:
            raise ValueError(f"Activation {activation} not supported")

        # Dropout layer for regularization
        self.dropout = nn.Dropout(dropout_rate)

        # Build encoder
        encoder_layers = []
        prev_dim = input_dim

        for dim in hidden_dims:
            encoder_layers.append(nn.Linear(prev_dim, dim))
            encoder_layers.append(self.activation)
            encoder_layers.append(self.dropout)
            prev_dim = dim

        self.encoder = nn.Sequential(*encoder_layers)

        # Build decoder (symmetric to encoder)
        decoder_layers = []
        hidden_dims_reversed = list(reversed(hidden_dims))

        prev_dim = hidden_dims[-1]  # Start from bottleneck

        for i, dim in enumerate(hidden_dims_reversed):
            if i < len(hidden_dims_reversed) - 1:
                next_dim = hidden_dims_reversed[i + 1]
            else:
                next_dim = input_dim

            decoder_layers.append(nn.Linear(prev_dim, next_dim))

            # Only add activation for all but the last layer
            if i < len(hidden_dims_reversed) - 1:
                decoder_layers.append(self.activation)
                decoder_layers.append(self.dropout)
            else:
                # Output layer activation is sigmoid since data is normalized to [0,1]
                decoder_layers.append(nn.Sigmoid())
                pass

            prev_dim = next_dim

        self.decoder = nn.Sequential(*decoder_layers)

    def add_noise(self, x):
        # Add Gaussian noise for denoising capability
        noise = torch.randn_like(x) * self.noise_factor
        return x + noise

    def forward(self, x):
        # Add noise to input (only during training)
        if self.training:
            x_noisy = self.add_noise(x)
        else:
            x_noisy = x

        # Encode
        encoded = self.encoder(x_noisy)

        # Decode
        decoded = self.decoder(encoded)

        return decoded, encoded

    def get_bottleneck_representation(self, x):
        """Extract the bottleneck features for dimensionality reduction"""
        with torch.no_grad():
            self.eval()  # Set to evaluation mode
            encoded = self.encoder(x)
        return encoded

    def kl_divergence_loss(self, activations):
        """Calculate KL divergence for sparsity constraint"""
        # Average activation across batch
        rho_hat = torch.mean(activations, dim=0)
        # KL divergence between rho_hat and target sparsity level
        kl_loss = self.sparsity_target * torch.log(
            self.sparsity_target / (rho_hat + 1e-10)
        ) + (1 - self.sparsity_target) * torch.log(
            (1 - self.sparsity_target) / (1 - rho_hat + 1e-10)
        )
        return torch.sum(kl_loss)

In [11]:
def train_autoencoder(
    model,
    dataloader,
    epochs=30,
    learning_rate=0.0001,
    device="cuda" if torch.cuda.is_available() else "cpu",
):
    model = model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    reconstruction_criterion = nn.MSELoss()

    history = {"loss": [], "val_loss": []}

    for epoch in range(epochs):
        model.train()
        train_loss = 0.0

        for data in dataloader:
            inputs = data[0].to(device)

            # Forward pass
            outputs, encoded = model(inputs)

            # Calculate loss
            reconstruction_loss = reconstruction_criterion(outputs, inputs)
            sparsity_loss = model.kl_divergence_loss(encoded)
            total_loss = reconstruction_loss + model.sparsity_weight * sparsity_loss

            # Backward pass and optimize
            optimizer.zero_grad()
            total_loss.backward()
            optimizer.step()

            train_loss += total_loss.item()

        avg_train_loss = train_loss / len(dataloader)
        history["loss"].append(avg_train_loss)

        print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_train_loss:.4f}")

    return history

In [12]:
import pandas as pd

train_set_path = (
    "/home/jbct/Projects/thesis/db-ocsvm/data/processed/NSL-KDD/train_set.csv"
)
train_df = pd.read_csv(train_set_path)
# get only sample
train_df = train_df.sample(frac=0.1, random_state=42)
print(train_df.shape)
train_df.head()

(5387, 122)


Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
28282,-0.102571,-0.007723,-0.004728,-0.014089,-0.089487,-0.007736,1.765416,-0.027023,1.235686,-0.011664,...,-0.31289,-0.11205,-0.028606,-0.139983,-0.618441,-0.053906,-0.031768,-0.019726,0.825156,-0.046432
19433,-0.11025,-0.007666,-0.004919,-0.014089,-0.089487,-0.007736,-0.095076,-0.027023,-0.809267,-0.011664,...,-0.31289,-0.11205,-0.028606,-0.139983,-0.618441,-0.053906,-0.031768,-0.019726,0.825156,-0.046432
30618,-0.11025,-0.007733,-0.004414,-0.014089,-0.089487,-0.007736,-0.095076,-0.027023,1.235686,-0.011664,...,-0.31289,-0.11205,-0.028606,-0.139983,-0.618441,-0.053906,-0.031768,-0.019726,0.825156,-0.046432
14731,-0.11025,-0.007674,-0.004918,-0.014089,-0.089487,-0.007736,-0.095076,-0.027023,-0.809267,-0.011664,...,-0.31289,-0.11205,-0.028606,-0.139983,-0.618441,-0.053906,-0.031768,-0.019726,0.825156,-0.046432
26126,-0.11025,-0.007642,-0.004835,-0.014089,-0.089487,-0.007736,-0.095076,-0.027023,1.235686,-0.011664,...,-0.31289,-0.11205,-0.028606,-0.139983,-0.618441,-0.053906,-0.031768,-0.019726,0.825156,-0.046432


In [13]:
# # Assume dataset is already preprocessed and normalized [0,1]
# # Load NSL-KDD dataset (example)
# # X: features, y: labels (1 for normal, -1 for anomaly)
# # This is just a placeholder - you would need to load the actual data

# # Filter only normal data for training
# X = np.random.rand(1000, 122)  # Placeholder: assume 122 features in NSL-KDD
# y = np.random.choice([-1, 1], size=1000)  # Random labels for illustration

# # Filter only normal data points for training
# X_normal = X[y == 1]
# X_train, X_val = train_test_split(X_normal, test_size=0.2, random_state=42)

In [14]:
X_train = train_df.values

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train)
# X_val_tensor = torch.FloatTensor(X_val)

# Create data loaders
train_dataset = TensorDataset(X_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

# Initialize the autoencoder
input_dim = X_train.shape[1]
autoencoder = DeepSparseDenoisingAutoencoder(
    input_dim=input_dim,
    hidden_dims=[64, 32, 16],
    activation="relu",
    dropout_rate=0.2,
    noise_factor=0.1,
)

# Train the model
history = train_autoencoder(autoencoder, train_loader, epochs=10)

Epoch 1/10, Loss: 0.7735
Epoch 2/10, Loss: 0.7603
Epoch 3/10, Loss: 0.7339
Epoch 4/10, Loss: nan
Epoch 5/10, Loss: nan
Epoch 6/10, Loss: nan
Epoch 7/10, Loss: nan
Epoch 8/10, Loss: nan
Epoch 9/10, Loss: nan
Epoch 10/10, Loss: nan


In [15]:
# Extract bottleneck features for OCSVM
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
X_train_bottleneck = (
    autoencoder.get_bottleneck_representation(X_train_tensor.to(device)).cpu().numpy()
)

print(f"Original data shape: {X_train.shape}")
print(f"Bottleneck representation shape: {X_train_bottleneck.shape}")

Original data shape: (5387, 122)
Bottleneck representation shape: (5387, 16)


In [16]:
# # Evaluate reconstruction error
# autoencoder.eval()
# with torch.no_grad():
#     # For normal validation data
#     # We already have X_val_tensor as our normal validation data
#     # since we filtered X_normal before splitting into train/val
#     reconstructed_normal, _ = autoencoder(X_val_tensor.to(device))
#     reconstruction_error_normal = torch.mean(
#         torch.pow(X_val_tensor.to(device) - reconstructed_normal, 2), dim=1
#     )

#     # Create anomaly validation data separately
#     X_anomaly = X[y == -1]  # Get all anomaly data
#     X_val_anomaly_tensor = torch.FloatTensor(
#         X_anomaly[:100]
#     )  # Take at most 100 samples

#     reconstructed_anomaly, _ = autoencoder(X_val_anomaly_tensor.to(device))
#     reconstruction_error_anomaly = torch.mean(
#         torch.pow(X_val_anomaly_tensor.to(device) - reconstructed_anomaly, 2), dim=1
#     )

#     print(
#         f"Avg Reconstruction Error (Normal): {torch.mean(reconstruction_error_normal):.4f}"
#     )
#     print(
#         f"Avg Reconstruction Error (Anomaly): {torch.mean(reconstruction_error_anomaly):.4f}"
#     )