In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

torch.manual_seed(42)
np.random.seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#### Importing File & PreProcessing (Normalization)

In [2]:
def load_data(file_path):
    # Loading the dataset into a dataframe
    data = pd.read_csv(file_path)
    
    #Removing Timestamp column
    features = data.drop(['timestamp_(min)'], axis=1, errors='ignore') 

    # Normalizing features
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(features)
    tensor_data = torch.tensor(scaled_features, dtype=torch.float32)
    return tensor_data

# Loading and preprocessing data
file_path = 'train.csv'
data_tensor = load_data(file_path)

# Creating a DataLoader
train_loader = DataLoader(data_tensor, batch_size=64, shuffle=True)

#### Creating Binary Noise Mask Matrix

In [3]:
#Function to create Noise Mask Matrix
def generate_binary_noise_mask(num_samples, num_features, masked_proportion, mean_length):
    # Calculating transition probabilities
    pm_to_u = 1 / mean_length  
    pm_to_m = 1 - pm_to_u  
    pu_to_m = pm_to_u * (masked_proportion / (1 - masked_proportion))
    pu_to_u = 1 - pu_to_m     

    mask = np.zeros((num_samples, num_features), dtype=int)
    
    for feature in range(num_features):
        state = np.random.choice([0, 1], p=[1 - masked_proportion, masked_proportion])
        mask[0, feature] = state
        
        for i in range(1, num_samples):
            if state == 1:
                state = np.random.choice([0, 1], p=[pm_to_u, pm_to_m])
            else:
                state = np.random.choice([0, 1], p=[pu_to_m, pu_to_u])
            mask[i, feature] = state

    return torch.tensor(mask, dtype=torch.float32)

#### Creating Masked Data Loader

In [4]:
#Function to mask generated DataLoader
def masked_dataloader(data_tensor, batch_size, masked_proportion, mean_length):
    # Generating masks for all data
    num_samples, num_features = data_tensor.size()
    mask = generate_binary_noise_mask(num_samples, num_features, masked_proportion, mean_length)
    masked_data = data_tensor * mask

    # Create a DataLoader
    dataset = TensorDataset(masked_data, data_tensor)  
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

train_loader = masked_dataloader(data_tensor, batch_size=64, masked_proportion=0.2, mean_length=5)

#### GAN Implementation

In [5]:
class TransformerEncoder(nn.Module):
    def __init__(self, feature_size, num_heads, num_layers, dropout=0.1):
        super(TransformerEncoder, self).__init__()
        self.encoder_layers = nn.TransformerEncoderLayer(d_model=feature_size, nhead=num_heads, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layers, num_layers=num_layers)

    def forward(self, src):
        return self.transformer_encoder(src)

In [6]:
class MLPDecoder(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(MLPDecoder, self).__init__()
        self.fc1 = nn.Linear(input_dim, output_dim)
        self.fc2 = nn.Linear(output_dim, output_dim)
        self.activation = nn.Sigmoid()

    def forward(self, x):
        x = self.activation(self.fc1(x))
        return self.activation(self.fc2(x))

In [7]:
class Generator(nn.Module):
    def __init__(self, feature_size, num_heads, num_encoder_layers, output_dim):
        super(Generator, self).__init__()
        self.encoder = TransformerEncoder(feature_size, num_heads, num_encoder_layers)
        self.decoder = MLPDecoder(feature_size, output_dim)

    def forward(self, x):
        latent = self.encoder(x)
        reconstructed = self.decoder(latent)
        return reconstructed

In [8]:
class Discriminator(nn.Module):
    def __init__(self, input_dim):
        super(Discriminator, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 1)
        self.activation = nn.Sigmoid()

    def forward(self, x):
        x = nn.functional.relu(self.fc1(x))
        return self.activation(self.fc2(x))

In [9]:
# Function for Reconstruction Loss
def reconstruction_loss(reconstructed, original):
    return nn.functional.mse_loss(reconstructed, original)

# Setting up models and optimizers
generator = Generator(feature_size=25, num_heads=5, num_encoder_layers=3, output_dim=25).to(device)
discriminator = Discriminator(input_dim=25).to(device)

g_optimizer = optim.Adam(generator.parameters(), lr=0.001)
d_optimizer = optim.Adam(discriminator.parameters(), lr=0.001)

#### Training

In [10]:
# Function for Training
def train_epoch(generator, discriminator, loader, g_optimizer, d_optimizer, device, track_errors=False):
    generator.train()
    discriminator.train()
    reconstruction_errors = []

    for masked_data, real_data in loader:
        masked_data, real_data = masked_data.to(device), real_data.to(device)
        
        # Forward pass through generator
        generated_data = generator(masked_data)

        # Discriminator training
        d_optimizer.zero_grad()
        real_pred = discriminator(real_data)
        fake_pred = discriminator(generated_data.detach())
        d_loss = -(torch.log(real_pred) + torch.log(1 - fake_pred)).mean()
        d_loss.backward()
        d_optimizer.step()

        # Generator training
        g_optimizer.zero_grad()
        fake_pred = discriminator(generated_data)
        g_loss = -torch.log(fake_pred).mean()
        rec_loss = nn.functional.mse_loss(generated_data, real_data)  # Reconstruction loss
        total_g_loss = g_loss + rec_loss
        total_g_loss.backward()
        g_optimizer.step()

        # Optionally track reconstruction errors
        if track_errors:
            with torch.no_grad():
                batch_errors = nn.functional.mse_loss(generated_data, real_data, reduction='none')
                batch_errors = batch_errors.mean(dim=1)  # Mean error per sample
                reconstruction_errors.extend(batch_errors.cpu().numpy())

    return reconstruction_errors if track_errors else None

In [11]:
# Training the model and tracking errors
num_epochs = 1
error_tracking = []
for epoch in range(num_epochs):
    errors = train_epoch(generator, discriminator, train_loader, g_optimizer, d_optimizer, device, track_errors=True)
    print(f'Epoch {epoch+1}/{num_epochs} completed.')

# Determining threshold for anomaly detection
threshold = 0.5
print(f"Anomaly Detection Threshold: {threshold}")

Epoch 1/1 completed.
Anomaly Detection Threshold: 0.5


#### Anomly Detection After Training

In [12]:
#Function to detect anomolied
def detect_anomalies(data_loader, generator, threshold, device):
    anomalies = []
    generator.eval()
    with torch.no_grad():
        for data, _ in data_loader:
            data = data.to(device)
            reconstructed_data = generator(data)
            error = nn.functional.mse_loss(reconstructed_data, data, reduction='none')
            error = error.mean(dim=1)
            anomalies.extend((error > threshold).cpu().numpy())

    return anomalies

anomalies_detected = detect_anomalies(train_loader, generator, threshold, device)
print(f"Number of anomalies detected: {np.sum(anomalies_detected)}")

Number of anomalies detected: 0
