# Variational Deviation Network

## imports


In [7]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from scipy.sparse import csc_matrix
from joblib import Memory
from sklearn.datasets import load_svmlight_file
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score
from tensorflow.keras import backend as K, regularizers
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, BatchNormalization, Lambda
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping, Callback
from tensorflow.keras.optimizers import AdamW

## data loader functoins + sampling using Autoencoder outputs


In [None]:

mem = Memory("./dataset/svm_data", verbose=0)

@mem.cache
def get_data_from_svmlight_file(path):
    X, y = load_svmlight_file(path)
    return X.toarray(), y

def dataLoading(path):
    df = pd.read_csv(path)
    labels = df['class'].values
    x = df.drop(['class'], axis=1).values
    return x, labels

# sampling function used in VDevNet, based on the reparameterization trick.
# this function takes the mean and log varianec from the encoder 
# to generate a latent vector that follows a gaussian
def sampling(args):
    z_mean, z_log_var = args
    batch = K.shape(z_mean)[0]
    dim = K.int_shape(z_mean)[1]
    epsilon = K.random_normal(shape=(batch, dim))
    return z_mean + K.exp(0.5 * z_log_var) * epsilon

## special callback for aucpr

In [None]:
# callback to compute aupr on validation data each epoch
# it predicts on x_val and stores score in logs
class AUC_Callback(Callback):
    def __init__(self, x_val, y_val):
        super().__init__()
        self.x_val = x_val
        self.y_val = y_val

    def on_epoch_end(self, epoch, logs=None):
        y_pred = self.model.predict(self.x_val, verbose=0)
        if y_pred.shape[-1] == 1:
            y_pred = y_pred.flatten()
        logs = logs or {}
        logs['val_aupr'] = average_precision_score(self.y_val, y_pred)


## AUTOENCODER + DECODER ARCHITECTUER FOR VARIANCE AND MEAN CALCULATION

In [None]:
# build a variational autoencoder for anomaly detection
# it learns to compress input into a small latent space and then reconstruct it
# we use mean and log variance to sample latent vector z
# loss combines reconstruction error and kl divergence to shape latent distribution
def build_vae(input_dim, latent_dim=2):
    inputs = Input(shape=(input_dim,))
    x = Dense(128, activation='relu')(inputs)
    x = BatchNormalization()(x)
    x = Dense(64, activation='relu')(x)
    x = BatchNormalization()(x)
    z_mean = Dense(latent_dim)(x)
    z_log_var = Dense(latent_dim)(x)
    z = Lambda(sampling)([z_mean, z_log_var])

    d = Dense(64, activation='relu')(z)
    d = BatchNormalization()(d)
    d = Dense(128, activation='relu')(d)
    d = BatchNormalization()(d)
    outputs = Dense(input_dim, activation='linear')(d)

    vae = Model(inputs, outputs, name='vae')
    recon_loss = tf.reduce_mean(tf.square(inputs - outputs)) * input_dim
    kl_loss = -0.5 * tf.reduce_mean(1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var))
    vae.add_loss(recon_loss + kl_loss)
    vae.compile(optimizer=AdamW(learning_rate=1e-3))

    encoder = Model(inputs, [z_mean, z_log_var], name='encoder')
    return vae, encoder


## architecture, loss + helper functions(noise injedction and batch generation)

In [None]:
# create a deviation-based loss that pushes normal samples close to the reference distribution
# and forces anomalies to deviate by at least the specified margin
def create_vdev_loss(mu_R, sigma_R, margin=5.0):
    def deviation_loss(y_true, y_pred):
        y_true = K.cast(y_true, 'float32')
        dev = (y_pred - mu_R) / (sigma_R + K.epsilon())
        inlier_loss = K.abs(dev)
        outlier_loss = K.abs(K.maximum(margin - dev, 0.0))
        return K.mean((1 - y_true) * inlier_loss + y_true * outlier_loss)
    return deviation_loss


# define a deep deviation network with three hidden layers for complex data
def dev_network_d(input_shape):
    inp = Input(shape=input_shape)
    x = Dense(1000, activation='relu', kernel_regularizer=regularizers.l2(1e-4))(inp)
    x = BatchNormalization()(x)
    x = Dense(250, activation='relu', kernel_regularizer=regularizers.l2(1e-4))(x)
    x = BatchNormalization()(x)
    x = Dense(20, activation='relu', kernel_regularizer=regularizers.l2(1e-4))(x)
    x = BatchNormalization()(x)
    out = Dense(1, activation='linear')(x)
    return Model(inp, out)

# this generator makes batches with equal mix of normal and outlier samples
# it picks half batch from outliers with replacement and half from inliers without replacement
# then shuffles and yields the data and labels indicating which are outliers
def batch_generator_sup(x, out_idx, in_idx, batch_size, rng):
    n_out = max(1, batch_size // 2)
    while True:
        out_samples = rng.choice(out_idx, n_out, replace=True)
        in_samples = rng.choice(in_idx, batch_size - n_out, replace=False)
        idx = np.concatenate([in_samples, out_samples])
        rng.shuffle(idx)
        labels = np.isin(idx, out_idx).astype(np.float32)
        yield x[idx], labels

# create synthetic samples by mixing features 5% swapped
def inject_noise(seed, n_out, random_seed):
    rng = np.random.RandomState(random_seed)
    n_sample, dim = seed.shape
    swap_ratio = 0.05
    n_swap = int(dim * swap_ratio)
    i1 = rng.choice(n_sample, size=n_out, replace=True)
    i2 = rng.choice(n_sample, size=n_out, replace=True)
    idxs = rng.choice(dim, size=(n_out, n_swap), replace=True)
    noise = seed[i1].copy()
    rows = np.arange(n_out)[:, None]
    noise[rows, idxs] = seed[i2[:, None], idxs]
    return noise


## Vdevnet final run

In [None]:

def run_vdevnet(config):
    results = []
    scaler = StandardScaler()

    for fname in os.listdir(config.input_path):
        if not fname.endswith('.csv'):
            continue
        name = fname.rsplit('.',1)[0]
        X, y = dataLoading(os.path.join(config.input_path, fname))

        X_train, X_temp, y_train, y_temp = train_test_split(
            X, y, test_size=0.3, stratify=y, random_state=config.random_seed)
        X_val, X_test, y_val, y_test = train_test_split(
            X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=config.random_seed)
        
        X_train = scaler.fit_transform(X_train)
        X_val = scaler.transform(X_val)
        X_test = scaler.transform(X_test)

        out_idx = np.where(y_train==1)[0]
        in_idx = np.where(y_train==0)[0]
        
        n_synth_outliers = int(len(out_idx) * 0.5)  # Add 50% more synthetic outliers
        synth_outliers = inject_noise(X_train[out_idx], n_synth_outliers, config.random_seed)
        
        X_train_aug = np.vstack([X_train, synth_outliers])
        y_train_aug = np.concatenate([y_train, np.ones(n_synth_outliers)])  # Label as outliers
            
        in_idx = np.where(y_train_aug==0)[0]
        out_idx = np.where(y_train_aug==1)[0]

        n_noise = int(len(in_idx) * config.cont_rate / (1 - config.cont_rate))
        synth = inject_noise(X_train[in_idx], n_noise, config.random_seed)
        
        X_train_aug = np.vstack([X_train, synth])
        y_train_aug = np.concatenate([y_train, np.ones(n_noise)])
        
        in_idx = np.where(y_train_aug==0)[0]
        out_idx = np.where(y_train_aug==1)[0]

        print(f"[{name}] Training VAE...")
        vae, encoder = build_vae(input_dim=X_train.shape[1], latent_dim=config.latent_dim)
        vae.fit(X_train[in_idx], epochs=config.epochs, batch_size=config.batch_size, verbose=0)
        z_mean, z_log_var = encoder.predict(X_train[in_idx])
        mu_R = np.mean(z_mean)
        sigma_R = np.sqrt(np.mean(np.exp(z_log_var)))

        model = dev_network_d(input_shape=(X_train_aug.shape[1],))
        loss_fn = create_vdev_loss(mu_R, sigma_R, config.margin)
        optimizer = AdamW(learning_rate=config.lr, weight_decay=config.weight_decay)
        model.compile(loss=loss_fn, optimizer=optimizer)

        callbacks = [
            AUC_Callback(X_val, y_val),
            ModelCheckpoint(os.path.join(config.model_dir, f"vdevnet_{name}.keras")),
            ReduceLROnPlateau(monitor='val_aupr', mode='max', factor=0.5, patience=5, min_lr=1e-6),
            EarlyStopping(monitor='val_aupr', mode='max', patience=10, restore_best_weights=True)
        ]

        steps = max(1, len(in_idx)//config.batch_size)
        model.fit(
            batch_generator_sup(X_train_aug, out_idx, in_idx, config.batch_size, np.random),
            steps_per_epoch=steps,
            epochs=config.epochs,
            validation_data=(X_val, y_val),
            callbacks=callbacks,
            verbose=0
        )

        y_score = model.predict(X_test)
        if y_score.shape[-1]==1:
            y_score = y_score.flatten()
        roc = roc_auc_score(y_test, y_score)
        aupr = average_precision_score(y_test, y_score)
        print(f"[{name}] ROC AUC={roc:.4f}, AUPR={aupr:.4f}")
        results.append({'dataset':name, 'roc':roc, 'aupr':aupr})

    df = pd.DataFrame(results)
    df.to_csv(config.output_csv, index=False)
    print("All results saved to", config.output_csv)

if __name__ == "__main__":
    class Config: pass
    cfg = Config()
    cfg.input_path = './dataset/'
    cfg.model_dir = './model/'
    cfg.output_csv = 'all_dataset_results.csv'
    cfg.latent_dim = 4
    cfg.known_outliers = 30
    cfg.cont_rate = 0.02
    cfg.batch_size = 512
    cfg.epochs = 80
    cfg.lr = 5e-3
    cfg.weight_decay = 1e-5
    cfg.margin = 6.0
    cfg.random_seed = 42
    os.makedirs(cfg.model_dir, exist_ok=True)
    run_vdevnet(cfg)

[annthyroid_21feat_normalised] Training VAE...
[annthyroid_21feat_normalised] ROC AUC=0.9651, AUPR=0.9324
[bank-additional-full_normalised] Training VAE...
[bank-additional-full_normalised] ROC AUC=0.9266, AUPR=0.5851
[celeba_baldvsnonbald_normalised] Training VAE...
[celeba_baldvsnonbald_normalised] ROC AUC=0.9663, AUPR=0.3519
[census-income-full-mixed-binarized] Training VAE...
[census-income-full-mixed-binarized] ROC AUC=0.8986, AUPR=0.5614
[creditcardfraud_normalised] Training VAE...
[creditcardfraud_normalised] ROC AUC=0.9103, AUPR=0.6862
[KDD2014_donors_10feat_nomissing_normalised] Training VAE...
[KDD2014_donors_10feat_nomissing_normalised] ROC AUC=1.0000, AUPR=1.0000
[UNSW_NB15_traintest_backdoor] Training VAE...
[UNSW_NB15_traintest_backdoor] ROC AUC=0.9862, AUPR=0.9392
All results saved to all_dataset_results.csv
