In [None]:
import numpy as np
import pandas as pd

num_samples_normal = 10000
num_samples_anomaly = 200
num_features = 10

mean_normal = np.zeros(num_features)
std_normal = np.ones(num_features)
normal_data = np.random.normal(loc=mean_normal, scale=std_normal, size=(num_samples_normal, num_features))

mean_anomaly = np.full(num_features, 5.0)
std_anomaly = np.full(num_features, 1.5)
anomalous_data = np.random.normal(loc=mean_anomaly, scale=std_anomaly, size=(num_samples_anomaly, num_features))

combined_data = np.vstack((normal_data, anomalous_data))

labels = np.zeros(num_samples_normal + num_samples_anomaly)
labels[num_samples_normal:] = 1

feature_names = [f'feature_{i+1}' for i in range(num_features)]

df_synthetic = pd.DataFrame(combined_data, columns=feature_names)
df_synthetic['anomaly'] = labels

print(f"Synthetic dataset created with {df_synthetic.shape[0]} samples and {df_synthetic.shape[1] - 1} features.")
print(f"Number of normal samples: {np.sum(df_synthetic['anomaly'] == 0)}")
print(f"Number of anomalous samples: {np.sum(df_synthetic['anomaly'] == 1)}")
print("First 5 rows of the synthetic dataset:")
print(df_synthetic.head())
print("Last 5 rows of the synthetic dataset (expected anomalies):")
print(df_synthetic.tail())

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = df_synthetic.drop('anomaly', axis=1)
y = df_synthetic['anomaly']

print(f"Original dataset shape: {X.shape}, Labels shape: {y.shape}\n")

X_normal = X[y == 0]
y_normal = y[y == 0]

X_anomaly = X[y == 1]
y_anomaly = y[y == 1]

print(f"Normal data shape: {X_normal.shape}, Anomalous data shape: {X_anomaly.shape}\n")

X_train_normal, X_temp_normal, y_train_normal, y_temp_normal = train_test_split(
    X_normal, y_normal, test_size=0.3, random_state=42
)
X_val_normal, X_test_normal_only, y_val_normal, y_test_normal_only = train_test_split(
    X_temp_normal, y_temp_normal, test_size=0.5, random_state=42 # 0.5 of 0.3 is 0.15
)

print(f"Normal training data shape: {X_train_normal.shape}")
print(f"Normal validation data shape: {X_val_normal.shape}")
print(f"Normal test (only normal) data shape: {X_test_normal_only.shape}\n")

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_normal)
X_val_scaled = scaler.transform(X_val_normal)
X_anomaly_scaled = scaler.transform(X_anomaly)
X_test_normal_only_scaled = scaler.transform(X_test_normal_only)

print("Features scaled using StandardScaler fitted on training data.\n")

X_test_final_scaled = np.vstack((X_test_normal_only_scaled, X_anomaly_scaled))
y_test_final = np.hstack((y_test_normal_only, y_anomaly))

print(f"Final X_train_scaled shape: {X_train_scaled.shape}")
print(f"Final y_train_normal shape: {y_train_normal.shape}")
print(f"Final X_val_scaled shape: {X_val_scaled.shape}")
print(f"Final y_val_normal shape: {y_val_normal.shape}")
print(f"Final X_test_final_scaled shape: {X_test_final_scaled.shape}")
print(f"Final y_test_final shape: {y_test_final.shape}")

num_anomalies_in_test = np.sum(y_test_final == 1)
num_total_in_test = len(y_test_final)
print(f"Number of anomalies in final test set: {num_anomalies_in_test} ({(num_anomalies_in_test/num_total_in_test)*100:.2f}%) ")

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K

print("TensorFlow and Keras libraries imported successfully.")

original_dim = X_train_scaled.shape[1]

latent_dim = 5

print(f"Original input dimension: {original_dim}")
print(f"Latent dimension: {latent_dim}")

input_tensor = keras.Input(shape=(original_dim,), name='encoder_input')

x = layers.Dense(64, activation='relu')(input_tensor)
x = layers.Dense(32, activation='relu')(x)

z_mean = layers.Dense(latent_dim, name='z_mean')(x)
z_log_var = layers.Dense(latent_dim, name='z_log_var')(x)

def sampling(args):
    z_mean, z_log_var = args
    batch = K.shape(z_mean)[0]
    dim = K.int_shape(z_mean)[1]
    epsilon = K.random_normal(shape=(batch, dim))
    return z_mean + K.exp(0.5 * z_log_var) * epsilon

z = layers.Lambda(sampling, output_shape=(latent_dim,), name='z')([z_mean, z_log_var])

encoder = Model(input_tensor, [z_mean, z_log_var, z], name='encoder')
encoder.summary()

print("Encoder and reparameterization trick implemented successfully.")

latent_inputs = keras.Input(shape=(latent_dim,), name='decoder_input')

y = layers.Dense(32, activation='relu')(latent_inputs)
y = layers.Dense(64, activation='relu')(y)

reconstruction = layers.Dense(original_dim, activation='linear', name='decoder_output')(y)

decoder = Model(latent_inputs, reconstruction, name='decoder')
decoder.summary()

print("Decoder implemented successfully.")

class VAE(keras.Model):
    def __init__(self, encoder, decoder, original_dim, **kwargs):
        super().__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder
        self.original_dim = original_dim
        self.total_loss_tracker = keras.metrics.Mean(name="total_loss")
        self.reconstruction_loss_tracker = keras.metrics.Mean(name="reconstruction_loss")
        self.kl_loss_tracker = keras.metrics.Mean(name="kl_loss")

    def call(self, inputs):
        z_mean, z_log_var, z = self.encoder(inputs)
        reconstruction = self.decoder(z)
        return reconstruction

    def train_step(self, data):
        if isinstance(data, tuple):
            data = data[0]

        with tf.GradientTape() as tape:
            z_mean, z_log_var, z = self.encoder(data)
            reconstruction = self.decoder(z)
            reconstruction_loss = tf.reduce_mean(
                keras.losses.mean_squared_error(data, reconstruction)
            ) * self.original_dim
            kl_loss = -0.5 * tf.reduce_mean(
                tf.reduce_sum(1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var), axis=1)
            )

            total_loss = reconstruction_loss + kl_loss

        grads = tape.gradient(total_loss, self.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))

        self.total_loss_tracker.update_state(total_loss)
        self.reconstruction_loss_tracker.update_state(reconstruction_loss)
        self.kl_loss_tracker.update_state(kl_loss)
        return {
            "loss": self.total_loss_tracker.result(),
            "reconstruction_loss": self.reconstruction_loss_tracker.result(),
            "kl_loss": self.kl_loss_tracker.result(),
        }

    def test_step(self, data):
        if isinstance(data, tuple):
            data = data[0]

        z_mean, z_log_var, z = self.encoder(data)
        reconstruction = self.decoder(z)

        reconstruction_loss = tf.reduce_mean(
            keras.losses.mean_squared_error(data, reconstruction)
        ) * self.original_dim

        kl_loss = -0.5 * tf.reduce_mean(
            tf.reduce_sum(1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var), axis=1)
        )
        total_loss = reconstruction_loss + kl_loss

        self.total_loss_tracker.update_state(total_loss)
        self.reconstruction_loss_tracker.update_state(reconstruction_loss)
        self.kl_loss_tracker.update_state(kl_loss)
        return {
            "loss": self.total_loss_tracker.result(),
            "reconstruction_loss": self.reconstruction_loss_tracker.result(),
            "kl_loss": self.kl_loss_tracker.result(),
        }

    @property
    def metrics(self):
        return [
            self.total_loss_tracker,
            self.reconstruction_loss_tracker,
            self.kl_loss_tracker,
        ]

vae = VAE(encoder, decoder, original_dim)
vae.compile(optimizer='adam')
vae.build(input_shape=(None, original_dim))
vae.summary()

print("Full VAE model implemented and compiled successfully.")

class VAE(keras.Model):
    def __init__(self, encoder, decoder, original_dim, **kwargs):
        super().__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder
        self.original_dim = original_dim
        self.total_loss_tracker = keras.metrics.Mean(name="total_loss")
        self.reconstruction_loss_tracker = keras.metrics.Mean(name="reconstruction_loss")
        self.kl_loss_tracker = keras.metrics.Mean(name="kl_loss")
        self.mse_loss_fn = tf.keras.losses.MeanSquaredError()

    def call(self, inputs):
        z_mean, z_log_var, z = self.encoder(inputs)
        reconstruction = self.decoder(z)
        return reconstruction

    def train_step(self, data):
        if isinstance(data, tuple):
            data = data[0]

        with tf.GradientTape() as tape:
            z_mean, z_log_var, z = self.encoder(data)
            reconstruction = self.decoder(z)

            reconstruction_loss = tf.reduce_mean(
                self.mse_loss_fn(data, reconstruction)
            ) * self.original_dim
            kl_loss = -0.5 * tf.reduce_mean(
                tf.reduce_sum(1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var), axis=1)
            )

            total_loss = reconstruction_loss + kl_loss

        grads = tape.gradient(total_loss, self.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))

        self.total_loss_tracker.update_state(total_loss)
        self.reconstruction_loss_tracker.update_state(reconstruction_loss)
        self.kl_loss_tracker.update_state(kl_loss)
        return {
            "loss": self.total_loss_tracker.result(),
            "reconstruction_loss": self.reconstruction_loss_tracker.result(),
            "kl_loss": self.kl_loss_tracker.result(),
        }

    def test_step(self, data):
        if isinstance(data, tuple):
            data = data[0]

        z_mean, z_log_var, z = self.encoder(data)
        reconstruction = self.decoder(z)

        reconstruction_loss = tf.reduce_mean(
            self.mse_loss_fn(data, reconstruction)
        ) * self.original_dim

        kl_loss = -0.5 * tf.reduce_mean(
            tf.reduce_sum(1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var), axis=1)
        )
        total_loss = reconstruction_loss + kl_loss

        self.total_loss_tracker.update_state(total_loss)
        self.reconstruction_loss_tracker.update_state(reconstruction_loss)
        self.kl_loss_tracker.update_state(kl_loss)
        return {
            "loss": self.total_loss_tracker.result(),
            "reconstruction_loss": self.reconstruction_loss_tracker.result(),
            "kl_loss": self.kl_loss_tracker.result(),
        }

    @property
    def metrics(self):
        return [
            self.total_loss_tracker,
            self.reconstruction_loss_tracker,
            self.kl_loss_tracker,
        ]
vae = VAE(encoder, decoder, original_dim)
vae.compile(optimizer='adam')
vae.build(input_shape=(None, original_dim))

print("VAE class definition updated and model re-instantiated with fix for MSE loss.")

epochs = 100
batch_size = 32

print(f"Starting VAE training for {epochs} epochs with a batch size of {batch_size}...")

history = vae.fit(
    X_train_scaled,
    epochs=epochs,
    batch_size=batch_size,
    validation_data=(X_val_scaled,),
    verbose=1
)

print("VAE model training completed.")

print("Predicting latent space parameters and reconstructions for the test set...")

z_mean_test, z_log_var_test, z_test = vae.encoder.predict(X_test_final_scaled)

reconstructions_test = vae.predict(X_test_final_scaled)

print(f"Shape of z_mean_test: {z_mean_test.shape}")
print(f"Shape of z_log_var_test: {z_log_var_test.shape}")
print(f"Shape of z_test: {z_test.shape}")
print(f"Shape of reconstructions_test: {reconstructions_test.shape}")
print("Predictions completed.")

print("Calculating reconstruction errors for the test set...")

squared_diff = tf.square(X_test_final_scaled - reconstructions_test)

mse_per_sample = tf.reduce_mean(squared_diff, axis=1)

reconstruction_errors_test = mse_per_sample * original_dim

print(f"Shape of reconstruction_errors_test: {reconstruction_errors_test.shape}")
print("Reconstruction errors calculated successfully.")

print("Calculating KL divergence for the test set...")

kl_divergence_test = -0.5 * K.sum(1 + z_log_var_test - K.square(z_mean_test) - K.exp(z_log_var_test), axis=1)

print(f"Shape of kl_divergence_test: {kl_divergence_test.shape}")
print("KL divergence calculated successfully.")

print("Combining reconstruction error and KL divergence to calculate anomaly scores...")

reconstruction_errors_test_float32 = tf.cast(reconstruction_errors_test, tf.float32)
kl_divergence_test_float32 = tf.cast(kl_divergence_test, tf.float32)

anomaly_scores_test = reconstruction_errors_test_float32 + kl_divergence_test_float32

print(f"Shape of anomaly_scores_test: {anomaly_scores_test.shape}")
print("Anomaly scores calculated successfully.")

from sklearn.metrics import average_precision_score, precision_recall_curve
import matplotlib.pyplot as plt

print("Evaluating anomaly detection performance using AUC-PR...")
auc_pr = average_precision_score(y_test_final, anomaly_scores_test)

print(f"Average Precision (AUC-PR): {auc_pr:.4f}")

precision, recall, _ = precision_recall_curve(y_test_final, anomaly_scores_test)

plt.figure(figsize=(8, 6))
plt.plot(recall, precision, label=f'AUC-PR = {auc_pr:.2f}')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve for Anomaly Detection')
plt.legend()
plt.grid(True)
plt.show()

print("Anomaly detection performance evaluation complete.")