In [37]:
import torch
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")

In [38]:
class VAE(nn.Module):
    def __init__(self, input_dim, latent_dim):
        super(VAE, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU()
        )
        self.fc_mu = nn.Linear(64, latent_dim)
        self.fc_logvar = nn.Linear(64, latent_dim)
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, input_dim),
            nn.Sigmoid()
        )

    def encode(self, x):
        h = self.encoder(x)
        return self.fc_mu(h), self.fc_logvar(h)

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def decode(self, z):
        return self.decoder(z)

    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        return self.decode(z), mu, logvar

In [39]:
def preprocess_data(dataset_path, ignore_columns=None):
    df = pd.read_csv(dataset_path)

    full_df = df.copy()

    if 'source' in df.columns:
        features = df.drop(columns=['source'])
    else:
        features = df.copy()

    if ignore_columns is None:
        ignore_columns = [col for col in features.columns if col[:2] == 'F' and len(col) == 3 and col[2] in ['L', 'M']]

    features = features.drop(columns=ignore_columns)

    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(features)

    return pd.DataFrame(scaled_features, columns=features.columns), full_df  


In [40]:
def create_dataloader(features, batch_size=64):
    tensor_data = torch.tensor(features.values, dtype=torch.float32)
    dataset = TensorDataset(tensor_data, tensor_data)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [41]:
def detect_anomalies(model, dataloader, threshold_percentile=95):
    model.eval()
    anomaly_scores = []
    latent_vectors = []
    with torch.no_grad():
        for batch, _ in dataloader:
            recon_batch, mu, logvar = model(batch)
            recon_error = torch.mean((recon_batch - batch) ** 2, dim=1)
            anomaly_scores.extend(recon_error.numpy())
            latent_vectors.append(mu.numpy())
    anomaly_scores = np.array(anomaly_scores)
    latent_vectors = np.vstack(latent_vectors)
    threshold = np.percentile(anomaly_scores, threshold_percentile)
    anomaly_labels = (anomaly_scores > threshold).astype(int)
    print(f"Anomaly threshold (percentile {threshold_percentile}): {threshold:.6f}")
    return anomaly_scores, latent_vectors, anomaly_labels, threshold

In [None]:
def save_results(features, anomaly_scores, latent_vectors, anomaly_labels, output_file='vae_anomaly_test_results.csv'):
    results = features.copy()
    results['Anomaly'] = anomaly_labels
    results['Reconstruction_Error'] = anomaly_scores
    latent_df = pd.DataFrame(latent_vectors, columns=[f'Latent_{i+1}' for i in range(latent_vectors.shape[1])])
    final_df = pd.concat([results, latent_df], axis=1)
    final_df.to_csv(output_file, index=False)
    print(f"Results saved to {output_file}")

In [None]:
def plot_anomaly_visualizations(anomaly_scores, latent_vectors, anomaly_labels, threshold, output_prefix="vae_anomaly_plot"):
    plt.figure(figsize=(14, 5))

    # 1. Histogram of reconstruction errors
    plt.subplot(1, 2, 1)
    sns.histplot(anomaly_scores, bins=50, kde=True, color='skyblue')
    plt.axvline(threshold, color='red', linestyle='--', label=f'Threshold = {threshold:.4f}')
    plt.title("Histogram of Reconstruction Errors")
    plt.xlabel("Reconstruction Error")
    plt.ylabel("Frequency")
    plt.legend()

    # 2. Latent space scatter plot
    plt.subplot(1, 2, 2)
    latent_df = pd.DataFrame(latent_vectors, columns=["Latent_1", "Latent_2"])
    latent_df["Anomaly"] = anomaly_labels
    sns.scatterplot(data=latent_df, x="Latent_1", y="Latent_2", hue="Anomaly", palette=["green", "red"])
    plt.title("Latent Space Visualization")
    plt.xlabel("Latent Dimension 1")
    plt.ylabel("Latent Dimension 2")

    plt.tight_layout()

    # Save plot
    plot_filename = f"{output_prefix}_visualizations.png"
    plt.savefig(plot_filename)
    print(f"Visualizations saved to {plot_filename}")
    plt.close()


In [None]:
def main():
    test_path = 'train.csv'  
    features, full_df = preprocess_data(test_path)

    latent_dim = 2  

    input_dim = features.shape[1]

    vae = VAE(input_dim, latent_dim)
    vae.load_state_dict(torch.load('best_vae_model.pt'))

    dataloader = create_dataloader(features)
    anomaly_scores, latent_vectors, anomaly_labels, threshold = detect_anomalies(vae, dataloader)

    save_results(full_df, anomaly_scores, latent_vectors, anomaly_labels)
    plot_anomaly_visualizations(anomaly_scores, latent_vectors, anomaly_labels, threshold, output_prefix="vae_anomaly_test")


In [45]:
if __name__ == "__main__":
    main()

Anomaly threshold (percentile 95): 1.917189
Results saved to vae_anomaly_test_results_1.csv
Visualizations saved to vae_anomaly_test_1_visualizations.png
