# Demonstration of anomaly detection with CVAE using DASHlink data

**Author: Milad Memarzadeh (milad.memarzadeh@nasa.gov)**


# Demonstration of anomaly detection with CVAE using DASHlink data

**Author: Milad Memarzadeh (milad.memarzadeh@nasa.gov)**


# Import Libraries

In [70]:
import os
import io
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn, optim
import torch.nn.functional as F

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix, ConfusionMatrixDisplay, precision_recall_fscore_support, roc_curve, precision_recall_curve, average_precision_score
from sklearn.utils import shuffle
#from source.modelsCondVAE import *
from source.utilsCondVAEs5 import *
from itertools import combinations
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cpu


In [71]:
train_learning_rate = 1e-3 #1e-3
train_num_epochs = 100

latent_dim = 6
batch_size = 64
window_size = 10


# Load and preprocess data

In [72]:
if os.name == 'nt':
    data_dir = 'C:/Users/jed95/Documents/GitHub/anomaly_detection/dataset/yahoo_s5/A4Benchmark/'
else:
    data_dir = '/home/adlink3/Downloads/yahoo_s5/A4Benchmark/'
save_dir="./CompressAE/A4/"+str(latent_dim)+"/"
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
file_list = [os.path.join(data_dir, f) for f in os.listdir(data_dir) if f.endswith('.csv')]


df_list = []
for file in file_list:
    df = pd.read_csv(file)
    df_list.append(df)

data = pd.concat(df_list, ignore_index=True)
print("Data shape:", data.shape)


Data shape: (168000, 9)


In [73]:
print("Missing values:", data.isnull().sum())
data.dropna(inplace=True)

Missing values: timestamps      0
value           0
anomaly         0
changepoint     0
trend           0
noise           0
seasonality1    0
seasonality2    0
seasonality3    0
dtype: int64


In [74]:
scaler = MinMaxScaler()
data['value'] = scaler.fit_transform(data['value'].values.reshape(-1, 1))


# Create Sequences

In [75]:
def create_sequences(values, labels, window_size, step=1):
    sequences = []
    seq_labels = []
    for i in range(0, len(values) - window_size + 1, step):
        seq = values[i:i + window_size]
        # If any label in the sequence is anomalous (e.g., 1), set the sequence label as anomalous
        label = 1 if any(labels[i:i + window_size]) else 0
        sequences.append(seq)
        seq_labels.append(label)
    return np.array(sequences), np.array(seq_labels)


sequences, seq_labels = create_sequences(data['value'].values, data['anomaly'].values, window_size=window_size)
print("Sequences shape:", sequences.shape)
print("Sequence labels shape:", seq_labels.shape)



Sequences shape: (167991, 10)
Sequence labels shape: (167991,)


# Split data into labeled and unlabeled sets

In [76]:

normal_indices = np.where(seq_labels == 0)[0]
anomalous_indices = np.where(seq_labels == 1)[0]

normal_sequences = sequences[normal_indices]
normal_labels = seq_labels[normal_indices]

anomalous_sequences = sequences[anomalous_indices]
anomalous_labels = seq_labels[anomalous_indices]
print(normal_sequences.shape)
print(anomalous_sequences.shape)


(159855, 10)
(8136, 10)


In [77]:
X_train_normal, X_test_normal, y_train_normal, y_test_normal = train_test_split(
    normal_sequences, normal_labels, test_size=0.6, random_state=42, stratify=normal_labels)
X_train_normal, X_val_normal, y_train_normal, y_val_normal = train_test_split(
    X_train_normal, y_train_normal, test_size=0.5, random_state=42, stratify=y_train_normal)


X_train_anomalous, X_test_anomalous, y_train_anomalous, y_test_anomalous = train_test_split(
    anomalous_sequences, anomalous_labels, test_size=0.6, random_state=42, stratify=anomalous_labels)
X_train_anomalous, X_val_anomalous, y_train_anomalous, y_val_anomalous = train_test_split(
    X_train_anomalous, y_train_anomalous, test_size=0.5, random_state=42, stratify=y_train_anomalous)


In [78]:
X_train = np.concatenate([X_train_normal, X_train_anomalous], axis=0)
y_train = np.concatenate([y_train_normal, y_train_anomalous], axis=0)

X_val = np.concatenate([X_val_normal, X_val_anomalous], axis=0)
y_val = np.concatenate([y_val_normal, y_val_anomalous], axis=0)

X_test = np.concatenate([X_test_normal, X_test_anomalous], axis=0)
y_test = np.concatenate([y_test_normal, y_test_anomalous], axis=0)
print("Training data shape:", X_train.shape, y_train.shape)
print("Validation data shape:", X_val.shape, y_val.shape)
print("Test data shape:", X_test.shape, y_test.shape)


Training data shape: (33598, 10) (33598,)
Validation data shape: (33598, 10) (33598,)
Test data shape: (100795, 10) (100795,)


In [79]:
X_train, y_train = shuffle(X_train, y_train, random_state=42)
X_val, y_val = shuffle(X_val, y_val, random_state=42)
X_test, y_test = shuffle(X_test, y_test, random_state=42)


# Convert Data to Tensors

In [80]:
np.save(save_dir+'X_train.npy', X_train.astype(np.float32))
np.save(save_dir+'y_train.npy', y_train.astype(np.float32))
np.save(save_dir+'X_val.npy', X_val.astype(np.float32))
np.save(save_dir+'y_val.npy', y_val.astype(np.float32))
np.save(save_dir+'X_test.npy', X_test.astype(np.float32))
np.save(save_dir+'y_test.npy', y_test.astype(np.float32))

X_train_size = os.path.getsize(save_dir+'X_train.npy')
X_val_size = os.path.getsize(save_dir+'X_val.npy')
X_test_size = os.path.getsize(save_dir+'X_test.npy')

In [81]:
X_train_tensor = torch.tensor(X_train).unsqueeze(1).float()
y_train_tensor = torch.tensor(y_train).long()

X_val_tensor = torch.tensor(X_val).unsqueeze(1).float()
y_val_tensor = torch.tensor(y_val).long()

X_test_tensor = torch.tensor(X_test).unsqueeze(1).float()
y_test_tensor = torch.tensor(y_test).long()
print("X_train_tensor shape:", X_train_tensor.shape)
print("y_train_tensor shape:", y_train_tensor.shape)
print("X_val_tensor shape:", X_val_tensor.shape)
print("y_val_tensor shape:", y_val_tensor.shape)
print("X_test_tensor shape:", X_test_tensor.shape)
print("y_test_tensor shape:", y_test_tensor.shape)


X_train_tensor shape: torch.Size([33598, 1, 10])
y_train_tensor shape: torch.Size([33598])
X_val_tensor shape: torch.Size([33598, 1, 10])
y_val_tensor shape: torch.Size([33598])
X_test_tensor shape: torch.Size([100795, 1, 10])
y_test_tensor shape: torch.Size([100795])


# Create Data Loaders for Labeled and Unlabeled Data

In [82]:

#train_dataset = TimeSeriesDataset(X_train_tensor, y_train_tensor)
#train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False) #TODO adjust Shuffle training data? 
#full_train_dataset = TimeSeriesDataset(X_train_tensor, y_train_tensor)
#
#val_dataset = TimeSeriesDataset(X_val_tensor, y_val_tensor)
#val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
#
#
#test_dataset = TimeSeriesDataset(X_test_tensor, y_test_tensor)
#test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
#
X_train_dataset = torch.utils.data.TensorDataset(X_train_tensor)
X_train_loader = torch.utils.data.DataLoader(X_train_dataset, batch_size=batch_size, shuffle=False)

In [83]:
class Conv1dAutoencoder(nn.Module):
    def __init__(self, input_length, latent_dim):
        super(Conv1dAutoencoder, self).__init__()
        self.input_length = input_length
        self.latent_dim = latent_dim

        # Encoder: Conv1d downsampling
        self.encoder = nn.Sequential(
            nn.Conv1d(1, 16, kernel_size=3, stride=2, padding=1),
            nn.ReLU(True),
            nn.Conv1d(16, 8, kernel_size=3, stride=2, padding=1),
            nn.ReLU(True)
        )
        # infer flattened size
        with torch.no_grad():
            dummy = torch.zeros(1, 1, self.input_length)
            enc = self.encoder(dummy)
            c, l = enc.shape[1], enc.shape[2]
            self._flattened_size = c * l

        # Bottleneck
        self.fc1 = nn.Linear(self._flattened_size, self.latent_dim)
        self.fc2 = nn.Linear(self.latent_dim, self._flattened_size)

        # Decoder: ConvTranspose1d upsampling
        self.decoder = nn.Sequential(
            nn.Unflatten(1, (c, l)),
            nn.ConvTranspose1d(c, 16, kernel_size=3, stride=2, padding=1, output_padding=1),
            nn.ReLU(True),
            nn.ConvTranspose1d(16, 1, kernel_size=3, stride=2, padding=1, output_padding=1),
            nn.Sigmoid()
        )

    def encode(self, x):
        x = self.encoder(x)
        x = x.view(x.size(0), -1)
        return self.fc1(x)

    def decode(self, z):
        x = self.fc2(z)
        x = self.decoder(x)
        return x[..., :self.input_length]

    def forward(self, x):
        z = self.encode(x)
        return self.decode(z), z


In [84]:
time_start = time.time()
model = Conv1dAutoencoder(input_length=window_size, latent_dim=latent_dim).to(device)
opt = optim.Adam(model.parameters(), lr=train_learning_rate)
loss_fn = nn.MSELoss()

model.train()
for ep in range(train_num_epochs):
    tot=0
    for bx, in X_train_loader:
        bx = bx.to(device)
        recon, _ = model(bx)
        loss = loss_fn(recon, bx)
        opt.zero_grad(); loss.backward(); opt.step()
        tot+=loss.item()
    print(f'Epoch {ep+1} Loss {tot/len(X_train_loader):.6f}')
time_end = time.time()
print("Training time:", time_end - time_start, "seconds")

Epoch 1 Loss 0.003358
Epoch 2 Loss 0.000290
Epoch 3 Loss 0.000129
Epoch 4 Loss 0.000114
Epoch 5 Loss 0.000109
Epoch 6 Loss 0.000105
Epoch 7 Loss 0.000103
Epoch 8 Loss 0.000101
Epoch 9 Loss 0.000099
Epoch 10 Loss 0.000096
Epoch 11 Loss 0.000092
Epoch 12 Loss 0.000087
Epoch 13 Loss 0.000081
Epoch 14 Loss 0.000078
Epoch 15 Loss 0.000077
Epoch 16 Loss 0.000076
Epoch 17 Loss 0.000076
Epoch 18 Loss 0.000075
Epoch 19 Loss 0.000075
Epoch 20 Loss 0.000074
Epoch 21 Loss 0.000074
Epoch 22 Loss 0.000074
Epoch 23 Loss 0.000073
Epoch 24 Loss 0.000073
Epoch 25 Loss 0.000073
Epoch 26 Loss 0.000072
Epoch 27 Loss 0.000072
Epoch 28 Loss 0.000071
Epoch 29 Loss 0.000071
Epoch 30 Loss 0.000070
Epoch 31 Loss 0.000069
Epoch 32 Loss 0.000068
Epoch 33 Loss 0.000067
Epoch 34 Loss 0.000065
Epoch 35 Loss 0.000063
Epoch 36 Loss 0.000061
Epoch 37 Loss 0.000059
Epoch 38 Loss 0.000055
Epoch 39 Loss 0.000053
Epoch 40 Loss 0.000051
Epoch 41 Loss 0.000050
Epoch 42 Loss 0.000050
Epoch 43 Loss 0.000049
Epoch 44 Loss 0.0000

In [85]:
model.eval()
#print(summary(
#model,
#input_size=(32, 1, window_size),
#))

Conv1dAutoencoder(
  (encoder): Sequential(
    (0): Conv1d(1, 16, kernel_size=(3,), stride=(2,), padding=(1,))
    (1): ReLU(inplace=True)
    (2): Conv1d(16, 8, kernel_size=(3,), stride=(2,), padding=(1,))
    (3): ReLU(inplace=True)
  )
  (fc1): Linear(in_features=24, out_features=6, bias=True)
  (fc2): Linear(in_features=6, out_features=24, bias=True)
  (decoder): Sequential(
    (0): Unflatten(dim=1, unflattened_size=(8, 3))
    (1): ConvTranspose1d(8, 16, kernel_size=(3,), stride=(2,), padding=(1,), output_padding=(1,))
    (2): ReLU(inplace=True)
    (3): ConvTranspose1d(16, 1, kernel_size=(3,), stride=(2,), padding=(1,), output_padding=(1,))
    (4): Sigmoid()
  )
)

In [86]:
time_start = time.time()
with torch.no_grad():
    recon1, z_train = model(X_train_tensor.to(device))
    recon2, z_val = model(X_val_tensor.to(device))
    recon3, z_test = model(X_test_tensor.to(device))
time_end = time.time()
print("Compressing time:", time_end - time_start, "seconds")
np.save(save_dir+'z_train.npy', z_train.cpu().numpy())
np.save(save_dir+'z_val.npy', z_val.cpu().numpy())
np.save(save_dir+'z_test.npy', z_test.cpu().numpy())

#dfz = pd.DataFrame(z, columns=[f'z{i}' for i in range(z.shape[1])])
#dfz['label'] = y_train
#dfz.to_csv('compressed_with_labels.csv', index=False)
# sizes
z_train_size = os.path.getsize(save_dir+'z_train.npy')
z_val_size = os.path.getsize(save_dir+'z_val.npy')
z_test_size = os.path.getsize(save_dir+'z_test.npy')

Compressing time: 0.12424993515014648 seconds


In [87]:
train_ratio = z_train_size / X_train_size if X_train_size else float('nan')
print(f"Uncompressed: {X_train_size} bytes\nCompressed: {z_train_size} bytes\nRatio: {train_ratio:.3f}\n")
val_ratio = z_val_size / X_val_size if X_val_size else float('nan')
print(f"Uncompressed: {X_val_size} bytes\nCompressed: {z_val_size} bytes\nRatio: {val_ratio:.3f}\n")
test_ratio = z_test_size / X_test_size if X_test_size else float('nan')
print(f"Uncompressed: {X_test_size} bytes\nCompressed: {z_test_size} bytes\nRatio: {test_ratio:.3f}\n")


Uncompressed: 1344048 bytes
Compressed: 806480 bytes
Ratio: 0.600

Uncompressed: 1344048 bytes
Compressed: 806480 bytes
Ratio: 0.600

Uncompressed: 4031928 bytes
Compressed: 2419208 bytes
Ratio: 0.600



# Instantiate and Train the Conditional VAE

In [88]:
def evaluate_model(model, data_loader):
    model.eval()
    total_loss = 0
    total_correct = 0
    total_samples = 0
    criterion = nn.CrossEntropyLoss()

    with torch.no_grad():
        for data, target in data_loader:
            data = data.to(device)
            target = target.to(device)

            # Forward pass
            x_rec, class_logits = model(data)
            
            # Compute classification loss
            loss = criterion(class_logits, target)
            total_loss += loss.item() * data.size(0)

            # Compute accuracy
            preds = class_logits.argmax(dim=1)
            total_correct += (preds == target).sum().item()
            total_samples += data.size(0)

    avg_loss = total_loss / total_samples
    avg_acc = total_correct / total_samples

    return avg_loss, avg_acc


In [89]:
def plot_training_loss(model_name, save_dir):
    training_traj = np.load(save_dir+model_name+"_training_loss.npz")

    total_loss = training_traj['training_total_loss']
    rec_loss = training_traj['training_rec_loss']
    kl_loss = training_traj['training_kl_loss']
    class_loss = training_traj['training_class_loss']
    plt.figure(figsize=(16, 16))
    plt.suptitle(model_name, fontsize=16)
    plt.subplot(2, 2, 1)
    plt.title("Total Loss", fontsize=12)
    plt.plot(range(len(total_loss)), total_loss)
    plt.tick_params(axis='both', which='major', labelsize=14)
    plt.subplot(2, 2, 2)
    plt.title("Reconstruction Loss", fontsize=12)
    plt.plot(range(len(total_loss)), rec_loss)
    plt.tick_params(axis='both', which='major', labelsize=14)
    plt.subplot(2, 2, 3)
    plt.title("KL Loss", fontsize=12)
    plt.plot(range(len(total_loss)), kl_loss)
    plt.tick_params(axis='both', which='major', labelsize=14)
    plt.subplot(2, 2, 4)
    plt.title("Classification Loss", fontsize=12)
    plt.plot(range(len(total_loss)), class_loss)
    plt.tick_params(axis='both', which='major', labelsize=14)
    
#for fold in range(k_folds):
#    plot_training_loss("CondVAE_model"+str(fold+1), save_dir)

# Initialize 3 new models for fine-tuning

# Verify the 3 new models

# Evaluate the Model and Detect Anomalies

In [90]:
def compute_anomaly_scores(model, data_loader):
    model.eval()
    rec_errors = []
    anomaly_probs = []
    true_labels = []
    predictions = []

    with torch.no_grad():
        for X_batch, y_batch in data_loader:
            X_batch = X_batch.unsqueeze(1).to(device)
            y_batch = y_batch.to(device)

            x_rec, class_logits = model(X_batch)
            y_probs = F.softmax(class_logits, dim=1)

            # Reconstruction error
            rec_error = torch.mean((X_batch - x_rec) ** 2, dim=[1, 2])

            # Classification probability for anomaly class
            anomaly_prob = y_probs[:, 1]

            # Combine scores
            #anomaly_score = rec_error * anomaly_prob
            rec_errors.extend(rec_error.cpu().numpy())
            anomaly_probs.extend(anomaly_prob.cpu().numpy())
            true_labels.extend(y_batch.cpu().numpy())
            predictions.extend(torch.argmax(class_logits, dim=1).cpu().numpy())

    return np.array(rec_errors), np.array(anomaly_probs), np.array(true_labels), np.array(predictions)

