In [2]:
import numpy as np

# Path to your CSV file
path_train = 'data/train_data.npy'
path_test = 'data/test_data.npy'
path_test_label = 'data/test_labels.npy'

# Read the CSV file into a DataFrame
train = np.load(path_train)
test = np.load(path_test)
test_labels = np.load(path_test_label)

In [3]:
data = test
labels = test_labels

In [None]:
# Ideas: Look at papers about anomaly detection (using autoencoders)
# Do hyperparam search on lr and percentile

In [10]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Scale the data
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)

# Split into training (normal data) and testing sets
X_train = train
X_test = test
y_test = test_labels

#X_train, X_test, y_train, y_test = train_test_split(data_scaled, labels, test_size=0.2, random_state=42)
#X_train = X_train[y_train == 0]  # Train only on normal data

# Convert data to PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32)

# Define the autoencoder model
class Autoencoder(nn.Module):
    
    #lr = 0.001
    lr = 0.0001
    percentile = 90
    def __init__(self, input_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 14),
            nn.ReLU(),
            nn.Linear(14, 7),
            nn.ReLU(),
            nn.Linear(7, 3),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(3, 7),
            nn.ReLU(),
            nn.Linear(7, 14),
            nn.ReLU(),
            nn.Linear(14, input_dim),
            nn.Sigmoid()
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

# Instantiate the model, define the loss function and optimizer
input_dim = X_train.shape[1]
model = Autoencoder(input_dim)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=model.lr)

# Train the autoencoder
num_epochs = 10
batch_size = 64

for epoch in range(num_epochs):
    permutation = torch.randperm(X_train.size()[0])
    for i in range(0, X_train.size()[0], batch_size):
        indices = permutation[i:i + batch_size]
        batch_x = X_train[indices]

        # Forward pass
        outputs = model(batch_x)
        loss = criterion(outputs, batch_x)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')

# Use the autoencoder to reconstruct the test data
X_test_pred = model(X_test).detach().numpy()

# Calculate reconstruction error
mse = np.mean((X_test.numpy() - X_test_pred) ** 2, axis=1)

# Set a threshold for anomaly detection (e.g., 95th percentile)
threshold = np.percentile(mse, model.percentile)

# Identify anomalies
anomalies = mse > threshold

# Evaluate the model
accuracy = accuracy_score(y_test.numpy(), anomalies)
precision = precision_score(y_test.numpy(), anomalies)
recall = recall_score(y_test.numpy(), anomalies)
f1 = f1_score(y_test.numpy(), anomalies)

print(f"Parameters: lr: {model.lr}, percentile: {model.percentile}")
print(f"Number of total samples: {len(X_test)}")
print(f"Number of predicted anomalies: {np.sum(anomalies)}")
print(f"Number of actual anomalies: {np.sum(y_test.numpy())}")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-score: {f1:.2f}")

Epoch [1/10], Loss: 0.0140
Epoch [2/10], Loss: 0.0093
Epoch [3/10], Loss: 0.0035
Epoch [4/10], Loss: 0.0053
Epoch [5/10], Loss: 0.0070
Epoch [6/10], Loss: 0.0025
Epoch [7/10], Loss: 0.0025
Epoch [8/10], Loss: 0.0031
Epoch [9/10], Loss: 0.0024
Epoch [10/10], Loss: 0.0019
Parameters: lr: 0.0001, percentile: 90
Number of total samples: 449919
Number of predicted anomalies: 44992
Number of actual anomalies: 54584.0
Accuracy: 0.94
Precision: 0.79
Recall: 0.65
F1-score: 0.71


In [6]:
'''
percentile: 95

Epoch [50/50], Loss: 0.4138
Number of total samples: 89984
Number of predicted anomalies: 4500
Number of actual anomalies: 11032.0
Accuracy: 0.91
Precision: 0.86
Recall: 0.35
F1-score: 0.50
'''

'''
Epoch [10/10], Loss: 0.0019
Parameters: lr: 0.0001, percentile: 90
Number of total samples: 449919
Number of predicted anomalies: 44992
Number of actual anomalies: 54584.0
Accuracy: 0.94
Precision: 0.79
Recall: 0.65
F1-score: 0.71
'''

'\npercentile: 95\n\nEpoch [50/50], Loss: 0.4138\nNumber of total samples: 89984\nNumber of predicted anomalies: 4500\nNumber of actual anomalies: 11032.0\nAccuracy: 0.91\nPrecision: 0.86\nRecall: 0.35\nF1-score: 0.50\n'