In [41]:

import pandas as pd
from pathlib import Path
import pyarrow.parquet as pq
from dataclasses import dataclass
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, classification_report, confusion_matrix, ConfusionMatrixDisplay, fbeta_score
from sklearn.metrics import matthews_corrcoef
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest

import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve, roc_auc_score

import warnings
warnings.filterwarnings("ignore")

import import_ipynb
import data_preprocessing


In [9]:
# training data
u4_train_equil_turbine = data_preprocessing.u4_train_equil_turbine
u4_train_equil_pump = data_preprocessing.u4_train_equil_pump
u5_train_equil_turbine = data_preprocessing.u5_train_equil_turbine
u5_train_equil_pump = data_preprocessing.u5_train_equil_pump
u6_train_equil_turbine = data_preprocessing.u6_train_equil_turbine
u6_train_equil_pump = data_preprocessing.u6_train_equil_pump

# synethetic test sets
u5_s01_equil_turbine = data_preprocessing.u5_s01_equil_turbine
u5_s01_equil_pump = data_preprocessing.u5_s01_equil_pump
u5_s02_equil_turbine = data_preprocessing.u5_s02_equil_turbine
u5_s02_equil_pump = data_preprocessing.u5_s02_equil_pump
u6_s01_equil_turbine = data_preprocessing.u6_s01_equil_turbine
u6_s01_equil_pump = data_preprocessing.u6_s01_equil_pump
u6_s02_equil_turbine = data_preprocessing.u6_s02_equil_turbine
u6_s02_equil_pump = data_preprocessing.u6_s02_equil_pump

# real test sets
u4_test_equil_turbine = data_preprocessing.u4_test_equil_turbine
u4_test_equil_pump = data_preprocessing.u4_test_equil_pump
u5_test_equil_turbine = data_preprocessing.u5_test_equil_turbine
u5_test_equil_pump = data_preprocessing.u5_test_equil_pump
u6_test_equil_turbine = data_preprocessing.u6_test_equil_turbine
u6_test_equil_pump = data_preprocessing.u6_test_equil_pump

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Simplified Autoencoder
class FastAutoencoder(nn.Module):
    def __init__(self, input_dim):
        super(FastAutoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 16),
        )
        self.decoder = nn.Sequential(
            nn.Linear(16, 128),
            nn.ReLU(),
            nn.Linear(128, input_dim),
        )

    def forward(self, x):
        feature = self.encoder(x)
        reconstruction = self.decoder(feature)
        return reconstruction, feature

# Data preprocessing
def preprocess_data_fast(df):
    # Scale data and convert to tensor
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(df)
    return torch.tensor(scaled_data, dtype=torch.float32), scaler

# Optimized training loop
def train_autoencoder_fast(model, train_data, val_data, epochs=5, batch_size=32, learning_rate=0.005):
    train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)
    val_loader = torch.utils.data.DataLoader(val_data, batch_size=batch_size, shuffle=False)

    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            reconstruction, _ = model(batch)
            loss = criterion(reconstruction, batch)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        train_loss /= len(train_loader)

        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                reconstruction, _ = model(batch)
                loss = criterion(reconstruction, batch)
                val_loss += loss.item()
        val_loss /= len(val_loader)

        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")

    return model


In [6]:

# Preprocess u4_train_equil_turbine
u4_train_data, scaler = preprocess_data_fast(u4_train_equil_turbine)

# Train-validation split
train_tensor, val_tensor = train_test_split(u4_train_data, test_size=0.2, random_state=42)

# Initialize and train the autoencoder
input_dim = train_tensor.shape[1]
model = FastAutoencoder(input_dim)
u4_turbine_trained_model = train_autoencoder_fast(model, train_tensor, val_tensor)


Epoch 1/5, Train Loss: 0.0661, Validation Loss: 0.0460
Epoch 2/5, Train Loss: 0.0360, Validation Loss: 0.0339
Epoch 3/5, Train Loss: 0.0315, Validation Loss: 0.0298
Epoch 4/5, Train Loss: 0.0284, Validation Loss: 0.0259
Epoch 5/5, Train Loss: 0.0274, Validation Loss: 0.0261


In [7]:
# Preprocess the test data
def preprocess_test_data(test_df, train_columns, scaler):
    # Reindex test_df to match the columns of the training data
    test_df = test_df.reindex(columns=train_columns, fill_value=0)
    # Use the same scaler from the training data
    scaled_test_data = scaler.transform(test_df)
    return torch.tensor(scaled_test_data, dtype=torch.float32)

# Evaluate reconstruction error
def evaluate_reconstruction(model, test_data):
    model.eval()
    with torch.no_grad():
        reconstruction, _ = model(test_data)
        reconstruction_error = torch.mean((reconstruction - test_data) ** 2, dim=1)
    return reconstruction_error




def visualize_anomalies(reconstruction_errors, anomaly_scores, threshold):
    """
    Visualize reconstruction errors and detected anomalies.

    Args:
        reconstruction_errors (torch.Tensor): Reconstruction errors for the test data.
        anomaly_scores (torch.Tensor): Binary tensor indicating anomalies (1 for anomaly, 0 for normal).
        threshold (float or torch.Tensor): Threshold value used for anomaly detection.
    """
    # Ensure threshold is a float for consistent handling
    if isinstance(threshold, torch.Tensor):
        threshold = threshold.item()

    plt.figure(figsize=(10, 6))
    plt.plot(reconstruction_errors.numpy(), label="Reconstruction Error")
    plt.axhline(y=threshold, color='r', linestyle='--', label=f"Threshold ({threshold:.4f})")
    plt.scatter(
        range(len(anomaly_scores)), 
        reconstruction_errors.numpy(), 
        c=anomaly_scores.numpy(), 
        cmap='coolwarm', 
        label="Anomalies"
    )
    plt.title("Reconstruction Errors and Detected Anomalies")
    plt.xlabel("Time Steps")
    plt.ylabel("Reconstruction Error")
    plt.legend()
    plt.show()

from sklearn.metrics import fbeta_score

def define_anomaly_score(reconstruction_errors, threshold):
    """
    Define anomaly scores based on a given threshold.

    Args:
        reconstruction_errors (torch.Tensor): The reconstruction errors for the test data.
        threshold (float): Threshold value for anomaly detection.

    Returns:
        torch.Tensor: Binary tensor indicating anomalies (1 for anomaly, 0 for normal).
    """
    # Generate binary predictions based on the threshold
    anomaly_scores = (reconstruction_errors > threshold).int()
    return anomaly_scores


def custom_score(tp, fp, fn, tn, penalty_fp=2.0):
    tp_rate = tp / (tp + fn + 1e-6)
    fp_rate = fp / (fp + tn + 1e-6)
    return tp_rate - penalty_fp * fp_rate




In [11]:

# Example usage for u4_test_equil_turbine
# Assume `trained_model` is the autoencoder trained on u4_train_equil_turbine
# Assume `scaler` is the StandardScaler fitted to u4_train_equil_turbine

# Preprocess test data
u4_test_data = preprocess_test_data(u4_test_equil_turbine, u4_train_equil_turbine.columns, scaler)

# Evaluate reconstruction errors
reconstruction_errors = evaluate_reconstruction(u4_turbine_trained_model, u4_test_data)

# Define anomaly scores and threshold
# anomaly_scores, threshold = define_anomaly_score(reconstruction_errors)

# Visualize anomalies
# visualize_anomalies(reconstruction_errors, anomaly_scores, threshold)

In [16]:
u4_test_data.shape, u4_test_equil_turbine.shape

(torch.Size([18106, 94]), (18106, 94))

In [13]:
reconstruction_errors.shape

torch.Size([18106])

### u5 s01

In [47]:

# Preprocess u5_train_equil_turbine with scaler validation
scaler = StandardScaler()  # Initialize scaler
u5_train_turbine_scaled = scaler.fit_transform(u5_train_equil_turbine)  # Fit scaler and transform data
u5_train_turbine_data = torch.tensor(u5_train_turbine_scaled, dtype=torch.float32)  # Convert to tensor

# Train-validation split
train_tensor, val_tensor = train_test_split(u5_train_turbine_data, test_size=0.2, random_state=42)

# Initialize and train the autoencoder
input_dim = train_tensor.shape[1]
model = FastAutoencoder(input_dim)
u5_turbine_trained_model = train_autoencoder_fast(model, train_tensor, val_tensor)

# Ensure that the columns in the test data match the training data
u5_s01_turbine_data_preprocessed = preprocess_test_data(u5_s01_equil_turbine, u5_train_equil_turbine.columns, scaler)

ground_truth = u5_s01_equil_turbine['anomaly']  # Replace 'anomaly' with the actual column name

# Evaluate reconstruction errors
reconstruction_errors_u5_turbine = evaluate_reconstruction(u5_turbine_trained_model, u5_s01_turbine_data_preprocessed)


Epoch 1/5, Train Loss: 0.2920, Validation Loss: 0.2250
Epoch 2/5, Train Loss: 0.2699, Validation Loss: 0.3959
Epoch 3/5, Train Loss: 0.3113, Validation Loss: 0.3211
Epoch 4/5, Train Loss: 0.4137, Validation Loss: 0.4034
Epoch 5/5, Train Loss: 0.3956, Validation Loss: 0.4035


In [None]:
# Initialize the Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=10, random_state=42)
X = reconstruction_errors.reshape(-1, 1)
y = u5_s01_equil_turbine[['anomaly_01_type_a', 'anomaly_01_type_b', 'anomaly_01_type_c']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train the classifier
rf_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_classifier.predict(X_test)

# Evaluate the classifier
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
conf_matrices = multilabel_confusion_matrix(y_test, y_pred)
print(conf_matrices)

Accuracy: 0.2726803743827843

Classification Report:
               precision    recall  f1-score   support

           0       0.54      0.53      0.53      6092
           1       0.46      0.44      0.45      5498
           2       0.47      0.43      0.45      5852

   micro avg       0.49      0.47      0.48     17442
   macro avg       0.49      0.47      0.48     17442
weighted avg       0.49      0.47      0.48     17442
 samples avg       0.32      0.31      0.29     17442

[[[4717 2760]
  [2861 3231]]

 [[5302 2769]
  [3092 2406]]

 [[4868 2849]
  [3335 2517]]]


### u5 s02

In [58]:

# # Preprocess u5_train_equil_turbine with scaler validation
# scaler = StandardScaler()  # Initialize scaler
# u5_train_turbine_scaled = scaler.fit_transform(u5_train_equil_turbine)  # Fit scaler and transform data
# u5_train_turbine_data = torch.tensor(u5_train_turbine_scaled, dtype=torch.float32)  # Convert to tensor

# # Train-validation split
# train_tensor, val_tensor = train_test_split(u5_train_turbine_data, test_size=0.2, random_state=42)

# # Initialize and train the autoencoder
# input_dim = train_tensor.shape[1]
# model = FastAutoencoder(input_dim)
# u5_turbine_trained_model = train_autoencoder_fast(model, train_tensor, val_tensor)

# # Ensure that the columns in the test data match the training data
u5_s02_turbine_data_preprocessed = preprocess_test_data(u5_s02_equil_turbine, u5_train_equil_turbine.columns, scaler)

# ground_truth = u5_s01_equil_turbine['anomaly']  # Replace 'anomaly' with the actual column name

# Evaluate reconstruction errors
reconstruction_errors_u5_s02_turbine = evaluate_reconstruction(u5_turbine_trained_model, u5_s02_turbine_data_preprocessed)


In [59]:
# Initialize the Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=10, random_state=42)
X = reconstruction_errors_u5_s02_turbine.reshape(-1, 1)
y = u5_s02_equil_turbine[['anomaly_02_type_a', 'anomaly_02_type_b', 'anomaly_02_type_c']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train the classifier
rf_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_classifier.predict(X_test)

# Evaluate the classifier
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
conf_matrices = multilabel_confusion_matrix(y_test, y_pred)
print(conf_matrices)

Accuracy: 0.31229869974949304

Classification Report:
               precision    recall  f1-score   support

           0       0.22      0.19      0.20      2882
           1       0.64      0.64      0.64      9762
           2       0.59      0.57      0.58      9759

   micro avg       0.57      0.55      0.56     22403
   macro avg       0.48      0.47      0.48     22403
weighted avg       0.57      0.55      0.56     22403
 samples avg       0.48      0.47      0.46     22403

[[[11954  1930]
  [ 2335   547]]

 [[ 3521  3483]
  [ 3504  6258]]

 [[ 3123  3884]
  [ 4183  5576]]]


### u6 s01 turbine

In [45]:
# Preprocess u6_train_equil_turbine
u6_train_turbine_data, scaler = preprocess_data_fast(u6_train_equil_turbine)

# Train-validation split
train_tensor, val_tensor = train_test_split(u6_train_turbine_data, test_size=0.2, random_state=42)

# Initialize and train the autoencoder
input_dim = train_tensor.shape[1]
model = FastAutoencoder(input_dim)
u6_turbine_trained_model = train_autoencoder_fast(model, train_tensor, val_tensor)

u6_s01_turbine_data_preprocessed = preprocess_test_data(u6_s01_equil_turbine, u6_train_equil_turbine.columns, scaler)

# Evaluate reconstruction errors
reconstruction_errors_u6_turbine = evaluate_reconstruction(u6_turbine_trained_model, u6_s01_turbine_data_preprocessed)

Epoch 1/5, Train Loss: 0.1708, Validation Loss: 0.1186
Epoch 2/5, Train Loss: 0.1251, Validation Loss: 0.1194
Epoch 3/5, Train Loss: 0.1866, Validation Loss: 0.1560
Epoch 4/5, Train Loss: 0.1668, Validation Loss: 0.1743
Epoch 5/5, Train Loss: 0.1848, Validation Loss: 0.1877


In [53]:
X = reconstruction_errors_u6_turbine.reshape(-1,1)
y = u6_s01_equil_turbine[['anomaly_01_type_a', 'anomaly_01_type_b', 'anomaly_01_type_c']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train the classifier
rf_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_classifier.predict(X_test)

# Evaluate the classifier
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
conf_matrices = multilabel_confusion_matrix(y_test, y_pred)
print(conf_matrices)

Accuracy: 0.2209089801389058

Classification Report:
               precision    recall  f1-score   support

           0       0.49      0.46      0.48      7708
           1       0.51      0.49      0.50      8119
           2       0.43      0.40      0.41      6054

   micro avg       0.48      0.45      0.47     21881
   macro avg       0.48      0.45      0.46     21881
weighted avg       0.48      0.45      0.47     21881
 samples avg       0.35      0.34      0.32     21881

[[[5052 3654]
  [4147 3561]]

 [[4496 3799]
  [4155 3964]]

 [[7175 3185]
  [3650 2404]]]
