In [None]:
# =============================================================================
# CONFIGURATION & PATHS
# =============================================================================

import os
import json
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# File paths (adjust these paths as needed)
INPUT_FILE = r"C:\Users\Ken Ira Talingting\Desktop\anomaly-detection-project\data\processed\equipment_anomaly_data_feature_engineered.csv"
OUTPUT_DIR = Path(r"C:\Users\Ken Ira Talingting\Desktop\anomaly-detection-project\data\processed_results")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)  # Ensure the output directory exists

# Set random seeds for reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
tf.random.set_seed(RANDOM_STATE)


In [None]:
# =============================================================================
# DATA LOADING & PREPROCESSING FUNCTIONS
# =============================================================================

def load_data(file_path):
    """
    Load the CSV file into a pandas DataFrame.
    
    Parameters:
        file_path (str): Path to the CSV file.
        
    Returns:
        df (DataFrame): Loaded DataFrame.
    """
    df = pd.read_csv(file_path)
    # Convert boolean columns to integer (if any)
    bool_cols = df.select_dtypes(include='bool').columns
    if len(bool_cols) > 0:
        df[bool_cols] = df[bool_cols].astype(int)
    return df

def preprocess_data(df, target_column='faulty'):
    """
    Separate features and target, and apply scaling.
    
    Parameters:
        df (DataFrame): Input DataFrame.
        target_column (str): Name of the column containing ground truth anomaly labels.
        
    Returns:
        X_scaled (DataFrame): Scaled feature data for autoencoder training.
        y (np.array): Ground truth anomaly labels.
        scaler (StandardScaler): Fitted scaler (for inverse transforming or future use).
    """
    # Separate features and target variable
    X = df.drop(columns=[target_column])
    y = df[target_column].values

    # Scale features to zero mean and unit variance.
    scaler = StandardScaler()
    X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
    
    return X_scaled, y, scaler

# Example usage:
# df = load_data(INPUT_FILE)
# X, y, scaler = preprocess_data(df)


In [None]:
# =============================================================================
# AUTOENCODER MODEL DEFINITION
# =============================================================================

def build_autoencoder(input_dim, encoding_dim=8):
    """
    Build a simple fully-connected autoencoder using Keras.
    
    Parameters:
        input_dim (int): Number of input features.
        encoding_dim (int): Dimension of the latent (encoded) space.
        
    Returns:
        autoencoder (Model): Compiled autoencoder model.
    """
    # Input layer
    input_layer = keras.Input(shape=(input_dim,))
    
    # Encoder: compress input to latent representation
    encoded = layers.Dense(encoding_dim * 2, activation='relu')(input_layer)
    encoded = layers.Dense(encoding_dim, activation='relu')(encoded)
    
    # Decoder: reconstruct input from latent space
    decoded = layers.Dense(encoding_dim * 2, activation='relu')(encoded)
    decoded = layers.Dense(input_dim, activation='linear')(decoded)
    
    # Build and compile the autoencoder model
    autoencoder = keras.Model(inputs=input_layer, outputs=decoded)
    autoencoder.compile(optimizer='adam', loss='mse')
    
    return autoencoder

# Example usage:
# autoencoder = build_autoencoder(input_dim=X.shape[1], encoding_dim=8)


In [None]:
# =============================================================================
# TRAINING & EVALUATION FUNCTIONS
# =============================================================================

def train_autoencoder(model, X, epochs=50, batch_size=32):
    """
    Train the autoencoder model.
    
    Parameters:
        model (Model): Compiled Keras autoencoder.
        X (DataFrame): Training data (features).
        epochs (int): Number of training epochs.
        batch_size (int): Batch size for training.
        
    Returns:
        history: Training history (for plotting losses).
    """
    history = model.fit(
        X, X,  # Input and target are the same for an autoencoder
        epochs=epochs,
        batch_size=batch_size,
        shuffle=True,
        validation_split=0.1,  # Reserve 10% of data for validation
        verbose=1
    )
    return history

def compute_reconstruction_errors(model, X):
    """
    Compute the reconstruction error for each sample.
    
    Parameters:
        model (Model): Trained autoencoder.
        X (DataFrame): Input data.
        
    Returns:
        errors (np.array): Reconstruction error (MSE) for each sample.
    """
    # Reconstruct the input using the autoencoder
    X_pred = model.predict(X)
    # Compute the mean squared error for each sample
    errors = np.mean(np.power(X - X_pred, 2), axis=1)
    return errors

def determine_threshold(errors, quantile=0.95):
    """
    Determine anomaly threshold based on the error distribution.
    
    Parameters:
        errors (np.array): Reconstruction errors.
        quantile (float): Quantile value to set threshold (e.g., 95th percentile).
        
    Returns:
        threshold (float): Reconstruction error threshold.
    """
    threshold = np.quantile(errors, quantile)
    return threshold

def evaluate_anomalies(y_true, errors, threshold):
    """
    Evaluate anomaly detection performance.
    
    Parameters:
        y_true (np.array): Ground truth anomaly labels.
        errors (np.array): Reconstruction errors.
        threshold (float): Threshold to classify anomalies.
        
    Returns:
        anomaly_labels (np.array): Binary labels (1 for anomaly, 0 for normal).
        report (str): Classification report.
        auc (float): ROC AUC score.
    """
    # Flag samples with reconstruction error above threshold as anomalies
    anomaly_labels = (errors > threshold).astype(int)
    
    report = classification_report(y_true, anomaly_labels, target_names=['Normal', 'Anomaly'])
    auc = roc_auc_score(y_true, errors)
    
    return anomaly_labels, report, auc

# Example usage:
# history = train_autoencoder(autoencoder, X.values)
# errors = compute_reconstruction_errors(autoencoder, X.values)
# threshold = determine_threshold(errors)
# anomaly_labels, report, auc = evaluate_anomalies(y, errors, threshold)


In [None]:
# =============================================================================
# RESULT SAVING & VISUALIZATION
# =============================================================================

def save_model(model, output_dir, model_filename='autoencoder_model.h5'):
    """
    Save the trained Keras model.
    
    Parameters:
        model (Model): Trained Keras autoencoder.
        output_dir (Path): Directory to save the model.
        model_filename (str): Name of the model file.
    """
    model_path = output_dir / model_filename
    model.save(model_path)
    print("Model saved to:", model_path)

def save_results(anomaly_labels, errors, threshold, report, auc, output_dir):
    """
    Save evaluation results including anomaly labels, reconstruction errors, threshold, and metrics.
    
    Parameters:
        anomaly_labels (np.array): Predicted anomaly labels.
        errors (np.array): Reconstruction errors.
        threshold (float): Anomaly threshold.
        report (str): Classification report.
        auc (float): ROC AUC score.
        output_dir (Path): Directory to save the results.
    """
    results_df = pd.DataFrame({
        'anomaly_label': anomaly_labels,
        'reconstruction_error': errors
    })
    results_csv = output_dir / 'autoencoder_anomaly_detection_results.csv'
    results_df.to_csv(results_csv, index=False)
    
    # Save threshold and metrics to JSON
    metadata = {
        'threshold': threshold,
        'roc_auc': auc,
        'classification_report': report
    }
    metadata_file = output_dir / 'autoencoder_model_metadata.json'
    with open(metadata_file, 'w') as f:
        json.dump(metadata, f, indent=2)
    
    print("Results saved to:", results_csv)
    print("Metadata saved to:", metadata_file)

def visualize_errors(errors, threshold, output_dir, dpi=300):
    """
    Plot and save a histogram of reconstruction errors with the threshold.
    
    Parameters:
        errors (np.array): Reconstruction errors.
        threshold (float): Threshold value.
        output_dir (Path): Directory to save the plot.
        dpi (int): Resolution for the plot.
    """
    plt.figure(figsize=(10, 6))
    sns.histplot(errors, bins=50, kde=True, color='blue')
    plt.axvline(threshold, color='red', linestyle='--', label=f"Threshold: {threshold:.4f}")
    plt.title('Reconstruction Error Distribution')
    plt.xlabel('Reconstruction Error (MSE)')
    plt.ylabel('Frequency')
    plt.legend()
    plot_path = output_dir / 'reconstruction_error_distribution.png'
    plt.savefig(plot_path, dpi=dpi)
    plt.close()
    print("Error distribution plot saved to:", plot_path)

# Example usage:
# save_results(anomaly_labels, errors, threshold, report, auc, OUTPUT_DIR)
# save_model(autoencoder, OUTPUT_DIR)
# visualize_errors(errors, threshold, OUTPUT_DIR)



In [None]:
# =============================================================================
# MAIN EXECUTION FUNCTION
# =============================================================================

def main():
    # -----------------------
    # Step 1: Load & Preprocess Data
    # -----------------------
    df = load_data(INPUT_FILE)
    X, y, scaler = preprocess_data(df, target_column='faulty')
    print("Data loaded and preprocessed. Shape of X:", X.shape)
    
    # -----------------------
    # Step 2: Build & Train the Autoencoder
    # -----------------------
    input_dim = X.shape[1]
    autoencoder = build_autoencoder(input_dim=input_dim, encoding_dim=8)
    
    # Train the autoencoder (using moderate epochs and batch size for CPU-only training)
    history = train_autoencoder(autoencoder, X.values, epochs=50, batch_size=32)
    print("Autoencoder training complete.")
    
    # -----------------------
    # Step 3: Evaluate the Model
    # -----------------------
    errors = compute_reconstruction_errors(autoencoder, X.values)
    threshold = determine_threshold(errors, quantile=0.95)
    anomaly_labels, report, auc = evaluate_anomalies(y, errors, threshold)
    
    print("Classification Report:\n", report)
    print("ROC AUC Score:", auc)
    print("Determined Threshold:", threshold)
    
    # -----------------------
    # Step 4: Save Results & Model
    # -----------------------
    save_results(anomaly_labels, errors, threshold, report, auc, OUTPUT_DIR)
    save_model(autoencoder, OUTPUT_DIR)
    
    # -----------------------
    # Step 5: Visualize Reconstruction Errors
    # -----------------------
    visualize_errors(errors, threshold, OUTPUT_DIR)
    
    # Optionally, save additional artifacts such as the scaler
    scaler_file = OUTPUT_DIR / 'scaler.joblib'
    joblib.dump(scaler, scaler_file)
    print("Scaler saved to:", scaler_file)

if __name__ == "__main__":
    main()
