In [1]:
import os

In [2]:
%pwd


'c:\\Users\\Jaison\\Documents\\Workspace\\Main Projects\\Audio-Based-Anomaly-Detection-for-Industrial-Machinery-End-to-End-Project-using-MLflow-DVC\\notebooks'

In [3]:
os.chdir("../")
%pwd

'c:\\Users\\Jaison\\Documents\\Workspace\\Main Projects\\Audio-Based-Anomaly-Detection-for-Industrial-Machinery-End-to-End-Project-using-MLflow-DVC'

## Entity

In [31]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class TrainingConfig:
    root_dir: Path
    trained_model_path: Path
    normal_features_path: Path
    abnormal_features_path: Path
    feature_names_path: Path
    feature_importance_path: Path
    params_epochs: int
    params_batch_size: int
    params_feature_count: int

In [32]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class TrainingConfig:
    root_dir: Path
    trained_model_path: Path
    normal_features_path: Path
    abnormal_features_path: Path
    feature_names_path: Path
    feature_importance_path: Path
    params_epochs: int
    params_batch_size: int
    params_feature_count: int


##Config

In [33]:
from Anomaly_Detection.constants import *
from Anomaly_Detection.utils.common import read_yaml, create_directories,save_bin,load_bin

In [35]:

class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])


    def get_training_config(self) -> TrainingConfig:
        training = self.config.training
        params = self.params.training
        create_directories([
            Path(training.root_dir)
        ])

        training_config  = TrainingConfig(
            root_dir=Path(training.root_dir),
            trained_model_path=Path(training.trained_model_path),
            normal_features_path=Path(training.normal_features_path),
            abnormal_features_path=Path(training.abnormal_features_path),
            feature_names_path=Path(training.feature_names_path),
            feature_importance_path=Path(training.feature_importance_path),
            params_epochs=params.EPOCHS,
            params_batch_size=params.BATCH_SIZE,
            params_feature_count=params.FEATURE_COUNT
            
        )

        return training_config

## Component

In [36]:
import os
import urllib.request as request
import tensorflow as tf

from Anomaly_Detection import logger
from Anomaly_Detection.utils.common import get_size
import pandas as pd

import numpy as np
import librosa
import os
import joblib
from sklearn.model_selection import train_test_split

In [37]:
from sklearn.model_selection import StratifiedKFold
from scipy.stats import ttest_ind
from tensorflow.keras import layers, models
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input, BatchNormalization
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, precision_recall_curve
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler



In [51]:
class Training:
    def __init__(self, config: TrainingConfig):
        self.config = config

    def enhanced_autoencoder(self,input_dim):
        input_layer = Input(shape=(input_dim,))

        # Encoder
        encoder = Dense(128, activation='relu')(input_layer)
        encoder = BatchNormalization()(encoder)
        encoder = Dropout(0.1)(encoder)
        encoder = Dense(64, activation='relu')(encoder)
        encoder = BatchNormalization()(encoder)
        encoder = Dropout(0.1)(encoder)
        encoder = Dense(32, activation='relu')(encoder)

        # Decoder
        decoder = Dense(64, activation='relu')(encoder)
        decoder = BatchNormalization()(decoder)
        decoder = Dropout(0.1)(decoder)
        decoder = Dense(128, activation='relu')(decoder)
        decoder = BatchNormalization()(decoder)
        decoder = Dropout(0.1)(decoder)
        output_layer = Dense(input_dim, activation='sigmoid')(decoder)

        autoencoder = Model(inputs=input_layer, outputs=output_layer)
        autoencoder.compile(optimizer='adam', loss='mean_squared_error')
        return autoencoder
    
    def model_training(self,X_train_scaled, X_val_scaled):
        # Adjusting input_dim based on your feature dimensions
        input_dim = X_train_scaled.shape[1]
        autoencoder = self.enhanced_autoencoder(input_dim)
        early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
        autoencoder.fit(
            X_train_scaled, X_train_scaled,
            epochs=400,  # Increase epochs if necessary
            batch_size=256,
            shuffle=True,
            validation_data=(X_val_scaled, X_val_scaled),
            callbacks=[early_stopping],
            verbose=0
            )
        return autoencoder

    def feature_importance(self,autoencoder, X_combined_test):
        # Predict the reconstructed sounds for the combined test set
        reconstructed_combined = autoencoder.predict(X_combined_test)

        # Calculate the mean squared reconstruction error for each feature
        mse_features = np.mean(np.power(X_combined_test - reconstructed_combined, 2), axis=0)

        # Rank features by reconstruction error
        feature_importance_ranking = np.argsort(mse_features)[::-1]  # Features with the highest error first
        logger.info(f"feature_importance_ranking: {feature_importance_ranking}")
        return feature_importance_ranking
    
    def model_evaluation(self,autoencoder,X_combined_test, y_combined_test):
        reconstructed_combined = autoencoder.predict(X_combined_test)
        mse_combined = np.mean(np.power(X_combined_test - reconstructed_combined, 2), axis=1)
        precisions, recalls, thresholds = precision_recall_curve(y_combined_test, mse_combined)
        # Calculate precision-recall curve
        precisions, recalls, thresholds = precision_recall_curve(y_combined_test, mse_combined)

        # Calculate F1 score for each threshold
        f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
        optimal_idx = np.argmax(f1_scores)
        optimal_threshold = thresholds[optimal_idx]

        # Use the optimal threshold to define anomalies
        optimal_predictions = (mse_combined > optimal_threshold).astype(int)

        # Calculate metrics using the optimal threshold
        optimal_accuracy = accuracy_score(y_combined_test, optimal_predictions)
        optimal_precision = precision_score(y_combined_test, optimal_predictions)
        optimal_recall = recall_score(y_combined_test, optimal_predictions)
        optimal_f1 = f1_score(y_combined_test, optimal_predictions)
        optimal_cm = confusion_matrix(y_combined_test, optimal_predictions)
        # Print metrics using the optimal threshold
        logger.info(f"Optimal Threshold: {optimal_threshold}")
        logger.info(f"Accuracy: {optimal_accuracy}")
        logger.info(f"Precision: {optimal_precision}")
        logger.info(f"Recall: {optimal_recall}")
        logger.info(f"F1 Score: {optimal_f1}")
        logger.info(f"confusion_matrix: {optimal_cm}")

    
    def feature_selection(self, N, feature_importance_ranking, feature_names):
        top_features_indices = feature_importance_ranking[:N]
        top_features=[]
        for rank in feature_importance_ranking[:N]:
            top_features.append(feature_names[rank])

        return top_features,top_features_indices

    def train_test_spliting(self,top_features_indices):

        # Load normal_features.pkl
        normal_features = joblib.load(self.config.normal_features_path)
        logger.info(f"Loaded normal features {normal_features.shape}.")
        # Load abnormal_features.pkl
        abnormal_features = joblib.load(self.config.abnormal_features_path)
        logger.info(f"Loaded abnormal features {abnormal_features.shape}.")
        
        # Subset the features for both normal and abnormal data
        normal_features = normal_features[:, top_features_indices]
        abnormal_features = abnormal_features[:, top_features_indices]

        X_train, X_val = train_test_split(normal_features, test_size=0.2, random_state=42)
        X_test = abnormal_features
        scaler = StandardScaler()
        # Fit the scaler on the training data and transform both training, validation, and test sets
        X_train_scaled = scaler.fit_transform(X_train)
        X_val_scaled = scaler.transform(X_val)
        X_test_scaled = scaler.transform(X_test)
        # Combine normal and abnormal data
        X_combined_test = np.concatenate((X_val_scaled, X_test_scaled))
        y_combined_test = np.concatenate((np.zeros(len(X_val_scaled)), np.ones(len(X_test_scaled))))  # 0 for normal, 1 for abnormal

        joblib.dump(X_train_scaled,(os.path.join(self.config.root_dir, "X_train_scaled.pkl")))
        joblib.dump(X_val_scaled,(os.path.join(self.config.root_dir, "X_val_scaled.pkl")))
        joblib.dump(X_combined_test,(os.path.join(self.config.root_dir, "X_combined_test.pkl")))
        joblib.dump(y_combined_test,(os.path.join(self.config.root_dir, "y_combined_test.pkl")))
        
        logger.info("Splited data into training and test sets")
        logger.info(f"Saved X_train_scaled {X_train_scaled.shape} into file.")
        logger.info(f"Saved X_train_scaled {X_val_scaled.shape} into file.")
        logger.info(f"Saved X_combined_test {X_combined_test.shape} into file.")
        logger.info(f"Saved y_combined_test {y_combined_test.shape} into file.")
        return X_train_scaled,X_val_scaled,X_combined_test,y_combined_test

    def train(self):
        logger.info(f"Starting Model Building")
        feature_names = joblib.load(self.config.feature_names_path)
        abnormal_features_path = joblib.load(self.config.abnormal_features_path)
        normal_features_path = joblib.load(self.config.normal_features_path)
        n = self.config.params_feature_count
        feature_importance_ranking= joblib.load(self.config.feature_importance_path)
        
        top_features,top_features_indices = self.feature_selection(n, feature_importance_ranking, feature_names)
        X_train_scaled,X_val_scaled,X_combined_test,y_combined_test=self.train_test_spliting(top_features_indices)
        
        autoencoder = self.model_training(X_train_scaled, X_val_scaled)
        self.model_evaluation(autoencoder,X_combined_test, y_combined_test)
        feature_importance_ranking = self.feature_importance(autoencoder,X_combined_test)

        joblib.dump(autoencoder,(os.path.join(self.config.root_dir, "autoencoder.pkl")))
        joblib.dump(feature_importance_ranking,(os.path.join(self.config.root_dir, "updated_feature_importance_ranking.pkl")))
        autoencoder.save((os.path.join(self.config.root_dir, 'Encoder_Model.keras')))        


## Pipeline

In [52]:
try:
    config = ConfigurationManager()
    training_config = config.get_training_config()
    prepare_base_model = Training(config=training_config)
    prepare_base_model.train()

except Exception as e:
    raise e


[2023-12-03 11:10:36,596: INFO: common: yaml file: config\config.yaml loaded successfully]
[2023-12-03 11:10:36,609: INFO: common: yaml file: params.yaml loaded successfully]
[2023-12-03 11:10:36,613: INFO: common: created directory at: artifacts]
[2023-12-03 11:10:36,618: INFO: common: created directory at: artifacts\training]
[2023-12-03 11:10:36,623: INFO: 3476346288: Starting Model Building]
[2023-12-03 11:10:36,635: INFO: 3476346288: Loaded normal features (381, 23).]
[2023-12-03 11:10:36,643: INFO: 3476346288: Loaded abnormal features (138, 23).]
[2023-12-03 11:10:36,668: INFO: 3476346288: Splited data into training and test sets]
[2023-12-03 11:10:36,673: INFO: 3476346288: Saved X_train_scaled (304, 5) into file.]
[2023-12-03 11:10:36,678: INFO: 3476346288: Saved X_train_scaled (77, 5) into file.]
[2023-12-03 11:10:36,682: INFO: 3476346288: Saved X_combined_test (215, 5) into file.]
[2023-12-03 11:10:36,687: INFO: 3476346288: Saved y_combined_test (215,) into file.]


[2023-12-03 11:11:01,650: INFO: 3476346288: Optimal Threshold: 1.2570316070644278]
[2023-12-03 11:11:01,656: INFO: 3476346288: Accuracy: 0.9488372093023256]
[2023-12-03 11:11:01,661: INFO: 3476346288: Precision: 0.9847328244274809]
[2023-12-03 11:11:01,665: INFO: 3476346288: Recall: 0.9347826086956522]
[2023-12-03 11:11:01,672: INFO: 3476346288: F1 Score: 0.9591078066914499]
[2023-12-03 11:11:01,676: INFO: 3476346288: confusion_matrix: [[ 75   2]
 [  9 129]]]
[2023-12-03 11:11:01,925: INFO: 3476346288: feature_importance_ranking: [0 1 4 3 2]]
