In [None]:
#import depencies
import os
import sys
from pathlib import Path

try:
    # Get the current working directory
    current_dir = os.getcwd()

    # Set the root directory to the parent of the current directory
    root_dir = Path(current_dir).parent

    # Add the root directory to sys.path so Python can find the utils module
    sys.path.append(str(root_dir))
    print(f"Added {root_dir} to Python path")

    # Standard libraries
    import numpy as np
    import pandas as pd
    import itertools
    import h5py

    # Data processing and visualization
    import matplotlib.pyplot as plt
    import seaborn as sns
    from scipy import signal, stats
    import pywt
    from tqdm import tqdm

    # Machine learning
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import classification_report, confusion_matrix
    from imblearn.over_sampling import SMOTE, SMOTENC
    import lightgbm as lgb
    

    # Custom utilities
    from utils import data_loader_utils
    from utils.feature_extraction import transform_data
    from utils.model_validation import perform_cross_validation

    from imblearn.over_sampling import SMOTE

    print("Dependencies loaded successfully ✅")
except Exception as e:
    print(f"Error loading dependencies: {e}")


In [None]:
def load_data(exclude_processes=None):
    """
    Load data from all machines and processes, with option to exclude specific processes.

    Args:
        exclude_processes (list, optional): List of process names to exclude from loading.

    Returns:
        tuple: (X_data, y_data, y_binary) containing features, full labels, and binary labels
    """
    machines = ["M01","M02","M03"]
    process_names = ["OP00","OP01","OP02","OP03","OP04","OP05","OP06","OP07","OP08","OP09","OP10","OP11","OP12","OP13","OP14"]
    labels = ["good","bad"]
    
    # Filter out excluded processes if any
    if exclude_processes:
        process_names = [p for p in process_names if p not in exclude_processes]
    
    path_to_dataset = os.path.join(root_dir, "data")
    
    X_data = []
    y_data = []
    
    try:
        # Calculate total number of combinations
        total_combinations = len(process_names) * len(machines) * len(labels)
        
        # Create progress bar
        with tqdm(total=total_combinations, desc="Loading data") as pbar:
            for process_name, machine, label in itertools.product(process_names, machines, labels):
                data_path = os.path.join(path_to_dataset, machine, process_name, label)
                data_list, data_label = data_loader_utils.load_tool_research_data(data_path, label=label)
                X_data.extend(data_list)
                y_data.extend(data_label)
                pbar.update(1)
                pbar.set_postfix({"Samples": len(X_data)})
                
        print(f"Data loaded successfully ✅ - {len(X_data)} samples")
    except Exception as e:
        print(f"Error loading data: {e}")
    
    # Generate binary labels from full label strings
    y_binary = [0 if label_str.split("_")[-1] == "good" else 1 for label_str in y_data]

    return X_data, y_data, y_binary

In [None]:
#1 == bad | 0 == good
X, y, y_binary = load_data()

In [None]:
y_binary[0]

In [None]:
trainX, testX, trainy, testy = train_test_split(X, y_binary, test_size=0.2, random_state=42, stratify=y_binary)

In [None]:

trainX_tr, trainy_tr = transform_data(trainX,trainy, label_type='binary')



In [None]:
testX_tr, testy_tr = transform_data(testX, testy, label_type='binary')


In [None]:
# Count the number of normal (0) and anomaly (1) samples in training and test sets
train_normal_count = (trainy_tr == 0).sum()
train_anomaly_count = (trainy_tr == 1).sum()
test_normal_count = (testy_tr == 0).sum()
test_anomaly_count = (testy_tr == 1).sum()

print(f"Training set: {train_normal_count} normal samples, {train_anomaly_count} anomaly samples")
print(f"Test set: {test_normal_count} normal samples, {test_anomaly_count} anomaly samples")
# print(f"Training set distribution: {train_normal_count/(train_normal_count+train_anomaly_count)*100:.2f}% normal, {train_anomaly_count/(train_normal_count+train_anomaly_count)*100:.2f}% anomaly")
# print(f"Test set distribution: {test_normal_count/(test_normal_count+test_anomaly_count)*100:.2f}% normal, {test_anomaly_count/(test_normal_count+test_anomaly_count)*100:.2f}% anomaly")


In [None]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.svm import OneClassSVM

In [None]:
# define outlier detection model
model = OneClassSVM(gamma='scale', nu=0.01)
# fit on majority class
trainX_tr_zero = trainX_tr[trainy_tr==0]
model.fit(trainX_tr_zero)
# detect outliers in the test set
yhat = model.predict(testX_tr)
# mark inliers 1, outliers -1
testy_tr[testy_tr == 1] = -1
testy_tr[testy_tr == 0] = 1
# calculate score
score = f1_score(testy_tr, yhat, pos_label=-1, average='binary')
print('F1 Score: %.3f' % score)

# create and display confusion matrix
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

cm = confusion_matrix(testy_tr, yhat)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Normal', 'Anomaly'])
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.show()