In [None]:
#import depencies
import os
import sys
from pathlib import Path

try:
    # Get the current working directory
    current_dir = os.getcwd()

    # Set the root directory to the parent of the current directory
    root_dir = Path(current_dir).parent

    # Add the root directory to sys.path so Python can find the utils module
    sys.path.append(str(root_dir))
    print(f"Added {root_dir} to Python path")

    # Standard libraries
    import numpy as np
    import pandas as pd
    import itertools
    import h5py

    # Data processing and visualization
    import matplotlib.pyplot as plt
    import seaborn as sns
    from scipy import signal, stats
    import pywt
    from tqdm import tqdm

    # Machine learning
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import classification_report, confusion_matrix
    from imblearn.over_sampling import SMOTE, SMOTENC
    import lightgbm as lgb
    

    # Custom utilities
    from utils import data_loader_utils
    from utils.feature_extraction import transform_data
    from utils.model_validation import perform_cross_validation

    from imblearn.over_sampling import SMOTE

    from sklearn.datasets import make_classification
    from sklearn.metrics import f1_score
    from sklearn.svm import OneClassSVM
    # create and display confusion matrix
    from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
    import matplotlib.pyplot as plt


    print("Dependencies loaded successfully ✅")
except Exception as e:
    print(f"Error loading dependencies: {e}")


In [None]:
def load_data(exclude_processes=None):
    """
    Load data from all machines and processes, with option to exclude specific processes.

    Args:
        exclude_processes (list, optional): List of process names to exclude from loading.

    Returns:
        tuple: (X_data, y_data, y_binary) containing features, full labels, and binary labels
    """
    machines = ["M01","M02","M03"]
    process_names = ["OP00","OP01","OP02","OP03","OP04","OP05","OP06","OP07","OP08","OP09","OP10","OP11","OP12","OP13","OP14"]
    labels = ["good","bad"]
    
    # Filter out excluded processes if any
    if exclude_processes:
        process_names = [p for p in process_names if p not in exclude_processes]
    
    path_to_dataset = os.path.join(root_dir, "data")
    
    X_data = []
    y_data = []
    
    try:
        # Calculate total number of combinations
        total_combinations = len(process_names) * len(machines) * len(labels)
        
        # Create progress bar
        with tqdm(total=total_combinations, desc="Loading data") as pbar:
            for process_name, machine, label in itertools.product(process_names, machines, labels):
                data_path = os.path.join(path_to_dataset, machine, process_name, label)
                data_list, data_label = data_loader_utils.load_tool_research_data(data_path, label=label)
                X_data.extend(data_list)
                y_data.extend(data_label)
                pbar.update(1)
                pbar.set_postfix({"Samples": len(X_data)})
                
        print(f"Data loaded successfully ✅ - {len(X_data)} samples")
    except Exception as e:
        print(f"Error loading data: {e}")
    
    # Generate binary labels from full label strings
    y_binary = [0 if label_str.split("_")[-1] == "good" else 1 for label_str in y_data]

    return X_data, y_data, y_binary

import pandas as pd
from sklearn.metrics import f1_score, confusion_matrix

def create_results_df():
    """
    Initialize an empty DataFrame to store experiment results:
      – M01_pct, M02_pct, M03_pct: fractions used in the train split
      – train_normals, train_anomalies: counts before SMOTE
      – train_resampled_normals, train_resampled_anomalies: counts after SMOTE
      – test_normals, test_anomalies: counts in the test set
      – f1_score: F1 on the test set
      – tn, fp, fn, tp: confusion matrix entries
    """
    cols = [
        'M01_pct','M02_pct','M03_pct',
        'train_normals','train_anomalies',
        'train_resampled_normals','train_resampled_anomalies',
        'test_normals','test_anomalies',
        'f1_score','tn','fp','fn','tp', 'confusion_matrix'
    ]
    return pd.DataFrame(columns=cols)

def record_result(
    df,
    m01_pct, m02_pct, m03_pct,
    trainy, trainy_resampled,
    testy,f1, confusion_matrix
):
    """
    Compute metrics and append a row to df.
    
    Args:
      df                    – the results DataFrame to append to
      m?_pct                – fraction of each machine in the train split
      trainy                – original train labels before SMOTE
      trainy_resampled      – train labels after SMOTE
      testy                 – ground‐truth labels for the test set
      y_pred                – predicted labels for the test set
    
    Returns:
      The DataFrame with the new row added.
    """
    tn, fp, fn, tp = confusion_matrix.ravel()
    
    # Count samples
    train_normals  = sum(1 for y in trainy               if y == 0)
    train_anomalies = sum(1 for y in trainy               if y == 1)
    res_normals     = sum(1 for y in trainy_resampled     if y == 0)
    res_anomalies   = sum(1 for y in trainy_resampled     if y == 1)
    test_normals    = sum(1 for y in testy                if y == 0)
    test_anomalies  = sum(1 for y in testy                if y == 1)
    
    # Append
    df.loc[len(df)] = [
        m01_pct, m02_pct, m03_pct,
        train_normals, train_anomalies,
        res_normals, res_anomalies,
        test_normals, test_anomalies,
        f1, tn, fp, fn, tp, confusion_matrix
    ]
    return df

In [None]:
#1 == bad | 0 == good
X, y, y_binary = load_data()

In [None]:
def train_rf_with_adoption(X_data=None, y_data=None, M01=0, M02=0, M03=0, verbose=False):
    """
    Train a Random Forest model with machine-specific adoption.
    
    Args:
        X_data (list, optional): List of feature data. If None, data will be loaded.
        y_data (list, optional): List of labels. If None, data will be loaded.
        exclude_process (list): List of processes to exclude from data loading
        M01 (float): Percentage (0-1) of M01 data to include in training
        M02 (float): Percentage (0-1) of M02 data to include in training
        M03 (float): Percentage (0-1) of M03 data to include in training
    
    Returns:
        dict: Dictionary containing model, evaluation results, and machine adoption percentages
    """

    # Create DataFrame
    df = pd.DataFrame({'data': X_data, 'label': y_data})
    df[['machine', 'month', 'year', 'process', 'sample_id', 'status']] = df['label'].str.split('_', expand=True)

    # Initialize empty DataFrames for train and test
    train_dfs = []
    test_dfs = []

    # Process each machine separately
    for machine, percentage in [('M01', M01), ('M02', M02), ('M03', M03)]:
        machine_data = df[df['machine'] == machine]
        
        if len(machine_data) > 0 and percentage > 0:  # Only process if percentage > 0
            # Get the status for stratification
            stratify = machine_data['status']
            
            # Split the data with stratification
            if percentage == 1:  # If percentage is 1, use all data for training
                train_samples = machine_data
                test_samples = pd.DataFrame(columns=machine_data.columns)
            else:
                train_samples, test_samples = train_test_split(
                    machine_data,
                    train_size=percentage,
                    stratify=stratify,
                    random_state=42
                )
            
            train_dfs.append(train_samples)
            test_dfs.append(test_samples)
        elif len(machine_data) > 0:  # If percentage is 0, add all to test set
            test_dfs.append(machine_data)

    # Combine all machine splits
    train_df = pd.concat(train_dfs) if train_dfs else pd.DataFrame()
    test_df = pd.concat(test_dfs)

    if verbose:
        print(f"Training set size: {len(train_df)} samples")
        print(f"Test set size: {len(test_df)} samples")
        print("\nMachine distribution in training set:")
        print(train_df['machine'].value_counts() if not train_df.empty else "No training data")
        print("\nStatus distribution in training set:")
        print(train_df['status'].value_counts() if not train_df.empty else "No training data")
        print("\nMachine distribution in test set:")
        print(test_df['machine'].value_counts())
        print("\nStatus distribution in test set:")
        print(test_df['status'].value_counts())

    # Store status counts before dropping columns
    train_status_counts = train_df['status'].value_counts() if not train_df.empty else pd.Series()
    test_status_counts = test_df['status'].value_counts()
    
    train_good = train_status_counts.get('good', 0)
    train_bad = train_status_counts.get('bad', 0)
    test_good = test_status_counts.get('good', 0)
    test_bad = test_status_counts.get('bad', 0)

    # Prepare data for training
    train_df.drop(columns=['machine', 'month', 'year', 'process', 'sample_id', 'status'], inplace=True)
    X_train = train_df['data'].tolist()
    y_train = train_df['label'].tolist()

    test_df.drop(columns=['machine', 'month', 'year', 'process', 'sample_id', 'status'], inplace=True)
    X_test = test_df['data'].tolist()
    y_test = test_df['label'].tolist()

    # transform data to features and transform labels to 0 and 1
    X_train_features, y_train_labels = transform_data(X_train, y_train, include_metadata=False)
    X_test_features, y_test_labels = transform_data(X_test, y_test, include_metadata=False)

    #smote oversampling on training data
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train_features, y_train_labels)

    if verbose:
        print("X_train_resampled shape:", X_train_resampled.shape , "with smote")
        print("X_test_features shape:", X_test_features.shape , "without smote")

    # Train Random Forest classifier with optimized hyperparameters
    RF = RandomForestClassifier(max_features='log2', 
                                n_estimators=150,
                                max_depth=15,
                                min_samples_leaf=1,
                                min_samples_split=2,
                                random_state=42)

    RF.fit(X_train_resampled, y_train_resampled)

    # Evaluate the model
    y_pred = RF.predict(X_test_features)
    
    if verbose:
        print(classification_report(y_test_labels, y_pred))
        print("Confusion Matrix:")
        print(confusion_matrix(y_test_labels, y_pred))

    # Feature importance
    feature_importances = pd.DataFrame(
        RF.feature_importances_,
        index=X_train_resampled.columns,
        columns=['importance']
    ).sort_values('importance', ascending=False)

    if verbose:
        print("Top 20 most important features:")
        print(feature_importances.head(5))

    # Return the trained model and classification report
    report = classification_report(y_test_labels, y_pred, output_dict=True)
    
    # Create a dictionary with model and evaluation results
    result = {
        'model': RF,
        'classification_report': report,
        'confusion_matrix': confusion_matrix(y_test_labels, y_pred),
        'feature_importances': feature_importances,
        'machine_adoption': {
            'M01': M01,
            'M02': M02,
            'M03': M03
        },
        'status_counts': {
            'train_good': train_good,
            'train_bad': train_bad,
            'test_good': test_good,
            'test_bad': test_bad
        }
    }
    
    # Store results in DataFrame
    store_results(result, M01, M02, M03, y_train_labels, y_test_labels)

In [None]:
# trainX, testX, trainy, testy = train_test_split(X, y_binary, test_size=0.2, random_state=42, stratify=y_binary)

# trainX = [x_i for x_i, y_i in zip(X, y) 
#          if y_i.split('_')[0] == 'M01']

# # If you also want the corresponding filtered labels:
# trainy = [y_i for y_i in y 
#          if y_i.split('_')[0] == 'M01']

# trainy = [0 if label.split('_')[-1] == 'good' else 1 for label in trainy]

# testX = [x_i for x_i, y_i in zip(X, y) 
#          if y_i.split('_')[0] != 'M01']

# # If you also want the corresponding filtered labels:
# testy = [y_i for y_i in y 
#          if y_i.split('_')[0] != 'M01']

# testy = [0 if label.split('_')[-1] == 'good' else 1 for label in testy]


# print(f"Train set size: {len(trainX)} samples")
# print(f"Test set size: {len(testX)} samples")


In [None]:
result_df = create_results_df()

In [None]:
# Define the fraction of each machine to go into the TRAIN set
machine_train_frac = {
    'M01': 1.00,   # 100%
    'M02': 0.00,   # 5%
    'M03': 0.00    # 0%
}

trainX, trainy, testX, testy = [], [], [], []

for machine, frac in machine_train_frac.items():
    # filter out samples for this machine
    data = [(x_i, y_i) 
            for x_i, y_i in zip(X, y) 
            if y_i.split('_')[0] == machine]
    X_m = [d[0] for d in data]
    y_m = [0 if d[1].split('_')[-1] == 'good' else 1 for d in data]

    # if frac is 0 or 1, no split needed
    if frac == 1.0:
        trainX += X_m
        trainy += y_m
    elif frac == 0.0:
        testX += X_m
        testy += y_m
    else:
        # do a stratified split to keep minority ratio
        X_tr, X_te, y_tr, y_te = train_test_split(
            X_m, y_m,
            train_size=frac,
            test_size=1 - frac,
            stratify=y_m,
            random_state=42
        )
        trainX += X_tr
        trainy += y_tr
        testX += X_te
        testy += y_te

# check minority class distribution
print(f"Train size: {len(trainX)} samples")
print(f" Test minority ratio: {np.mean(trainy):.3f}")
print(f"Test size:  {len(testX)} samples")
print(f" Test minority ratio: {np.mean(testy):.3f}")

trainX_tr, trainy_tr = transform_data(trainX,trainy, label_type='binary')
testX_tr, testy_tr = transform_data(testX, testy, label_type='binary')

#smote oversampling on training data
smote = SMOTE(random_state=42)
trainX_tr_resampled, trainy_tr_resampled = smote.fit_resample(trainX_tr, trainy_tr)

test_y = testy_tr
test_y[test_y == 1] = -1
test_y[test_y == 0] = 1

model_X = trainX_tr
model_y = trainy_tr


# define outlier detection model
model = OneClassSVM(gamma='scale', nu=0.01)
# fit on majority class
trainX_tr_zero = model_X[model_y==0]
model.fit(trainX_tr_zero)
# detect outliers in the test set
yhat = model.predict(testX_tr)
# mark inliers 1, outliers -1

# calculate score
score = f1_score(test_y, yhat, pos_label=-1, average='binary')
print('F1 Score: %.3f' % score)

cm = confusion_matrix(test_y, yhat)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Anomaly', 'Normal'])
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.show()

record_result(result_df, machine_train_frac['M01'], machine_train_frac['M02'], machine_train_frac['M03'], trainy, trainX_tr_resampled, testy, score, cm)

In [None]:
result_df