In [None]:
#import depencies
import os
import sys
from pathlib import Path

try:
    # Get the current working directory
    current_dir = os.getcwd()

    # Set the root directory to the parent of the current directory
    root_dir = Path(current_dir).parent

    # Add the root directory to sys.path so Python can find the utils module
    sys.path.append(str(root_dir))
    print(f"Added {root_dir} to Python path")

    # Standard libraries
    import numpy as np
    import pandas as pd
    import itertools
    import h5py

    # Data processing and visualization
    import matplotlib.pyplot as plt
    import seaborn as sns
    from scipy import signal, stats
    import pywt
    from tqdm import tqdm

    # Machine learning
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import RandomForestClassifier
    from imblearn.over_sampling import SMOTE

    # Custom utilities
    from utils import data_loader_utils
    from utils.feature_extraction import transform_data
    from utils.load_data import load_data
    from utils.result_utils import create_results_df, record_result

    from imblearn.over_sampling import SMOTE
    from imblearn.under_sampling import RandomUnderSampler

    from sklearn.metrics import f1_score, confusion_matrix, ConfusionMatrixDisplay
    import matplotlib.pyplot as plt


    print("Dependencies loaded successfully ✅")
except Exception as e:
    print(f"Error loading dependencies: {e}")


In [None]:
#0 == good | 1 == bad |
X, y, y_binary = load_data()

In [None]:
result_df_RF = create_results_df()

In [None]:
trainX, testX, trainy, testy = train_test_split(X, y_binary, test_size=0.2, random_state=42, stratify=y_binary)

print(f"Train set size: {len(trainX)} samples")
print(f"Test set size: {len(testX)} samples")

# transform and resample
trainX_tr, trainy_tr = transform_data(trainX, trainy, label_type='binary')

smote = SMOTE(k_neighbors=5, random_state=42)

rus = RandomUnderSampler(sampling_strategy=0.5, random_state=42)
trainX_tr, trainy_tr = rus.fit_resample(trainX_tr, trainy_tr)
trainX_tr_resampled, trainy_tr_resampled = smote.fit_resample(trainX_tr, trainy_tr)
testX_tr, testy_tr = transform_data(testX, testy, label_type='binary')

    # Train Random Forest classifier with optimized hyperparameters
RF = RandomForestClassifier(max_features='log2', 
                            n_estimators=150,
                            max_depth=15,
                            min_samples_leaf=1,
                            min_samples_split=2,
                            random_state=42)

RF.fit(trainX_tr_resampled, trainy_tr_resampled)

# Evaluate the model
yhat = RF.predict(testX_tr)
score = f1_score(testy_tr, yhat, pos_label=1, average='binary')
cm = confusion_matrix(testy_tr, yhat).ravel()
# record results
record_result(result_df_RF, 0, 0, 0, trainy, trainy_tr_resampled, testy, score, cm, experiment_id='exp0_no_machine_adoption_random_split')


In [None]:
def run_experiment(m01, m02, m03,show_confusion_matrix=False):
    '''Run one-class SVM experiment with specified machine fractions.'''   
    Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y_binary)

    machine_train_frac = {'M01': m01, 'M02': m02, 'M03': m03}
    trainX, trainy, testX, testy = [], [], [], []
    
    for machine, frac in machine_train_frac.items():
        # filter samples for this machine
        data = [(x_i, y_i) for x_i, y_i in zip(Xtrain, ytrain)]
        X_m = [d[0] for d in data if d[1].split('_')[0] == machine]
        y_m = [0 if d[1].split('_')[-1] == 'good' else 1 for d in data if d[1].split('_')[0] == machine]
        if frac == 1.0:
            trainX.extend(X_m); trainy.extend(y_m)
        elif frac == 0.0:
            testX.extend(X_m); testy.extend(y_m)
        else:
            X_tr, X_te, y_tr, y_te = train_test_split(
                X_m, y_m, train_size=frac, stratify=y_m, random_state=42
            )
            trainX.extend(X_tr); trainy.extend(y_tr)
            testX.extend(X_te); testy.extend(y_te)

    # transform and resample
    trainX_tr, trainy_tr = transform_data(trainX, trainy)
    testX_tr, testy_tr = transform_data(Xtest, ytest,label_type='string')

    # print(f"Class distribution before resampling: {pd.Series(trainy_tr).value_counts()}")
    
    rus = RandomUnderSampler(sampling_strategy=0.25, random_state=42)

    trainX_tr, trainy_tr = rus.fit_resample(trainX_tr, trainy_tr)
    
    smote = SMOTE(random_state=42)
    trainX_tr_resampled, trainy_tr_resampled = smote.fit_resample(trainX_tr, trainy_tr)
    
    RF = RandomForestClassifier(max_features='log2', 
                                n_estimators=150,
                                max_depth=15,
                                min_samples_leaf=1,
                                min_samples_split=2,
                                random_state=42,
                                n_jobs=-1)

    RF.fit(trainX_tr_resampled, trainy_tr_resampled)

    yhat = RF.predict(testX_tr)
    score = f1_score(testy_tr, yhat, pos_label=1, average='binary')
    cm = confusion_matrix(testy_tr, yhat)

    record_result(result_df_RF, m01, m02, m03, trainy, trainy_tr_resampled, testy, score, cm)

    if show_confusion_matrix:
        return ConfusionMatrixDisplay.from_estimator(RF, testX_tr, testy_tr)

# e.g. experiment 1: vary M02
for m02 in np.arange(0, 0.51, 0.05):
    run_experiment(1.0, m02, 0.0)
    print(f"Finished running experiment 1 (vary M02) with M02={m02:.2f}")
    result_df_RF.loc[result_df_RF.index[-1], 'experiment_id'] = 'exp1_vary_M02'

# e.g. experiment 2: vary M03
for m03 in np.arange(0.05, 0.51, 0.05):
    run_experiment(1.0, 0.0, m03)
    print(f"Finished running experiment 2 (vary M03) with M03={m03:.2f}")
    result_df_RF.loc[result_df_RF.index[-1], 'experiment_id'] = 'exp2_vary_M03'

# e.g. experiment 3: vary M02 and M03 together
for frac in np.arange(0.05, 0.51, 0.05):
    run_experiment(1.0, frac, frac)
    print(f"Finished running experiment 3 (vary both) with frac={frac:.2f}")
    result_df_RF.loc[result_df_RF.index[-1], 'experiment_id'] = 'exp3_vary_both'

# Display the compiled results
result_df_RF

In [None]:
path = os.path.join(os.path.dirname(os.getcwd()), 'export')
result_df_RF.to_csv(path + '/results/result_df_RF_0.25UND_SMOTE.csv', index=False)

In [None]:
result_df_RF

In [None]:
run_experiment(1.0, 0.5, 0.0,show_confusion_matrix=True)