# Experiment 1: Problem of unbalanced data

### One class classification vs Binary classification

In [None]:
#import depencies
import os
import sys
from pathlib import Path

try:
    # Get the current working directory
    current_dir = os.getcwd()

    # Set the root directory to the parent of the current directory
    root_dir = os.path.dirname(os.path.dirname(current_dir))

    # Add the root directory to sys.path so Python can find the utils module
    sys.path.append(str(root_dir))

    # Standard libraries
    import numpy as np
    import pandas as pd
    from IPython.display import display
    import itertools
    import h5py

    # Data processing and visualization
    import matplotlib.pyplot as plt
    import seaborn as sns
    from scipy import signal, stats
    import pywt
    from tqdm import tqdm

    # Machine learning
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler
    from sklearn.svm import OneClassSVM
    from sklearn.ensemble import IsolationForest,RandomForestClassifier

    from imblearn.over_sampling import SMOTE
    from imblearn.under_sampling import RandomUnderSampler

    # Custom utilities
    from utils.feature_extraction import transform_data
    from utils.load_data import load_data
    from utils.result_utils import create_results_df, record_result


    from sklearn.metrics import f1_score, confusion_matrix, ConfusionMatrixDisplay, classification_report,accuracy_score,precision_score,recall_score
    import matplotlib.pyplot as plt


    print("Dependencies loaded successfully ✅")
except Exception as e:
    print(f"Error loading dependencies: {e}")

In [None]:
#0 == good | 1 == bad |
X, y, y_binary = load_data()

result_df_RF = create_results_df()

In [None]:
def run_experiment(m01, m02, m03,show_confusion_matrix=False):
    '''Run one-class SVM experiment with specified machine fractions.'''   
    Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y_binary)

    machine_train_frac = {'M01': m01, 'M02': m02, 'M03': m03}
    trainX, trainy, testX, testy = [], [], [], []
    
    for machine, frac in machine_train_frac.items():
        # filter samples for this machine
        data = [(x_i, y_i) for x_i, y_i in zip(Xtrain, ytrain)]
        X_m = [d[0] for d in data if d[1].split('_')[0] == machine]
        y_m = [0 if d[1].split('_')[-1] == 'good' else 1 for d in data if d[1].split('_')[0] == machine]
        if frac == 1.0:
            trainX.extend(X_m); trainy.extend(y_m)
        elif frac == 0.0:
            testX.extend(X_m); testy.extend(y_m)
        else:
            X_tr, X_te, y_tr, y_te = train_test_split(
                X_m, y_m, train_size=frac, stratify=y_m, random_state=42
            )
            trainX.extend(X_tr); trainy.extend(y_tr)
            testX.extend(X_te); testy.extend(y_te)

    # transform and resample
    trainX_tr, trainy_tr = transform_data(trainX, trainy)
    testX_tr, testy_tr = transform_data(Xtest, ytest,label_type='string')

    # print(f"Class distribution before resampling: {pd.Series(trainy_tr).value_counts()}")
    
    rus = RandomUnderSampler(sampling_strategy=0.25, random_state=42)

    trainX_tr, trainy_tr = rus.fit_resample(trainX_tr, trainy_tr)
    
    smote = SMOTE(random_state=42)
    trainX_tr_resampled, trainy_tr_resampled = smote.fit_resample(trainX_tr, trainy_tr)
    
    RF = RandomForestClassifier(max_features='log2', 
                                n_estimators=150,
                                max_depth=15,
                                min_samples_leaf=1,
                                min_samples_split=2,
                                random_state=42,
                                n_jobs=-1)

    RF.fit(trainX_tr_resampled, trainy_tr_resampled)

    yhat = RF.predict(testX_tr)
    score = f1_score(testy_tr, yhat, pos_label=1, average='binary')
    cm = confusion_matrix(testy_tr, yhat)

    record_result(result_df_RF, m01, m02, m03, trainy, trainy_tr_resampled, testy, score, cm)

    if show_confusion_matrix:
        return ConfusionMatrixDisplay.from_estimator(RF, testX_tr, testy_tr)

In [None]:
import numpy as np
import pandas as pd
from itertools import permutations
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from joblib import Parallel, delayed
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings("ignore")

In [None]:
# ------------------------------------------------------------------
# 1. ÉÉN VASTE PIPELINE – hyper-parameters invriezen
# ------------------------------------------------------------------
def build_fixed_pipeline(best_params, random_state=42):
    """Maak een RUS → SMOTE → RandomForest pipeline met vastgezette params."""
    rus = RandomUnderSampler(
        sampling_strategy=best_params["rus__sampling_strategy"],
        random_state=random_state
    )
    sm = SMOTE(
        sampling_strategy=best_params["smote__sampling_strategy"],
        random_state=random_state
    )
    rf_params = {k.split("rf__")[1]: v          # strip de voorvoegsels
                 for k, v in best_params.items() if k.startswith("rf__")}
    rf = RandomForestClassifier(random_state=random_state, **rf_params)
    return Pipeline([("rus", rus), ("smote", sm), ("rf", rf)])


# ------------------------------------------------------------------
# 2. HULPFUNCTIE – stel de trainingset samen voor één k-waarde
# ------------------------------------------------------------------
def assemble_train_set(X_src, y_src,
                       X_tgt, y_tgt,
                       k, rnd, op_ids_src=None, op_ids_tgt=None):
    """Voeg k 'bad' samples uit target toe + bijbehorende 'good' (optioneel)."""
    # -- split target in bad / good indices ------------------------
    idx_bad  = np.where(y_tgt == 1)[0]
    idx_good = np.where(y_tgt == 0)[0]

    # -- random trekkingen per bootstrap-rep -----------------------
    rng = np.random.default_rng(rnd)
    if k > len(idx_bad):
        raise ValueError(f"k = {k} groter dan #available bad samples ({len(idx_bad)})")
    sel_bad = rng.choice(idx_bad, size=k, replace=False)

    # (optioneel) alle good van dezelfde operatie meepakken -----------
    if op_ids_tgt is not None:
        ops_chosen = np.unique(op_ids_tgt[sel_bad])
        sel_good_extra = np.where(np.isin(op_ids_tgt, ops_chosen) & (y_tgt == 0))[0]
        sel_good = sel_good_extra
    else:                      # fallback: evenveel good random nemen
        sel_good = rng.choice(idx_good, size=k*2, replace=False) if k else []
    # ---------------------------------------------------------------
    X_aug = np.concatenate([X_src,      X_tgt[sel_bad],  X_tgt[sel_good]])
    y_aug = np.concatenate([y_src,      y_tgt[sel_bad],  y_tgt[sel_good]])
    return X_aug, y_aug


# ------------------------------------------------------------------
# 3. MAIN LOOP – één bootstrap-rep
# ------------------------------------------------------------------
def single_run(src, tgt,
               X_src, y_src, X_tgt_pool, y_tgt_pool, X_test, y_test,
               k, best_params, rep, op_ids=None, random_state=42):
    rnd = random_state + rep      # ander seed per rep
    pipe = build_fixed_pipeline(best_params, random_state=rnd)
    # assemble train set
    X_train, y_train = assemble_train_set(
        X_src, y_src, X_tgt_pool, y_tgt_pool, k, rnd,
        op_ids_src=None,
        op_ids_tgt=None if op_ids is None else op_ids[tgt]
    )
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    f1 = f1_score(y_test, y_pred, average="macro")
    return f1


# ------------------------------------------------------------------
# 4. EXPERIMENT – alle paren, alle k's, 20×
# ------------------------------------------------------------------
def run_cross_machine_experiment(machine_data,
                                 best_params,
                                 k_grid=(0,2,4,6,8,10,15,20),
                                 n_reps=20,
                                 test_size=0.3,
                                 n_jobs=-1,
                                 random_state=42,
                                 op_ids=None):
    """Return DataFrame met mediane F1's + n★ per (src→tgt)."""
    records = []
    # ----------------------------------------------------------------
    for src, tgt in permutations(machine_data.keys(), 2):
        X_src, y_src = machine_data[src]
        X_tgt, y_tgt = machine_data[tgt]
        # één vaste testset per (src→tgt)
        X_tgt_pool, X_test, y_tgt_pool, y_test = train_test_split(
            X_tgt, y_tgt, test_size=test_size, stratify=y_tgt,
            random_state=random_state
        )

        # baseline-F1 (zonder cross-samples) -------------------------
        baseline_pipe = build_fixed_pipeline(best_params, random_state)
        baseline_pipe.fit(X_tgt_pool, y_tgt_pool)
        baseline_pred = baseline_pipe.predict(X_test)
        F1_baseline = f1_score(y_test, baseline_pred, average="macro")
        F1_target = 0.95 * F1_baseline

        # alle k-waarden (Parallel voor snelheid) -------------------
        for k in k_grid:
            f1_vals = Parallel(n_jobs=n_jobs)(
                delayed(single_run)(
                    src, tgt,
                    X_src, y_src, X_tgt_pool, y_tgt_pool,
                    X_test, y_test,
                    k, best_params, rep,
                    op_ids, random_state
                )
                for rep in range(n_reps)
            )
            median_f1 = float(np.median(f1_vals))
            records.append({
                "source": src, "target": tgt,
                "k": k, "median_F1": median_f1,
                "F1_target": F1_target,
                "meets_95pct": median_f1 >= F1_target,
                "tot_bad_target": int(np.sum(y_tgt == 1))
            })

    df = pd.DataFrame.from_records(records)

    # n★ berekenen ---------------------------------------------------
    summary = []
    for (src, tgt), grp in df.groupby(["source", "target"]):
        ok = grp[grp["meets_95pct"]].sort_values("k")
        n_star  = int(ok["k"].iloc[0]) if not ok.empty else np.nan
        pct_star = 100 * n_star / grp["tot_bad_target"].iloc[0] if not ok.empty else np.nan
        summary.append({
            "source": src, "target": tgt,
            "F1_baseline": grp["F1_target"].iloc[0] / 0.95,
            "n_star": n_star,
            "pct_bad": pct_star,
            "median_F1_at_n_star": ok["median_F1"].iloc[0] if not ok.empty else np.nan
        })
    summary_df = pd.DataFrame(summary)
    return df, summary_df


# ------------------------------------------------------------------
# 5. VOORBEELD VAN AANROEPEN ---------------------------------------
if __name__ == "__main__":
    


    # (1) laad of prepareer je data hier 
       machine_data = {
           "M01": (X_M01, y_M01),
           "M02": (X_M02, y_M02),
           "M03": (X_M03, y_M03),
       }
    # (2) plak hier je best_params, bijv.:
       best_params = {
           'rus__sampling_strategy': 0.2,
           'smote__sampling_strategy': 0.5,
           'rf__n_estimators': 200,
           'rf__max_depth': 15,
           'rf__min_samples_split': 2,
           'rf__min_samples_leaf': 1,
           'rf__max_features': 'sqrt'
       }
    
    df_all, df_summary = run_cross_machine_experiment(
        machine_data, best_params,
        k_grid=(0,2,4,6,8,10,15,20),
        n_reps=20, n_jobs=-1, random_state=42
    )
    
    print("\n=== Sample-efficiency resultaten ===")
    print(df_summary.to_string(index=False))
    df_all.to_csv("exp2_full_results.csv", index=False)
    pass


In [None]:
X, y, y_binary = load_data()




In [None]:
y[0]

In [None]:
X_M01 = []
X_M02 = []
X_M03 = []
y_M01 = []
y_M02 = []
y_M03 = []

for i in range(len(y)):
    machine = y[i].split("_")[0]
    is_good = 0 if y[i].split("_")[-1] == "good" else 1
    
    if machine == 'M01':
        X_M01.append(X[i])
        y_M01.append(is_good)
    elif machine == 'M02':
        X_M02.append(X[i])
        y_M02.append(is_good)
    elif machine == 'M03':
        X_M03.append(X[i])
        y_M03.append(is_good)

# Convert to numpy arrays
X_M01 = np.array(X_M01)
X_M02 = np.array(X_M02)
X_M03 = np.array(X_M03)
y_M01 = np.array(y_M01)
y_M02 = np.array(y_M02)
y_M03 = np.array(y_M03)


In [None]:
# Filter data by machine
X_M01 = X[X['machine'] == 'M01'].drop('machine', axis=1)
X_M02 = X[X['machine'] == 'M02'].drop('machine', axis=1)
X_M03 = X[X['machine'] == 'M03'].drop('machine', axis=1)

# Create binary labels (0 for good, 1 for anomaly)
y_M01 = np.array([0 if label_str.split("_")[-1] == "good" else 1 for label_str in y[X['machine'] == 'M01']])
y_M02 = np.array([0 if label_str.split("_")[-1] == "good" else 1 for label_str in y[X['machine'] == 'M02']])
y_M03 = np.array([0 if label_str.split("_")[-1] == "good" else 1 for label_str in y[X['machine'] == 'M03']])

machine_data = {
    "M01": (X_M01, y_M01),
    "M02": (X_M02, y_M02),
    "M03": (X_M03, y_M03),
}


In [None]:
# Get the best model from grid search
best_model

# Extract feature importances from the Random Forest classifier
feature_importances = best_model.named_steps['rf'].feature_importances_
feature_names = trainX_tr.columns

# Create a DataFrame to display feature importances
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
})

# Sort by importance and display top 20 features
top_features = importance_df.sort_values('Importance', ascending=False).head(20)
print("Top 20 features selected by the model:")
display(top_features)

# Visualize feature importances
plt.figure(figsize=(10, 6))
plt.barh(top_features['Feature'][:10], top_features['Importance'][:10])
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Top 10 Most Important Features')
plt.tight_layout()
plt.show()

### Experiment 1: Results

In [None]:
import pickle 
import os

# Create path to models folder (going up from src to root, then to models)
models_path = os.path.join(os.path.dirname(os.path.dirname(os.getcwd())), 'models')

# Ensure the models directory exists
os.makedirs(models_path, exist_ok=True)

# Save best model in the models folder
model_path = os.path.join(models_path, 'EXP1_BINARY_MODEL.pkl')
pickle.dump(best_model, open(model_path, 'wb'))

loaded_model = pickle.load(open(model_path,'rb'))
