## Experiment 2: Cross-Machine Sample-Efficiency

In [None]:
# Import dependencies
import os
import sys
from pathlib import Path
import random
import json
from collections import defaultdict
from typing import Dict, List, Sequence, Tuple

# Get the absolute path to the current notebook directory
current_dir = Path().resolve()

# Set the project root directory (two levels up from notebooks if in experiments folder)
project_root = current_dir.parent.parent

# Add the project root to sys.path so Python can find the utils module
sys.path.append(str(project_root))

# Standard libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split

# Custom utilities
from utils.load_data import load_data
from utils.feature_extraction import transform_data

print("Dependencies loaded successfully ✅")

# --------------------------------------------------
# Hyper-parameters fixed from Experiment 1 (best RF)
# --------------------------------------------------
BEST_PARAMS: Dict[str, object] = {
    "rus__sampling_strategy": 0.2,
    "smote__sampling_strategy": 0.5,
    "rf__n_estimators": 200,
    "rf__max_depth": 15,
    "rf__min_samples_split": 2,
    "rf__min_samples_leaf": 1,
    "rf__max_features": "sqrt",
}

K_GRID: Sequence[int] = (0, 2, 4, 6, 8, 10, 15, 20)
N_REPS: int = 20
TEST_SIZE: float = 0.3  # fraction of *target* kept for final test
RANDOM_STATE: int = 42
N_JOBS: int = -1

# --------------------------------------------------
# Helper functions
# --------------------------------------------------

def _parse_label(label: str) -> Tuple[str, str, str]:
    """Return (machine, process, status) from full label string."""
    parts = label.split("_")
    return parts[0], parts[3], parts[-1]


def _build_metadata(labels: Sequence[str]) -> pd.DataFrame:
    """Convert list of label strings to a metadata DataFrame."""
    meta = [dict(machine=m, process=p, status=s) for m, p, s in map(_parse_label, labels)]
    return pd.DataFrame(meta)


def _prepare_features() -> Tuple[pd.DataFrame, pd.Series, pd.DataFrame]:
    """Load raw data, extract wavelet features and return features, y, meta."""
    X_raw, y_raw, _ = load_data()  # y_raw are full strings
    meta = _build_metadata(y_raw)

    # One-shot feature extraction for the full dataset
    X_feat, y_num = transform_data(X_raw, y_raw, label_type="string")
    # Ensure consistent indices
    X_feat.index = meta.index = np.arange(len(meta))
    return X_feat, y_num, meta


def _make_pipeline(best: Dict[str, object], random_state: int) -> Pipeline:
    """Instantiate the RF pipeline with fixed hyper-parameters."""
    rus = RandomUnderSampler(
        sampling_strategy=best["rus__sampling_strategy"], random_state=random_state
    )
    smote = SMOTE(
        sampling_strategy=best["smote__sampling_strategy"], random_state=random_state
    )
    rf = RandomForestClassifier(
        n_estimators=best["rf__n_estimators"],
        max_depth=best["rf__max_depth"],
        min_samples_split=best["rf__min_samples_split"],
        min_samples_leaf=best["rf__min_samples_leaf"],
        max_features=best["rf__max_features"],
        n_jobs=N_JOBS,
        random_state=random_state,
    )
    return Pipeline([("rus", rus), ("smote", smote), ("rf", rf)])


# --------------------------------------------------
# Within-machine reference (for 95 % threshold)
# --------------------------------------------------

def _within_machine_f1(
    X: pd.DataFrame,
    y: pd.Series,
    meta: pd.DataFrame,
    machine: str,
    test_size: float,
    random_state: int,
) -> float:
    """Train/test on the same machine; return macro-F1 on hold-out."""
    idx = meta.index[meta["machine"] == machine]
    X_m, y_m = X.loc[idx], y.loc[idx]
    X_tr, X_te, y_tr, y_te = train_test_split(
        X_m, y_m, test_size=test_size, stratify=y_m, random_state=random_state
    )
    pipe = _make_pipeline(BEST_PARAMS, random_state)
    pipe.fit(X_tr, y_tr)
    y_pred = pipe.predict(X_te)
    return f1_score(y_te, y_pred, average="macro")


# --------------------------------------------------
# Core experiment
# --------------------------------------------------

def run_cross_machine_experiment(
    X: pd.DataFrame,
    y: pd.Series,
    meta: pd.DataFrame,
    k_grid: Sequence[int] = K_GRID,
    n_reps: int = N_REPS,
    test_size: float = TEST_SIZE,
    random_state: int = RANDOM_STATE,
    baseline_override: float | None = None,      #  NEW
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Return (df_all, df_summary)."""
    rng = random.Random(random_state)

    # --------------------------------------------------
    # Baseline(s)
    # --------------------------------------------------
    if baseline_override is None:
        baseline_f1 = {
            m: _within_machine_f1(X, y, meta, m, test_size, random_state)
            for m in ("M01", "M02", "M03")
        }
    else:
        # same baseline for every target machine
        baseline_f1 = {m: baseline_override for m in ("M01", "M02", "M03")}

    records: List[Dict[str, object]] = []

    for source in ("M01", "M02", "M03"):
        for target in ("M01", "M02", "M03"):
            if source == target:
                continue  # only cross-machine pairs

            # Pre-select indices for faster masks
            idx_source = meta.index[meta["machine"] == source]
            idx_target = meta.index[meta["machine"] == target]
            idx_target_bad = idx_target[meta.loc[idx_target, "status"] == "bad"]
            idx_target_good = idx_target[meta.loc[idx_target, "status"] == "good"]

            # Operations mapping (process → good indices)
            proc_good_map: Dict[str, List[int]] = defaultdict(list)
            for i in idx_target_good:
                proc_good_map[meta.loc[i, "process"].upper()].append(i)

            # List of bad indices per process
            proc_bad_map: Dict[str, List[int]] = defaultdict(list)
            for i in idx_target_bad:
                proc_bad_map[meta.loc[i, "process"].upper()].append(i)

            for k in k_grid:
                for rep in range(n_reps):
                    # ── NEW ────────────────────────────────────────────────────────────
                    # keep at least one faulty sample for the test set; if impossible,
                    # skip this (k, rep) configuration
                    if k >= len(idx_target_bad):
                        continue

                    # draw the k faulty samples that will be added to the training set
                    chosen_bad_idx = rng.sample(list(idx_target_bad), k) if k else []
                    # ───────────────────────────────────────────────────────────────────

                    chosen_processes = {meta.loc[i, "process"].upper() for i in chosen_bad_idx}

                    # Include ALL corresponding good samples of those processes
                    chosen_good_idx = []
                    for proc in chosen_processes:
                        chosen_good_idx.extend(proc_good_map.get(proc, []))

                    train_idx = list(idx_source) + chosen_bad_idx + chosen_good_idx
                    test_idx  = list(idx_target.difference(train_idx))
                    chosen_processes = {
                        meta.loc[i, "process"].upper() for i in chosen_bad_idx
                    }
                    # Include ALL corresponding good samples of those processes
                    chosen_good_idx: List[int] = []
                    for proc in chosen_processes:
                        chosen_good_idx.extend(proc_good_map.get(proc, []))

                    train_idx = list(idx_source) + chosen_bad_idx + chosen_good_idx
                    test_idx = list(idx_target.difference(train_idx))

                    # Defensive: some k may exhaust target test set
                    if (
                        len(test_idx) == 0                                   # empty test set
                        or len(set(y.loc[train_idx])) < 2                    # training has 1 class
                        or len(set(y.loc[test_idx]))   < 2                   # test has 1 class
                    ):
                        continue  # skip this rep – not informative
                    X_train, y_train = X.loc[train_idx], y.loc[train_idx]
                    X_test, y_test = X.loc[test_idx], y.loc[test_idx]

                    pipe = _make_pipeline(BEST_PARAMS, random_state + rep)
                    pipe.fit(X_train, y_train)
                    y_pred = pipe.predict(X_test)
                    f1 = f1_score(y_test, y_pred, average="macro")

                    records.append(
                        {
                            "source": source,
                            "target": target,
                            "k": k,
                            "rep": rep,
                            "f1_macro": f1,
                        }
                    )

    df_all = pd.DataFrame.from_records(records)

    # --------------------------------------------------
    # Aggregate to summary table with minimal k (n★)
    # --------------------------------------------------
    summary_rows = []
    for (src, tgt), group in df_all.groupby(["source", "target"]):
        medians = group.groupby("k")["f1_macro"].median()
        threshold = 0.95 * baseline_f1[tgt]
        eligible_ks = sorted([k for k, val in medians.items() if val >= threshold])
        n_star = eligible_ks[0] if eligible_ks else None
        summary_rows.append({
            "source": src,
            "target": tgt,
            "baseline_f1": baseline_f1[tgt],
            "n_star": n_star,
        })

    df_summary = pd.DataFrame(summary_rows)
    return df_all, df_summary

In [None]:
# Prepare features (will take some time)
print(">>> Preparing features (this may take a while)…")
X_features, y_numeric, meta_df = _prepare_features()

In [None]:
X,y, y_binary = load_data()
# Group X and y by machine without forcing X into a 2D numpy array
machines = ['M01', 'M02', 'M03']
machine_data = {}

for m in machines:
    X_m = []
    y_m = []
    for xi, yi in zip(X, y):
        if yi.startswith(f"{m}_"):
            X_m.append(xi)  
            y_m.append(0 if yi.endswith("_good") else 1)

    # leave X_m as a list of nd‐arrays; convert y_m to numpy if you like
    machine_data[m] = (X_m, np.array(y_m))

# e.g.:
X_M01, y_M01 = machine_data['M01']
X_M02, y_M02 = machine_data['M02']
X_M03, y_M03 = machine_data['M03']

X_M01_tr, y_M01_tr = transform_data(X_M01, y_M01, label_type='binary')
X_M02_tr, y_M02_tr = transform_data(X_M02, y_M02, label_type='binary')
X_M03_tr, y_M03_tr = transform_data(X_M03, y_M03, label_type='binary')

# Create an overview of all the subsets
print("Dataset Overview:")
print(f"Total samples: {len(X)}")
print("\nMachine-specific breakdown:")
for machine in machines:
    X_m, y_m = machine_data[machine]
    good_samples = sum(1 for y in y_m if y == 0)
    bad_samples = sum(1 for y in y_m if y == 1)
    total_samples = len(y_m)
    
    print(f"\n{machine} Dataset:")
    print(f"  Total samples: {total_samples}")
    print(f"  Good samples: {good_samples} ({good_samples/total_samples:.2%})")
    print(f"  Bad samples: {bad_samples} ({bad_samples/total_samples:.2%})")

# Visualize class distribution
plt.figure(figsize=(12, 6))

# Create a subplot for each machine
for i, machine in enumerate(machines, 1):
    plt.subplot(1, 3, i)
    _, y_m = machine_data[machine]
    counts = np.bincount(y_m)
    bars = plt.bar(['Good', 'Bad'], counts, color=['green', 'red'])
    
    # Add count labels on top of each bar
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height,
                 f'{int(height)}',
                 ha='center', va='bottom')
    
    plt.title(f'{machine} Class Distribution')
    plt.ylabel('Count')
    
plt.tight_layout()
plt.show()

In [None]:
# Get the best model from grid search
best_model

# Extract feature importances from the Random Forest classifier
feature_importances = best_model.named_steps['rf'].feature_importances_
feature_names = trainX_tr.columns

# Create a DataFrame to display feature importances
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
})

# Sort by importance and display top 20 features
top_features = importance_df.sort_values('Importance', ascending=False).head(20)
print("Top 20 features selected by the model:")
display(top_features)

# Visualize feature importances
plt.figure(figsize=(10, 6))
plt.barh(top_features['Feature'][:10], top_features['Importance'][:10])
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Top 10 Most Important Features')
plt.tight_layout()
plt.show()

### Experiment 1: Results

In [None]:
import pickle 
import os

# Create path to models folder (going up from src to root, then to models)
models_path = os.path.join(os.path.dirname(os.path.dirname(os.getcwd())), 'models')

# Ensure the models directory exists
os.makedirs(models_path, exist_ok=True)

# Save best model in the models folder
model_path = os.path.join(models_path, 'EXP1_BINARY_MODEL.pkl')
pickle.dump(best_model, open(model_path, 'wb'))

loaded_model = pickle.load(open(model_path,'rb'))
