In [None]:
# Import dependencies
import os
import sys
from pathlib import Path
import random
import json
from collections import defaultdict
from typing import Dict, List, Sequence, Tuple

# Get the absolute path to the current notebook directory
current_dir = Path().resolve()

# Set the project root directory (two levels up from notebooks if in experiments folder)
project_root = current_dir.parent.parent

# Add the project root to sys.path so Python can find the utils module
sys.path.append(str(project_root))

# Standard libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split

# Custom utilities
from utils.load_data import load_data
from utils.feature_extraction import transform_data

print("Dependencies loaded successfully ✅")

# --------------------------------------------------
# Hyper-parameters fixed from Experiment 1 (best RF)
# --------------------------------------------------
BEST_PARAMS: Dict[str, object] = {
    "rus__sampling_strategy": 0.2,
    "smote__sampling_strategy": 0.5,
    "rf__n_estimators": 200,
    "rf__max_depth": 15,
    "rf__min_samples_split": 2,
    "rf__min_samples_leaf": 1,
    "rf__max_features": "sqrt",
}

K_GRID: Sequence[int] = (0, 2, 4, 6, 8, 10, 15, 20)
N_REPS: int = 20
TEST_SIZE: float = 0.3  # fraction of *target* kept for final test
RANDOM_STATE: int = 42
N_JOBS: int = -1

# --------------------------------------------------
# Helper functions
# --------------------------------------------------

def _parse_label(label: str) -> Tuple[str, str, str]:
    """Return (machine, process, status) from full label string."""
    parts = label.split("_")
    return parts[0], parts[3], parts[-1]


def _build_metadata(labels: Sequence[str]) -> pd.DataFrame:
    """Convert list of label strings to a metadata DataFrame."""
    meta = [dict(machine=m, process=p, status=s) for m, p, s in map(_parse_label, labels)]
    return pd.DataFrame(meta)


def _prepare_features() -> Tuple[pd.DataFrame, pd.Series, pd.DataFrame]:
    """Load raw data, extract wavelet features and return features, y, meta."""
    X_raw, y_raw, _ = load_data()  # y_raw are full strings
    meta = _build_metadata(y_raw)

    # One-shot feature extraction for the full dataset
    X_feat, y_num = transform_data(X_raw, y_raw, label_type="string")
    # Ensure consistent indices
    X_feat.index = meta.index = np.arange(len(meta))
    return X_feat, y_num, meta


def _make_pipeline(best: Dict[str, object], random_state: int) -> Pipeline:
    """Instantiate the RF pipeline with fixed hyper-parameters."""
    rus = RandomUnderSampler(
        sampling_strategy=best["rus__sampling_strategy"], random_state=random_state
    )
    smote = SMOTE(
        sampling_strategy=best["smote__sampling_strategy"], random_state=random_state
    )
    rf = RandomForestClassifier(
        n_estimators=best["rf__n_estimators"],
        max_depth=best["rf__max_depth"],
        min_samples_split=best["rf__min_samples_split"],
        min_samples_leaf=best["rf__min_samples_leaf"],
        max_features=best["rf__max_features"],
        n_jobs=N_JOBS,
        random_state=random_state,
    )
    return Pipeline([("rus", rus), ("smote", smote), ("rf", rf)])


# --------------------------------------------------
# Within-machine reference (for 95 % threshold)
# --------------------------------------------------

def _within_machine_f1(
    X: pd.DataFrame,
    y: pd.Series,
    meta: pd.DataFrame,
    machine: str,
    test_size: float,
    random_state: int,
) -> float:
    """Train/test on the same machine; return macro-F1 on hold-out."""
    idx = meta.index[meta["machine"] == machine]
    X_m, y_m = X.loc[idx], y.loc[idx]
    X_tr, X_te, y_tr, y_te = train_test_split(
        X_m, y_m, test_size=test_size, stratify=y_m, random_state=random_state
    )
    pipe = _make_pipeline(BEST_PARAMS, random_state)
    pipe.fit(X_tr, y_tr)
    y_pred = pipe.predict(X_te)
    return f1_score(y_te, y_pred, average="macro")


# --------------------------------------------------
# Core experiment
# --------------------------------------------------

def run_cross_machine_experiment(
    X: pd.DataFrame,
    y: pd.Series,
    meta: pd.DataFrame,
    k_grid: Sequence[int] = K_GRID,
    n_reps: int = N_REPS,
    test_size: float = TEST_SIZE,
    random_state: int = RANDOM_STATE,
    baseline_override: float | None = None,      #  NEW
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Return (df_all, df_summary)."""
    rng = random.Random(random_state)

    # --------------------------------------------------
    # Baseline(s)
    # --------------------------------------------------
    if baseline_override is None:
        baseline_f1 = {
            m: _within_machine_f1(X, y, meta, m, test_size, random_state)
            for m in ("M01", "M02", "M03")
        }
    else:
        # same baseline for every target machine
        baseline_f1 = {m: baseline_override for m in ("M01", "M02", "M03")}

    records: List[Dict[str, object]] = []

    for source in ("M01", "M02", "M03"):
        for target in ("M01", "M02", "M03"):
            if source == target:
                continue  # only cross-machine pairs

            # Pre-select indices for faster masks
            idx_source = meta.index[meta["machine"] == source]
            idx_target = meta.index[meta["machine"] == target]
            idx_target_bad = idx_target[meta.loc[idx_target, "status"] == "bad"]
            idx_target_good = idx_target[meta.loc[idx_target, "status"] == "good"]

            # Operations mapping (process → good indices)
            proc_good_map: Dict[str, List[int]] = defaultdict(list)
            for i in idx_target_good:
                proc_good_map[meta.loc[i, "process"].upper()].append(i)

            # List of bad indices per process
            proc_bad_map: Dict[str, List[int]] = defaultdict(list)
            for i in idx_target_bad:
                proc_bad_map[meta.loc[i, "process"].upper()].append(i)

            for k in k_grid:
                for rep in range(n_reps):
                    # ── NEW ────────────────────────────────────────────────────────────
                    # keep at least one faulty sample for the test set; if impossible,
                    # skip this (k, rep) configuration
                    if k >= len(idx_target_bad):
                        continue

                    # draw the k faulty samples that will be added to the training set
                    chosen_bad_idx = rng.sample(list(idx_target_bad), k) if k else []
                    # ───────────────────────────────────────────────────────────────────

                    chosen_processes = {meta.loc[i, "process"].upper() for i in chosen_bad_idx}

                    # Include ALL corresponding good samples of those processes
                    chosen_good_idx = []
                    for proc in chosen_processes:
                        chosen_good_idx.extend(proc_good_map.get(proc, []))

                    train_idx = list(idx_source) + chosen_bad_idx + chosen_good_idx
                    test_idx  = list(idx_target.difference(train_idx))
                    chosen_processes = {
                        meta.loc[i, "process"].upper() for i in chosen_bad_idx
                    }
                    # Include ALL corresponding good samples of those processes
                    chosen_good_idx: List[int] = []
                    for proc in chosen_processes:
                        chosen_good_idx.extend(proc_good_map.get(proc, []))

                    train_idx = list(idx_source) + chosen_bad_idx + chosen_good_idx
                    test_idx = list(idx_target.difference(train_idx))

                    # Defensive: some k may exhaust target test set
                    if (
                        len(test_idx) == 0                                   # empty test set
                        or len(set(y.loc[train_idx])) < 2                    # training has 1 class
                        or len(set(y.loc[test_idx]))   < 2                   # test has 1 class
                    ):
                        continue  # skip this rep – not informative
                    X_train, y_train = X.loc[train_idx], y.loc[train_idx]
                    X_test, y_test = X.loc[test_idx], y.loc[test_idx]

                    pipe = _make_pipeline(BEST_PARAMS, random_state + rep)
                    pipe.fit(X_train, y_train)
                    y_pred = pipe.predict(X_test)
                    f1 = f1_score(y_test, y_pred, average="macro")

                    records.append(
                        {
                            "source": source,
                            "target": target,
                            "k": k,
                            "rep": rep,
                            "f1_macro": f1,
                        }
                    )

    df_all = pd.DataFrame.from_records(records)

    # --------------------------------------------------
    # Aggregate to summary table with minimal k (n★)
    # --------------------------------------------------
    summary_rows = []
    for (src, tgt), group in df_all.groupby(["source", "target"]):
        medians = group.groupby("k")["f1_macro"].median()
        threshold = 0.95 * baseline_f1[tgt]
        eligible_ks = sorted([k for k, val in medians.items() if val >= threshold])
        n_star = eligible_ks[0] if eligible_ks else None
        summary_rows.append({
            "source": src,
            "target": tgt,
            "baseline_f1": baseline_f1[tgt],
            "n_star": n_star,
        })

    df_summary = pd.DataFrame(summary_rows)
    return df_all, df_summary

In [None]:
# Prepare features (will take some time)
print(">>> Preparing features (this may take a while)…")
X_features, y_numeric, meta_df = _prepare_features()

In [None]:
# Run the cross-machine experiment
print(">>> Running cross-machine experiment …")
df_all, df_summary = run_cross_machine_experiment(
    X_features,
    y_numeric,
    meta_df,
    k_grid=tuple(range(0, 21, 1)),
    n_reps=N_REPS,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    baseline_override=0.914
)

In [None]:
# Display summary results
print("\n=== Sample-efficiency summary (n★ per Source → Target) ===")
display(df_all)

# Save results to CSV if needed
df_all.to_csv("exp2_full_results.csv", index=False)
df_summary.to_csv("exp2_summary.csv", index=False)

In [None]:
# Visualization: Plot median F1 per k value for each source-target pair
plt.figure(figsize=(12, 8))
for (src, tgt), group in df_all.groupby(["source", "target"]):
    medians = group.groupby("k")["f1_macro"].median()
    plt.plot(medians.index, medians.values, marker='o', label=f"{src} → {tgt}")
    
    # Add a horizontal line at 95% of the target baseline
    threshold = 0.95 * df_summary.loc[(df_summary['source']==src) & 
                                      (df_summary['target']==tgt), 'baseline_f1'].values[0]
    plt.axhline(y=threshold, linestyle='--', color='gray', alpha=0.7)

plt.xlabel('Number of target faulty samples (k)')
plt.ylabel('Median Macro F1-score')
plt.title('Cross-machine transfer performance by number of target samples')
plt.grid(True, alpha=0.3)
plt.legend()
plt.xticks(range(0, 21, 1))  # Set x-axis ticks with steps of 1
plt.tight_layout()
plt.show()

In [None]:
print(df_summary.to_latex())