# SECOM Yield Prediction — End-to-End Notebook

**Goal:** Predict *Fail* outcomes from process measurements to reduce scrap and downtime.

**Data:** `data/secom.data`, `data/secom_labels.data`, `data/secom.names` (UCI ML Repository, real fab data).

**Primary metric:** Recall on *Fail* at acceptable precision. Report PR-AUC and Balanced Error Rate (BER).

> Safety: No unsupported claims. Treat outputs as decision support, not automation.

### 1. Setup

In [None]:
# Basic libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import joblib


# Global style for plots
plt.rcParams.update({
    "font.family": "Times New Roman",
    "font.size": 12,
    "axes.titlesize": 14,
    "axes.labelsize": 12,
    "axes.edgecolor": "black",
    "axes.linewidth": 1,
    "xtick.color": "black",
    "ytick.color": "black",
    "xtick.direction": "out",
    "ytick.direction": "out",
    "xtick.bottom": True,
    "ytick.left": True,
    "xtick.top": False,
    "ytick.right": False,
    "figure.dpi": 150,
    "legend.frameon": True,
    "legend.facecolor": "white",
    "legend.edgecolor": "black",
    "legend.fontsize": 12
})

# Apply to seaborn
sns.set_theme(context="notebook", style="ticks")

# Reproducibility
rnd_num = 42
np.random.seed(rnd_num)

# Root directory
ROOT = Path(os.getcwd()).parent

# Data directory
DATA_DIR = Path(ROOT/"data")
RAW = Path(DATA_DIR/"raw")
assert (RAW/"secom.data").exists() and (RAW/"secom_labels.data").exists(), "Data files are missing!"

# Results directory
RESULT_DIR = Path(ROOT/"results")
RESULT_DIR.mkdir(exist_ok=True, parents=True)

# Model directory
MODEL_DIR = Path(ROOT/"models")
MODEL_DIR.mkdir(exist_ok=True, parents=True)

In [None]:
# Figure saver helper (use after each plot)
RESULT_DIR_ETL = Path(RESULT_DIR/"ETL")
RESULT_DIR_ETL.mkdir(exist_ok=True, parents=True)
def savefig(name):
    out = RESULT_DIR_ETL/name
    plt.tight_layout()
    plt.savefig(out, dpi=300, bbox_inches="tight")
    print("Saved:", out)

Note: Numbering continue from 01_EDA.ipynb

### 5. Extract Transform Load (ETL)

In [None]:
# Libraries
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold

In [None]:
# Load data from EDA step.
df_EDA = pd.read_parquet(DATA_DIR/"interim/SECOM_EDA.parquet")
df = df_EDA.copy()
df = df.drop(columns=["timestamp"])  # timestamp not needed for modeling

#### 5.1. Missing value handling (audit + strategy)

From EDA we know:
- Most features have almost no missing values.
- A few have extreme missingness (up to `~91%`).
- Average missingness `~4.5%`.

In [None]:
# Fraction of missing per feature
missing_frac = df.drop(columns=["label"]).isna().mean()

# Candidate thresholds to test
thresh_values = np.linspace(0, 1, 21)

kept, dropped = [], []

for t in thresh_values:
    drop_cols = (missing_frac > t).sum()
    keep_cols = (missing_frac <= t).sum()
    dropped.append(drop_cols)
    kept.append(keep_cols)

plt.plot(thresh_values, kept, marker="o", label="Kept Features")
plt.plot(thresh_values, dropped, marker="s", label="Dropped Features")
plt.axvline(0.7, color="red", linestyle="--", label="drop_thresh=0.7")
plt.xlabel("Drop threshold (fraction missing)")
plt.ylabel("Number of features")
plt.title("Impact of drop_thresh on features retained")
plt.legend()
savefig("01_missing_drop_curve.png")

plt.show()

In [None]:
flag_counts = []
for t in thresh_values:
    flag_cols = ((missing_frac > t) & (missing_frac <= 0.7)).sum()
    flag_counts.append(flag_cols)

plt.plot(thresh_values, flag_counts, marker="d", color="purple")
plt.axvline(0.1, color="red", linestyle="--", label="flag_thresh=0.1")
plt.xlabel("Flag threshold (fraction missing)")
plt.ylabel("Number of flagged features")
plt.title("Impact of flag_thresh on missingness indicators")
plt.legend()
savefig("02_missing_flag_curve.png")
plt.show()

In [None]:
# 1. Audit missingness (from EDA)
print(missing_frac.describe())

# 2. Define thresholds
drop_thresh = 0.7   # drop if >70% missing
flag_thresh = 0.1   # add missing indicator if >10% missing

# 3. Identify feature groups
drop_cols = missing_frac[missing_frac > drop_thresh].index.tolist()
flag_cols = missing_frac[(missing_frac > flag_thresh) & (missing_frac <= drop_thresh)].index.tolist()
keep_cols = missing_frac[missing_frac <= drop_thresh].index.tolist()

print(f"Drop {len(drop_cols)} features with >70% missing")
print(f"Flag {len(flag_cols)} features with 10-70% missing")
print(f"Keep {len(keep_cols)} features with <70% missing")

# 4. Drop high-missing features
df_etl = df.drop(columns=drop_cols)

# 5. Add missingness indicators
for col in flag_cols:
    df_etl[col+"_missing"] = df_etl[col].isna().astype(int)

# 6. Impute remaining missing with median
imputer = SimpleImputer(strategy="median")
df_etl[keep_cols] = imputer.fit_transform(df_etl[keep_cols])

print("Shape:", df_etl.shape)

#### 5.2. Outlier Handling.

In [None]:
from scipy.stats import skew

strategies = {}
X = df_etl.drop(columns="label")

for col in X.columns:
    s = X[col].dropna()
    sk = skew(s)
    if abs(sk) > 1 and (s >= 0).all():
        strategies[col] = "log"
    elif s.quantile(0.99) > s.quantile(0.5) * 5:  # extreme tail
        strategies[col] = "winsor"
    else:
        strategies[col] = "keep"

# Summary of counts
from collections import Counter
print(Counter(strategies.values()))

In [None]:
# Helper functions for outlier handling.
def winsorize_col(s: pd.Series, lower=0.01, upper=0.99):
    """Clip to [lower, upper] quantiles. Works with NaNs."""
    ql, qu = s.quantile(lower), s.quantile(upper)
    return s.clip(lower=ql, upper=qu)

def apply_outlier_transforms(df_in: pd.DataFrame,
                             strategies: dict,
                             winsor=(0.01, 0.99),
                             log_safe=True) -> pd.DataFrame:
    """
    Apply per-column outlier strategy:
        'log'     -> log1p (if non-negative; else fallback to winsor)
        'winsor'  -> clip to quantiles
        'keep'    -> no change
    Non-feature columns like 'label' are passed through unchanged.
    """
    df = df_in.copy()
    feat_cols = [c for c in df.columns if c != "label"]

    # Summary counters
    n_log = n_win = n_keep = n_badlog = 0

    for c in feat_cols:
        strat = strategies.get(c, "keep")

        # Choose action
        if strat == "log":
            s = df[c]
            # If any negatives and log_safe, fallback to winsor
            if log_safe and (s.min(skipna=True) < 0):
                df[c] = winsorize_col(s, *winsor)
                n_badlog += 1; n_win += 1
            else:
                # log1p handles zeros; keep NaNs as-is (assumed imputed earlier)
                df[c] = np.log1p(s)
                n_log += 1

        elif strat == "winsor":
            df[c] = winsorize_col(df[c], *winsor)
            n_win += 1

        else:  # 'keep'
            n_keep += 1

    print(f"Applied transforms --> log: {n_log}, winsor: {n_win} "
          f"(fallbacks from log: {n_badlog}), keep: {n_keep}")
    return df

In [None]:
# --- run on df_etl dataframe ---
df_etl2 = apply_outlier_transforms(df_etl, strategies, winsor=(0.01, 0.99))
print("After outliers handling step:", df_etl2.shape)

In [None]:
# --- Sanity check after ETL Outlier Handiling Step ---

# 1. Identify indicator columns (added in Step 1)
indicator_cols = [c for c in df_etl2.columns if c.endswith("_missing")]

# 2. Original sensor features (those in strategy dict)
sensor_cols = list(strategies.keys())

# 3. Label column
label_col = "label"

# 4. Safety check: All other columns should be accounted for
other_cols = [c for c in df_etl2.columns 
              if c not in indicator_cols and c not in sensor_cols and c != label_col]

# --- Counts ---
print(f"Total columns: {df_etl2.shape[1]}")
print(f"  Sensor features: {len(sensor_cols)}")
print(f"  Indicator features: {len(indicator_cols)}")
print(f"  Label: 1")
print(f"  Other (unexpected): {len(other_cols)}")

# --- Quick data type check ---
print("\nIndicator columns dtype check (should all be int or 0/1):")
print(df_etl2[indicator_cols].dtypes.value_counts())

# --- Sample preview ---
print("\nSample indicator preview (first 5 rows):")
print(df_etl2[indicator_cols].head())

#### 5.3. Feature Scaling

In [None]:
# Separate features vs label
X = df_etl2.drop(columns="label")
y = df_etl2["label"]

In [None]:
# Identify which columns to scale: numeric sensors only (exclude indicators)
scale_cols = [c for c in X.columns if not c.endswith("_missing")]
scale_cols

In [None]:
# Fit StandardScaler on numeric features
scaler = StandardScaler()
X_scaled = X.copy()
X_scaled[scale_cols] = scaler.fit_transform(X[scale_cols])

In [None]:
# Final dataset
df_etl3 = X_scaled.join(y)

print("After ETL Feature Scaling:", df_etl3.shape)
print("Mean of first 5 scaled features:")
print(df_etl3[scale_cols].mean().head())
print("Std of first 5 scaled features:")
print(df_etl3[scale_cols].std().head())

#### 5.4. Chronological split (no leakage)

In [None]:
# Pre-req: df_etl3 has features + 'label'
#          df_EDA has a 'timestamp' column aligned by row index

# 1) attach timestamp (align by index)
assert len(df_etl3) == len(df_EDA), "row count mismatch"
df_etl4 = df_etl3.copy()
df_etl4["timestamp"] = df_EDA["timestamp"]

# 2) drop rows with missing timestamps (should be none)
df_etl4 = df_etl4.dropna(subset=["timestamp"])
print("After attaching timestamp:", df_etl4.shape)

In [None]:
# 3) sort by time
df_etl4 = df_etl4.sort_values("timestamp").reset_index(drop=True)

# 4) time-based split indices (60/20/20)
n = len(df_etl4)
i_tr  = int(0.60 * n)
i_val = int(0.80 * n)

In [None]:
# 5) build splits
cols_feat = [c for c in df_etl4.columns if c not in ("label","timestamp")]
X_train, y_train = df_etl4.loc[:i_tr-1, cols_feat].values, df_etl4.loc[:i_tr-1, "label"].values
X_val,   y_val   = df_etl4.loc[i_tr:i_val-1, cols_feat].values, df_etl4.loc[i_tr:i_val-1, "label"].values
X_test,  y_test  = df_etl4.loc[i_val:, cols_feat].values, df_etl4.loc[i_val:, "label"].values

In [None]:
# 6) quick sanity
def stats(y):
    return dict(n=len(y), fails=int((y==1).sum()), fail_rate=float((y==1).mean()))
print("Train:", stats(y_train))
print("Val:  ", stats(y_val))
print("Test: ", stats(y_test))

In [None]:
# Backup: keep the split DataFrames for inspection
train_df = df_etl4.loc[:i_tr-1, cols_feat + ["label","timestamp"]]
val_df   = df_etl4.loc[i_tr:i_val-1, cols_feat + ["label","timestamp"]]
test_df  = df_etl4.loc[i_val:, cols_feat + ["label","timestamp"]]

#### 5.5. Low-variance and duplicate feature pruning (train-only)

In [None]:
cols_feat # List of feature columns.

In [None]:
# Work on TRAIN ONLY to avoid leakage
# Use col_feat to extract features for modeling
Xtr_df = train_df[cols_feat]
ytr = train_df["label"]

In [None]:
# 5a) Low-variance filter (std < 1e-8 after scaling ⇒ constant)
vt = VarianceThreshold(threshold=1e-8)
vt.fit(Xtr_df.values)
keep_mask = vt.get_support()
keep_cols_lv = list(np.array(cols_feat)[keep_mask])
print(f"Low-variance removed: {len(cols_feat) - len(keep_cols_lv)}")

In [None]:
# 5b) Duplicate columns (exact duplicates on train)
Xtr_lv = Xtr_df[keep_cols_lv]
dups = {}
seen = {}
for c in keep_cols_lv:
    key = tuple(np.round(Xtr_lv[c].values, 8))  # robust equality
    if key in seen:
        dups[c] = seen[key]
    else:
        seen[key] = c
drop_dups = list(dups.keys())
keep_cols_uniq = [c for c in keep_cols_lv if c not in drop_dups]
print(f"Duplicate columns removed: {len(drop_dups)}")

In [None]:
# Apply to all splits
def apply_colsubset(df, cols):
    return df[cols + ["label","timestamp"]].copy()

In [None]:
train_df5 = apply_colsubset(train_df, keep_cols_uniq)
val_df5   = apply_colsubset(val_df,   keep_cols_uniq)
test_df5  = apply_colsubset(test_df,  keep_cols_uniq)

In [None]:
print("Shapes after Low-variance and duplicate feature pruning:", train_df5.shape, val_df5.shape, test_df5.shape)

#### 5.6. Correlation pruning (train-only, within highly correlated groups)

In [None]:
# Remove one feature from any pair with |corr| >= 0.98 (train-only)
Xtr = train_df5.drop(columns=["label","timestamp"])
corr = Xtr.corr().abs()

In [None]:
# Upper triangle mask
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
high_corr_cols = [column for column in upper.columns if (upper[column] >= 0.98).any()]

In [None]:
# Keep = all minus high-corr columns
keep_cols_corr = [c for c in Xtr.columns if c not in set(high_corr_cols)]
print(f"Correlation-pruned: {len(Xtr.columns) - len(keep_cols_corr)}")

In [None]:
# Apply to splits
def apply_corrsubset(df, cols):
    return df[cols + ["label","timestamp"]].copy()

In [None]:
train_df6 = apply_corrsubset(train_df5, keep_cols_corr)
val_df6   = apply_corrsubset(val_df5,   keep_cols_corr)
test_df6  = apply_corrsubset(test_df5,  keep_cols_corr)

In [None]:
print("Shapes after Correlation pruning:", train_df6.shape, val_df6.shape, test_df6.shape)

#### 5.7. Persist clean artifacts for modeling

In [None]:
PROC = Path("../data/processed")
PROC.mkdir(parents=True, exist_ok=True)

train_df6.to_parquet(PROC/"train.parquet", index=False)
val_df6.to_parquet(PROC/"val.parquet", index=False)
test_df6.to_parquet(PROC/"test.parquet", index=False)

# Also save feature list
feat_final = [c for c in train_df6.columns if c not in ("label","timestamp")]
pd.Series(feat_final, name="features").to_csv(PROC/"features_final.txt", index=False)

print("Saved:", list(PROC.iterdir()))