# Module 3.2 – Classification Analysis (Semiconductor Manufacturing)
This notebook develops and evaluates supervised classification approaches for semiconductor yield/excursion detection tasks. It mirrors the regression workflow patterns from Module 3.1, extending them to imbalanced binary classification: baseline models, imbalance strategies (class weighting vs SMOTE variants), threshold tuning, probability calibration, interpretability, and production pipeline handoff.

Sections:
1. Imports & Global Configuration
2. Data Loading (SECOM + synthetic generator) & Target Derivation
3. Exploratory Data Analysis (class distribution, feature summaries)
4. Baseline Logistic Regression
5. Model Comparison (SVM, Tree, RandomForest, Gradient Boosting)
6. Imbalance Strategies (Weights vs SMOTE)
7. Threshold Tuning (Precision-Recall / ROC)
8. Probability Calibration (Reliability Curve, Brier)
9. Interpretability (Permutation Importance, Coefficients)
10. ClassificationPipeline Assembly & Persistence
11. Monitoring & Drift Considerations
12. Next Steps


In [None]:
# 1. Imports & Global Configuration
import os
import json
import math
from pathlib import Path
from datetime import datetime
import warnings

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (roc_auc_score, average_precision_score, precision_recall_curve,
                             roc_curve, classification_report, confusion_matrix,
                             brier_score_loss, log_loss, matthews_corrcoef, balanced_accuracy_score)
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.utils import class_weight
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt
import seaborn as sns

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

plt.style.use('seaborn-v0_8')
sns.set_context('talk')
warnings.filterwarnings('ignore')

print('Environment ready:', {'numpy': np.__version__, 'pandas': pd.__version__})

In [None]:
# 2. Data Loading & Synthetic Generator
# Placeholder: Loading SECOM dataset (existing regression dataset repurposed for classification)
# We will derive a binary target from a continuous yield/quality metric or simulate one.

DATA_DIR = Path('../../../datasets').resolve()

# Synthetic classification data generator (imbalanced)
def generate_synthetic_classification(n=1200, minority_frac=0.08, seed=RANDOM_SEED):
    rng = np.random.default_rng(seed)
    temp = rng.normal(450, 15, n)
    pressure = rng.normal(2.5, 0.3, n)
    flow = rng.normal(120, 10, n)
    time = rng.normal(60, 5, n)
    interaction = 0.001 * (temp - 450) * (flow - 120)
    # latent score with non-linear effect
    score = (0.04*(temp-450) - 1.2*(pressure-2.5)**2 + 0.03*flow + 0.15*time + interaction)
    # threshold for minority (rare excursion)
    cutoff = np.quantile(score, 1 - minority_frac)
    y = (score >= cutoff).astype(int)
    df = pd.DataFrame({'temperature': temp, 'pressure': pressure, 'flow': flow, 'time': time, 'rare_event': y})
    # simple engineered features
    df['temp_centered'] = df['temperature'] - df['temperature'].mean()
    df['pressure_sq'] = df['pressure']**2
    df['flow_time_inter'] = df['flow'] * df['time']
    df['temp_flow_inter'] = df['temperature'] * df['flow']
    return df

synthetic_df = generate_synthetic_classification()
print('Synthetic shape:', synthetic_df.shape, 'Minority rate:', synthetic_df['rare_event'].mean())

# Derive X, y
TARGET = 'rare_event'
X = synthetic_df.drop(columns=[TARGET])
y = synthetic_df[TARGET].values

# Initial train/holdout split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=RANDOM_SEED)
print('Train minority rate:', y_train.mean(), 'Test minority rate:', y_test.mean())

In [None]:
# 3. Exploratory Data Analysis
class_counts = pd.Series(y_train).value_counts().sort_index()
print('Class counts (train):')
print(class_counts)
print('\nClass distribution (%):')
print((class_counts / class_counts.sum()).round(4))

summary = X_train.describe().T
summary['missing_pct'] = 100 * (X_train.isna().sum() / len(X_train))
summary


In [None]:
# 4. Baseline Logistic Regression
baseline_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('scale', StandardScaler()),
    ('clf', LogisticRegression(max_iter=500, class_weight='balanced', random_state=RANDOM_SEED))
])
baseline_pipeline.fit(X_train, y_train)
probs_val = baseline_pipeline.predict_proba(X_test)[:,1]
roc = roc_auc_score(y_test, probs_val)
ap = average_precision_score(y_test, probs_val)
print({'roc_auc': roc, 'average_precision': ap})