In [1]:
# 04_reason_per_cause_models.ipynb

# ── 1. Imports ──────────────────────────────────────────
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler
import joblib

# ── 2. Load Data ────────────────────────────────────────
df = pd.read_csv("../data/ai4i2020.csv")

# Sensors/features only (we’ll reuse for each target)
feature_cols = ["Type", "Air temperature [K]", "Process temperature [K]",
                "Rotational speed [rpm]", "Torque [Nm]", "Tool wear [min]"]
X_full = df[feature_cols]

# ── 3. Preprocessing pipeline (shared) ──────────────────
cat_cols = ["Type"]
num_cols = [c for c in feature_cols if c not in cat_cols]

preprocess = ColumnTransformer([
    ("cat", Pipeline([
        ("impute", SimpleImputer(strategy="most_frequent")),
        ("ohe", OneHotEncoder(handle_unknown="ignore"))
    ]), cat_cols),
    ("num", Pipeline([
        ("impute", SimpleImputer(strategy="median")),
        ("scale", StandardScaler())
    ]), num_cols)
])

# ── 4. Train a model per cause ──────────────────────────
targets = ["TWF", "HDF", "PWF", "OSF", "RNF"]
model_dir = Path("notebooks")
model_dir.mkdir(exist_ok=True)

models = {}

for target in targets:
    print(f"\n================ Training for {target} ================")
    y = df[target]

    # Train/test split stratified on this cause
    X_train, X_test, y_train, y_test = train_test_split(
        X_full, y,
        test_size=0.2,
        stratify=y,
        random_state=42
    )

    # Oversample to balance positives/negatives for this label
    ros = RandomOverSampler(random_state=42)
    X_bal, y_bal = ros.fit_resample(X_train, y_train)

    # Build pipeline: preprocessing + classifier
    pipe = Pipeline([
        ("prep", preprocess),
        ("model", RandomForestClassifier(
            n_estimators=400,
            random_state=42,
            n_jobs=-1
        ))
    ])

    pipe.fit(X_bal, y_bal)
    y_pred = pipe.predict(X_test)

    print(classification_report(y_test, y_pred, zero_division=0))

    # Save the trained pipeline
    fname = model_dir / f"reason_{target}_model.joblib"
    joblib.dump(pipe, fname)
    print(f"Saved: {fname}")
    models[target] = pipe

print("\nAll models trained and saved.")



              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1991
           1       0.00      0.00      0.00         9

    accuracy                           1.00      2000
   macro avg       0.50      0.50      0.50      2000
weighted avg       0.99      1.00      0.99      2000

Saved: notebooks\reason_TWF_model.joblib

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1977
           1       0.94      0.74      0.83        23

    accuracy                           1.00      2000
   macro avg       0.97      0.87      0.91      2000
weighted avg       1.00      1.00      1.00      2000

Saved: notebooks\reason_HDF_model.joblib

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1981
           1       0.81      0.68      0.74        19

    accuracy                           1.00      2000
   macro avg       0.90      0.84      0.87