# Modeling

## Setup & Imports

In [None]:
# Standard imports + our modeling module
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys
from pathlib import Path

# make sure parent folder (project root) is on the path
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

# Modelling utilities
from sklearn.ensemble import RandomForestClassifier
from src.modeling import (
    load_features,
    cross_validate_model,
    train_test_evaluate,
    evaluate_models
)

# Scikit-learn imports for experiments
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, train_test_split
from sklearn.metrics import f1_score, classification_report
from imblearn.over_sampling import SMOTE
import lightgbm as lgb
from sklearn.neural_network import MLPClassifier

# Set up default CV
CV = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


## Load Data

In [6]:
FEATURE_CSV = "../data/processed/features.csv"
X, targets, feature_names = load_features(FEATURE_CSV)
print("Features shape:", X.shape)
print("Targets keys:", targets.keys())

Features shape: (2205, 136)
Targets keys: dict_keys(['cooler_pct', 'valve_pct', 'pump_leak', 'acc_pressure'])


## Baseline Evaluation of Candidate Models
Compare RandomForest, GradientBoosting and SVC with grouped train/test metrics.

In [7]:
candidates = {
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    # "SVC": SVC(random_state=42)
}

metrics_df, reports = evaluate_models(candidates, X, targets)
print(metrics_df)

# (Optionally display one of the classification reports)
print("\nSample report for RandomForest on cooler_pct:\n")
print(reports["RandomForest_cooler_pct"])


              model        target  accuracy  f1_macro
0      RandomForest    cooler_pct  1.000000  1.000000
1      RandomForest     valve_pct  0.986395  0.982805
2      RandomForest     pump_leak  0.997732  0.996616
3      RandomForest  acc_pressure  0.968254  0.965000
4  GradientBoosting    cooler_pct  1.000000  1.000000
5  GradientBoosting     valve_pct  0.986395  0.981546
6  GradientBoosting     pump_leak  0.995465  0.993230
7  GradientBoosting  acc_pressure  0.963719  0.958744

Sample report for RandomForest on cooler_pct:

              precision    recall  f1-score   support

           3       1.00      1.00      1.00       147
          20       1.00      1.00      1.00       146
         100       1.00      1.00      1.00       148

    accuracy                           1.00       441
   macro avg       1.00      1.00      1.00       441
weighted avg       1.00      1.00      1.00       441



## Cross-Validation Summary
Compute 5-fold CV (accuracy & macro-F1) for each model/target.

In [8]:
for name, est in candidates.items():
    print(f"\n### {name}")
    for tgt, y in targets.items():
        cv_df = cross_validate_model(est, X, y, cv=CV)
        print(f"{tgt:15s} →", 
              f"acc {cv_df['accuracy_mean'].iloc[0]:.3f}±{cv_df['accuracy_std'].iloc[0]:.3f},",
              f"F1 {cv_df['f1_macro_mean'].iloc[0]:.3f}±{cv_df['f1_macro_std'].iloc[0]:.3f}")



### RandomForest
cooler_pct      → acc 0.998±0.001, F1 0.998±0.001
valve_pct       → acc 0.993±0.002, F1 0.991±0.002
pump_leak       → acc 0.995±0.002, F1 0.994±0.003
acc_pressure    → acc 0.986±0.004, F1 0.984±0.004

### GradientBoosting
cooler_pct      → acc 0.999±0.001, F1 0.999±0.001
valve_pct       → acc 0.990±0.004, F1 0.988±0.005
pump_leak       → acc 0.992±0.005, F1 0.989±0.007


KeyboardInterrupt: 

## Experiment A: Hyperparameter Tuning (RandomizedSearchCV)
Looping Hyperparameter Tuning over all targets

In [9]:
from sklearn.model_selection import RandomizedSearchCV

param_dist = {
    "n_estimators": [50,100,200,300],
    "max_depth":    [None,10,20,30],
    "min_samples_split":[2,5,10],
    "min_samples_leaf": [1,2,4],
    "bootstrap":[True,False]
}

best_params = {}
for tgt, y in targets.items():
    print(f"\nTuning RandomForest for {tgt!r}")
    rs = RandomizedSearchCV(
        RandomForestClassifier(random_state=42),
        param_distributions=param_dist,
        n_iter=30,
        cv=CV,
        scoring="f1_macro",
        n_jobs=-1,
        random_state=42,
        verbose=0
    )
    rs.fit(X, y)
    best_params[tgt] = rs.best_params_
    print(f" → Best f1_macro: {rs.best_score_:.4f} with {rs.best_params_}")

# now `best_params` holds one dict of tuned hyper-params per target


Tuning RandomForest for 'cooler_pct'
 → Best f1_macro: 0.9982 with {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_depth': 10, 'bootstrap': True}

Tuning RandomForest for 'valve_pct'


KeyboardInterrupt: 

## Experiment B: Handling Class Imbalance
Looping Class-Imbalance Strategies


In [None]:
from imblearn.over_sampling import SMOTE

imbalance_results = []

for tgt, y in targets.items():
    print(f"\n=== Handling imbalance for {tgt!r} ===")
    # train/test split
    X_tr, X_te, y_tr, y_te = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

    # a) SVC with class_weight='balanced'
    svc_bal = SVC(class_weight="balanced", random_state=42)
    svc_bal.fit(X_tr, y_tr)
    f1_bal = f1_score(y_te, svc_bal.predict(X_te), average="macro")

    # b) RandomForest + SMOTE
    sm = SMOTE(random_state=42)
    X_res, y_res = sm.fit_resample(X_tr, y_tr)
    rf_sm = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_sm.fit(X_res, y_res)
    f1_sm = f1_score(y_te, rf_sm.predict(X_te), average="macro")

    imbalance_results.append({
        "target": tgt,
        "svc_bal_f1": f1_bal,
        "rf+smote_f1": f1_sm
    })

pd.DataFrame(imbalance_results)

## Experiment C: Alternative Models (LightGBM & MLP)
Looping Alternative Models

In [None]:
alt_results = []
for tgt, y in targets.items():
    print(f"\n*** Alternative models for {tgt!r} ***")
    X_tr, X_te, y_tr, y_te = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

    # LightGBM
    lgbm = lgb.LGBMClassifier(random_state=42)
    lgbm.fit(X_tr, y_tr)
    report_lgbm = classification_report(y_te, lgbm.predict(X_te), output_dict=True)

    # MLP
    mlp = MLPClassifier(hidden_layer_sizes=(100,50), max_iter=300, random_state=42)
    mlp.fit(X_tr, y_tr)
    report_mlp = classification_report(y_te, mlp.predict(X_te), output_dict=True)

    alt_results.append({
        "target": tgt,
        "lgbm_f1_macro": report_lgbm["macro avg"]["f1-score"],
        "mlp_f1_macro": report_mlp["macro avg"]["f1-score"]
    })

pd.DataFrame(alt_results)
