In [None]:
# 04_modeling.ipynb
import pandas as pd, os, joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

df = pd.read_csv("../data/processed/features.csv")
X = df.drop(columns=["cooler_pct","valve_pct","pump_leak","acc_pressure"])
y = df[["cooler_pct","valve_pct","pump_leak","acc_pressure"]]

results = {}
for target in y.columns:
    X_tr, X_val, y_tr, y_val = train_test_split(
        X, y[target], stratify=y[target], test_size=0.2, random_state=42
    )
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_tr, y_tr)
    preds = clf.predict(X_val)
    report = classification_report(y_val, preds, output_dict=True)
    results[target] = report
    print(f"\n--- {target} ---")
    print(classification_report(y_val, preds))
    # save model
    os.makedirs("../artifacts", exist_ok=True)
    joblib.dump(clf, f"../artifacts/{target}_rf.pkl")

# Optional: assemble a summary table
import pandas as pd
summary = pd.DataFrame({
    t: {"accuracy": results[t]["accuracy"],
        **{f"{cls}_f1": results[t][cls]["f1-score"]
           for cls in results[t] if cls not in ["accuracy","macro avg","weighted avg"]}}
    for t in results
}).T
summary.to_csv("../artifacts/model_performance.csv")
print("\nSaved performance summary to artifacts/model_performance.csv")
summary
