In [None]:
# Predict WIN_PCT from PPP + POSS_PCT + SEASON
import re
import json
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.inspection import permutation_importance
from joblib import dump

CSV_PATH = "/mnt/data/standardized_all_data.csv"
MODEL_PATH = "/mnt/data/win_pct_hgbr.joblib"
REPORT_PATH = "/mnt/data/win_pct_report.json"


# -------------------------
# 1) Load data
# -------------------------
df = pd.read_csv(CSV_PATH)

# Target
target_col = "WIN_PCT"
assert target_col in df.columns, f"{target_col} not found in CSV columns"

# -------------------------
# 2) Feature selection
# -------------------------
# All PPP_* and POSS_PCT_* features
ppp_cols  = [c for c in df.columns if re.match(r"^PPP_", c)]
poss_cols = [c for c in df.columns if re.match(r"^POSS_PCT_", c)]

# Season feature(s):
# Prefer your existing dummy columns: SEASON__2015-16, ..., SEASON__2024-25
season_onehots = [c for c in df.columns if c.startswith("SEASON__")]
use_onehots = len(season_onehots) > 0

if use_onehots:
    feature_cols = ppp_cols + poss_cols + season_onehots
    preproc = "passthrough"
else:
    # Fall back to one-hot encoding SEASON_ORIG if present
    season_cat = ["SEASON_ORIG"] if "SEASON_ORIG" in df.columns else []
    feature_cols = ppp_cols + poss_cols + season_cat
    preproc = ColumnTransformer(
        transformers=[("season", OneHotEncoder(handle_unknown="ignore"), season_cat)],
        remainder="passthrough",
        verbose_feature_names_out=False,
    )

X = df[feature_cols].copy()
y = df[target_col].astype(float).values

print(f"Samples: {X.shape[0]}  |  Features: {X.shape[1]}  "
      f"(PPP: {len(ppp_cols)}, POSS_PCT: {len(poss_cols)}, Season feats: {len(season_onehots) if use_onehots else 'one-hot via transformer'})")

# -------------------------
# 3) Train/test split
# -------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)

# -------------------------
# 4) Model pipeline
# -------------------------
if preproc == "passthrough":
    pipe = Pipeline([
        ("model", HistGradientBoostingRegressor(
            random_state=42,
            max_depth=4,
            max_leaf_nodes=31,
            learning_rate=0.1,
            early_stopping=True
        ))
    ])
else:
    pipe = Pipeline([
        ("pre", preproc),
        ("model", HistGradientBoostingRegressor(
            random_state=42,
            max_depth=4,
            max_leaf_nodes=31,
            learning_rate=0.1,
            early_stopping=True
        ))
    ])

# -------------------------
# 5) Train
# -------------------------
pipe.fit(X_train, y_train)

# -------------------------
# 6) Evaluate
# -------------------------
def eval_split(name, Xs, ys, model):
    pred = model.predict(Xs)
    rmse = mean_squared_error(ys, pred, squared=False)
    mae  = mean_absolute_error(ys, pred)
    r2   = r2_score(ys, pred)
    return {"split": name, "rmse": rmse, "mae": mae, "r2": r2}

report = {
    "metrics": [
        eval_split("train", X_train, y_train, pipe),
        eval_split("test",  X_test,  y_test,  pipe),
    ],
    "n_samples": len(df),
    "n_features": X.shape[1],
    "feature_cols": feature_cols,
}

print("\n=== Metrics ===")
for m in report["metrics"]:
    print(f"{m['split']:>5} -> RMSE: {m['rmse']:.4f} | MAE: {m['mae']:.4f} | R2: {m['r2']:.4f}")

# -------------------------
# 7) Permutation importance (optional but useful)
# -------------------------
try:
    # Get feature names post-preprocessing for readability
    if preproc == "passthrough":
        feat_names = list(X_test.columns)
        X_eval = X_test
    else:
        feat_names = list(pipe.named_steps["pre"].get_feature_names_out(input_features=feature_cols))
        X_eval = X_test  # OK: pipeline handles transform

    perm = permutation_importance(pipe, X_eval, y_test, n_repeats=10, random_state=42, n_jobs=-1)
    importances = (pd.DataFrame({
        "feature": feat_names,
        "importance_mean": perm.importances_mean,
        "importance_std": perm.importances_std
    })
    .sort_values("importance_mean", ascending=False)
    .reset_index(drop=True))

    top_imp = importances.head(30).to_dict(orient="records")
    report["permutation_importance_top30"] = top_imp
    print("\nTop features (permutation importance):")
    for r in top_imp[:10]:
        print(f"  {r['feature']}: {r['importance_mean']:.6f} ± {r['importance_std']:.6f}")
except Exception as e:
    print(f"(Permutation importance skipped: {e})")

# -------------------------
# 8) Save artifacts
# -------------------------
Path(MODEL_PATH).parent.mkdir(parents=True, exist_ok=True)
dump(pipe, MODEL_PATH)
with open(REPORT_PATH, "w") as f:
    json.dump(report, f, indent=2)

print(f"\nModel saved to: {MODEL_PATH}")
print(f"Report saved to: {REPORT_PATH}")


FileNotFoundError: [Errno 2] No such file or directory: '/mnt/data/standardized_all_data.csv'