In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.ensemble import HistGradientBoostingRegressor, HistGradientBoostingClassifier
import joblib


def _tail_metrics(name: str, y_true: np.ndarray, y_pred: np.ndarray, thresholds=(3.0, 5.0, 10.0)):
    print(f"\n{name} tail metrics:")
    for t in thresholds:
        mask = y_true >= t
        n = int(mask.sum())
        if n < 5:
            print(f"  y >= {t}: n={n} (too few)")
            continue
        mae = mean_absolute_error(y_true[mask], y_pred[mask])
        r2 = r2_score(y_true[mask], y_pred[mask])
        print(f"  y >= {t}: n={n} | MAE={mae:.3f} | R2={r2:.4f}")


def train_fraxplus_surrogates_final(
    df: pd.DataFrame,
    *,
    test_size: float = 0.2,
    random_state: int = 42,
    hip_high_threshold: float = 2.0,
    clamp_0_100: bool = True,
    tail_thresholds=(3.0, 5.0, 10.0),
):
    base = df.drop(
        columns=[c for c in ["continent", "bmi_units",
                             "scanner"] if c in df.columns],
        errors="ignore"
    ).copy()

    base = base.dropna(subset=["mof_risk", "hip_risk"]).copy()
    if "us_group" in base.columns:
        base = base.dropna(subset=["us_group"]).copy()

    y_mof = base["mof_risk"].astype(float).to_numpy()
    y_hip = base["hip_risk"].astype(float).to_numpy()

    X = base.drop(columns=["mof_risk", "hip_risk"])
    if "us_group" in X.columns:
        X = pd.get_dummies(X, columns=["us_group"], drop_first=False)

    X_train, X_test, y_mof_train, y_mof_test, y_hip_train, y_hip_test = train_test_split(
        X, y_mof, y_hip, test_size=test_size, random_state=random_state
    )

    # ---- MOF model ----
    mof_model = HistGradientBoostingRegressor(
        learning_rate=0.05,
        max_iter=2000,
        l2_regularization=0.5,
        early_stopping=True,
        random_state=random_state,
        max_depth=4,
        min_samples_leaf=20,
    )

    print("\nTraining MOF (log1p)... {'max_depth': 4, 'min_samples_leaf': 20}")
    mof_model.fit(X_train, np.log1p(y_mof_train))

    mof_pred_t = mof_model.predict(X_test)
    mof_pred = np.expm1(mof_pred_t)

    # ---- HIP two-stage ----
    y_high = (y_hip_train >= hip_high_threshold).astype(int)

    hip_clf = HistGradientBoostingClassifier(
        learning_rate=0.05,
        max_iter=2000,
        l2_regularization=0.5,
        early_stopping=True,
        random_state=random_state,
        max_depth=3,
        min_samples_leaf=40,
    )

    print(f"Training Hip classifier (hip_risk >= {hip_high_threshold})...")
    hip_clf.fit(X_train, y_high)

    low_mask = y_hip_train < hip_high_threshold
    high_mask = ~low_mask

    hip_reg_low = HistGradientBoostingRegressor(
        learning_rate=0.05,
        max_iter=3000,
        l2_regularization=0.5,
        early_stopping=True,
        random_state=random_state,
        max_depth=3,
        min_samples_leaf=40,
    )

    hip_reg_high = HistGradientBoostingRegressor(
        learning_rate=0.03,
        max_iter=4000,
        l2_regularization=0.2,
        early_stopping=True,
        random_state=random_state,
        max_depth=4,
        min_samples_leaf=15,
    )

    print(f"Training Hip regressor LOW (n={int(low_mask.sum())})...")
    hip_reg_low.fit(X_train[low_mask], np.log1p(y_hip_train[low_mask]))

    print(f"Training Hip regressor HIGH (n={int(high_mask.sum())})...")
    hip_reg_high.fit(X_train[high_mask], np.log1p(y_hip_train[high_mask]))

    p_high = hip_clf.predict_proba(X_test)[:, 1]  # keep T=1.0 (best)
    hip_pred_low_t = hip_reg_low.predict(X_test)
    hip_pred_high_t = hip_reg_high.predict(X_test)

    hip_pred_t = (1 - p_high) * hip_pred_low_t + p_high * hip_pred_high_t
    hip_pred = np.expm1(hip_pred_t)

    # ---- safety clamp ----
    if clamp_0_100:
        mof_pred = np.clip(mof_pred, 0.0, 100.0)
        hip_pred = np.clip(hip_pred, 0.0, 100.0)

    # ---- metrics ----
    print("\n=== PERFORMANCE (holdout) ===")
    print(
        f"MOF  original: MAE={mean_absolute_error(y_mof_test, mof_pred):.3f} | R2={r2_score(y_mof_test, mof_pred):.4f}")
    print(
        f"MOF  log1p:    MAE={mean_absolute_error(np.log1p(y_mof_test), mof_pred_t):.3f} | R2={r2_score(np.log1p(y_mof_test), mof_pred_t):.4f}")

    print(
        f"Hip  original: MAE={mean_absolute_error(y_hip_test, hip_pred):.3f} | R2={r2_score(y_hip_test, hip_pred):.4f}")
    print(
        f"Hip  log1p:    MAE={mean_absolute_error(np.log1p(y_hip_test), hip_pred_t):.3f} | R2={r2_score(np.log1p(y_hip_test), hip_pred_t):.4f}")

    _tail_metrics("MOF", y_mof_test, mof_pred, thresholds=tail_thresholds)
    _tail_metrics("Hip", y_hip_test, hip_pred, thresholds=tail_thresholds)

    return {
        "mof_model": mof_model,
        "hip_clf": hip_clf,
        "hip_reg_low": hip_reg_low,
        "hip_reg_high": hip_reg_high,
        "feature_columns": list(X.columns),
        "hip_high_threshold": hip_high_threshold,
    }


def save_models(models, path="fraxplus_models.pkl"):
    joblib.dump(models, path)
    print(f"Models saved to {path}")

In [2]:
data = pd.read_csv('/Users/hariprasannaa/fraxdata.csv')

In [3]:
models = train_fraxplus_surrogates_final(data, hip_high_threshold=2.0)
save_models(models)


Training MOF (log1p)... {'max_depth': 4, 'min_samples_leaf': 20}
Training Hip classifier (hip_risk >= 2.0)...
Training Hip regressor LOW (n=146)...
Training Hip regressor HIGH (n=158)...

=== PERFORMANCE (holdout) ===
MOF  original: MAE=0.774 | R2=0.9353
MOF  log1p:    MAE=0.089 | R2=0.9675
Hip  original: MAE=0.544 | R2=0.9195
Hip  log1p:    MAE=0.175 | R2=0.9239

MOF tail metrics:
  y >= 3.0: n=58 | MAE=0.944 | R2=0.9031
  y >= 5.0: n=42 | MAE=1.161 | R2=0.8471
  y >= 10.0: n=19 | MAE=1.754 | R2=0.6268

Hip tail metrics:
  y >= 3.0: n=26 | MAE=0.924 | R2=0.8445
  y >= 5.0: n=12 | MAE=1.376 | R2=0.7820
  y >= 10.0: n=2 (too few)
Models saved to fraxplus_models.pkl
