In [None]:
"""Cell 1:  imports & config: pulls in pandas/numpy/sklearn, defines a simple RMSE helper and sets a random seed"""

# Core
import re, numpy as np, pandas as pd

# Modeling
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

# Utils
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

pd.set_option("display.max_columns", 100)
RANDOM_STATE = 42


In [None]:
"""Cell 2:  load & inspect: reads cleaned CSV, standardizes column names, picks the target Sp02 column, drops NaNs/Infs, extracts an integer Subject ID for grouping, and prints basic info"""

# Path to your mentor's cleaned file
CSV = "DOVE_Hypoxia_Data_Manually_Synced.csv"  # change if needed

df = pd.read_csv(CSV)

# Standardize column names
df.columns = [c.strip() for c in df.columns]

# Choose which commercial SpO2 to use as ground truth (Rad97 or Nellcor)
TARGET_COL = "Rad97-60/SpO2"             # or "Nellcor PM1000N-1/SpO2"

# Keep only the columns we need and drop rows with NaN/inf
use_cols = ["SubjectID", "r_value", "skintone", TARGET_COL]
df = df[use_cols].replace([np.inf, -np.inf], np.nan).dropna()

# Extract integer Subject number for grouping
df["SubjectNum"] = df["SubjectID"].str.extract(r"Subject(\d+)", expand=False).astype(int)

print(df.head(5))
print("n rows:", len(df), "| n subjects:", df.SubjectNum.nunique())


  SubjectID   r_value  skintone  Rad97-60/SpO2  SubjectNum
0  Subject1  0.858301     1.992             91           1
1  Subject1  0.831904     1.992             91           1
2  Subject1  0.922036     1.992             90           1
3  Subject1  0.925349     1.992             90           1
4  Subject1  0.990698     1.992             89           1
n rows: 604 | n subjects: 11


In [None]:
"""Cell 3:  LOSO helper: defines a generic “leave-one-subject-out” evaluation loop that trains on all-but-one subject and aggregates MAE/RMSE and correlation"""

from typing import Callable, Dict, List, Tuple

def loso_eval(
    X: np.ndarray,
    y: np.ndarray,
    groups: np.ndarray,
    fit_predict_fn: Callable[[np.ndarray, np.ndarray, np.ndarray], np.ndarray],
) -> Dict[str, float]:
    """
    Generic LOSO evaluation loop.
    fit_predict_fn must train on (X_train, y_train) and return predictions for X_test.
    """
    logo = LeaveOneGroupOut()
    maes, rmses, rs = [], [], []
    all_true, all_pred = [], []

    for tr, te in logo.split(X, y, groups):
        y_tr, y_te = y[tr], y[te]
        y_hat = fit_predict_fn(X[tr], y_tr, X[te])

        maes.append(mean_absolute_error(y_te, y_hat))
        rmses.append(rmse(y_te, y_hat))
        # Pearson r on this fold (guard against constant arrays)
        if np.std(y_hat) > 1e-9 and np.std(y_te) > 1e-9:
            r = np.corrcoef(y_te, y_hat)[0,1]
            rs.append(r)

        all_true.append(y_te)
        all_pred.append(y_hat)

    all_true = np.concatenate(all_true)
    all_pred = np.concatenate(all_pred)
    overall_r = np.corrcoef(all_true, all_pred)[0,1] if (np.std(all_pred)>1e-9 and np.std(all_true)>1e-9) else np.nan

    return {
        "MAE": float(np.mean(maes)),
        "RMSE": float(np.mean(rmses)),
        "Fold-r (mean)": float(np.mean(rs)) if rs else np.nan,
        "Overall-r": float(overall_r),
        "n_folds": int(len(maes)),
    }


In [None]:

"""Cell 4: baseline (quadratic): reproduces the Beer–Lambert style quadratic fit using only r_value, evaluated with LOSO so it’s a fair, per-subject baseline"""

y = df[TARGET_COL].values
groups = df["SubjectNum"].values

def quad_baseline_fit_predict(Xtr, ytr, Xte):
    # X uses only r_value column for the baseline
    r_tr = Xtr[:, 0]
    r_te = Xte[:, 0]
    # robust clean
    m = ~np.isnan(r_tr) & ~np.isnan(ytr) & ~np.isinf(r_tr) & ~np.isinf(ytr)
    coeffs = np.polyfit(r_tr[m], ytr[m], deg=2)
    a, b, c = coeffs
    return a * r_te**2 + b * r_te + c

# Build feature matrix for baseline: only r_value
X_base = df[["r_value"]].values
res_baseline = loso_eval(X_base, y, groups, quad_baseline_fit_predict)
res_baseline


{'MAE': 2.616163582991273,
 'RMSE': 3.2469400911066675,
 'Fold-r (mean)': 0.966660491044379,
 'Overall-r': 0.9174718236785232,
 'n_folds': 11}

In [None]:
"""Cell 5 — ML #1 (Gradient Boosting): compares models with r_value alone vs r_value + skintone under LOSO to quantify the gain from adding skin tone"""

def gbr_fit_predict(Xtr, ytr, Xte):
    model = GradientBoostingRegressor(random_state=RANDOM_STATE)
    model.fit(Xtr, ytr)
    return model.predict(Xte)

# (a) without skintone
X_no_tone = df[["r_value"]].values
res_gbr_no = loso_eval(X_no_tone, y, groups, gbr_fit_predict)

# (b) with skintone
X_with_tone = df[["r_value", "skintone"]].values
res_gbr_with = loso_eval(X_with_tone, y, groups, gbr_fit_predict)

res_gbr_no, res_gbr_with


({'MAE': 2.731902672314914,
  'RMSE': 3.3757012123988632,
  'Fold-r (mean)': 0.952601861995917,
  'Overall-r': 0.9120807646422168,
  'n_folds': 11},
 {'MAE': 3.6983185003838,
  'RMSE': 4.40310209900018,
  'Fold-r (mean)': 0.9609352808423934,
  'Overall-r': 0.8648947465091292,
  'n_folds': 11})

In [None]:
"""Cell 6: ML #2 (Random Forest): repeats the same comparison with a tree ensemble to check robustness to nonlinearity and interactions"""

def rf_fit_predict(Xtr, ytr, Xte):
    model = RandomForestRegressor(
        n_estimators=400, max_depth=None, min_samples_leaf=2, random_state=RANDOM_STATE, n_jobs=-1
    )
    model.fit(Xtr, ytr)
    return model.predict(Xte)

res_rf_no  = loso_eval(X_no_tone,   y, groups, rf_fit_predict)
res_rf_with= loso_eval(X_with_tone, y, groups, rf_fit_predict)

res_rf_no, res_rf_with


({'MAE': 2.8180136547536083,
  'RMSE': 3.49785187910624,
  'Fold-r (mean)': 0.9446105353733633,
  'Overall-r': 0.9071317354078964,
  'n_folds': 11},
 {'MAE': 3.2447717388620605,
  'RMSE': 3.9250800672171233,
  'Fold-r (mean)': 0.9560567549351021,
  'Overall-r': 0.885240788698983,
  'n_folds': 11})

In [None]:
"""Cell 7: ML #3 (Ridge + polynomial r): fits a linear model on polynomial features of r_value (degree 3), with and without raw skintone, again under LOSO"""

# Polynomial on r_value up to degree 3; optionally append raw skintone
def ridge_poly_fit_predict(Xtr, ytr, Xte, include_tone: bool):
    # Split columns: 0=r_value, 1=skintone (if present)
    r_tr = Xtr[:, 0:1]
    r_te = Xte[:, 0:1]
    poly = PolynomialFeatures(degree=3, include_bias=False)
    R_tr = poly.fit_transform(r_tr)
    R_te = poly.transform(r_te)

    if include_tone and Xtr.shape[1] > 1:
        R_tr = np.hstack([R_tr, Xtr[:, 1:2]])
        R_te = np.hstack([R_te, Xte[:, 1:2]])

    model = Ridge(alpha=1.0, random_state=RANDOM_STATE)
    model.fit(R_tr, ytr)
    return model.predict(R_te)

res_ridge_no   = loso_eval(X_no_tone,   y, groups, lambda a,b,c: ridge_poly_fit_predict(a,b,c, include_tone=False))
res_ridge_with = loso_eval(X_with_tone, y, groups, lambda a,b,c: ridge_poly_fit_predict(a,b,c, include_tone=True))

res_ridge_no, res_ridge_with


({'MAE': 2.6188237248363344,
  'RMSE': 3.2153055778345423,
  'Fold-r (mean)': 0.9727555394271806,
  'Overall-r': 0.9152727187442256,
  'n_folds': 11},
 {'MAE': 2.763944350481047,
  'RMSE': 3.327641910428741,
  'Fold-r (mean)': 0.9728528176281518,
  'Overall-r': 0.9117881485005682,
  'n_folds': 11})

In [None]:
"""Cell 8:  summary table: collates all LOSO metrics into one sortable DataFrame so you can see which approach (and whether adding skin tone) wins on MAE/RMSE/"""

rows = []
rows.append(("Baseline Quad (r only)", res_baseline))
rows.append(("GBR (r only)",          res_gbr_no))
rows.append(("GBR (+skintone)",       res_gbr_with))
rows.append(("RF (r only)",           res_rf_no))
rows.append(("RF (+skintone)",        res_rf_with))
rows.append(("Ridge Poly (r only)",   res_ridge_no))
rows.append(("Ridge Poly (+skintone)",res_ridge_with))

summary = pd.DataFrame([
    {"Model": name, **metrics} for name, metrics in rows
]).sort_values("MAE")
summary.reset_index(drop=True, inplace=True)
summary


Unnamed: 0,Model,MAE,RMSE,Fold-r (mean),Overall-r,n_folds
0,Baseline Quad (r only),2.616164,3.24694,0.96666,0.917472,11
1,Ridge Poly (r only),2.618824,3.215306,0.972756,0.915273,11
2,GBR (r only),2.731903,3.375701,0.952602,0.912081,11
3,Ridge Poly (+skintone),2.763944,3.327642,0.972853,0.911788,11
4,RF (r only),2.818014,3.497852,0.944611,0.907132,11
5,RF (+skintone),3.244772,3.92508,0.956057,0.885241,11
6,GBR (+skintone),3.698319,4.403102,0.960935,0.864895,11


In [None]:
"""Cell 9: fit best model: trains your chosen best model on all data (often r_value + skintone) and prints in-sample error for quick sanity-check/plotting."""

# Choose the best performing setting from the table above
best_features = ["r_value", "skintone"]  # update if your best model didn't use tone
best_X = df[best_features].values

best_model = GradientBoostingRegressor(random_state=RANDOM_STATE)  # swap if another model won
best_model.fit(best_X, y)
y_pred_full = best_model.predict(best_X)

print("In-sample MAE:", mean_absolute_error(y, y_pred_full), "RMSE:", rmse(y, y_pred_full))


In-sample MAE: 1.1006769002671426 RMSE: 1.4809196236707376
