In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# ===============================
# 1. LOAD DATA
# ===============================
df = pd.read_csv("wearable_ml_ready.csv")

# ===============================
# 2. FEATURES & TARGET
# ===============================
X = df.drop(columns=["sleep_efficiency", "user_id", "date"])
y = df["sleep_efficiency"]

# ===============================
# 3. TRAIN TEST SPLIT
# ===============================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ===============================
# 4. MODEL
# ===============================
rf = RandomForestRegressor(
    n_estimators=50,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)

# ===============================
# 5. TRAIN
# ===============================
rf.fit(X_train, y_train)

# ===============================
# 6. EVALUATION
# ===============================
y_pred = rf.predict(X_test)

print("Random Forest Results")
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R2 :", r2_score(y_test, y_pred))

# ===============================
# 7. FEATURE IMPORTANCE (ROOT CAUSE)
# ===============================
importance = pd.Series(
    rf.feature_importances_,
    index=X.columns
).sort_values(ascending=False)

print("\nTop Root Causes:")
print(importance.head(10))


Random Forest Results
MAE: 0.0328996004097446
R2 : 0.754675112159086

Top Root Causes:
activity_load               0.951238
sleep_7d_avg                0.004606
spo2_avg_pct                0.004406
sleep_architecture_score    0.004070
hr_strain                   0.003874
hrv_deviation               0.003794
rhr_deviation               0.003700
hrv_7d_avg                  0.003395
stress_score                0.003091
avg_hr_day_bpm              0.003021
dtype: float64


In [5]:
pip install lightgbm

Defaulting to user installation because normal site-packages is not writeable
Collecting lightgbm
  Using cached lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Using cached lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
Installing collected packages: lightgbm
Successfully installed lightgbm-4.6.0
Note: you may need to restart the kernel to use updated packages.


In [6]:
import pandas as pd
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

df = pd.read_csv("wearable_ml_ready.csv")

X = df.drop(columns=["sleep_efficiency", "user_id", "date"])
y = df["sleep_efficiency"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model = LGBMRegressor(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("MAE:", mean_absolute_error(y_test, y_pred))
print("R2 :", r2_score(y_test, y_pred))


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012008 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3888
[LightGBM] [Info] Number of data points in the train set: 44160, number of used features: 18
[LightGBM] [Info] Start training from score 0.880240
MAE: 0.032974029686690644
R2 : 0.7557221865346598


In [7]:
import joblib

# model = trained LightGBM model
joblib.dump(model, "sleep_root_cause_lightgbm.pkl")

print("✅ Model saved as sleep_root_cause_lightgbm.pkl")


✅ Model saved as sleep_root_cause_lightgbm.pkl
