In [4]:
import pandas as pd
import numpy as np

# ===============================
# 1. LOAD DATA
# ===============================
df = pd.read_csv("wearables_health_6mo_daily.csv")

# ===============================
# 2. BASIC CLEANING
# ===============================
df.columns = df.columns.str.lower().str.strip()
df["date"] = pd.to_datetime(df["date"])
df = df.sort_values(["user_id", "date"]).reset_index(drop=True)
df = df.drop_duplicates(subset=["user_id", "date"])

# ===============================
# 3. HANDLE MISSING VALUES (NUMERIC)
# ===============================
num_cols = df.select_dtypes(include=["int64", "float64"]).columns
for col in num_cols:
    df[col] = df.groupby("user_id")[col].transform(
        lambda x: x.fillna(x.median())
    )

# ===============================
# 4. OUTLIER CAPPING (SAFE)
# ===============================
def clip_series(s):
    return s.clip(s.quantile(0.01), s.quantile(0.99))

for col in [
    "avg_hr_day_bpm",
    "resting_hr_bpm",
    "hrv_rmssd_ms",
    "sleep_duration_hours",
    "spo2_avg_pct"
]:
    if col in df.columns:
        df[col] = clip_series(df[col])

# ===============================
# 5. TIME FEATURES
# ===============================
df["day_of_week"] = df["date"].dt.dayofweek
df["is_weekend"] = df["day_of_week"].isin([5, 6]).astype(int)

# ===============================
# 6. ACTIVITY LOAD (NO FAILURE)
# ===============================
if "calories_burned_kcal" in df.columns:
    df["activity_load"] = 0.6 * df["steps"] + 0.4 * df["calories_burned_kcal"]
elif "calories_burned" in df.columns:
    df["activity_load"] = 0.6 * df["steps"] + 0.4 * df["calories_burned"]
else:
    df["activity_load"] = df["steps"]

# ===============================
# 7. SLEEP STAGES (SAFE DEFAULTS)
# ===============================
if "sleep_stage_deep_pct" not in df.columns:
    df["sleep_stage_deep_pct"] = 0

if "sleep_stage_rem_pct" not in df.columns:
    df["sleep_stage_rem_pct"] = 0

df["deep_sleep_ratio"] = df["sleep_stage_deep_pct"] / 100
df["rem_sleep_ratio"] = df["sleep_stage_rem_pct"] / 100

df["sleep_architecture_score"] = (
    0.5 * df["deep_sleep_ratio"] +
    0.5 * df["rem_sleep_ratio"]
)

# ===============================
# 8. SLEEP EFFICIENCY (DERIVE IF MISSING)
# ===============================
if "sleep_efficiency_pct" in df.columns:
    df["sleep_efficiency"] = df["sleep_efficiency_pct"]
elif "sleep_efficiency" in df.columns:
    df["sleep_efficiency"] = df["sleep_efficiency"]
else:
    df["sleep_efficiency"] = np.minimum(
        100,
        (df["sleep_duration_hours"] / 8) * 100
    )

# ===============================
# 9. RECOVERY & STRAIN
# ===============================
df["hr_strain"] = (
    (df["avg_hr_day_bpm"] - df["resting_hr_bpm"]) /
    df["resting_hr_bpm"]
)

df["sleep_pressure"] = np.maximum(
    0,
    8 - df["sleep_duration_hours"]
)

# ===============================
# 10. PERSONAL BASELINES
# ===============================
df["baseline_hrv"] = df.groupby("user_id")["hrv_rmssd_ms"].transform("median")
df["baseline_rhr"] = df.groupby("user_id")["resting_hr_bpm"].transform("median")

df["hrv_deviation"] = df["hrv_rmssd_ms"] - df["baseline_hrv"]
df["rhr_deviation"] = df["resting_hr_bpm"] - df["baseline_rhr"]

# ===============================
# 11. ROLLING FEATURES
# ===============================
df["hrv_7d_avg"] = df.groupby("user_id")["hrv_rmssd_ms"].transform(
    lambda x: x.rolling(7, min_periods=1).mean()
)

df["sleep_7d_avg"] = df.groupby("user_id")["sleep_duration_hours"].transform(
    lambda x: x.rolling(7, min_periods=1).mean()
)

# ===============================
# 12. FINAL ML DATASET (AUTO-SAFE)
# ===============================
final_features = [
    "user_id",
    "date",
    "avg_hr_day_bpm",
    "resting_hr_bpm",
    "hrv_rmssd_ms",
    "stress_score",
    "spo2_avg_pct",
    "sleep_duration_hours",
    "sleep_efficiency",
    "sleep_architecture_score",
    "activity_load",
    "hr_strain",
    "sleep_pressure",
    "baseline_hrv",
    "baseline_rhr",
    "hrv_deviation",
    "rhr_deviation",
    "hrv_7d_avg",
    "sleep_7d_avg",
    "day_of_week",
    "is_weekend"
]

final_features = [c for c in final_features if c in df.columns]
df_final = df[final_features]

# ===============================
# 13. FINAL SAFETY CHECK
# ===============================
assert df_final.isnull().sum().sum() == 0

# ===============================
# 14. SAVE
# ===============================
df_final.to_csv("wearable_ml_ready.csv", index=False)

print("✅ ALL ERRORS FIXED — ML-ready dataset created successfully")
print("Final columns:", df_final.columns.tolist())


✅ ALL ERRORS FIXED — ML-ready dataset created successfully
Final columns: ['user_id', 'date', 'avg_hr_day_bpm', 'resting_hr_bpm', 'hrv_rmssd_ms', 'stress_score', 'spo2_avg_pct', 'sleep_duration_hours', 'sleep_efficiency', 'sleep_architecture_score', 'activity_load', 'hr_strain', 'sleep_pressure', 'baseline_hrv', 'baseline_rhr', 'hrv_deviation', 'rhr_deviation', 'hrv_7d_avg', 'sleep_7d_avg', 'day_of_week', 'is_weekend']
