# üß† AI Lactate Advisor ‚Äì Final Training Notebook (Merged & Clean)

This notebook trains and versions two models:

1. **Lactate Model** ‚Äì predicts blood lactate (mmol/L) from time-series features  
2. **Recovery Model** ‚Äì predicts recovery/readiness score from biomarker data  

It is designed to be consistent with the **Streamlit app**, saving models into `models/`
and appending results to `models/training_log.csv`.


## ‚òëÔ∏è Cell 1 ‚Äì Environment Setup & Imports

In [None]:
# =============================================================
# üîß Environment & Paths
# =============================================================
import os
import sys
import pathlib
from datetime import datetime

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error

# LightGBM (used inside train_lightgbm too)
from lightgbm import LGBMRegressor

# Resolve repo root (assumes notebook lives in `notebooks/` or repo root)
NOTEBOOK_DIR = os.getcwd()
if os.path.basename(NOTEBOOK_DIR).lower() == "notebooks":
    REPO_ROOT = os.path.dirname(NOTEBOOK_DIR)
else:
    REPO_ROOT = NOTEBOOK_DIR

DATA_DIR = os.path.join(REPO_ROOT, "data")
MODELS_DIR = os.path.join(REPO_ROOT, "models")

pathlib.Path(DATA_DIR).mkdir(exist_ok=True, parents=True)
pathlib.Path(MODELS_DIR).mkdir(exist_ok=True, parents=True)

print(f"üìÅ REPO_ROOT: {REPO_ROOT}")
print(f"üìÇ DATA_DIR:  {DATA_DIR}")
print(f"üìÇ MODELS_DIR:{MODELS_DIR}")

# Make sure we can import model_utils.py from repo root
if REPO_ROOT not in sys.path:
    sys.path.append(REPO_ROOT)

from model_utils import (
    add_hr_slopes,
    add_rolling_features,
    train_lightgbm,
)

print("‚úÖ Imported model_utils.")


## ‚òëÔ∏è Cell 2 ‚Äì Train Lactate Model (features match app)

In [None]:
# =============================================================
# ü©∏ Lactate Model ‚Äì Data Preparation & Training
# =============================================================

lactate_csv = os.path.join(DATA_DIR, "athlete_training_dataset_1000.csv")

if not os.path.exists(lactate_csv):
    raise FileNotFoundError(
        f"‚ùå Lactate dataset not found at {lactate_csv}.\n"
        "Please ensure `athlete_training_dataset_1000.csv` exists in the data/ folder."
    )

df_lac = pd.read_csv(lactate_csv)
print(f"‚úÖ Loaded lactate dataset: {df_lac.shape[0]} rows, {df_lac.shape[1]} columns")

assert "lactate" in df_lac.columns, "Dataset must contain 'lactate' column."

# --- Apply SAME feature engineering as in app.py ---
df_feat = df_lac.copy()

# App pipeline: rename hr ‚Üí heart_rate, then add slopes/rolling, then rename back
if "hr" in df_feat.columns:
    df_feat = df_feat.rename(columns={"hr": "heart_rate"})

df_feat = add_hr_slopes(df_feat)            # adds hr_slope_time, etc.
df_feat = add_rolling_features(df_feat, 30) # rolling 30s features

if "heart_rate" in df_feat.columns:
    df_feat = df_feat.rename(columns={"heart_rate": "hr"})

# Drop rows missing label
df_feat = df_feat.dropna(subset=["lactate"])

# Build features X and labels y
X = df_feat.drop(columns=["lactate", "recovery_score"], errors="ignore")
y = df_feat["lactate"].values

print(f"‚úÖ Lactate feature matrix shape: {X.shape}")
print(f"‚úÖ Lactate labels shape: {y.shape}")
print("üìã Lactate feature columns:", list(X.columns))

# =============================================================
# üìö Train/Validation/Test Split ‚Äì Lactate
# =============================================================
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.25, random_state=42
)  # 0.25 of 0.8 = 0.2

print("‚úÖ Lactate splits:")
print("  X_train:", X_train.shape, "  y_train:", y_train.shape)
print("  X_val:  ", X_val.shape,   "  y_val:  ", y_val.shape)
print("  X_test: ", X_test.shape,  "  y_test: ", y_test.shape)

# =============================================================
# üöÄ Train & Save Lactate Model (with versioning)
# =============================================================
lactate_model = train_lightgbm(
    X_train, y_train,
    X_val, y_val,
    model_dir=MODELS_DIR,
    model_name="lactate_lightgbm_model",
    github_repo="AI-Lactate-Advisor",
    github_user="indarss"
)

# Evaluate on held-out test set
y_pred_test_lac = lactate_model.predict(X_test)
r2_lac = r2_score(y_test, y_pred_test_lac)
mae_lac = mean_absolute_error(y_test, y_pred_test_lac)

print(f"üìà Lactate model TEST R¬≤ = {r2_lac:.3f}")
print(f"üìâ Lactate model TEST MAE = {mae_lac:.3f}")

# Keep references for later summary/logging
lactate_metrics = {
    "model": "lactate",
    "r2_test": r2_lac,
    "mae_test": mae_lac,
    "n_test": len(X_test),
}


## ‚òëÔ∏è Cell 3 ‚Äì Train Recovery Model (biomarker dataset)

In [None]:
# =============================================================
# üß¨ Recovery Model ‚Äì Data Preparation & Training
# =============================================================

bio_csv = os.path.join(DATA_DIR, "athlete_training_dataset_with_biomarkers.csv")

if not os.path.exists(bio_csv):
    raise FileNotFoundError(
        f"‚ùå Biomarker dataset not found at {bio_csv}.\n"
        "Please ensure `athlete_training_dataset_with_biomarkers.csv` exists in the data/ folder."
    )

df_rec = pd.read_csv(bio_csv)
print(f"‚úÖ Loaded biomarker dataset: {df_rec.shape[0]} rows, {df_rec.shape[1]} columns")

assert "recovery_score" in df_rec.columns, "Biomarker dataset must contain 'recovery_score' column."

Xr = df_rec.drop(columns=["recovery_score"], errors="ignore")
yr = df_rec["recovery_score"].values

print(f"‚úÖ Recovery feature matrix Xr shape: {Xr.shape}")
print(f"‚úÖ Recovery labels yr shape: {yr.shape}")

# =============================================================
# üìö Train/Validation/Test Split ‚Äì Recovery
# =============================================================
Xr_train_full, Xr_test, yr_train_full, yr_test = train_test_split(
    Xr, yr, test_size=0.2, random_state=42
)
Xr_train, Xr_val, yr_train, yr_val = train_test_split(
    Xr_train_full, yr_train_full, test_size=0.25, random_state=42
)

print("‚úÖ Recovery splits:")
print("  Xr_train:", Xr_train.shape, "  yr_train:", yr_train.shape)
print("  Xr_val:  ", Xr_val.shape,   "  yr_val:  ", yr_val.shape)
print("  Xr_test: ", Xr_test.shape,  "  yr_test: ", yr_test.shape)

# =============================================================
# üöÄ Train & Save Recovery Model (with versioning)
# =============================================================
recovery_model = train_lightgbm(
    Xr_train, yr_train,
    Xr_val, yr_val,
    model_dir=MODELS_DIR,
    model_name="recovery_lightgbm_model",
    github_repo="AI-Lactate-Advisor",
    github_user="indarss"
)

yr_pred_test_rec = recovery_model.predict(Xr_test)
r2_rec = r2_score(yr_test, yr_pred_test_rec)
mae_rec = mean_absolute_error(yr_test, yr_pred_test_rec)

print(f"üìà Recovery model TEST R¬≤ = {r2_rec:.3f}")
print(f"üìâ Recovery model TEST MAE = {mae_rec:.3f}")

recovery_metrics = {
    "model": "recovery",
    "r2_test": r2_rec,
    "mae_test": mae_rec,
    "n_test": len(Xr_test),
}


## ‚òëÔ∏è Cell 4 ‚Äì Combined Metrics Summary (Nice Table)

In [None]:
# =============================================================
# üìä Combined Metrics Summary (Lactate & Recovery)
# =============================================================
metrics_rows = []

if "lactate_metrics" in globals():
    metrics_rows.append({
        "Model": "Lactate (LightGBM)",
        "R2 (test)": lactate_metrics["r2_test"],
        "MAE (test)": lactate_metrics["mae_test"],
        "n_test": lactate_metrics["n_test"],
    })

if "recovery_metrics" in globals():
    metrics_rows.append({
        "Model": "Recovery (LightGBM)",
        "R2 (test)": recovery_metrics["r2_test"],
        "MAE (test)": recovery_metrics["mae_test"],
        "n_test": recovery_metrics["n_test"],
    })

df_metrics = pd.DataFrame(metrics_rows)
display(df_metrics.style.format({"R2 (test)": "{:.3f}", "MAE (test)": "{:.3f}"}))


## ‚òëÔ∏è Cell 5 ‚Äì Append Metrics to models/training_log.csv

In [None]:
# =============================================================
# üßæ Append Training Metrics to models/training_log.csv
# =============================================================
import csv

log_path = os.path.join(MODELS_DIR, "training_log.csv")
os.makedirs(MODELS_DIR, exist_ok=True)

timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

rows_to_log = []
if "lactate_metrics" in globals():
    rows_to_log.append([
        "lactate",
        timestamp,
        lactate_metrics["r2_test"],
        lactate_metrics["mae_test"],
        lactate_metrics["n_test"],
    ])

if "recovery_metrics" in globals():
    rows_to_log.append([
        "recovery",
        timestamp,
        recovery_metrics["r2_test"],
        recovery_metrics["mae_test"],
        recovery_metrics["n_test"],
    ])

header = ["model", "timestamp", "r2_test", "mae_test", "n_test"]
file_exists = os.path.exists(log_path)

with open(log_path, "a", newline="") as f:
    writer = csv.writer(f)
    if not file_exists:
        writer.writerow(header)
    writer.writerows(rows_to_log)

print(f"‚úÖ Logged {len(rows_to_log)} entries to {log_path}")
