# üß† AI Lactate Advisor ‚Äì Final Training Notebook (Merged & Clean)

This notebook trains and versions two models:

1. **Lactate Model** ‚Äì predicts blood lactate (mmol/L) from time-series features  
2. **Recovery Model** ‚Äì predicts recovery/readiness score from biomarker data  

It is designed to be consistent with the **Streamlit app**, saving models into `models/`
and appending results to `models/training_log.csv`.


## üìò Cell 1 ‚Äî Imports & Global Config

In [None]:
# =============================================================
# üìò Cell 1 ‚Äî Imports & Global Config
# =============================================================

import os
import numpy as np
import pandas as pd
from datetime import datetime
import joblib

from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error

# Install PyGithub if not already installed
try:
    from github import Github
except ImportError:
    print("Installing PyGithub...")
    !pip install PyGithub
    from github import Github  # optional; ok if not used


from model_utils import (
    add_hr_slopes,
    add_rolling_features,
)

# Paths relative to repo root
DATA_DIR = "data"
MODELS_DIR = "models"

os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(MODELS_DIR, exist_ok=True)

LACTATE_MODEL_PATH = os.path.join(MODELS_DIR, "lactate_lightgbm_model.joblib")
RECOVERY_MODEL_PATH = os.path.join(MODELS_DIR, "recovery_lightgbm_model.joblib")

print("üìÅ DATA_DIR:", DATA_DIR)
print("üìÅ MODELS_DIR:", MODELS_DIR)


## üìó Cell 2 ‚Äî Load & Merge Data (Auto-Retrain Logic)

In [None]:
# =============================================================
# üìó Cell 2 ‚Äî Load & Merge Data (Auto-Retrain Logic)
# =============================================================

MERGED_DATASET = os.path.join(DATA_DIR, "merged_training_data.csv")

# If merged file exists, use it as "master"
if os.path.exists(MERGED_DATASET):
    df_master = pd.read_csv(MERGED_DATASET)
    master_mtime = os.path.getmtime(MERGED_DATASET)
else:
    df_master = pd.DataFrame()
    master_mtime = 0

# Find all CSVs in data/ that look like training data
csv_files = [f for f in os.listdir(DATA_DIR) if f.endswith(".csv")]
if not csv_files:
    raise FileNotFoundError("‚ùå No CSV files in data/ folder.")

latest_file = max(csv_files, key=lambda f: os.path.getmtime(os.path.join(DATA_DIR, f)))
latest_path = os.path.join(DATA_DIR, latest_file)
latest_mtime = os.path.getmtime(latest_path)

print("üìÑ Latest CSV:", latest_file)

if latest_mtime > master_mtime:
    print("üì¶ Newer dataset detected ‚Üí merging into merged_training_data.csv")
    df_new = pd.read_csv(latest_path)
    if not df_master.empty:
        df_merged = pd.concat([df_master, df_new], ignore_index=True).drop_duplicates()
    else:
        df_merged = df_new
    df_merged.to_csv(MERGED_DATASET, index=False)
    df_all = df_merged
else:
    print("‚úÖ No newer CSV; using existing merged_training_data.csv")
    df_all = pd.read_csv(MERGED_DATASET)

print("üìä Merged dataset shape:", df_all.shape)
df_all.head(3)


##  üìó Cell 3 ‚Äî Feature Engineering for Both Models

In [None]:
# =============================================================
# üìó Cell 3 ‚Äî Feature Engineering (Lactate + Recovery)
# =============================================================

df = df_all.copy()

# Basic checks
for col in ["lactate", "recovery_score", "hr", "power"]:
    if col not in df.columns:
        print(f"‚ö†Ô∏è Warning: column '{col}' missing in merged data.")

assert "lactate" in df.columns, "‚ùå 'lactate' column required in merged_training_data.csv"
assert "recovery_score" in df.columns, "‚ùå 'recovery_score' column required."

# Rename hr ‚Üí heart_rate for consistency with add_hr_slopes
if "hr" in df.columns:
    df = df.rename(columns={"hr": "heart_rate"})

# Apply your feature engineering pipeline
df = add_hr_slopes(df)
df = add_rolling_features(df, window=30)

# Rename back to hr for app compatibility
if "heart_rate" in df.columns:
    df = df.rename(columns={"heart_rate": "hr"})

print("‚úÖ Feature engineering complete. Columns now:", len(df.columns))

# --- Build lactate dataset ---
df_lac = df.dropna(subset=["lactate"]).copy()
X_lac = df_lac.drop(columns=["lactate", "recovery_score"], errors="ignore")
y_lac = df_lac["lactate"]

print("üìä Lactate X shape:", X_lac.shape, " y:", y_lac.shape)

# --- Build recovery dataset ---
df_rec = df.dropna(subset=["recovery_score"]).copy()
# Use all numeric columns except labels as features
numeric_cols = df_rec.select_dtypes(include=[np.number]).columns
feature_cols_rec = [c for c in numeric_cols if c not in ["lactate", "recovery_score"]]

X_rec = df_rec[feature_cols_rec]
y_rec = df_rec["recovery_score"]

print("üìä Recovery X shape:", X_rec.shape, " y:", y_rec.shape)
print("üß¨ Recovery features:", feature_cols_rec[:10], "...")


## üìó Cell 4 ‚Äî Train Both Models (with Feature Schema Embedded)

In [None]:
# =============================================================
# üìó Cell 4 ‚Äî Train Lactate & Recovery Models
# =============================================================

def train_lightgbm_model(X, y, name: str):
    """Train a LightGBM regressor and return model + metrics + feature list."""
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    model = LGBMRegressor(
        n_estimators=400,
        learning_rate=0.05,
        num_leaves=31,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    )

    print(f"üöÄ Training {name} model...")
    model.fit(X_train, y_train)

    y_pred = model.predict(X_val)
    r2 = r2_score(y_val, y_pred)
    mae = mean_absolute_error(y_val, y_pred)

    print(f"üìà {name} VALID R¬≤ = {r2:.3f}")
    print(f"üìâ {name} VALID MAE = {mae:.3f}")

    return model, r2, mae, list(X.columns)

# ---- Lactate model ----
lactate_model, r2_lac, mae_lac, lactate_features = train_lightgbm_model(X_lac, y_lac, "Lactate")

# ---- Recovery model ----
recovery_model, r2_rec, mae_rec, recovery_features = train_lightgbm_model(X_rec, y_rec, "Recovery")

print("‚úÖ Training complete.")


## üìó Cell 5 ‚Äî Save Models (Wrapped with Schema) + Training Log + Optional GitHub Upload

In [None]:
# =============================================================
# üìó Cell 5 ‚Äî Save Models + Versioning + Log + Optional GitHub Upload
# =============================================================

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Wrap models with feature schema
lactate_wrapper = {"model": lactate_model, "features": lactate_features}
recovery_wrapper = {"model": recovery_model, "features": recovery_features}

# Save latest
joblib.dump(lactate_wrapper, LACTATE_MODEL_PATH)
joblib.dump(recovery_wrapper, RECOVERY_MODEL_PATH)

# Save versioned copies
ver_lac = os.path.join(MODELS_DIR, f"lactate_lightgbm_model_{timestamp}.joblib")
ver_rec = os.path.join(MODELS_DIR, f"recovery_lightgbm_model_{timestamp}.joblib")
joblib.dump(lactate_wrapper, ver_lac)
joblib.dump(recovery_wrapper, ver_rec)

print("üíæ Saved:")
print("  ", LACTATE_MODEL_PATH)
print("  ", RECOVERY_MODEL_PATH)
print("  ", ver_lac)
print("  ", ver_rec)

# ---- Training Log ----
log_path = os.path.join(MODELS_DIR, "training_log.csv")
log_entry = pd.DataFrame([{
    "timestamp": timestamp,
    "r2_lactate": r2_lac,
    "mae_lactate": mae_lac,
    "r2_recovery": r2_rec,
    "mae_recovery": mae_rec,
    "rows": len(df_all)
}])

if os.path.exists(log_path):
    log_entry.to_csv(log_path, mode="a", header=False, index=False)
else:
    log_entry.to_csv(log_path, index=False)

print(f"üìù Logged training metrics ‚Üí {log_path}")

# ---- Optional: GitHub upload (models + log) ----
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
GITHUB_USER = "indarss"
GITHUB_REPO = "AI-Lactate-Advisor"

if GITHUB_TOKEN:
    try:
        g = Github(GITHUB_TOKEN)
        repo = g.get_user().get_repo(GITHUB_REPO)

        def upload_or_update(local_path, repo_path, message):
            with open(local_path, "rb") as f:
                content = f.read()
            try:
                existing = repo.get_contents(repo_path)
                repo.update_file(existing.path, message, content, existing.sha, branch="main")
                print(f"‚úÖ Updated {repo_path} on GitHub")
            except Exception:
                repo.create_file(repo_path, message, content, branch="main")
                print(f"‚úÖ Uploaded {repo_path} to GitHub")

        upload_or_update(LACTATE_MODEL_PATH, "models/lactate_lightgbm_model.joblib", "Update lactate model")
        upload_or_update(RECOVERY_MODEL_PATH, "models/recovery_lightgbm_model.joblib", "Update recovery model")
        upload_or_update(log_path, "models/training_log.csv", "Update training log")

        print("üåê GitHub sync complete.")
    except Exception as e:
        print(f"‚ö†Ô∏è GitHub upload failed: {e}")
else:
    print("‚ÑπÔ∏è GITHUB_TOKEN not set ‚Äì skipping GitHub upload.")
