# üß† AI Lactate Advisor - AutoRetrain Versioned Notebook (Updated)

In [None]:
# =============================================================
# üìä DATA PREPARATION (Lactate Features & Labels)
# =============================================================

import pandas as pd
import numpy as np
# Download model_utils.py from GitHub
!wget -q -O model_utils.py https://raw.githubusercontent.com/indarss/AI-Lactate-Advisor/main/model_utils.py
# Install required libraries
%pip install -q streamlit

from model_utils import make_features, prepare_features

# Load your training dataset
# Adjust the path if your data is in a different location
df = pd.read_csv('/content/data/athlete_training_dataset.csv')  # or your dataset path

print(f"‚úÖ Loaded data: {df.shape[0]} rows, {df.shape[1]} columns")

# === Prepare features and labels ===
# Assuming 'lactate' column exists in your dataset
assert 'lactate' in df.columns, "Dataset must contain 'lactate' column"

# Prepare features, which may drop rows due to NaN values in feature engineering
processed_features_df = prepare_features(df)

# Align y with the processed features X using the index of processed_features_df
X_temp = processed_features_df
y_temp = df.loc[X_temp.index, 'lactate'] # Get as Series to easily drop NaNs

# Drop rows where y_temp is NaN, ensuring X and y are perfectly aligned and clean
nan_mask = y_temp.isna()
X = X_temp[~nan_mask]
y = y_temp[~nan_mask].values

print(f"‚úÖ Features prepared: X shape = {X.shape}")
print(f"‚úÖ Labels prepared: y shape = {y.shape}")
print(f"üìã Feature columns: {list(X.columns)}")


In [None]:
# =============================================================
# üìä Train/Validation Split for Lactate Model
# =============================================================
from sklearn.model_selection import train_test_split

# X and y must already be defined in the data preparation cell
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Training features:", X_train.shape)
print("Validation features:", X_val.shape)
print("Training labels:", y_train.shape)
print("Validation labels:", y_val.shape)


This notebook includes automatic retraining logic for both **Lactate** and **Recovery** models, including versioning, feature name storage, and optional GitHub uploads.

## üöÄ Added: LightGBM Training Cells for Lactate and Recovery Models

In [None]:

# =============================================================
# üöÄ TRAINING CELL (Lactate Model + Auto-Save + Optional GitHub)
# =============================================================

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error
from lightgbm import LGBMRegressor
import joblib, os
from datetime import datetime

# === Prepare splits ===
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.25, random_state=42)
print(f"‚úÖ Data split: Train={len(X_train)}, Val={len(X_val)}, Test={len(X_test)}")

# === Train model ===
params = {
    "n_estimators": 400,
    "learning_rate": 0.05,
    "num_leaves": 31,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "random_state": 42
}

print("üöÄ Training LightGBM Lactate Model...")
model = LGBMRegressor(**params)
model.fit(X_train, y_train)

# === Evaluate ===
y_pred_val = model.predict(X_val)
r2 = r2_score(y_val, y_pred_val)
mae = mean_absolute_error(y_val, y_pred_val)
print(f"üìà Validation R¬≤ = {r2:.3f}, MAE = {mae:.3f}")

# === Save ===
MODEL_DIR = "models"
os.makedirs(MODEL_DIR, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
latest_path = os.path.join(MODEL_DIR, "lactate_lightgbm_model.joblib")
versioned_path = os.path.join(MODEL_DIR, f"lactate_lightgbm_model_{timestamp}.joblib")

joblib.dump(model, latest_path)
joblib.dump(model, versioned_path)
print(f"üíæ Models saved:\n ‚î£‚îÅ {latest_path}\n ‚îó‚îÅ {versioned_path}")

# Store feature names for compatibility
model.feature_names_in_ = list(X_train.columns)
# The LightGBM model automatically stores feature names when fitted with a DataFrame.
# Explicitly assigning to .feature_names_in_ is not needed and causes an AttributeError.
# You can access them via model.feature_name_ if needed later.
print(f"üß© Stored {len(model.feature_names_in_)} feature names for alignment.")

# === Optional: GitHub Upload ===
GITHUB_USERNAME = "indarss"
GITHUB_REPO = "AI-Lactate-Advisor"
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")  # Set this in Colab: Runtime ‚Üí Secrets

if GITHUB_TOKEN:
    try:
        from github import Github
        g = Github(GITHUB_TOKEN)
        repo = g.get_user().get_repo(GITHUB_REPO)

        def upload_or_update(local_path, repo_path, message):
            with open(local_path, "rb") as f:
                content = f.read()
            try:
                existing = repo.get_contents(repo_path)
                repo.update_file(existing.path, message, content, existing.sha, branch="main")
                print(f"‚úÖ Updated on GitHub: {repo_path}")
            except Exception:
                repo.create_file(repo_path, message, content, branch="main")
                print(f"‚úÖ Uploaded new file: {repo_path}")

        upload_or_update(latest_path, f"models/lactate_lightgbm_model.joblib", "Auto-update lactate model")
        upload_or_update(versioned_path, f"models/lactate_lightgbm_model_{timestamp}.joblib", "Auto-version lactate model")
        print("üåê GitHub upload complete.")
    except Exception as e:
        print(f"‚ö†Ô∏è GitHub upload failed: {e}")
else:
    print("‚ö†Ô∏è GITHUB_TOKEN not set ‚Äî skipping GitHub upload.")


In [None]:

# =============================================================
# üß¨ TRAINING CELL (Recovery Model + Biomarker Data)
# =============================================================

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error
from lightgbm import LGBMRegressor
import joblib, os
from datetime import datetime

# Load biomarker dataset (ensure it's preloaded as df_rec or similar)
if 'df_rec' not in globals():
    df_rec = pd.read_csv('data/athlete_training_dataset_with_biomarkers.csv')

assert 'recovery_score' in df_rec.columns, "Biomarker dataset must contain 'recovery_score' column."
Xr = df_rec.drop(columns=['recovery_score'])
yr = df_rec['recovery_score']

# Split data
Xr_train_full, Xr_test, yr_train_full, yr_test = train_test_split(Xr, yr, test_size=0.2, random_state=42)
Xr_train, Xr_val, yr_train, yr_val = train_test_split(Xr_train_full, yr_train_full, test_size=0.25, random_state=42)
print(f"‚úÖ Data split: Train={len(Xr_train)}, Val={len(Xr_val)}, Test={len(Xr_test)}")

params = {
    "n_estimators": 400,
    "learning_rate": 0.05,
    "num_leaves": 31,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "random_state": 42
}

print("üöÄ Training LightGBM Recovery Model...")
recovery_model = LGBMRegressor(**params)
recovery_model.fit(Xr_train, yr_train)

# Evaluate
yr_pred_val = recovery_model.predict(Xr_val)
r2 = r2_score(yr_val, yr_pred_val)
mae = mean_absolute_error(yr_val, yr_pred_val)
print(f"üìà Validation R¬≤ = {r2:.3f}, MAE = {mae:.3f}")

# Save model
MODEL_DIR = "models"
os.makedirs(MODEL_DIR, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
latest_path = os.path.join(MODEL_DIR, "recovery_lightgbm_model.joblib")
versioned_path = os.path.join(MODEL_DIR, f"recovery_lightgbm_model_{timestamp}.joblib")

joblib.dump(recovery_model, latest_path)
joblib.dump(recovery_model, versioned_path)
print(f"üíæ Models saved:\n ‚î£‚îÅ {latest_path}\n ‚îó‚îÅ {versioned_path}")

# Store feature names
# recovery_model.feature_names_in_ = list(Xr_train.columns)
#print(f"üß© Stored {len(recovery_model.feature_names_in_)} feature names for alignment.")
# Store feature names - Not needed, LightGBM handles this automatically
print(f"üß© Stored {len(Xr_train.columns)} feature names for alignment.")

# Optional GitHub upload
if GITHUB_TOKEN:
    try:
        from github import Github
        g = Github(GITHUB_TOKEN)
        repo = g.get_user().get_repo(GITHUB_REPO)

        def upload_or_update(local_path, repo_path, message):
            with open(local_path, "rb") as f:
                content = f.read()
            try:
                existing = repo.get_contents(repo_path)
                repo.update_file(existing.path, message, content, existing.sha, branch="main")
                print(f"‚úÖ Updated on GitHub: {repo_path}")
            except Exception:
                repo.create_file(repo_path, message, content, branch="main")
                print(f"‚úÖ Uploaded new file: {repo_path}")

        upload_or_update(latest_path, f"models/recovery_lightgbm_model.joblib", "Auto-update recovery model")
        upload_or_update(versioned_path, f"models/recovery_lightgbm_model_{timestamp}.joblib", "Auto-version recovery model")
        print("üåê GitHub upload complete.")
    except Exception as e:
        print(f"‚ö†Ô∏è GitHub upload failed: {e}")
else:
    print("‚ö†Ô∏è GITHUB_TOKEN not set ‚Äî skipping GitHub upload.")


## üìä Combined Metrics Summary (Lactate & Recovery Models)

In [None]:

# =============================================
# üìä Combined Metrics Summary (Test Set)
# Recomputes R¬≤ / MAE for both models on held-out test sets
# =============================================
import pandas as pd
from sklearn.metrics import r2_score, mean_absolute_error

rows = []

# ---- Lactate model metrics on test ----
try:
    y_pred_test_lac = model.predict(X_test)
    r2_lac = r2_score(y_test, y_pred_test_lac)
    mae_lac = mean_absolute_error(y_test, y_pred_test_lac)
    rows.append({"Model": "Lactate (LightGBM)", "R2 (test)": r2_lac, "MAE (test)": mae_lac, "n_test": len(X_test)})
except Exception as e:
    rows.append({"Model": "Lactate (LightGBM)", "R2 (test)": None, "MAE (test)": None, "n_test": 0})
    print(f"‚ö†Ô∏è Could not compute lactate test metrics: {e}")

# ---- Recovery model metrics on test ----
try:
    y_pred_test_rec = recovery_model.predict(Xr_test)
    r2_rec = r2_score(yr_test, y_pred_test_rec)
    mae_rec = mean_absolute_error(yr_test, y_pred_test_rec)
    rows.append({"Model": "Recovery (LightGBM)", "R2 (test)": r2_rec, "MAE (test)": mae_rec, "n_test": len(Xr_test)})
except Exception as e:
    rows.append({"Model": "Recovery (LightGBM)", "R2 (test)": None, "MAE (test)": None, "n_test": 0})
    print(f"‚ö†Ô∏è Could not compute recovery test metrics: {e}")

df_metrics = pd.DataFrame(rows)
display(df_metrics.style.format({"R2 (test)": "{:.3f}", "MAE (test)": "{:.3f}"}))


## üßæ Save Training Metrics to CSV Log

In [None]:

# =====================================================
# üßæ Save Training Metrics to models/training_log.csv
# =====================================================
import csv, os
from datetime import datetime

LOG_PATH = os.path.join("models", "training_log.csv")
os.makedirs("models", exist_ok=True)

timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

rows_to_log = []

if 'r2_lac' in locals():
    rows_to_log.append(["lactate", timestamp, r2_lac, mae_lac, len(X_test)])
if 'r2_rec' in locals():
    rows_to_log.append(["recovery", timestamp, r2_rec, mae_rec, len(Xr_test)])

header = ["model", "timestamp", "r2_test", "mae_test", "n_test"]

file_exists = os.path.exists(LOG_PATH)
with open(LOG_PATH, "a", newline="") as f:
    writer = csv.writer(f)
    if not file_exists:
        writer.writerow(header)
    writer.writerows(rows_to_log)

print(f"‚úÖ Logged {len(rows_to_log)} entries to {LOG_PATH}")
