# 🧠 Train Recovery Estimator (LightGBM)

This notebook trains a **LightGBM regression model** to predict `recovery_score` from biomarkers.

### What you'll get
- Auto-detect & load dataset (sample or your real CSV)
- Train/validate LightGBM model
- Evaluate (R², MAE) and plot predictions vs truth
- **Compare AI model vs rule-based score**
- Optional **SHAP** feature importance plots
- Save model → `models/recovery_lightgbm_model.joblib`


In [0]:
import os, sys, pandas as pd, numpy as np
from pathlib import Path

# Ensure project root on sys.path
repo_root = Path.cwd()
if str(repo_root) not in sys.path:
    sys.path.append(str(repo_root))

print('Working dir:', repo_root)
print('Listing files:', [p.name for p in repo_root.iterdir()][:15])

## 1) Load dataset (auto-detect)
Looks for `data/athlete_training_dataset_with_biomarkers_SAMPLE.csv` first, then root.
Required columns: `ck, cortisol, tc_ratio, hscrp, glucose, rbc, recovery_score`

In [0]:
candidates = [
    Path('data/athlete_training_dataset_with_biomarkers_SAMPLE.csv'),
    Path('athlete_training_dataset_with_biomarkers_SAMPLE.csv'),
    Path('athlete_training_dataset_with_biomarkers.csv'),
]
csv_path = None
for c in candidates:
    if c.exists():
        csv_path = c; break

if csv_path is None:
    raise FileNotFoundError('No dataset found. Please upload a CSV with biomarkers + recovery_score.')

df = pd.read_csv(csv_path)
print('✅ Loaded:', csv_path)
print(df.head())
print('Shape:', df.shape)

## 2) Train/Validation split & LightGBM training

In [0]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error
from lightgbm import LGBMRegressor
import joblib

feature_cols = ['ck','cortisol','tc_ratio','hscrp','glucose','rbc']
target_col = 'recovery_score'

for col in feature_cols + [target_col]:
    if col not in df.columns:
        raise ValueError(f'Missing required column: {col}')

X = df[feature_cols].copy()
y = df[target_col].astype(float)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LGBMRegressor(n_estimators=400, learning_rate=0.05, max_depth=-1, subsample=0.9, colsample_bytree=0.9, random_state=42)
model.fit(X_train, y_train)

pred_test = model.predict(X_test)
r2 = r2_score(y_test, pred_test)
mae = mean_absolute_error(y_test, pred_test)
print(f'R²: {r2:.3f}  |  MAE: {mae:.2f}')

# Save model
models_dir = Path('models'); models_dir.mkdir(exist_ok=True)
out_path = models_dir / 'recovery_lightgbm_model.joblib'
joblib.dump(model, out_path)
print('💾 Saved model to', out_path)

## 3) Plot predictions vs truth

In [0]:
import matplotlib.pyplot as plt
plt.figure(figsize=(6,6))
plt.scatter(y_test, pred_test, alpha=0.5)
plt.plot([0,100],[0,100],'k--', lw=1)
plt.title('Recovery: Predictions vs Truth')
plt.xlabel('True Recovery Score')
plt.ylabel('Predicted Recovery Score')
plt.grid(alpha=0.3)
plt.show()

## 4) Compare with rule-based score
We recreate the simple handcrafted score and compare against model predictions.

In [0]:
# Rule-based score (as used earlier)
def rule_based_score(row):
    score = 100 - (
        (row['ck'] - 100) * 0.05 +
        (row['cortisol'] - 250) * 0.03 +
        (row['hscrp'] - 0.5) * 10 +
        (1 - row['tc_ratio']) * 20
    )
    return np.clip(score, 0, 100)

rb = X_test.copy()
rb['rule_score'] = X_test.apply(rule_based_score, axis=1)

from sklearn.metrics import mean_squared_error
rb_mae = mean_absolute_error(y_test, rb['rule_score'])
rb_r2  = r2_score(y_test, rb['rule_score'])
print(f'Rule-based — R²: {rb_r2:.3f}  |  MAE: {rb_mae:.2f}')

plt.figure(figsize=(6,6))
plt.scatter(y_test, rb['rule_score'], alpha=0.5, label='Rule-based')
plt.scatter(y_test, pred_test, alpha=0.5, label='LightGBM')
plt.plot([0,100],[0,100],'k--', lw=1)
plt.title('Model vs Rule-based Recovery')
plt.xlabel('True Recovery Score')
plt.ylabel('Predicted Score')
plt.legend(); plt.grid(alpha=0.3)
plt.show()

## 5) SHAP interpretation (optional)
If `shap` is installed, plot global feature importance and contribution summary.

In [0]:
try:
    import shap
    explainer = shap.TreeExplainer(model)
    # Use a smaller sample for speed
    Xs = X_test.sample(min(512, len(X_test)), random_state=42)
    sv = explainer.shap_values(Xs)
    shap.summary_plot(sv, Xs, show=True)
except Exception as e:
    print('SHAP not available or failed:', e)

## 6) Save evaluation table to CSV
Useful for reporting or for your Streamlit demo.

In [0]:
results = pd.DataFrame({
    'true': y_test.reset_index(drop=True),
    'pred_lightgbm': pred_test,
    'pred_rule_based': rb['rule_score'].reset_index(drop=True)
})
out_csv = Path('/content/recovery_eval_results.csv')
results.to_csv(out_csv, index=False)
print('💾 Saved:', out_csv)
results.head()

## 7) (Optional) Push model to GitHub
Configure a GitHub Personal Access Token (classic, `repo` scope) in a Colab secret before running.

In [0]:
import os, base64, json, requests
GITHUB_TOKEN = os.getenv('GITHUB_TOKEN')  # set in Colab: %env GITHUB_TOKEN=xxxx
REPO = 'indarss/AI-Lactate-Advisor'
BRANCH = 'main'
TARGET_PATH = 'models/recovery_lightgbm_model.joblib'

if GITHUB_TOKEN:
    with open('models/recovery_lightgbm_model.joblib', 'rb') as f:
        content = base64.b64encode(f.read()).decode('utf-8')

    url = f'https://api.github.com/repos/{REPO}/contents/{TARGET_PATH}'
    headers = {'Authorization': f'token {GITHUB_TOKEN}', 'Accept': 'application/vnd.github+json'}
    data = {
        'message': 'Add recovery LightGBM model (auto-upload)',
        'content': content,
        'branch': BRANCH
    }
    r = requests.put(url, headers=headers, data=json.dumps(data))
    print('GitHub upload status:', r.status_code)
    print(r.text[:500])
else:
    print('Skipping GitHub upload (no GITHUB_TOKEN set).')