
# COMPETITIVE PACING — Model Training & Inference Notebook

This notebook:
1. Verifies datasets and the leaderboard of best models.
2. (Re)builds `best_models_by_stroke.csv` if it's missing.
3. Trains the per-stroke, per-target models using the chosen estimator+params.
4. Saves models under `/mnt/data/models` and writes a detailed `build_report.json`.
5. Shows how to load and run inference with a trained model.


In [None]:

import os, json, ast, re, warnings
from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import KFold, cross_val_score

# Estimators
from sklearn.ensemble import RandomForestRegressor as RandomForest
from sklearn.ensemble import GradientBoostingRegressor as GBR
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression
from sklearn.svm import SVR

ROOT = Path(".")
DATASETS_DIR = ROOT / "datasets"
MODELS_DIR = ROOT / "models"
MODELS_DIR.mkdir(parents=True, exist_ok=True)

LEADERBOARD_COMBINED = ROOT / "best_models_by_stroke.csv"

# Individual leaderboards (already uploaded earlier)
LB_FILES = {
    "Freestyle": ROOT / "Freestyle_leaderboard.csv",
    "Backstroke": ROOT / "Backstroke_leaderboard.csv",
    "Breaststroke": ROOT / "Breaststroke_leaderboard.csv",
    "Butterfly": ROOT / "Butterfly_leaderboard.csv",
    "IM": ROOT / "IM_leaderboard.csv",
}

STROKE_TO_FILE = {
    "Freestyle": "freestyle_dataset.csv",
    "Backstroke": "backstroke_dataset.csv",
    "Breaststroke": "breaststroke_dataset.csv",
    "Butterfly": "butterfly_dataset.csv",
    "IM": "im_dataset.csv",
}

MODEL_MAP = {
    "RandomForest": RandomForest,
    "GBR": GBR,
    "Ridge": Ridge,
    "Lasso": Lasso,
    "ElasticNet": ElasticNet,
    "LinearRegression": LinearRegression,
    "SVR": SVR,
}

SCALER_MODELS = {"Ridge","Lasso","ElasticNet","LinearRegression","SVR"}
TARGET_PATTERN = re.compile(r"^frac_\d+$", flags=re.IGNORECASE)


## 1) Verify datasets are present

In [None]:

available = []
if DATASETS_DIR.exists():
    for f in DATASETS_DIR.glob("*.csv"):
        available.append(f.name)

print("Found dataset files:", available)
missing = [v for v in STROKE_TO_FILE.values() if v not in available]
if missing:
    print("[WARN] Missing expected dataset files:", missing)
else:
    print("[OK] All expected datasets present.")


## 2) Build/Load combined leaderboard of best models

In [None]:

def pick_best(df):
    # Expect columns: target, model, bestparams, r2, mse (case-insensitive handled below)
    df = df.copy()
    df.columns = [c.strip().lower() for c in df.columns]
    # Sort by r2 desc, mse asc
    df_sorted = df.sort_values(by=["r2", "mse"], ascending=[False, True])
    best = df_sorted.groupby("target", as_index=False).first()
    return best

if not LEADERBOARD_COMBINED.exists():
    print("[INFO] Combined leaderboard not found. Recomputing from individual files...")
    frames = []
    for stroke, path in LB_FILES.items():
        if path.exists():
            df = pd.read_csv(path)
            best = pick_best(df)
            best.insert(0, "stroke", stroke)
            frames.append(best)
        else:
            print(f"[WARN] Missing leaderboard for {stroke}: {path}")
    if not frames:
        raise FileNotFoundError("No leaderboards found to build combined leaderboard.")
    combined = pd.concat(frames, ignore_index=True)
    combined.rename(columns={"bestparams":"bestparams","model":"model","target":"target","r2":"r2","mse":"mse"}, inplace=True)
    combined.to_csv(LEADERBOARD_COMBINED, index=False)
    print("[OK] Wrote combined leaderboard to", LEADERBOARD_COMBINED)
else:
    combined = pd.read_csv(LEADERBOARD_COMBINED)
    print("[OK] Loaded existing combined leaderboard from", LEADERBOARD_COMBINED)

display(combined.head())


## 3) Helpers for training

In [None]:

def parse_bestparams(s: str) -> dict:
    if pd.isna(s) or not str(s).strip():
        return {}
    try:
        d = ast.literal_eval(s)
    except Exception:
        # naive fallback
        d = {}
        s2 = str(s).strip().strip("{}")
        for part in s2.split(","):
            if not part.strip(): continue
            if ":" in part:
                k, v = part.split(":", 1)
                d[k.strip().strip("'").strip('"')] = ast.literal_eval(v.strip())
    return d

def make_pipeline(model_name: str, params: dict, feature_cols):
    model_cls = MODEL_MAP[model_name]
    model = model_cls()

    # Normalize params to 'model__' namespaced keys
    p2 = {}
    for k, v in params.items():
        if k.startswith("model__"):
            p2[k] = v
        else:
            p2[f"model__{k}"] = v

    pre = ColumnTransformer(
        [("num", StandardScaler() if model_name in SCALER_MODELS else "passthrough", list(feature_cols))],
        remainder="drop"
    )

    pipe = Pipeline([("pre", pre), ("model", model)])
    if p2:
        pipe.set_params(**p2)
    return pipe

def kfold_metrics(model, X, y, n_splits=5):
    cv = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        r2 = cross_val_score(model, X, y, cv=cv, scoring="r2")
        neg_mse = cross_val_score(model, X, y, cv=cv, scoring="neg_mean_squared_error")
    return {
        "r2_mean": float(np.mean(r2)), "r2_std": float(np.std(r2)),
        "mse_mean": float(np.mean(-neg_mse)), "mse_std": float(np.std(-neg_mse))
    }


## 4) Train and save models

In [None]:

import joblib

build_report = []

for stroke, sub in combined.groupby("stroke"):
    ds_name = STROKE_TO_FILE.get(stroke)
    ds_path = DATASETS_DIR / ds_name if ds_name else None

    if not ds_path or not ds_path.exists():
        print(f"[WARN] Dataset for {stroke} not found at {ds_path}. Skipping this stroke.")
        continue

    df = pd.read_csv(ds_path)

    target_cols = [c for c in df.columns if TARGET_PATTERN.match(str(c))]
    # features: numeric that are not targets
    feature_cols = [c for c in df.select_dtypes(include=[np.number]).columns if c not in target_cols]
    if not feature_cols:
        feature_cols = [c for c in df.columns if c not in target_cols]

    print(f"\n[INFO] {stroke}: Using {len(feature_cols)} features and {len(target_cols)} targets: {target_cols}")

    stroke_dir = MODELS_DIR / stroke
    stroke_dir.mkdir(parents=True, exist_ok=True)

    for _, row in sub.iterrows():
        target = row["target"]
        model_name = row["model"]
        params = parse_bestparams(row["bestparams"])

        if target not in df.columns:
            print(f"[WARN] Target '{target}' missing in dataset for {stroke}; skipping.")
            continue

        X = df[feature_cols].copy()
        y = df[target].values

        pipe = make_pipeline(model_name, params, feature_cols)

        # CV metrics on training data (for record)
        try:
            cvm = kfold_metrics(pipe, X, y, n_splits=5)
        except Exception as e:
            print(f"[WARN] CV failed for {stroke}/{target} with {model_name}: {e}")
            cvm = None

        pipe.fit(X, y)

        model_path = stroke_dir / f"{target}_{model_name}.joblib"
        joblib.dump(pipe, model_path)

        meta = {
            "stroke": stroke,
            "target": target,
            "model": model_name,
            "bestparams": params,
            "features": feature_cols,
            "dataset": str(ds_path),
            "model_path": str(model_path),
            "cv_metrics": cvm,
        }
        build_report.append(meta)
        print(f"[OK] Saved {stroke}/{target} -> {model_path.name}")

    with open(stroke_dir / "index.json", "w") as f:
        json.dump([m for m in build_report if m["stroke"] == stroke], f, indent=2)

# global report
with open(MODELS_DIR / "build_report.json", "w") as f:
    json.dump(build_report, f, indent=2)

print("\n[DONE] Training complete. Summary:")
pd.DataFrame(build_report)[["stroke","target","model","model_path"]].head()


## 5) Build report summary

In [None]:

report_df = pd.DataFrame(build_report)
display(report_df.head(20))

report_csv = MODELS_DIR / "build_report.csv"
report_df.to_csv(report_csv, index=False)
print("Saved:", report_csv)


## 6) Load a trained model and predict (registry-like helpers)

In [None]:

import json, joblib

def list_trained(stroke: str):
    idx_path = MODELS_DIR / stroke / "index.json"
    if not idx_path.exists():
        return []
    with open(idx_path, "r") as f:
        meta = json.load(f)
    return meta

def load_model(stroke: str, target: str):
    metas = list_trained(stroke)
    candidates = [m for m in metas if m["target"] == target]
    if not candidates:
        raise FileNotFoundError(f"No trained model for {stroke}/{target}")
    m = candidates[0]
    path = Path(m["model_path"])
    if not path.exists():
        raise FileNotFoundError(f"Model file missing at {path}")
    return joblib.load(path), m


### Inference demo

In [None]:

# Pick a stroke & target that were trained (adjust as needed)
example_stroke = "Freestyle"
example_target = "frac_1"

# Load dataset to get feature columns
ds_path = DATASETS_DIR / STROKE_TO_FILE[example_stroke]
df = pd.read_csv(ds_path)

target_cols = [c for c in df.columns if TARGET_PATTERN.match(str(c))]
feature_cols = [c for c in df.select_dtypes(include=[np.number]).columns if c not in target_cols]
if not feature_cols:
    feature_cols = [c for c in df.columns if c not in target_cols]

pipe, meta = load_model(example_stroke, example_target)

# Use a few samples from the dataset for a quick check
X_sample = df[feature_cols].head(5)
y_pred = pipe.predict(X_sample)

print("Loaded model:", meta["model"], "for", example_stroke, example_target)
print("Predictions:", y_pred)
