In [None]:
# Block 1 — Imports & constants
import os, json, ast, math
from pathlib import Path
import numpy as np
import pandas as pd
import joblib

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import r2_score

# Base path = same folder as notebook & CSVs
BASE = Path(".").resolve()

# Tunables for realism constraints
MAX_CONSEC_DELTA = 0.20      # ±20% cap between consecutive 50s
PB_MIN_MARGIN_S  = 0.20      # every split must be at least PB50 + this many seconds
TARGET_LOWER_PAD = 0.02      # target >= PB pace * (1+this)
TARGET_UPPER_PAD = 1.50      # target <= PB pace * this


In [None]:
# Block 2 — Utilities: parsing, prep, estimators

def time_to_seconds(t):
    if t is None: return np.nan
    if isinstance(t, (int, float)): return float(t)
    s = str(t).strip().replace(",", ".")
    if ":" in s:
        try:
            m, sec = s.split(":", 1)
            return float(m)*60 + float(sec)
        except:
            return np.nan
    try:
        return float(s)
    except:
        return np.nan

def parse_splits(raw):
    """Accepts '33.46;35.67;...' or arrays; returns list[float] seconds."""
    if raw is None or (isinstance(raw, float) and math.isnan(raw)): return []
    if isinstance(raw, (list, tuple, np.ndarray, pd.Series)):
        out = []
        for x in list(raw):
            if x is None or (isinstance(x, float) and math.isnan(x)): continue
            sx = str(x).strip()
            if not sx: continue
            parts = [p.strip() for p in sx.replace(",", ";").split(";") if p.strip()]
            for p in parts:
                sec = time_to_seconds(p)
                if not math.isnan(sec): out.append(float(sec))
        return out
    if isinstance(raw, str):
        parts = [p.strip() for p in raw.replace(",", ";").split(";") if p.strip()]
        out = [time_to_seconds(p) for p in parts]
        return [float(v) for v in out if not math.isnan(v)]
    sec = time_to_seconds(raw)
    return [float(sec)] if not math.isnan(sec) else []

def normalize_splits(splits):
    arr = np.asarray(splits, dtype=float)
    tot = arr.sum()
    return (arr/tot).tolist() if tot > 0 else []

def find_splits_column(df):
    for c in df.columns:
        if "split" in c.lower():
            return c
    raise ValueError(f"No 'split' column found. Headers = {list(df.columns)}")

def prepare_training_frame(csv_path: Path):
    """
    Reads a stroke CSV, auto-detects splits column, builds frac_i targets & features.
    Returns prepared DataFrame and list of target columns.
    """
    df = pd.read_csv(csv_path)

    split_col = find_splits_column(df)
    df["splits_50"] = df[split_col].apply(parse_splits)

    # Distance (infer if missing)
    if "Distance" not in df.columns:
        df["Distance"] = df["splits_50"].apply(lambda s: len(s)*50 if s else np.nan)

    # Total time
    time_col = None
    for c in df.columns:
        lc = c.strip().lower()
        if lc in ("final time in seconds","final_time","time","totaltime_s"):
            time_col = c; break
    df["TotalTime_s"] = df[time_col] if time_col else df["splits_50"].apply(lambda s: sum(s) if s else np.nan)

    # PB50 estimate if missing
    if "PB50_s" not in df.columns:
        df["PB50_s"] = df["splits_50"].apply(lambda s: min(s) if s else np.nan)

    # Build frac targets
    n_max = df["splits_50"].apply(lambda s: len(s) if isinstance(s, list) else 0).max()
    n_max = int(n_max) if not (pd.isna(n_max) or n_max is None) else 0
    targets = []
    for i in range(1, n_max+1):
        col = f"frac_{i}"
        df[col] = np.nan
        targets.append(col)
    for ridx, s in df["splits_50"].items():
        if isinstance(s, list) and len(s) > 0:
            fr = normalize_splits(s)
            for j, f in enumerate(fr, 1):
                df.at[ridx, f"frac_{j}"] = f

    # Clean rows with no targets at all
    if targets:
        df = df.dropna(subset=[targets[0]], how="any")
    return df.reset_index(drop=True), targets

def make_preprocessor(feat_cols):
    num = [c for c in feat_cols if c in ["Distance", "PB50_s", "TotalTime_s"]]
    cat = [c for c in feat_cols if c in ["Stroke", "Pool"]]
    return ColumnTransformer([
        ("num", StandardScaler(), num),
        ("cat", OneHotEncoder(handle_unknown='ignore'), cat)
    ])

# Supported estimators (use one winner per stroke)
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR

def make_estimator(name: str, params: dict):
    if name == "Ridge": return Ridge(**params)
    if name == "Lasso": return Lasso(max_iter=5000, **params)
    if name == "RandomForest": return RandomForestRegressor(**params)
    if name == "GBR" or name == "GradientBoosting": return GradientBoostingRegressor(**params)
    if name == "SVR": return SVR(**params)
    # Fallback
    raise ValueError(f"Unknown model name: {name}")

def parse_best_params(cell):
    if isinstance(cell, dict): 
        return cell
    s = str(cell)
    try:
        return ast.literal_eval(s)
    except Exception:
        try:
            return json.loads(s.replace("'", '"'))
        except Exception:
            return {}


In [None]:
# Block 3 — Pick best model per stroke (Option A)
def pick_best_model_from_leaderboard(leaderboard_csv: Path):
    """
    Accepts different leaderboard schemas:
      - columns may include: ['Model','BestParams','R2']          (our earlier export)
      - OR: ['model','params','mean_test_score']                  (alt schema)
    Returns (model_name, params_dict).
    """
    lb = pd.read_csv(leaderboard_csv)

    # Normalize column names to lowercase for detection
    lower_cols = {c.lower(): c for c in lb.columns}

    # Case 1: R2
    if "r2" in lower_cols and "model" in lower_cols:
        model_col = lower_cols["model"]
        r2_col = lower_cols["r2"]
        params_col = lower_cols.get("bestparams") or lower_cols.get("params")
        lb_sorted = lb.sort_values(r2_col, ascending=False)
        row = lb_sorted.iloc[0]
        model = str(row[model_col]).strip()
        params = parse_best_params(row[params_col]) if params_col else {}
        # strip "model__" prefixes if any
        params = {k.split("model__")[-1]: v for k, v in params.items()}
        return model, params

    # Case 2: mean_test_score
    if "mean_test_score" in lower_cols and "model" in lower_cols:
        model_col = lower_cols["model"]
        score_col = lower_cols["mean_test_score"]
        params_col = lower_cols.get("params") or lower_cols.get("bestparams")
        lb_sorted = lb.sort_values(score_col, ascending=False)
        row = lb_sorted.iloc[0]
        model = str(row[model_col]).strip()
        params = parse_best_params(row[params_col]) if params_col else {}
        params = {k.split("model__")[-1]: v for k, v in params.items()}
        return model, params

    raise ValueError(f"Leaderboard {leaderboard_csv} has unsupported columns: {list(lb.columns)}")


In [None]:
# Block 4 — Train & save per-split models using the stroke's single best estimator
def train_stroke_models_optionA(stroke_name: str, leaderboard_csv: Path, data_csv: Path, out_dir: Path):
    # Pick one winner model for the stroke
    model_name, params = pick_best_model_from_leaderboard(leaderboard_csv)
    print(f"[{stroke_name}] Best model from leaderboard: {model_name}  params={params}")

    # Prepare dataset and frac targets
    df, targets = prepare_training_frame(data_csv)
    if not targets:
        print(f"[{stroke_name}] No frac targets found in {data_csv}.")
        return

    feat_pool = ["Distance", "PB50_s", "TotalTime_s", "Stroke", "Pool"]
    present = [c for c in feat_pool if c in df.columns]
    pre = make_preprocessor(present)

    out_dir.mkdir(parents=True, exist_ok=True)
    manifest = {}
    trained = 0

    for tgt in targets:
        sub = df.dropna(subset=[tgt]).copy()
        if sub.empty:
            print(f"[{stroke_name}] skip {tgt}: no rows.")
            continue

        X = sub[present]
        y = sub[tgt].astype(float).values

        est = make_estimator(model_name, params)
        pipe = Pipeline([("pre", pre), ("reg", est)])
        pipe.fit(X, y)

        r2 = r2_score(y, pipe.predict(X))
        print(f"[{stroke_name}] {tgt} → R2(train)={r2:.4f}")

        path = out_dir / f"{stroke_name.lower()}_{tgt}.joblib"
        joblib.dump(pipe, path)
        manifest[tgt] = {"model": model_name, "params": params}
        trained += 1

    with open(out_dir / "manifest.json", "w") as f:
        json.dump(manifest, f, indent=2)

    print(f"✅ [{stroke_name}] Saved {trained} models to {out_dir}")


In [None]:
# Block 5 — Train ALL strokes (run once to export models)

CONFIG = [
    ("Freestyle",
     BASE/"Freestyle_leaderboard.csv",
     BASE/"Freestyle_dataset.csv",
     BASE/"models_freestyle"),

    ("Backstroke",
     BASE/"Backstroke_leaderboard.csv",
     BASE/"Backstroke_dataset.csv",
     BASE/"models_backstroke"),

    ("Breaststroke",
     BASE/"Breaststroke_leaderboard.csv",
     BASE/"Breaststroke_dataset.csv",
     BASE/"models_breaststroke"),

    ("Butterfly",
     BASE/"Butterfly_leaderboard.csv",
     BASE/"Butterfly_dataset.csv",
     BASE/"models_butterfly"),

    ("IM",
     BASE/"IM_leaderboard.csv",
     BASE/"IM_dataset.csv",
     BASE/"models_im"),
]

for stroke, lb_csv, data_csv, out_dir in CONFIG:
    print("\n" + "="*90)
    print(f"Training from leaderboard → {stroke}")
    train_stroke_models_optionA(stroke, lb_csv, data_csv, Path(out_dir))


In [None]:
# Block 6 — Load models per stroke + realism constraints

def load_models_for_stroke(stroke: str):
    folder = BASE / f"models_{stroke.lower()}"
    manifest_path = folder / "manifest.json"
    if not manifest_path.exists():
        raise FileNotFoundError(f"No manifest found for stroke '{stroke}' in {folder}")

    with open(manifest_path) as f:
        manifest = json.load(f)
    # models keyed by 'frac_i'
    models = {t: joblib.load(folder / f"{stroke.lower()}_{t}.joblib") for t in manifest.keys()}
    return models, manifest

def enforce_constraints(splits, pb50, total_time,
                        max_delta=MAX_CONSEC_DELTA,
                        pb_min_margin_s=PB_MIN_MARGIN_S):
    """
    1) If pb50 is provided (non-IM): each 50m >= PB50 + margin
       If pb50 is None (IM): skip PB-based floor entirely.
    2) Consecutive 50s vary by at most ±max_delta relative to previous
    3) Normalize back to total_time
    """
    s = np.array(splits, dtype=float)

    # (1) PB floor only when pb50 is known (non-IM)
    if pb50 is not None:
        min_allowed = pb50 + pb_min_margin_s
        s = np.maximum(s, min_allowed)

    # (2) Smooth consecutive jumps
    for i in range(1, len(s)):
        upper = s[i-1] * (1.0 + max_delta)
        lower = s[i-1] * (1.0 - max_delta)
        s[i] = min(upper, max(lower, s[i]))

    # (3) Normalize to exact total
    s *= (total_time / s.sum())

    # If we applied PB floor, re-apply and renormalize (small tweak)
    if pb50 is not None:
        s = np.maximum(s, pb50 + pb_min_margin_s)
        s *= (total_time / s.sum())

    return s


In [None]:
# Block 7 — Pre-race and Post-race functions (robust IM handling + auto PB proxy + auto-plot)
import matplotlib.pyplot as plt

def _is_im(stroke: str) -> bool:
    return str(stroke).strip().upper() == "IM"

def predict_splits_pre_race(models: dict, stroke: str, distance: int, pb50: float or None, target_time: float):
    n = distance // 50
    is_im = _is_im(stroke)

    # If IM, inject a neutral PB50 feature proxy purely for the model's preprocessor
    # (this is NOT used for constraints)
    pb50_feature = pb50
    if is_im:
        pb50_feature = max(0.01, target_time / n)  # simple neutral proxy so StandardScaler has a numeric column

    X_dict = {
        "Distance": distance,
        "TotalTime_s": target_time,
        "Stroke": "IM" if is_im else str(stroke).strip().title(),
        "Pool": "LCM",
        "PB50_s": pb50_feature,  # always include for the preprocessor
    }
    X = pd.DataFrame([X_dict])

    # Predict fractions with per-split models (fallback equal if missing)
    fracs = []
    for i in range(1, n+1):
        tgt = f"frac_{i}"
        if tgt not in models:
            fracs.append(1.0 / n)
        else:
            fr = models[tgt].predict(X)[0]
            fracs.append(max(0.0, float(fr)))
    fracs = np.array(fracs)
    fracs = fracs / fracs.sum() if fracs.sum() > 0 else np.ones(n) / n

    raw_splits = fracs * target_time
    # Constraints: skip PB floor for IM (pb50=None)
    return enforce_constraints(raw_splits, pb50=None if is_im else pb50, total_time=target_time)

def analyze_post_race(models: dict, stroke: str, distance: int, pb50: float or None, actual_splits_50: np.ndarray):
    n = distance // 50
    if len(actual_splits_50) != n:
        raise ValueError(f"Expected {n} x 50m splits, got {len(actual_splits_50)}")

    is_im = _is_im(stroke)
    total_time = float(np.sum(actual_splits_50))

    # For IM, inject PB50 feature proxy for the preprocessor only
    pb50_feature = pb50
    if is_im:
        pb50_feature = max(0.01, total_time / n)

    X_dict = {
        "Distance": distance,
        "TotalTime_s": total_time,
        "Stroke": "IM" if is_im else str(stroke).strip().title(),
        "Pool": "LCM",
        "PB50_s": pb50_feature,  # always included for the preprocessor
    }
    X = pd.DataFrame([X_dict])

    # Predict fractional pattern
    fracs = []
    for i in range(1, n+1):
        tgt = f"frac_{i}"
        if tgt not in models:
            fracs.append(1.0 / n)
        else:
            fr = models[tgt].predict(X)[0]
            fracs.append(max(0.0, float(fr)))
    fracs = np.array(fracs)
    fracs = fracs / fracs.sum() if fracs.sum() > 0 else np.ones(n) / n

    ideal_splits = enforce_constraints(
        fracs * total_time,
        pb50=None if is_im else pb50,   # still skip PB-based floor for IM
        total_time=total_time
    )

    deltas = actual_splits_50 - ideal_splits
    suggestions = []
    for i, d in enumerate(deltas, 1):
        if d > 0.30:
            suggestions.append(f"50m #{i}: +{d:.2f}s vs ideal — speed up slightly here.")
        elif d < -0.30:
            suggestions.append(f"50m #{i}: {d:.2f}s vs ideal — consider easing earlier effort.")
        else:
            suggestions.append(f"50m #{i}: within ±0.30s of ideal — maintain.")

    # ---- Print summary ----
    label_stroke = "IM" if is_im else str(stroke).strip().title()
    print(f"\n=== Post-Race Analysis: {label_stroke} {distance}m ===")
    for i, (a, p, d) in enumerate(zip(actual_splits_50, ideal_splits, deltas), 1):
        faster_slower = "slower" if d > 0 else "faster"
        print(f"50m {i}: Actual {a:.2f} | Ideal {p:.2f} | You were {abs(d):.2f}s {faster_slower}")
    print("\nSuggestions:")
    for s in suggestions:
        print(" - " + s)
    print(f"\nTotals — Actual: {total_time:.2f}s | Ideal: {float(np.sum(ideal_splits)):.2f}s")

    # ---- Auto-Plot ----
    x = np.arange(1, n+1)
    plt.figure(figsize=(8,5))
    plt.plot(x, actual_splits_50, marker='o', label="Actual Splits")
    plt.plot(x, ideal_splits, marker='s', label="Ideal Splits")
    plt.title(f"{label_stroke} {distance}m — Splits (Actual vs Ideal)")
    plt.xlabel("Split (per 50m)")
    plt.ylabel("Time (s)")
    plt.xticks(x)
    plt.grid(True, linestyle="--", alpha=0.5)
    plt.legend()
    plt.show()

    return {
        "ideal_50": ideal_splits,
        "delta_50": deltas,
        "suggestions": suggestions,
        "total_actual": total_time,
        "total_ideal": float(np.sum(ideal_splits))
    }


In [None]:
# Block 8 — Interactive CLI (never asks PB for IM)
mode_raw   = input("Select mode (pre/post): ").strip().lower()
stroke_raw = input("Enter stroke (Freestyle/Backstroke/Breaststroke/Butterfly/IM): ").strip()

# Normalize stroke in three forms:
stroke_key   = stroke_raw.strip().lower()        # for folders
stroke_label = "IM" if stroke_key == "im" else stroke_raw.strip().title()  # for display/features
is_im        = (stroke_key == "im")

distance = int(input("Enter race distance (e.g., 100, 200, 400): "))

# Load models once (using folder key)
models, manifest = load_models_for_stroke(stroke_label)

if mode_raw == "pre":
    # PB only for non-IM
    pb50 = None
    if not is_im:
        pb50 = float(input("Enter your 50m personal best (in seconds): "))

    target_time = float(input("Enter your target total race time (in seconds): "))

    # Validate only for non-IM (PB-based bounds)
    if not is_im:
        n = distance // 50
        min_possible = pb50 * n * (1.0 + TARGET_LOWER_PAD)
        max_reasonable = pb50 * n * TARGET_UPPER_PAD
        if target_time < min_possible:
            print(f"\n[Error] Target {target_time:.2f}s too fast for your PB50 ({pb50:.2f}s). "
                  f"Minimum allowed ≈ {min_possible:.2f}s.")
        elif target_time > max_reasonable:
            print(f"\n[Error] Target {target_time:.2f}s too slow. "
                  f"Maximum allowed ≈ {max_reasonable:.2f}s.")
        else:
            splits = predict_splits_pre_race(models, stroke=stroke_label, distance=distance, pb50=pb50, target_time=target_time)
            print(f"\n=== Optimal Predicted Splits for {stroke_label} {distance}m ===")
            for i, s in enumerate(splits, 1):
                print(f"50m {i}: {s:.2f} s")
            print(f"Target Total: {target_time:.2f} s | Predicted Sum: {np.sum(splits)::.2f} s")
    else:
        # IM: no PB prompt or PB-based validation
        splits = predict_splits_pre_race(models, stroke=stroke_label, distance=distance, pb50=None, target_time=target_time)
        print(f"\n=== Optimal Predicted Splits for {stroke_label} {distance}m ===")
        for i, s in enumerate(splits, 1):
            print(f"50m {i}: {s:.2f} s")
        print(f"Target Total: {target_time:.2f} s | Predicted Sum: {np.sum(splits):.2f} s")

elif mode_raw == "post":
    step = int(input("Enter split interval (50 or 100): "))
    if step not in (50, 100):
        raise ValueError("Split interval must be 50 or 100.")
    n = distance // step

    # PB only for non-IM
    pb50 = None
    if not is_im:
        pb50 = float(input("Enter your 50m personal best (in seconds): "))

    # Collect splits
    user = []
    print(f"Enter your actual {step}m splits:")
    for i in range(n):
        val = float(input(f"Split {i+1}: "))
        if step == 100:
            user.extend([val/2.0, val/2.0])  # expand to 50s
        else:
            user.append(val)
    user = np.array(user, dtype=float)

    rep = analyze_post_race(models, stroke=stroke_label, distance=distance, pb50=pb50, actual_splits_50=user)
    print(f"\n=== Post-Race Analysis: {stroke_label} {distance}m ===")
    for i, (a, p, d) in enumerate(zip(user, rep['ideal_50'], rep['delta_50']), 1):
        faster_slower = "slower" if d > 0 else "faster"
        print(f"50m {i}: Actual {a:.2f} | Ideal {p:.2f} | You were {abs(d):.2f}s {faster_slower}")
    print("\nSuggestions:")
    for s in rep["suggestions"]:
        print(" - " + s)
    print(f"\nTotals — Actual: {rep['total_actual']:.2f}s | Ideal: {rep['total_ideal']:.2f}s")

else:
    print("Invalid mode. Please choose 'pre' or 'post'.")
