# Competitive Pacing — Jupyter Demo

This notebook sets up the updated module, creates small dummy datasets, trains models, and demonstrates pre-race split generation and post-race analysis with plots.

In [ ]:
# Write the module file locally so the notebook is self-contained
from pathlib import Path
module_path = Path('competitive_pacing_updated.py')
module_path.write_text('''
from pathlib import Path
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple, Any
import json
import re

import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from joblib import dump, load

import matplotlib.pyplot as plt

# ------------------------------
# Constants & Paths
# ------------------------------

ROOT = Path(".").resolve()
DATA_DIR = ROOT / "data"
MODELS_DIR = ROOT / "models"
MODELS_DIR.mkdir(parents=True, exist_ok=True)

VALID_STROKES = ["Freestyle", "Backstroke", "Breaststroke", "Butterfly", "IM"]
STROKE_KEY = {
    "freestyle": "FREE",
    "free": "FREE",
    "fs": "FREE",
    "backstroke": "BACK",
    "back": "BACK",
    "bk": "BACK",
    "breaststroke": "BRST",
    "breast": "BRST",
    "br": "BRST",
    "butterfly": "FLY",
    "fly": "FLY",
    "im": "IM",
    "individual medley": "IM",
}

# Default model registry (can be overridden per stroke)
DEFAULT_MODEL_CFG = {
    "FREE": {"estimator": "GBR", "params": {"n_estimators": 400, "learning_rate": 0.05, "max_depth": 3, "random_state": 42}},
    "BACK": {"estimator": "GBR", "params": {"n_estimators": 400, "learning_rate": 0.05, "max_depth": 3, "random_state": 42}},
    "FLY":  {"estimator": "GBR", "params": {"n_estimators": 400, "learning_rate": 0.05, "max_depth": 3, "random_state": 42}},
    "BRST": {"estimator": "GBR", "params": {"n_estimators": 400, "learning_rate": 0.05, "max_depth": 3, "random_state": 42}},
    "IM":   {"estimator": "GBR", "params": {"n_estimators": 500, "learning_rate": 0.04, "max_depth": 3, "random_state": 42}},
}

def norm_stroke_label(s: str) -> str:
    if s is None:
        raise ValueError("Stroke cannot be None")
    s2 = s.strip().lower()
    if s2 not in STROKE_KEY:
        for v in ["freestyle", "backstroke", "breaststroke", "butterfly", "im"]:
            if s2 == v:
                return STROKE_KEY[v]
        raise ValueError(f"Unknown stroke label: {s}")
    return STROKE_KEY[s2]

def time_to_seconds(t: str) -> float:
    t = str(t).strip()
    if re.fullmatch(r"\d+(\.\d+)?", t):
        return float(t)
    parts = t.split(":")
    if len(parts) == 2:
        m, s = parts
        return int(m) * 60 + float(s)
    elif len(parts) == 3:
        h, m, s = parts
        return int(h) * 3600 + int(m) * 60 + float(s)
    else:
        raise ValueError(f"Unrecognized time format: {t}")

def seconds_to_time(s: float) -> str:
    s = float(s)
    if s < 0:
        s = 0.0
    m = int(s // 60)
    rem = s - m * 60
    return f"{m}:{rem:05.2f}"

def parse_splits_str(s: str) -> List[float]:
    if s is None:
        return []
    parts = [p.strip() for p in re.split(r"[,\s]+", str(s)) if p.strip()]
    return [time_to_seconds(p) for p in parts]

def count_split_columns(df: pd.DataFrame) -> Tuple[int, List[str]]:
    split_cols = [c for c in df.columns if re.fullmatch(r"(split|lap)_(\d+)", c.strip().lower())]
    def idx(c):
        m = re.search(r"(\d+)$", c)
        return int(m.group(1)) if m else 0
    split_cols_sorted = sorted(split_cols, key=idx)
    return len(split_cols_sorted), split_cols_sorted

def fatigue_coeff(stroke_key: str, distance_m: int) -> float:
    base = 1.10
    if distance_m == 100:
        base = 1.045
    elif distance_m == 200:
        base = 1.090
    elif distance_m == 400:
        base = 1.150
    elif distance_m in (800, 1500):
        base = 1.180
    tweak = {
        "FREE": 0.000,
        "BACK": 0.005,
        "FLY":  0.007,
        "BRST": 0.015,
        "IM":   0.010,
    }.get(stroke_key, 0.0)
    return base + tweak

def target_time_from_pb50(stroke_key: str, distance_m: int, pb50_sec: float) -> float:
    n50 = distance_m / 50.0
    raw = n50 * pb50_sec
    return float(raw * fatigue_coeff(stroke_key, distance_m))

def prepare_training_long(df: pd.DataFrame, stroke_key: str) -> Tuple[pd.DataFrame, pd.Series]:
    if "distance_m" not in df.columns:
        raise ValueError("Training data must include 'distance_m' per race.")
    n, split_cols = count_split_columns(df)
    if n == 0:
        raise ValueError("No split columns detected. Use 'split_1', 'split_2', ... or 'lap_1', ...")
    rows = []
    targets = []
    for _, row in df.iterrows():
        dist = int(row["distance_m"])
        expected_laps = int(dist // 50)
        lap_vals = []
        for c in split_cols:
            v = row.get(c, np.nan)
            if pd.isna(v):
                break
            lap_vals.append(float(v))
        if len(lap_vals) != expected_laps:
            continue
        splits = np.array(lap_vals, dtype=float)
        base = splits.mean()
        residuals = splits - base
        n_laps = len(splits)
        for i, r in enumerate(residuals, start=1):
            rows.append({
                "stroke_key": stroke_key,
                "distance_m": dist,
                "n_laps": n_laps,
                "lap_idx": i,
                "lap_idx_norm": i / n_laps,
                "lap_idx_sq": (i / n_laps) ** 2,
            })
            targets.append(r)
    X = pd.DataFrame(rows)
    y = pd.Series(targets, name="residual")
    if len(X) == 0:
        raise ValueError(f"No valid rows produced for stroke {stroke_key}. Check your data.")
    return X, y

def build_estimator(estimator: str, params: Dict[str, Any]):
    if estimator.upper() == "GBR":
        return GradientBoostingRegressor(**params)
    elif estimator.upper() == "RF":
        return RandomForestRegressor(**params)
    else:
        raise ValueError(f"Unsupported estimator: {estimator}")

def train_stroke_model(stroke: str, df: pd.DataFrame, cfg: Optional[Dict[str, Any]] = None) -> Tuple[Pipeline, Dict[str, Any], float]:
    key = norm_stroke_label(stroke)
    cfg = cfg or DEFAULT_MODEL_CFG[key]
    est = build_estimator(cfg["estimator"], cfg["params"])
    X, y = prepare_training_long(df, key)
    pipe = Pipeline([
        ("scaler", StandardScaler(with_mean=True, with_std=True)),
        ("est", est)
    ])
    split_idx = int(0.8 * len(X))
    X_tr, X_va = X.iloc[:split_idx], X.iloc[split_idx:]
    y_tr, y_va = y.iloc[:split_idx], y.iloc[split_idx:]
    pipe.fit(X_tr, y_tr)
    yhat_va = pipe.predict(X_va)
    mae = float(mean_absolute_error(y_va, yhat_va))
    meta = {"stroke_key": key, "features": list(X.columns), "target": "residual", "mae_valid": mae}
    return pipe, meta, mae

from joblib import dump, load
def save_stroke_model(stroke_key: str, pipe: Pipeline, meta: Dict[str, Any]):
    p = MODELS_DIR / f"{stroke_key}_residual_model.joblib"
    dump({"pipe": pipe, "meta": meta}, p)

def load_stroke_model(stroke_key: str) -> Optional[Dict[str, Any]]:
    p = MODELS_DIR / f"{stroke_key}_residual_model.joblib"
    if not p.exists():
        return None
    return load(p)

def predict_residuals(stroke_key: str, distance_m: int, n_laps: int) -> np.ndarray:
    pack = load_stroke_model(stroke_key)
    X = pd.DataFrame({
        "stroke_key": [stroke_key] * n_laps,
        "distance_m": [distance_m] * n_laps,
        "n_laps": [n_laps] * n_laps,
        "lap_idx": list(range(1, n_laps + 1)),
    })
    X["lap_idx_norm"] = X["lap_idx"] / n_laps
    X["lap_idx_sq"] = X["lap_idx_norm"] ** 2
    if pack is None:
        return np.zeros(n_laps, dtype=float)
    pipe = pack["pipe"]
    features = pack["meta"]["features"]
    for col in features:
        if col not in X.columns:
            X[col] = 0.0
    X = X[features]
    return pipe.predict(X).astype(float)

def shape_splits_to_target(residuals: np.ndarray, target_total: float, clip_sigma: float = 3.0) -> np.ndarray:
    n = len(residuals)
    r = residuals.copy()
    std = r.std() if r.std() > 1e-9 else 1.0
    r = np.clip(r, -clip_sigma * std, clip_sigma * std)
    r -= r.mean()
    base = target_total / n
    splits = base + r
    correction = (target_total - splits.sum()) / n
    splits = splits + correction
    splits = np.clip(splits, 0.01, None)
    splits *= target_total / splits.sum()
    return splits

def generate_ideal_splits(stroke: str, distance_m: int, pb50_sec: Optional[float] = None,
                          per_leg_pb50: Optional[Dict[str, float]] = None) -> List[float]:
    key = norm_stroke_label(stroke)
    n_laps = int(distance_m // 50)

    if key != "IM":
        if pb50_sec is None:
            target_total = float(distance_m / 50 * 40.0)
        else:
            target_total = target_time_from_pb50(key, distance_m, pb50_sec)
        residuals = predict_residuals(key, distance_m, n_laps)
        splits = shape_splits_to_target(residuals, target_total)
        return splits.tolist()

    if distance_m not in (200, 400):
        if pb50_sec is None:
            target_total = float(distance_m / 50 * 45.0)
        else:
            target_total = target_time_from_pb50("IM", distance_m, pb50_sec)
        residuals = predict_residuals("IM", distance_m, n_laps)
        splits = shape_splits_to_target(residuals, target_total)
        return splits.tolist()

    leg_size = 1 if distance_m == 200 else 2
    legs = ["FLY", "BACK", "BRST", "FREE"]
    leg_targets = []
    for leg in legs:
        if per_leg_pb50 and leg in per_leg_pb50:
            leg_target = target_time_from_pb50(leg, leg_size * 50, per_leg_pb50[leg])
        else:
            leg_target = float(leg_size * 50 / 50 * 45.0)
        leg_targets.append(leg_target)

    leg_splits_all = []
    for leg_key, leg_target in zip(legs, leg_targets):
        res_leg = predict_residuals(leg_key, leg_size * 50, leg_size)
        leg_splits = shape_splits_to_target(res_leg, leg_target)
        leg_splits_all.extend(leg_splits.tolist())

    total = sum(leg_splits_all)
    desired = sum(leg_targets)
    scale = desired / total if total > 1e-9 else 1.0
    leg_splits_all = [s * scale for s in leg_splits_all]
    return leg_splits_all

def analyze_post_race(stroke: str, distance_m: int, actual_splits: List[float],
                      pb50_sec: Optional[float] = None,
                      per_leg_pb50: Optional[Dict[str, float]] = None,
                      plot: bool = True) -> Dict[str, Any]:
    ideal = generate_ideal_splits(stroke, distance_m, pb50_sec, per_leg_pb50)
    if len(ideal) != len(actual_splits):
        raise ValueError(f"Lap count mismatch. Expected {len(ideal)} splits for {distance_m}m, got {len(actual_splits)}.")
    ideal_arr = np.array(ideal, dtype=float)
    actual_arr = np.array(actual_splits, dtype=float)
    delta = actual_arr - ideal_arr

    out = {
        "ideal_splits": ideal,
        "actual_splits": actual_splits,
        "delta_splits": delta.tolist(),
        "ideal_total": float(ideal_arr.sum()),
        "actual_total": float(actual_arr.sum()),
        "delta_total": float(delta.sum()),
    }

    if plot:
        plt.figure(figsize=(9, 4.5))
        x = np.arange(1, len(ideal) + 1)
        plt.plot(x, ideal_arr, marker="o", label="Ideal (model)")
        plt.plot(x, actual_arr, marker="s", label="Actual")
        plt.title(f"{stroke} {distance_m}m: Actual vs Ideal per-50 splits")
        plt.xlabel("Lap (50m)")
        plt.ylabel("Time (s)")
        plt.legend()
        plt.tight_layout()
        plt.show()

    return out

def load_training_csv_for_stroke(stroke: str) -> pd.DataFrame:
    fname = {
        "FREE": "Freestyle_dataset.csv",
        "BACK": "Backstroke_dataset.csv",
        "BRST": "Breaststroke_dataset.csv",
        "FLY":  "Butterfly_dataset.csv",
        "IM":   "IM_dataset.csv",
    }[norm_stroke_label(stroke)]
    p = (DATA_DIR / fname)
    if not p.exists():
        raise FileNotFoundError(f"Missing training CSV for {stroke}: {p}")
    df = pd.read_csv(p)
    _, split_cols = count_split_columns(df)
    for c in split_cols:
        df[c] = df[c].apply(lambda x: time_to_seconds(x) if isinstance(x, str) and ":" in x else float(x))
    if "distance_m" in df.columns:
        df["distance_m"] = df["distance_m"].astype(int)
    return df

def train_all_strokes(save: bool = True) -> Dict[str, Dict[str, Any]]:
    results = {}
    for stroke in VALID_STROKES:
        key = norm_stroke_label(stroke)
        try:
            df = load_training_csv_for_stroke(stroke)
        except FileNotFoundError:
            continue
        pipe, meta, mae = train_stroke_model(stroke, df)
        results[key] = {"pipe": pipe, "meta": meta, "mae_valid": mae}
        if save:
            save_stroke_model(key, pipe, meta)
    return results

def _prompt_float(msg: str) -> float:
    v = input(msg).strip()
    return time_to_seconds(v) if ":" in v else float(v)

def interactive_cli():
    print("== Competitive Pacing ==")
    mode = input("Choose mode [pre/post]: ").strip().lower()
    stroke = input("Stroke (Freestyle/Backstroke/Breaststroke/Butterfly/IM): ").strip()
    distance_m = int(input("Distance (e.g., 100/200/400): ").strip())
    if mode == "pre":
        if norm_stroke_label(stroke) == "IM":
            use_legs = input("Provide per-leg 50 PBs? [y/n]: ").strip().lower() == "y"
            per_legs = None
            if use_legs:
                per_legs = {}
                for leg in ["FLY", "BACK", "BRST", "FREE"]:
                    per_legs[leg] = _prompt_float(f"Enter 50 PB for {leg} (sec or mm:ss.xx): ")
            splits = generate_ideal_splits(stroke, distance_m, None, per_legs)
        else:
            have_pb = input("Provide 50 PB? [y/n]: ").strip().lower() == "y"
            pb50 = _prompt_float("Enter 50 PB (sec or mm:ss.xx): ") if have_pb else None
            splits = generate_ideal_splits(stroke, distance_m, pb50, None)
        total = sum(splits)
        print(f"\nPredicted optimal per-50 splits for {stroke} {distance_m}m (Total {seconds_to_time(total)}):")
        print(", ".join(seconds_to_time(s) for s in splits))
    else:
        actual = parse_splits_str(input("Enter actual per-50 splits (comma/space separated, sec or mm:ss.xx): "))
        if norm_stroke_label(stroke) == "IM":
            use_legs = input("Provide per-leg 50 PBs? [y/n]: ").strip().lower() == "y"
            per_legs = None
            if use_legs:
                per_legs = {}
                for leg in ["FLY", "BACK", "BRST", "FREE"]:
                    per_legs[leg] = _prompt_float(f"Enter 50 PB for {leg} (sec or mm:ss.xx): ")
            out = analyze_post_race(stroke, distance_m, actual, None, per_legs, plot=True)
        else:
            have_pb = input("Provide 50 PB? [y/n]: ").strip().lower() == "y"
            pb50 = _prompt_float("Enter 50 PB (sec or mm:ss.xx): ") if have_pb else None
            out = analyze_post_race(stroke, distance_m, actual, pb50, None, plot=True)
        print("\nDiagnostics:")
        print(f"Ideal total:  {seconds_to_time(out['ideal_total'])}")
        print(f"Actual total: {seconds_to_time(out['actual_total'])}")
        d = out['delta_total']
        print(f"Delta total:  {'+' if d>=0 else ''}{d:.2f} s")

if __name__ == "__main__":
    print("This module provides training, prediction, and analysis utilities for Competitive Pacing.")
    print("Functions available: train_all_strokes(), generate_ideal_splits(), analyze_post_race(), interactive_cli()")
''')
print('Wrote', module_path)


In [ ]:
# Imports
import pandas as pd
from pathlib import Path
import numpy as np
from competitive_pacing_updated import (
    train_all_strokes, generate_ideal_splits, analyze_post_race, seconds_to_time
)

# Ensure folders exist
Path('data').mkdir(exist_ok=True)
Path('models').mkdir(exist_ok=True)
print('Environment ready.')


## Create tiny dummy datasets
We synthesize a few races per stroke so you can train immediately. Replace these with your real CSVs later.

In [ ]:
import random
def make_dataset(stroke_name, distances=[100,200,400], n_races=6, base_50=28.0):
    rows = []
    for d in distances:
        laps = d // 50
        for _ in range(n_races):
            # create a gentle shape around base
            shape = np.linspace(-0.6, 0.6, int(laps)) + np.random.normal(0, 0.15, int(laps))
            splits = base_50 + shape
            row = {f'split_{i+1}': float(s) for i, s in enumerate(splits)}
            row['distance_m'] = int(d)
            rows.append(row)
    return pd.DataFrame(rows)

make_dataset('Freestyle', [100,200,400], 8, base_50=27.5).to_csv('data/Freestyle_dataset.csv', index=False)
make_dataset('Backstroke', [100,200], 8, base_50=30.0).to_csv('data/Backstroke_dataset.csv', index=False)
make_dataset('Butterfly', [100,200], 8, base_50=29.8).to_csv('data/Butterfly_dataset.csv', index=False)
make_dataset('Breaststroke', [100,200], 8, base_50=33.0).to_csv('data/Breaststroke_dataset.csv', index=False)
# IM: synthesize 200 IM as 4x50
im_df = make_dataset('IM', [200], 10, base_50=32.0)
im_df.to_csv('data/IM_dataset.csv', index=False)
print('Dummy datasets written to ./data')


## Train models (shape learning)
This will save per-stroke residual models into `./models`.

In [ ]:
results = train_all_strokes(save=True)
results


## Pre-race: generate ideal splits
Example: 200m Freestyle with a 50 PB of 26.5s.

In [ ]:
splits_fr_200 = generate_ideal_splits('Freestyle', 200, pb50_sec=26.5)
[seconds_to_time(s) for s in splits_fr_200], sum(splits_fr_200)


## IM example with per-leg PBs
We pass 50 PBs for Fly/Back/Breast/Free to shape each leg for a 200 IM.

In [ ]:
legs = {'FLY': 27.4, 'BACK': 29.2, 'BRST': 32.9, 'FREE': 25.1}
splits_im_200 = generate_ideal_splits('IM', 200, per_leg_pb50=legs)
[seconds_to_time(s) for s in splits_im_200], sum(splits_im_200)


## Post-race analysis with plot
Compare Actual vs Ideal splits for a sample 200 Free.

In [ ]:
actual_200fr = [27.8, 28.4, 29.1, 29.5]
diag = analyze_post_race('Freestyle', 200, actual_200fr, pb50_sec=26.5, plot=True)
diag


### Notes
- Replace the dummy CSVs in `data/` with your real race datasets.
- Re-run the **Train models** cell to fit on your data.
- All plots use pure Matplotlib (no seaborn) and a single figure per plot, as requested.