In [1]:
# Competitive Pacing — updated single-cell notebook
# -------------------------------------------------
# What changed vs last version?
# • Reads your uploaded files by default:
#     /mnt/data/freestyle_dataset.csv
#     /mnt/data/backstroke_dataset.csv
#     /mnt/data/breaststroke_dataset.csv
#     /mnt/data/butterfly_dataset.csv
#     /mnt/data/im_dataset.csv
#     /mnt/data/best_models_by_stroke.csv   (leaderboard/best-model hints)
#   ...and still falls back to ./datasets/*.csv if you move them there.
# • Uses "best model by stroke" (if provided) to choose a regressor for each stroke.
#   Supported strings (case-insensitive): randomforest, gradientboosting, ridge, lasso,
#   elasticnet, svr, knn. Unknown → defaults to RandomForest.
# • Leaderboard file ALSO doubles as pacing baseline if it contains ratio_* or split_* columns
#   per (stroke, distance), or a "splits" semicolon string. Otherwise it’s treated only as model hints.
#
# Usage:
#   1) Put CSVs in /mnt/data or ./datasets.
#   2) Run this cell.
#   3) list_available_events()
#   4) cli_loop()   # interactive Pre-race / Post-race CLI
#
# Requirements:
#   pip install pandas numpy scikit-learn matplotlib

import os, re, sys, math, textwrap, warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from typing import Dict, Tuple, List, Optional
from collections import defaultdict

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.multioutput import MultiOutputRegressor

warnings.filterwarnings("ignore")

# -------------------------------
# Utilities: parsing + formatting
# -------------------------------

STROKE_ALIASES = {
    "free": "freestyle", "freestyle": "freestyle", "fr": "freestyle", "fs": "freestyle",
    "back": "backstroke", "backstroke": "backstroke", "bk": "backstroke",
    "breast": "breaststroke", "breaststroke": "breaststroke", "br": "breaststroke",
    "fly": "butterfly", "butterfly": "butterfly", "bf": "butterfly", "fl": "butterfly",
    "im": "im", "medley": "im", "individual medley": "im", "individualmedley": "im"
}

TOTAL_TIME_CANDIDATES = ["time_total","total_time","final_time","result_time","time","seed_time","pb_time","official_time"]
DISTANCE_CANDIDATES   = ["distance","event_distance","dist","meters","metres"]
STROKE_CANDIDATES     = ["stroke","event_stroke","style","st"]
SPLITS_BUNDLE         = ["splits","split_string","lap_splits"]

def normalize_stroke(x: str) -> Optional[str]:
    if x is None or (isinstance(x, float) and math.isnan(x)): return None
    s = str(x).strip().lower().replace("-", " ")
    s = re.sub(r'[^a-z ]+', '', s)
    s = re.sub(r'\s+', ' ', s)
    return STROKE_ALIASES.get(s, s if s in STROKE_ALIASES.values() else None)

def parse_time_to_seconds(t) -> Optional[float]:
    if t is None or (isinstance(t, float) and math.isnan(t)): return None
    s = str(t).strip()
    if not s: return None
    # direct float
    try:
        return float(s)
    except: pass
    # M:SS(.xx)
    if ":" in s:
        parts = s.split(":")
        if len(parts) == 2:
            try:
                m = float(parts[0]); sec = float(parts[1])
                return m*60.0 + sec
            except: return None
    s2 = re.sub(r'[^0-9\.]', '', s)
    try:
        return float(s2)
    except:
        return None

def seconds_to_time_str(sec: float) -> str:
    if sec is None or not np.isfinite(sec): return "NA"
    sec = float(sec)
    if sec < 60:
        return f"{sec:0.2f}"
    m = int(sec // 60)
    s = sec - 60*m
    return f"{m}:{s:05.2f}"

def parse_splits_from_string(s: str) -> List[float]:
    if s is None or (isinstance(s, float) and math.isnan(s)): return []
    parts = [p.strip() for p in str(s).replace(",", ";").split(";") if p.strip()]
    vals = [parse_time_to_seconds(p) for p in parts]
    return [v for v in vals if v is not None and v > 0]

def infer_split_columns(df: pd.DataFrame) -> List[str]:
    cols = list(df.columns)
    split_cols = []
    # bundled?
    for c in SPLITS_BUNDLE:
        if c in df.columns:
            return [c]
    # split-like columns
    pattern = re.compile(r'^(split|lap|l50|s50|fifty)(_)?(\d+)?', re.IGNORECASE)
    for c in cols:
        if pattern.match(str(c)): split_cols.append(c)
    return split_cols

def extract_total_time_column(df: pd.DataFrame) -> Optional[str]:
    for c in TOTAL_TIME_CANDIDATES:
        if c in df.columns: return c
    for c in df.columns:
        if str(c).lower() == "result": return c
    return None

def extract_distance_column(df: pd.DataFrame) -> Optional[str]:
    for c in DISTANCE_CANDIDATES:
        if c in df.columns: return c
    return None

def extract_stroke_column(df: pd.DataFrame) -> Optional[str]:
    for c in STROKE_CANDIDATES:
        if c in df.columns: return c
    return None

def n50s_for_distance(distance: int) -> int:
    return max(0, int(distance // 50))

def softmax_like(x: np.ndarray, eps: float = 1e-9) -> np.ndarray:
    x = np.array(x, dtype=float)
    x = np.maximum(x, eps)
    total = np.sum(x)
    if total <= eps: return np.ones_like(x)/len(x)
    return x / total

# ---------------------------------------
# Data loading, cleaning, and harmonizing
# ---------------------------------------

def load_csv_if_exists(path) -> Optional[pd.DataFrame]:
    try:
        if os.path.exists(path):
            df = pd.read_csv(path)
            df.columns = [re.sub(r'\s+', '_', str(c).strip().lower()) for c in df.columns]
            return df
    except Exception as e:
        print(f"[warn] Could not read {path}: {e}")
    return None

def resample_splits(splits: List[float], n_target: int) -> List[float]:
    total = sum(splits) if splits else 0.0
    if n_target <= 0 or total <= 0: return []
    # robust equal redistribution; avoids distortion when input counts differ
    return [total / n_target] * n_target

def extract_splits_from_row(df: pd.DataFrame, row: pd.Series) -> List[float]:
    split_cols = infer_split_columns(df)
    if not split_cols: return []
    if len(split_cols) == 1 and split_cols[0] in SPLITS_BUNDLE and split_cols[0] in df.columns:
        return parse_splits_from_string(row[split_cols[0]])
    # sort by trailing number if present
    def split_key(c):
        m = re.search(r'(\d+)$', str(c))
        return int(m.group(1)) if m else 9999
    cols = sorted(split_cols, key=split_key)
    vals = []
    for c in cols:
        v = row.get(c, None)
        if v is None or (isinstance(v, float) and math.isnan(v)): continue
        v = parse_time_to_seconds(v)
        if v is not None and v > 0: vals.append(v)
    return vals

def standardize_event_rows(df: pd.DataFrame, default_stroke: Optional[str]) -> pd.DataFrame:
    df = df.copy()

    # stroke
    sc = extract_stroke_column(df)
    if sc is None:
        df["stroke"] = default_stroke
    else:
        df["stroke"] = df[sc].apply(normalize_stroke)
        if default_stroke and df["stroke"].isna().all():
            df["stroke"] = default_stroke

    # distance
    dc = extract_distance_column(df)
    if dc is None:
        split_cols = infer_split_columns(df)
        if split_cols:
            dists = []
            for _, r in df.iterrows():
                n = len(extract_splits_from_row(df, r))
                dists.append(n*50 if n>0 else np.nan)
            df["distance"] = dists
        else:
            df["distance"] = np.nan
    else:
        df["distance"] = pd.to_numeric(df[dc], errors="coerce")

    # total time
    tc = extract_total_time_column(df)
    if tc is None:
        totals = []
        for _, r in df.iterrows():
            splits = extract_splits_from_row(df, r)
            totals.append(sum(splits) if splits else np.nan)
        df["time_total_sec"] = totals
    else:
        df["time_total_sec"] = df[tc].apply(parse_time_to_seconds)

    # validity
    df = df[~df["stroke"].isna()]
    df = df[~df["distance"].isna()]
    df = df[~df["time_total_sec"].isna()]
    df = df[df["distance"] % 50 == 0]
    df = df[df["time_total_sec"] > 0]

    # build split & ratio columns
    split_list, ratio_list, max_s = [], [], 0
    for _, row in df.iterrows():
        splits = extract_splits_from_row(df, row)
        total = row["time_total_sec"]
        if not splits:
            n = n50s_for_distance(int(row["distance"]))
            splits = [total / n]*n if n>0 else []
        else:
            n_expected = n50s_for_distance(int(row["distance"]))
            if n_expected>0 and len(splits)!=n_expected:
                splits = resample_splits(splits, n_expected)
        ratios = [s/total for s in splits] if total>0 else []
        split_list.append(splits)
        ratio_list.append(ratios)
        max_s = max(max_s, len(splits))

    for i in range(max_s):
        df[f"split_{i+1}_sec"] = [(v[i] if i<len(v) else np.nan) for v in split_list]
        df[f"ratio_{i+1}"]     = [(v[i] if i<len(v) else np.nan) for v in ratio_list]

    # feature columns (numeric, excluding targets)
    exclude_prefixes = ("split_", "ratio_")
    numeric_cols = []
    for c in df.columns:
        if c in ["stroke","distance","time_total_sec"]: continue
        if c.startswith(exclude_prefixes): continue
        if pd.api.types.is_numeric_dtype(df[c]): numeric_cols.append(c)
    feature_cols = ["time_total_sec"] + numeric_cols
    df["_feature_cols"] = [feature_cols]*len(df)
    return df

# -----------------------------------
# Leaderboards / Best-models parsing
# -----------------------------------

def load_leaderboard_and_models(path: str):
    """
    Returns:
      leaderboard: Dict[(stroke, distance) -> ratio list]  # if present
      model_hints: Dict[stroke -> model_name]              # if present
    Accepts flexible schema:
      • stroke, distance, ratio_1..ratio_k
      • stroke, distance, split_1..split_k  (converted to ratios)
      • stroke, distance, splits (semicolon string)
      • stroke, model
    """
    leaderboard = {}
    model_hints = {}
    df = load_csv_if_exists(path)
    if df is None or df.empty: return leaderboard, model_hints

    # normalize stroke
    if "stroke" in df.columns:
        df["stroke"] = df["stroke"].apply(normalize_stroke)

    # model hints
    if "model" in df.columns and "stroke" in df.columns:
        for _, r in df.dropna(subset=["stroke","model"]).iterrows():
            model_hints[r["stroke"]] = str(r["model"]).strip().lower()

    # ratios / splits per (stroke, distance)
    has_distance = "distance" in df.columns
    if "stroke" in df.columns and has_distance:
        tmp = df.dropna(subset=["stroke","distance"]).copy()
        tmp["distance"] = pd.to_numeric(tmp["distance"], errors="coerce")
        tmp = tmp.dropna(subset=["distance"])
        # ratio columns?
        ratio_cols = [c for c in tmp.columns if re.match(r'^(avg_)?ratio_\d+$', str(c))]
        if ratio_cols:
            for _, r in tmp.iterrows():
                ratios = [r[c] for c in ratio_cols]
                ratios = [float(x) for x in ratios if x is not None and np.isfinite(x)]
                if ratios:
                    leaderboard[(r["stroke"], int(r["distance"]))] = softmax_like(np.array(ratios)).tolist()
        else:
            # split columns?
            split_cols = [c for c in tmp.columns if re.match(r'^(avg_)?split_\d+(_sec)?$', str(c))]
            if split_cols:
                for _, r in tmp.iterrows():
                    splits = [parse_time_to_seconds(r[c]) for c in split_cols]
                    splits = [x for x in splits if x is not None and x > 0]
                    if splits:
                        leaderboard[(r["stroke"], int(r["distance"]))] = softmax_like(np.array(splits)).tolist()
            # bundled splits?
            if "splits" in tmp.columns:
                for _, r in tmp.iterrows():
                    s = parse_splits_from_string(r["splits"])
                    if s:
                        leaderboard[(r["stroke"], int(r["distance"]))] = softmax_like(np.array(s)).tolist()

    return leaderboard, model_hints

# -------------------------
# Model store + training
# -------------------------

def make_base_estimator(name: str):
    n = (name or "").strip().lower()
    if n in ("randomforest","rf","random_forest"):
        return RandomForestRegressor(n_estimators=250, random_state=42, min_samples_leaf=3, n_jobs=-1)
    if n in ("gradientboosting","gb","gbr","gradient_boosting"):
        return GradientBoostingRegressor(random_state=42)
    if n in ("ridge",):
        return Ridge(alpha=1.0, random_state=42)
    if n in ("lasso",):
        return Lasso(alpha=0.0005, random_state=42, max_iter=20000)
    if n in ("elasticnet","enet"):
        return ElasticNet(alpha=0.0005, l1_ratio=0.3, random_state=42, max_iter=20000)
    if n in ("svr","svm"):
        return SVR(kernel="rbf", C=10.0, gamma="scale")
    if n in ("knn","kneighbors","k-neighbors"):
        return KNeighborsRegressor(n_neighbors=7, weights="distance")
    # default
    return RandomForestRegressor(n_estimators=250, random_state=42, min_samples_leaf=3, n_jobs=-1)

class SplitModelStore:
    """
    Stores per-(stroke, distance) multi-output models that predict split ratios.
    Chooses regressor per stroke using model_hints if available.
    """
    def __init__(self):
        self.models: Dict[Tuple[str,int], MultiOutputRegressor] = {}
        self.feature_cols: Dict[Tuple[str,int], List[str]] = {}
        self.output_dims: Dict[Tuple[str,int], int] = {}
        self.leaderboard: Dict[Tuple[str,int], List[float]] = {}
        self.model_hints: Dict[str, str] = {}

    def set_leaderboard(self, lb: Dict[Tuple[str,int], List[float]]):
        self.leaderboard = lb or {}

    def set_model_hints(self, hints: Dict[str, str]):
        self.model_hints = {k: (v or "").strip().lower() for k,v in (hints or {}).items()}

    def _estimator_for_stroke(self, stroke: str):
        hint = self.model_hints.get(stroke, "")
        base = make_base_estimator(hint)
        return base

    def train_from_dataframe(self, df: pd.DataFrame):
        if df is None or df.empty: return
        groups = df.groupby(["stroke","distance"])
        for (stroke, dist), g in groups:
            stroke = normalize_stroke(stroke)
            if not stroke: continue
            dist = int(dist) if pd.notna(dist) else None
            if not dist or dist % 50 != 0: continue
            n_out = n50s_for_distance(dist)
            feat_cols = g["_feature_cols"].iloc[0] if "_feature_cols" in g.columns else ["time_total_sec"]
            feat_cols = [c for c in feat_cols if c in g.columns]
            X = g[feat_cols].apply(pd.to_numeric, errors="coerce").replace([np.inf,-np.inf], np.nan)
            target_cols = [f"ratio_{i+1}" for i in range(n_out)]
            Y = g[target_cols].apply(pd.to_numeric, errors="coerce").replace([np.inf,-np.inf], np.nan)

            XY = X.join(Y).dropna()
            if XY.empty: continue
            Xc = XY[feat_cols].values
            Yc = XY[target_cols].values
            row_sums = Yc.sum(axis=1, keepdims=True)
            row_sums[row_sums==0] = 1.0
            Yc = Yc / row_sums

            base = self._estimator_for_stroke(stroke)
            model = MultiOutputRegressor(base)
            model.fit(Xc, Yc)
            self.models[(stroke, dist)] = model
            self.feature_cols[(stroke, dist)] = feat_cols
            self.output_dims[(stroke, dist)] = n_out

    def predict_ratios(self, stroke: str, distance: int, total_time_sec: float, extra_features: Optional[Dict[str,float]] = None) -> List[float]:
        key = (normalize_stroke(stroke), int(distance))
        n_out = n50s_for_distance(distance)
        feat_cols = self.feature_cols.get(key, ["time_total_sec"])
        feats = {c: np.nan for c in feat_cols}
        feats["time_total_sec"] = float(total_time_sec)
        if extra_features:
            for k,v in extra_features.items():
                if k in feats and v is not None and np.isfinite(v):
                    feats[k] = float(v)
        Xrow = np.array([[feats[c] if np.isfinite(feats[c]) else 0.0 for c in feat_cols]])

        if key in self.models:
            pred = self.models[key].predict(Xrow)[0]
            ratios = softmax_like(pred)
            if len(ratios) != n_out:
                ratios = _resize_ratios(ratios, n_out)
            return ratios

        if key in self.leaderboard:
            ratios = np.array(self.leaderboard[key], dtype=float)
            return softmax_like(_resize_ratios(ratios, n_out))

        # Heuristic fallback
        if distance <= 200:
            base = np.linspace(1.05, 0.95, n_out)
        else:
            base = np.linspace(0.98, 1.02, n_out)
        return softmax_like(base)

def _resize_ratios(r: np.ndarray, n_out: int) -> np.ndarray:
    r = np.array(r, dtype=float)
    if n_out <= 0: return np.array([])
    if len(r) == n_out: return r
    if len(r) <= 0: return np.ones(n_out)/n_out
    # simple even redistribution
    return np.ones(n_out)/n_out

# ----------------------------
# Data ingestion from datasets
# ----------------------------

def first_existing(*paths) -> Optional[str]:
    for p in paths:
        if p and os.path.exists(p): return p
    return None

def load_all_datasets() -> pd.DataFrame:
    # Prefer /mnt/data (your uploads), fallback to ./datasets
    files = {
        "freestyle":  first_existing("/mnt/data/freestyle_dataset.csv",  "./datasets/freestyle.csv",  "./datasets/freestyle_dataset.csv"),
        "backstroke": first_existing("/mnt/data/backstroke_dataset.csv", "./datasets/backstroke.csv", "./datasets/backstroke_dataset.csv"),
        "breaststroke": first_existing("/mnt/data/breaststroke_dataset.csv","./datasets/breaststroke.csv","./datasets/breaststroke_dataset.csv"),
        "butterfly":  first_existing("/mnt/data/butterfly_dataset.csv", "./datasets/butterfly.csv",  "./datasets/butterfly_dataset.csv"),
        "im":         first_existing("/mnt/data/im_dataset.csv",         "./datasets/im.csv",         "./datasets/im_dataset.csv"),
    }
    frames = []
    for stroke_name, path in files.items():
        if not path: 
            continue
        df = load_csv_if_exists(path)
        if df is None or df.empty:
            continue
        df = standardize_event_rows(df, default_stroke=stroke_name)
        frames.append(df)
    return pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()

def load_leaderboard_and_hints():
    path = first_existing("/mnt/data/best_models_by_stroke.csv", "./datasets/best_models_by_stroke.csv", "./datasets/leaderboards.csv")
    return load_leaderboard_and_models(path) if path else ({}, {})

# ----------------------------
# CLI Helpers (Notebook-safe)
# ----------------------------

def ask(prompt: str, cast=str, allow_blank=False):
    while True:
        val = input(prompt).strip()
        if not val and allow_blank:
            return None
        try:
            return cast(val)
        except Exception:
            print("  Invalid input, try again.")

def parse_stroke_input(s: str) -> str:
    st = normalize_stroke(s)
    if not st:
        raise ValueError("Unknown stroke. Use Freestyle, Backstroke, Breaststroke, Butterfly, or IM.")
    return st

def parse_distance_input(s: str) -> int:
    d = int(float(s))
    if d % 50 != 0 or d <= 0:
        raise ValueError("Distance must be a positive multiple of 50 (e.g., 50, 100, 200, 400).")
    return d

def pretty_print_splits(splits_sec: List[float]) -> str:
    return ";".join(seconds_to_time_str(x) for x in splits_sec)

def figure_compare_splits(given: List[float], ideal: List[float], title: str = "Splits Comparison"):
    n = max(len(given), len(ideal))
    if n == 0:
        print("[info] Nothing to plot.")
        return
    if len(given) != n:
        given = _resize_to_len(given, n)
    if len(ideal) != n:
        ideal = _resize_to_len(ideal, n)
    xs = np.arange(1, n+1)
    plt.figure(figsize=(8,4.5))
    plt.plot(xs, given, marker='o', label="Given splits (sec)")
    plt.plot(xs, ideal, marker='o', label="Ideal splits (sec)")
    plt.xlabel("50m split #")
    plt.ylabel("Time (s)")
    plt.title(title)
    plt.legend()
    plt.grid(True)
    plt.show()

def _resize_to_len(v: List[float], n: int) -> List[float]:
    if len(v) == n: return v
    if len(v) == 0: return [0.0]*n
    total = sum(v)
    if total <= 0: return [0.0]*n
    return (np.ones(n) * (total/n)).tolist()

# -----------------------------------
# Core functions (Pre-race/Post-race)
# -----------------------------------

def ideal_splits_from_target(stroke: str, distance: int, target_time_str: str, personal_best_str: Optional[str] = None) -> List[float]:
    target_sec = parse_time_to_seconds(target_time_str)
    if target_sec is None or target_sec <= 0:
        raise ValueError("Invalid target time.")
    extra = {}
    if personal_best_str:
        pb_sec = parse_time_to_seconds(personal_best_str)
        if pb_sec and np.isfinite(pb_sec) and pb_sec > 0:
            extra["pb_sec"] = pb_sec
    ratios = MODEL_STORE.predict_ratios(stroke, distance, total_time_sec=target_sec, extra_features=extra)
    splits = (np.array(ratios) * target_sec).tolist()
    return splits

def analyze_post_race(stroke: str, distance: int, given_splits_str: str, personal_best_str: Optional[str] = None):
    given = parse_splits_from_string(given_splits_str)
    if not given:
        raise ValueError("Could not parse given splits. Use format like '32.33;33.11;...'.")
    n_expected = n50s_for_distance(distance)
    if n_expected <= 0: raise ValueError("Distance must be a positive multiple of 50.")
    if len(given) != n_expected:
        given = resample_splits(given, n_expected)
    total_given = sum(given)

    # Ideal total: prefer PB if provided; else match given total
    target_sec = None
    if personal_best_str:
        pb = parse_time_to_seconds(personal_best_str)
        if pb and pb > 0:
            target_sec = pb
    if not target_sec:
        target_sec = total_given

    ratios = MODEL_STORE.predict_ratios(stroke, distance, total_time_sec=target_sec, extra_features={"pb_sec": target_sec})
    ideal = (np.array(ratios) * target_sec).tolist()

    diff_per_split = (np.array(given) - np.array(ideal)).tolist()
    cumulative_given = np.cumsum(given)
    cumulative_ideal = np.cumsum(ideal)
    total_delta = total_given - sum(ideal)

    report = {
        "stroke": stroke,
        "distance": distance,
        "total_given_sec": total_given,
        "target_sec_used": target_sec,
        "total_ideal_sec": float(sum(ideal)),
        "total_delta_sec": float(total_delta),
        "given_splits_sec": [float(x) for x in given],
        "ideal_splits_sec": [float(x) for x in ideal],
        "split_delta_sec": [float(x) for x in diff_per_split],
        "cumulative_given_sec": [float(x) for x in cumulative_given],
        "cumulative_ideal_sec": [float(x) for x in cumulative_ideal],
    }
    return report

# --------------------------
# Interactive CLI (Notebook)
# --------------------------

MENU = textwrap.dedent("""
    -------------------------
    Competitive Pacing - CLI
    -------------------------
    Choose mode:
      1) Pre-race (PB + Target -> ideal 50s splits)
      2) Post-race (Given splits + PB -> analysis & chart)
      3) Exit
""")

def cli_loop():
    while True:
        print(MENU)
        choice = ask("Enter choice (1/2/3): ", cast=str)
        if choice == "3":
            print("Bye!")
            break
        elif choice == "1":
            try:
                stroke = parse_stroke_input(ask("Stroke (Freestyle/Backstroke/Breaststroke/Butterfly/IM): ", cast=str))
                distance = parse_distance_input(ask("Distance (e.g., 50/100/200/400): ", cast=str))
                pb = ask("Personal Best time (e.g., 1:45.23 or 65.23) [optional]: ", cast=str, allow_blank=True)
                target = ask("Target time (e.g., 1:40.00 or 100.0): ", cast=str)
                splits = ideal_splits_from_target(stroke, distance, target_time_str=target, personal_best_str=pb)
                print("\nIdeal 50m splits:")
                print(pretty_print_splits(splits))
                print("(semicolon-separated; times shown as M:SS.xx or SS.xx)")
                figure_compare_splits([], splits, title=f"Ideal Splits — {stroke.title()} {distance}")
            except Exception as e:
                print(f"[error] {e}")
        elif choice == "2":
            try:
                stroke = parse_stroke_input(ask("Stroke (Freestyle/Backstroke/Breaststroke/Butterfly/IM): ", cast=str))
                distance = parse_distance_input(ask("Distance (e.g., 50/100/200/400): ", cast=str))
                given = ask("Given splits (semicolon-separated, e.g., 32.33;33.11;...): ", cast=str)
                pb = ask("Personal Best time (e.g., 1:45.23 or 65.23) [optional]: ", cast=str, allow_blank=True)
                report = analyze_post_race(stroke, distance, given_splits_str=given, personal_best_str=pb)
                print("\n--- Post-race Analysis ---")
                print(f"Event: {stroke.title()} {distance}m")
                print(f"Given total: {seconds_to_time_str(report['total_given_sec'])}")
                print(f"Ideal total (from model): {seconds_to_time_str(report['total_ideal_sec'])}")
                print(f"Delta (Given - Ideal): {report['total_delta_sec']:+.2f} s")
                print("\nSplit-by-split (Given vs Ideal | Δ):")
                for i, (g, idl, dlt) in enumerate(zip(report["given_splits_sec"], report["ideal_splits_sec"], report["split_delta_sec"]), start=1):
                    print(f"  50#{i:>2}: {g:6.2f}  |  {idl:6.2f}  |  {dlt:+6.2f}")
                figure_compare_splits(report["given_splits_sec"], report["ideal_splits_sec"], 
                                      title=f"Given vs Ideal — {stroke.title()} {distance}")
            except Exception as e:
                print(f"[error] {e}")
        else:
            print("Invalid choice.")

# --------------------------
# Train from your datasets
# --------------------------

ALL_DATA = load_all_datasets()
LEADERBOARD, MODEL_HINTS = load_leaderboard_and_hints()

MODEL_STORE = SplitModelStore()
MODEL_STORE.set_leaderboard(LEADERBOARD)
MODEL_STORE.set_model_hints(MODEL_HINTS)
MODEL_STORE.train_from_dataframe(ALL_DATA)

print("=== Competitive Pacing (updated) ===")
print(f"Loaded rows: {len(ALL_DATA)}")
if MODEL_HINTS:
    print("Model hints by stroke:", MODEL_HINTS)
print(f"Models trained for events: {sorted(list(MODEL_STORE.models.keys()))}")
if LEADERBOARD:
    print(f"Leaderboard baselines available for: {sorted(list(LEADERBOARD.keys()))}")
else:
    print("No leaderboard ratios found; using trained models or heuristics.")

def list_available_events():
    keys = set(MODEL_STORE.models.keys()) | set(MODEL_STORE.leaderboard.keys())
    if not keys:
        print("No events available yet. Add CSVs and re-run this cell.")
        return
    print("Available (stroke, distance) events:")
    for k in sorted(keys):
        print("  ", k)

print("\nTip: call list_available_events() to see which (stroke, distance) are model-ready.")
print("Run cli_loop() to start the interactive menu.\n")

# Example:
# list_available_events()
# cli_loop()


=== Competitive Pacing (updated) ===
Loaded rows: 0
Models trained for events: []
No leaderboard ratios found; using trained models or heuristics.

Tip: call list_available_events() to see which (stroke, distance) are model-ready.
Run cli_loop() to start the interactive menu.

