In [None]:

import os, json, ast, re, warnings, math, sys, subprocess
from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import KFold, cross_val_score

# Estimators
from sklearn.ensemble import RandomForestRegressor as RandomForest
from sklearn.ensemble import GradientBoostingRegressor as GBR
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression
from sklearn.svm import SVR

ROOT = Path(".").resolve()
DATASETS_DIR = ROOT / "datasets"
MODELS_DIR = ROOT / "models"
MODELS_DIR.mkdir(parents=True, exist_ok=True)

LEADERBOARD_COMBINED = ROOT / "best_models_by_stroke.csv"

# Individual leaderboards (optional; used to rebuild combined if present)
LB_FILES = {
    "Freestyle": ROOT / "Freestyle_leaderboard.csv",
    "Backstroke": ROOT / "Backstroke_leaderboard.csv",
    "Breaststroke": ROOT / "Breaststroke_leaderboard.csv",
    "Butterfly": ROOT / "Butterfly_leaderboard.csv",
    "IM": ROOT / "IM_leaderboard.csv",
}

STROKE_TO_FILE = {
    "Freestyle": "freestyle_dataset.csv",
    "Backstroke": "backstroke_dataset.csv",
    "Breaststroke": "breaststroke_dataset.csv",
    "Butterfly": "butterfly_dataset.csv",
    "IM": "im_dataset.csv",
}

MODEL_MAP = {
    "RandomForest": RandomForest,
    "GBR": GBR,
    "Ridge": Ridge,
    "Lasso": Lasso,
    "ElasticNet": ElasticNet,
    "LinearRegression": LinearRegression,
    "SVR": SVR,
}

SCALER_MODELS = {"Ridge","Lasso","ElasticNet","LinearRegression","SVR"}

# Flexible target detection
TARGET_PATTERNS = [
    re.compile(r"^frac_\d+$", flags=re.IGNORECASE),   # frac_1
    re.compile(r"^frac\d+$", flags=re.IGNORECASE),    # frac1
    re.compile(r"^FRAC_\d+$"),
]

def detect_target_cols(columns, expected_targets=None):
    cols = []
    for c in columns:
        s = str(c)
        if any(p.match(s) for p in TARGET_PATTERNS):
            cols.append(c)
    if expected_targets:
        lower_expected = {t.lower() for t in expected_targets}
        cols = [c for c in cols if str(c).lower() in lower_expected or str(c).lower().replace("_","") in lower_expected]
    return cols

def normalize_target_name(name: str):
    s = str(name)
    if re.match(r"^frac\d+$", s, flags=re.IGNORECASE):
        return re.sub(r"(?i)^frac(\d+)$", r"frac_\1", s.lower())
    return s


# Silence sklearn SimpleImputer all-NaN warnings (we already handle features safely)
import warnings as _warn
_warn.filterwarnings('ignore', message='Skipping features without any observed values', category=UserWarning)

In [None]:
available = []
if DATASETS_DIR.exists():
    for f in DATASETS_DIR.glob("*.csv"):
        available.append(f.name)

print("Found dataset files:", available)
missing = [v for v in STROKE_TO_FILE.values() if v not in available]
if missing:
    print("[WARN] Missing expected dataset files:", missing)
else:
    print("[OK] All expected datasets present.")


In [None]:
# Diagnostics: show columns for each dataset (head only)
for stroke, fname in STROKE_TO_FILE.items():
    p = DATASETS_DIR / fname
    if p.exists():
        dfh = pd.read_csv(p, nrows=3)
        print(f"\n[{stroke}] {fname} → columns: {list(dfh.columns)}")
        display(dfh.head(3))


In [None]:

def pick_best(df):
    df = df.copy()
    df.columns = [c.strip().lower() for c in df.columns]
    df_sorted = df.sort_values(by=["r2","mse"], ascending=[False, True])
    best = df_sorted.groupby("target", as_index=False).first()
    return best

frames = []
if not LEADERBOARD_COMBINED.exists():
    print("[INFO] Combined leaderboard not found. Recomputing from individual files if available...")
    for stroke, path in LB_FILES.items():
        if path.exists():
            try:
                df = pd.read_csv(path)
                best = pick_best(df)
                best.insert(0, "stroke", stroke)
                frames.append(best)
            except Exception as e:
                print(f"[WARN] Failed to read {path}: {e}")
    if frames:
        combined = pd.concat(frames, ignore_index=True)
        combined.to_csv(LEADERBOARD_COMBINED, index=False)
        print("[OK] Wrote combined leaderboard to", LEADERBOARD_COMBINED)
    else:
        print("[WARN] Per-stroke leaderboards not found. Will try local 'best_models_by_stroke.csv' or fallback next.")
else:
    print("[OK] Found combined leaderboard at", LEADERBOARD_COMBINED)

# If still missing, try local sibling or embedded fallback
if not LEADERBOARD_COMBINED.exists():
    candidate = Path("best_models_by_stroke.csv")
    if candidate.exists():
        combined = pd.read_csv(candidate)
        combined.to_csv(LEADERBOARD_COMBINED, index=False)
        print("[OK] Loaded local best_models_by_stroke.csv into", LEADERBOARD_COMBINED)
    else:
        # Will be handled by next cell that may embed fallback if provided.
        pass

In [None]:

# Embedded fallback (auto-written only if nothing else was found)
_fallback_csv_payload = r"""stroke,target,model,bestparams,r2,mse
Freestyle,frac_1,GBR,"{'model__learning_rate': 0.1, 'model__n_estimators': 100}",0.999870243378946,3.446300985404293e-06
Freestyle,frac_10,GBR,"{'model__learning_rate': 0.1, 'model__n_estimators': 100}",0.9998943409633224,2.269611027931617e-08
Freestyle,frac_11,GBR,"{'model__learning_rate': 0.1, 'model__n_estimators': 100}",0.9998884575181034,2.403124123045788e-08
Freestyle,frac_12,GBR,"{'model__learning_rate': 0.1, 'model__n_estimators': 100}",0.9998824672075748,2.5413726915240372e-08
Freestyle,frac_13,GBR,"{'model__learning_rate': 0.1, 'model__n_estimators': 100}",0.999880564246292,2.5981500273515697e-08
Freestyle,frac_14,GBR,"{'model__learning_rate': 0.1, 'model__n_estimators': 100}",0.9998710225436998,2.8192505027017152e-08
Freestyle,frac_15,GBR,"{'model__learning_rate': 0.1, 'model__n_estimators': 100}",0.9998765681982152,2.7097569500937284e-08
Freestyle,frac_16,GBR,"{'model__learning_rate': 0.1, 'model__n_estimators': 100}",0.9964365193390609,7.678266339955384e-07
Freestyle,frac_17,Ridge,{'model__alpha': 0.1},0.0,2.988990637699851e-08
Freestyle,frac_18,Ridge,{'model__alpha': 0.1},0.0,2.5349672450572248e-08
Freestyle,frac_19,Ridge,{'model__alpha': 0.1},0.0,2.2408701089148216e-08
Freestyle,frac_2,GBR,"{'model__learning_rate': 0.1, 'model__n_estimators': 100}",0.9998867955175011,3.57427629763549e-06
Freestyle,frac_20,Ridge,{'model__alpha': 0.1},0.0,1.5852440136225758e-08
Freestyle,frac_21,Ridge,{'model__alpha': 0.1},0.0,1.2104395266390137e-08
Freestyle,frac_22,Ridge,{'model__alpha': 0.1},0.0,1.202243271136594e-08
Freestyle,frac_23,Ridge,{'model__alpha': 0.1},0.0,1.1919455483054742e-08
Freestyle,frac_24,Ridge,{'model__alpha': 0.1},0.0,1.7149887415871123e-08
Freestyle,frac_25,Ridge,{'model__alpha': 0.1},0.0,3.0218453715648795e-08
Freestyle,frac_26,Ridge,{'model__alpha': 0.1},0.0,1.4024195867334521e-07
Freestyle,frac_27,Ridge,{'model__alpha': 0.1},0.0,6.314455592110409e-07
Freestyle,frac_28,Ridge,{'model__alpha': 0.1},0.0,8.880265920459282e-07
Freestyle,frac_29,Ridge,{'model__alpha': 0.1},0.0,8.688721906725003e-07
Freestyle,frac_3,GBR,"{'model__learning_rate': 0.1, 'model__n_estimators': 100}",0.9998709600381422,9.556862590077076e-07
Freestyle,frac_30,Ridge,{'model__alpha': 0.1},0.0,8.309361340034765e-07
Freestyle,frac_4,GBR,"{'model__learning_rate': 0.1, 'model__n_estimators': 100}",0.9999064518848968,6.696272014015025e-07
Freestyle,frac_5,RandomForest,"{'model__max_depth': 3, 'model__n_estimators': 100}",0.9999709211876188,4.2780859101891336e-08
Freestyle,frac_6,RandomForest,"{'model__max_depth': 3, 'model__n_estimators': 100}",0.9999532784999926,6.87564897870968e-08
Freestyle,frac_7,RandomForest,"{'model__max_depth': 3, 'model__n_estimators': 100}",0.9999334164741176,9.807550310729845e-08
Freestyle,frac_8,RandomForest,"{'model__max_depth': 3, 'model__n_estimators': 50}",0.9992697709056444,1.1442479334816152e-06
Freestyle,frac_9,GBR,"{'model__learning_rate': 0.1, 'model__n_estimators': 100}",0.9998878635463874,2.3983271145407182e-08
Backstroke,frac_1,RandomForest,"{'model__max_depth': 3, 'model__n_estimators': 50}",0.9994268075799432,8.867917637670797e-06
Backstroke,frac_2,GBR,"{'model__learning_rate': 0.1, 'model__n_estimators': 100}",0.9995320930324364,7.929250320901907e-06
Backstroke,frac_3,Ridge,{'model__alpha': 0.1},0.0,8.36422372652802e-06
Backstroke,frac_4,Ridge,{'model__alpha': 0.1},0.0,1.756630052177816e-05
Breaststroke,frac_1,GBR,"{'model__learning_rate': 0.1, 'model__n_estimators': 100}",0.9988548437938534,1.6389929209266215e-05
Breaststroke,frac_2,GBR,"{'model__learning_rate': 0.1, 'model__n_estimators': 100}",0.9990713463228648,1.8044181130454614e-05
Breaststroke,frac_3,Ridge,{'model__alpha': 0.1},0.0,3.1925173522318018e-06
Breaststroke,frac_4,Ridge,{'model__alpha': 0.1},0.0,1.0488003264994874e-05
Butterfly,frac_1,RandomForest,"{'model__max_depth': 3, 'model__n_estimators': 50}",0.9987736496840494,1.8391454265397885e-05
Butterfly,frac_2,RandomForest,"{'model__max_depth': 3, 'model__n_estimators': 100}",0.9991241698770432,1.6971678700986156e-05
Butterfly,frac_3,Ridge,{'model__alpha': 0.1},0.0,6.019565155644881e-06
Butterfly,frac_4,Ridge,{'model__alpha': 0.1},0.0,2.3645284938661704e-05
IM,frac_1,RandomForest,"{'model__max_depth': 3, 'model__n_estimators': 50}",0.9977444968117418,6.883277693457198e-06
IM,frac_2,RandomForest,"{'model__max_depth': 3, 'model__n_estimators': 100}",0.9974358107136986,1.1070725017522065e-05
IM,frac_3,GBR,"{'model__learning_rate': 0.1, 'model__n_estimators': 100}",0.9965213315108744,2.2576616555748427e-05
IM,frac_4,GBR,"{'model__learning_rate': 0.1, 'model__n_estimators': 100}",0.9977884821132098,7.467268845815134e-06
IM,frac_5,Ridge,{'model__alpha': 0.1},0.0,7.963029513171503e-06
IM,frac_6,Ridge,{'model__alpha': 0.1},0.0,1.4448839049173334e-05
IM,frac_7,Ridge,{'model__alpha': 0.1},0.0,9.62360429367415e-06
IM,frac_8,Ridge,{'model__alpha': 0.1},0.0,9.857673151697714e-06
"""

if not LEADERBOARD_COMBINED.exists():
    if _fallback_csv_payload.strip():
        Path("best_models_by_stroke.csv").write_text(_fallback_csv_payload)
        combined = pd.read_csv("best_models_by_stroke.csv")
        combined.to_csv(LEADERBOARD_COMBINED, index=False)
        print("[OK] Wrote embedded fallback to", LEADERBOARD_COMBINED)
    else:
        raise FileNotFoundError("No leaderboards available. Place 'best_models_by_stroke.csv' next to this notebook.")
else:
    combined = pd.read_csv(LEADERBOARD_COMBINED)

display(combined.head())


In [None]:
def parse_bestparams(s: str) -> dict:
    if pd.isna(s) or not str(s).strip():
        return {}
    try:
        d = ast.literal_eval(s)
    except Exception:
        d = {}
        s2 = str(s).strip().strip("{}")
        for part in s2.split(","):
            if not part.strip(): continue
            if ":" in part:
                k, v = part.split(":", 1)
                try:
                    d[k.strip().strip("'").strip('"')] = ast.literal_eval(v.strip())
                except Exception:
                    d[k.strip().strip("'").strip('"')] = v.strip()
    return d

def make_pipeline(model_name: str, params: dict, feature_cols):
    model_cls = MODEL_MAP[model_name]
    model = model_cls()

    p2 = {}
    for k, v in params.items():
        if k.startswith("model__"):
            p2[k] = v
        else:
            p2[f"model__{k}"] = v

    num_pipe = Pipeline([("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler() if model_name in SCALER_MODELS else "passthrough")])
    pre = ColumnTransformer([("num", num_pipe, list(feature_cols))], remainder="drop")
    pipe = Pipeline([("pre", pre), ("model", model)])
    if p2:
        pipe.set_params(**p2)
    return pipe

def kfold_metrics(model, X, y, n_splits=5):
    n_splits = min(n_splits, max(2, len(y)//2))  # shrink folds if dataset is small
    cv = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        r2 = cross_val_score(model, X, y, cv=cv, scoring="r2")
        neg_mse = cross_val_score(model, X, y, cv=cv, scoring="neg_mean_squared_error")
    return {
        "r2_mean": float(np.mean(r2)), "r2_std": float(np.std(r2)),
        "mse_mean": float(np.mean(-neg_mse)), "mse_std": float(np.std(neg_mse)),
    }


In [None]:
def parse_splits_to_list(s):
    if pd.isna(s):
        return None
    nums = re.findall(r"[-+]?\d*\.?\d+", str(s))
    if not nums:
        return None
    return [float(x) for x in nums]

def make_frac_targets_from_splits(df_in: pd.DataFrame, expected_targets: list):
    df = df_in.copy()
    split_col = None
    for c in [
        'add splits per 50 with (;) between them\n (eg. "33.46; 35.67; 36.88; 33.84")\n do not put words in this question',
        'splits', 'Splits'
    ]:
        if c in df.columns:
            split_col = c
            break

    if split_col is None:
        return df

    splits = df[split_col].apply(parse_splits_to_list)
    total = splits.apply(lambda lst: np.sum(lst) if isinstance(lst, list) and len(lst)>0 else np.nan)

    max_frac = 0
    for t in expected_targets or []:
        m = re.match(r"(?i)^frac[_]?(\d+)$", str(t))
        if m:
            max_frac = max(max_frac, int(m.group(1)))

    for i in range(1, max(1, max_frac)+1):
        df[f"frac_{i}"] = np.nan

    for idx, lst in splits.items():
        if isinstance(lst, list) and len(lst) > 0 and not pd.isna(total.loc[idx]):
            denom = total.loc[idx]
            if denom:
                for i in range(1, len(lst)+1):
                    col = f"frac_{i}"
                    if col in df.columns:
                        df.at[idx, col] = lst[i-1] / denom
    return df

def engineer_numeric_features(df_in: pd.DataFrame):
    df = df_in.copy()
    df["Distance"] = pd.to_numeric(df.get("Distance"), errors="coerce")

    pool_map = {"LCM 50": 50, "SCM 25": 25, "LCM50": 50, "SCM25": 25}
    if "pool" in df.columns:
        df["pool_len"] = df["pool"].map(pool_map)

    df["athlete age"] = pd.to_numeric(df.get("athlete age"), errors="coerce")

    strat_map = {"positive": -1, "even": 0, "negative": 1, "all-out sprint": 2}
    if "Strategy" in df.columns:
        df["strategy_code"] = df["Strategy"].map(strat_map).fillna(0)
    else:
        df["strategy_code"] = 0

    df["final_time_sec"] = pd.to_numeric(df.get("Final time in seconds"), errors="coerce")
    return df


In [None]:
import joblib

build_report = []

for stroke, sub in combined.groupby("stroke"):
    ds_name = STROKE_TO_FILE.get(stroke)
    ds_path = DATASETS_DIR / ds_name if ds_name else None

    if not ds_path or not ds_path.exists():
        print(f"[WARN] Dataset for {stroke} not found at {ds_path}. Skipping this stroke.")
        continue

    df = pd.read_csv(ds_path)

    expected_targets = list(sub["target"].astype(str).unique())
    df = make_frac_targets_from_splits(df, expected_targets=expected_targets)
    df = engineer_numeric_features(df)
    # Drop columns that are entirely NaN (e.g., IM stroke labels)
    df = df.dropna(axis=1, how='all')

    # Detect targets now present
    target_cols = detect_target_cols(df.columns, expected_targets=expected_targets)

    # features: numeric not in targets
    feature_cols = [c for c in df.select_dtypes(include=[np.number]).columns if c not in target_cols]
    if not feature_cols:
        print(f"[ERROR] No numeric features found for {stroke}. Check dataset columns.")
        continue

    print(f"\n[INFO] {stroke}: Using {len(feature_cols)} features and {len(target_cols)} targets: {target_cols}")

    stroke_dir = MODELS_DIR / stroke
    stroke_dir.mkdir(parents=True, exist_ok=True)

    for _, row in sub.iterrows():
        target = row["target"]
        model_name = row["model"]
        params = parse_bestparams(row["bestparams"])

        # Normalize target name to 'frac_i'
        t_norm = normalize_target_name(target)

        if t_norm not in df.columns:
            print(f"[WARN] Target '{t_norm}' missing after split parsing for {stroke}; skipping.")
            continue

        X = df[feature_cols].copy()
        y = df[t_norm].astype(float).values

        # Drop rows where target is NaN or not finite
        import numpy as _np
        mask = _np.isfinite(y)
        if mask.sum() < len(y):
            X = X.loc[mask]
            y = y[mask]
        # If still too few samples or no variation, skip
        if len(y) < 5 or (len(_np.unique(y)) <= 1):
            print(f"[WARN] Not enough valid samples for {stroke}/{t_norm} (n={len(y)}). Skipping.")
            continue

        pipe = make_pipeline(model_name, params, feature_cols)

        try:
            cvm = kfold_metrics(pipe, X, y, n_splits=5)
        except Exception as e:
            print(f"[WARN] CV failed for {stroke}/{t_norm} with {model_name}: {e}")
            cvm = None

        pipe.fit(X, y)

        model_path = stroke_dir / f"{t_norm}_{model_name}.joblib"
        joblib.dump(pipe, model_path)

        meta = {
            "stroke": stroke,
            "target": t_norm,
            "model": model_name,
            "bestparams": params,
            "features": feature_cols,
            "dataset": str(ds_path),
            "model_path": str(model_path),
            "cv_metrics": cvm,
        }
        build_report.append(meta)
        print(f"[OK] Saved {stroke}/{t_norm} -> {model_path.name}")

    with open(stroke_dir / "index.json", "w") as f:
        json.dump([m for m in build_report if m["stroke"] == stroke], f, indent=2)

# global report
with open(MODELS_DIR / "build_report.json", "w") as f:
    json.dump(build_report, f, indent=2)

print("\n[DONE] Training complete.")


In [None]:
report_df = pd.DataFrame(build_report)
if report_df.empty:
    print("[INFO] No models were trained. See warnings above about missing or unparsable targets.")
else:
    display(report_df.head(30))
    report_csv = MODELS_DIR / "build_report.csv"
    report_df.to_csv(report_csv, index=False)
    print("Saved:", report_csv)


In [None]:
import json, joblib

def list_trained(stroke: str):
    idx_path = MODELS_DIR / stroke / "index.json"
    if not idx_path.exists():
        return []
    with open(idx_path, "r") as f:
        meta = json.load(f)
    return meta

def load_model(stroke: str, target: str):
    metas = list_trained(stroke)
    target_norm = normalize_target_name(target)
    candidates = [m for m in metas if normalize_target_name(m["target"]) == target_norm]
    if not candidates:
        raise FileNotFoundError(f"No trained model for {stroke}/{target_norm}")
    m = candidates[0]
    path = Path(m["model_path"])
    if not path.exists():
        raise FileNotFoundError(f"Model file missing at {path}")
    return joblib.load(path), m


In [None]:
# Example: Freestyle frac_1 (adjust as needed)
example_stroke = "Freestyle"
example_target = "frac_1"

# Load dataset to get feature columns
ds_path = DATASETS_DIR / STROKE_TO_FILE[example_stroke]
if ds_path.exists():
    df_demo = pd.read_csv(ds_path)
    df_demo = make_frac_targets_from_splits(df_demo, expected_targets=[example_target])
    df_demo = engineer_numeric_features(df_demo)

    targets_demo = detect_target_cols(df_demo.columns, expected_targets=[example_target])
    feats_demo = [c for c in df_demo.select_dtypes(include=[np.number]).columns if c not in targets_demo]

    if feats_demo:
        try:
            pipe, meta = load_model(example_stroke, example_target)
            X_sample = df_demo[feats_demo].head(5)
            y_pred = pipe.predict(X_sample)
            print("Loaded model:", meta["model"], "for", example_stroke, example_target)
            print("Predictions:", y_pred)
        except Exception as e:
            print("[INFO] Demo couldn't run:", e)
    else:
        print("[INFO] No numeric features to demo.")
else:
    print("[INFO] Demo dataset not found:", ds_path)


In [None]:
# --- Competitive Pacing: Simple CLI (Pre-race & Post-race) ---
# - Uses your existing per-stroke frac_k models from models/<Stroke>/index.json
# - No changes to your training code are needed
# - Adds an option to save the comparison/ideal chart as a JPEG

import re, json
from pathlib import Path
from typing import List, Dict, Tuple

import numpy as np
import pandas as pd
import joblib
import matplotlib.pyplot as plt
from IPython.display import display

# ----------------------------
# Basic config / constants
# ----------------------------
ROOT = Path(".").resolve()
MODELS_DIR = ROOT / "models"
STROKES = ["Freestyle", "Backstroke", "Breaststroke", "Butterfly", "IM"]

# ----------------------------
# Small helpers
# ----------------------------
def parse_time_to_seconds(s: str) -> float:
    s = str(s).strip()
    if not s:
        return np.nan
    if ":" not in s:
        try:
            return float(s)
        except:
            return np.nan
    parts = s.split(":")
    try:
        parts = [float(p) for p in parts]
    except:
        return np.nan
    secs = 0.0
    for p in parts:
        secs = secs * 60.0 + p
    return secs

def seconds_to_time_str(x: float) -> str:
    if x is None or np.isnan(x):
        return "NA"
    x = float(x)
    m = int(x // 60)
    s = x - 60*m
    return f"{m}:{s:05.2f}"

def parse_splits_string(s: str) -> List[float]:
    nums = re.findall(r"[-+]?\d*\.?\d+", str(s))
    return [float(x) for x in nums] if nums else []

def expected_num_splits(distance: int) -> int:
    return distance // 50 if distance and distance > 0 else 0

def frac_name(i: int) -> str:
    return f"frac_{i}"

def print_splits(splits: List[float]) -> str:
    return "; ".join(f"{v:.2f}" for v in splits)

# ----------------------------
# Model loading / inference
# ----------------------------
def load_index(stroke: str) -> List[Dict]:
    idx = MODELS_DIR / stroke / "index.json"
    if not idx.exists():
        raise FileNotFoundError(f"Missing index for {stroke}: {idx}")
    meta = json.loads(idx.read_text())
    meta = [m for m in meta if re.match(r"(?i)^frac(_)?\d+$", str(m.get("target","")))]
    for m in meta:
        t = str(m["target"]).lower()
        if re.match(r"^frac\d+$", t):
            t = re.sub(r"^frac(\d+)$", r"frac_\1", t)
        m["target"] = t
    meta.sort(key=lambda d: int(re.findall(r"\d+", d["target"])[0]))
    return meta

def load_frac_models(stroke: str) -> Tuple[Dict[str, object], Dict[str, Dict]]:
    metas = load_index(stroke)
    models, meta_map = {}, {}
    for m in metas:
        p = Path(m["model_path"])
        if not p.exists():
            alt = MODELS_DIR / stroke / p.name
            if alt.exists(): p = alt
        if not p.exists():
            raise FileNotFoundError(f"Model file missing: {p}")
        models[m["target"]] = joblib.load(p)
        meta_map[m["target"]] = m
    return models, meta_map

def build_feature_row(required_feats: List[str], distance_m: int, total_time_s: float) -> pd.DataFrame:
    row = {f: np.nan for f in required_feats}
    if "Distance" in row: row["Distance"] = float(distance_m)
    if "final_time_sec" in row: row["final_time_sec"] = float(total_time_s)
    if "Final time in seconds" in row: row["Final time in seconds"] = float(total_time_s)
    return pd.DataFrame([row])

def predict_fractions(stroke: str, distance_m: int, total_time_s: float) -> pd.Series:
    k = expected_num_splits(distance_m)
    if k <= 0: raise ValueError("Distance must be a positive multiple of 50.")
    models, metas = load_frac_models(stroke)

    preds = []
    for i in range(1, k+1):
        t = frac_name(i)
        if t not in models: break
        pipe = models[t]
        feats = metas[t].get("features", [])
        X = build_feature_row(feats, distance_m, total_time_s)
        y = float(pipe.predict(X)[0])
        preds.append((t, max(0.0, y)))
    if not preds:
        raise RuntimeError(f"No usable frac models for {stroke} at {distance_m}m.")
    ser = pd.Series(dict(preds))
    total = ser.sum()
    ser = ser / total if total > 0 else pd.Series(np.full(len(ser), 1/len(ser)), index=ser.index)
    return ser

def ideal_splits_seconds(stroke: str, distance_m: int, total_time_s: float) -> List[float]:
    fracs = predict_fractions(stroke, distance_m, total_time_s)
    return [float(f * total_time_s) for f in fracs.values]

# ----------------------------
# Analysis / plotting
# ----------------------------
def compare_df(given: List[float], ideal: List[float]) -> pd.DataFrame:
    n = min(len(given), len(ideal))
    df = pd.DataFrame({
        "Distance (m)": [i*50 for i in range(1, n+1)],
        "Given (s)": [float(x) for x in given[:n]],
        "Ideal (s)": [float(x) for x in ideal[:n]],
    })
    df["Delta (Given - Ideal)"] = df["Given (s)"] - df["Ideal (s)"]
    return df

def plot_lines(x, y_list, labels, title, xlabel, ylabel, save_jpeg_path: str = None):
    plt.figure(figsize=(8,4))
    for y, lab in zip(y_list, labels):
        plt.plot(x, y, marker="o", label=lab)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.grid(True, alpha=0.3)
    plt.legend()
    if save_jpeg_path:
        plt.tight_layout()
        plt.savefig(save_jpeg_path, dpi=200, format="jpeg")
        print(f"Saved chart: {save_jpeg_path}")
    plt.show()

# ----------------------------
# Tiny prompt helpers
# ----------------------------
def pick_from_list(prompt: str, options: List[str]) -> str:
    print(prompt)
    for i, opt in enumerate(options, 1):
        print(f"  {i}. {opt}")
    while True:
        s = input("> ").strip()
        if s.isdigit() and 1 <= int(s) <= len(options):
            return options[int(s)-1]
        for opt in options:
            if s.lower() == opt.lower():
                return opt
        print("Choose a valid option.")

def ask_int(prompt: str) -> int:
    while True:
        s = input(prompt).strip()
        try:
            v = int(s)
            return v
        except:
            print("Please enter an integer.")

def ask_time_seconds(prompt: str) -> float:
    while True:
        s = input(prompt).strip()
        v = parse_time_to_seconds(s)
        if np.isfinite(v): return v
        print("Please enter a valid time (e.g., 59.80 or 1:05.23).")

def ask_yes_no(prompt: str) -> bool:
    s = input(prompt + " [y/n]: ").strip().lower()
    return s in ("y","yes")

def ask_save_path(default_name: str) -> str:
    s = input(f"Enter JPEG filename (or press Enter for '{default_name}'): ").strip()
    if not s:
        s = default_name
    if not s.lower().endswith(".jpg") and not s.lower().endswith(".jpeg"):
        s += ".jpg"
    return s

# ----------------------------
# CLI modes
# ----------------------------
def cli_pre_race():
    print("\n=== PRE-RACE: Ideal Splits ===")
    stroke = pick_from_list("Select stroke:", STROKES)
    distance = ask_int("Enter distance (e.g., 100, 200, 400): ")
    target_sec = ask_time_seconds("Enter TARGET time(in seconds): ")

    try:
        ideal = ideal_splits_seconds(stroke, distance, target_sec)
    except Exception as e:
        print(f"Prediction failed: {e}")
        return

    print("\nIdeal splits (s per 50m):")
    print(print_splits(ideal))
    print(f"Target total: {seconds_to_time_str(target_sec)}")

    x = [i*50 for i in range(1, len(ideal)+1)]
    title = f"Ideal Splits — {stroke} {distance}m (target {seconds_to_time_str(target_sec)})"
    save_path = None
    if ask_yes_no("Save chart as JPEG?"):
        save_path = ask_save_path(f"ideal_{stroke}_{distance}m.jpg")
    plot_lines(x, [ideal], ["Ideal"], title, "Distance (m)", "Split (s)", save_path)

def cli_post_race():
    print("\n=== POST-RACE: Given vs Ideal ===")
    stroke = pick_from_list("Select stroke:", STROKES)
    distance = ask_int("Enter distance (e.g., 100, 200, 400): ")
    pb = ask_time_seconds("Enter PERSONAL BEST: ")
    splits_str = input('Paste race splits per 50 separated by ";" (e.g., 32.33; 33.22; 34.10):\n> ')
    given = parse_splits_string(splits_str)
    if not given:
        print("No numeric splits detected.")
        return

    total_actual = float(np.sum(given))
    try:
        ideal = ideal_splits_seconds(stroke, distance, total_actual)
    except Exception as e:
        print(f"Prediction failed: {e}")
        return

    print("\nGiven splits (s):")
    print(print_splits(given))
    print("\nIdeal splits (s):")
    print(print_splits(ideal))
    print(f"\nActual total: {seconds_to_time_str(total_actual)} | PB: {seconds_to_time_str(pb)}")

    df = compare_df(given, ideal)
    display(df)

    x = df["Distance (m)"].tolist()
    title = f"{stroke} {distance}m — Given vs Ideal (total {seconds_to_time_str(total_actual)})"
    save_path = None
    if ask_yes_no("Save comparison chart as JPEG?"):
        save_path = ask_save_path(f"compare_{stroke}_{distance}m.jpg")
    plot_lines(x, [df["Given (s)"].tolist(), df["Ideal (s)"].tolist()],
               ["Given", "Ideal"], title, "Distance (m)", "Split (s)", save_path)

# ----------------------------
# Main loop
# ----------------------------
def main():
    print("Competitive Pacing — Simple CLI")
    print("1) Pre-race (ideal splits)")
    print("2) Post-race (compare given vs ideal)")
    print("q) Quit")
    while True:
        choice = input("\nChoose 1 / 2 / q: ").strip().lower()
        if choice == "1": cli_pre_race()
        elif choice == "2": cli_post_race()
        elif choice in {"q","quit","exit"}:
            print("Bye.")
            break
        else:
            print("Please choose 1, 2, or q.")

# Run
main()
