In [7]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib

In [8]:
df = pd.read_csv("../data/master_dataset_2025.csv")

In [9]:
df["is_podium_finish"] = df["RacePosition"] <= 3

In [10]:
X = df[["fp2_avg_lap", "fp2_best_lap", "fp2_total_laps", "FastestQualiLap",  "QualiPosition"]]   
y = df["is_podium_finish"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=22)

In [11]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

log_model = LogisticRegression(class_weight="balanced", random_state=22)
log_model.fit(X_train_scaled, y_train)

In [13]:
joblib.dump(log_model, "../models/logreg_scaled_quali_pre_r18_singapore.pkl")
joblib.dump(scaler, "../models/scaler_pre_r18_singapore.pkl")

['../models/scaler_pre_r18_singapore.pkl']

Try to train a different model for predicting US GP (Round 19)

In [8]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GroupKFold, cross_val_predict
from sklearn.metrics import roc_auc_score, average_precision_score, brier_score_loss
from sklearn.calibration import CalibratedClassifierCV

from xgboost import XGBClassifier

CSV_PATH = "~/desktop/f1-podium-predictor/data/master_dataset_2025.csv"   # <- adjust path if needed
RANDOM_STATE = 42


In [9]:
df = pd.read_csv(CSV_PATH)

# coerce numerics
num_cols = ["fp2_avg_lap","fp2_best_lap","fp2_total_laps","FastestQualiLap","QualiPosition"]
for c in num_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce")

# normalize DNF & create target: podium = finished <=3 AND not DNF
df["DNF"] = df["DNF"].astype(str).str.lower().isin(["true","1","yes"])
df["RacePosition"] = pd.to_numeric(df["RacePosition"], errors="coerce").astype("Int64")
df["podium"] = ((df["RacePosition"] <= 3) & (~df["DNF"])).astype(int)

# drop missing essentials
df = df.dropna(subset=["Driver","grand_prix","podium"] + num_cols).reset_index(drop=True)

# bucket Quali to avoid over-reliance
df["QualiBin"] = pd.cut(
    df["QualiPosition"],
    bins=[0,1,3,6,10,15,100],
    labels=["P1","P2-3","P4-6","P7-10","P11-15","P16+"]
).astype("category")

FEATURES_NUM = ["fp2_avg_lap","fp2_best_lap","fp2_total_laps","FastestQualiLap"]
FEATURES_CAT = ["Driver","QualiBin"]
ALL_FEATURES = FEATURES_NUM + FEATURES_CAT

X = df[ALL_FEATURES].copy()
y = df["podium"].values
groups = df["grand_prix"].values


In [10]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

pre = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), FEATURES_NUM),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), FEATURES_CAT),
    ],
    remainder="drop",
)

xgb = XGBClassifier(
    n_estimators=1500,
    max_depth=5,
    learning_rate=0.025,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=1.0,
    objective="binary:logistic",
    eval_metric="logloss",
    tree_method="hist",
    n_jobs=-1,
    random_state=RANDOM_STATE,
)

pipe = Pipeline([("pre", pre), ("clf", xgb)])

In [11]:
from sklearn.model_selection import GroupKFold, cross_val_predict
from sklearn.calibration import CalibratedClassifierCV

gkf = GroupKFold(n_splits=6)

# Out-of-fold probs for honest metrics
oof_proba = cross_val_predict(
    pipe, X, y, groups=groups, cv=gkf, method="predict_proba", n_jobs=-1
)[:, 1]

print("OOF ROC AUC :", round(roc_auc_score(y, oof_proba), 4))
print("OOF PR  AUC :", round(average_precision_score(y, oof_proba), 4))
print("OOF Brier   :", round(brier_score_loss(y, oof_proba), 4))

# Calibrate probs using the SAME grouped folds (prevents leakage)
folds = list(gkf.split(X, y, groups))
cal = CalibratedClassifierCV(estimator=pipe, method="isotonic", cv=folds)
cal.fit(X, y)

print("✅ XGBoost model trained + calibrated.")

OOF ROC AUC : 0.8855
OOF PR  AUC : 0.6307
OOF Brier   : 0.0851
✅ XGBoost model trained + calibrated.


In [12]:
def predict_podium_probs(calibrated_model, df_next):
    """
    df_next must contain at least:
      Driver, fp2_avg_lap, fp2_best_lap, fp2_total_laps, FastestQualiLap, QualiPosition
    """
    d = df_next.copy()

    # numerics
    for c in ["fp2_avg_lap","fp2_best_lap","fp2_total_laps","FastestQualiLap","QualiPosition"]:
        d[c] = pd.to_numeric(d[c], errors="coerce")

    # same QualiBin as training
    d["QualiBin"] = pd.cut(
        d["QualiPosition"],
        bins=[0,1,3,6,10,15,100],
        labels=["P1","P2-3","P4-6","P7-10","P11-15","P16+"]
    ).astype("category")

    Xn = d[ALL_FEATURES]
    p = calibrated_model.predict_proba(Xn)[:, 1]

    out = d[["Driver"]].copy()
    out["podium_probability"] = p
    out = out.sort_values("podium_probability", ascending=False).reset_index(drop=True)
    return out


In [None]:
# Example:
# df_next = pd.read_csv("next_race_matrix.csv")
# preds = predict_podium_probs(cal, df_next)
# preds  # displays Driver + podium_probability

In [13]:
# --- Save trained model + feature config ---
import joblib, json, os

ARTIFACT_DIR = "../models"
os.makedirs(ARTIFACT_DIR, exist_ok=True)

# 'cal' wraps the Pipeline (preprocessing + XGB) and calibration
joblib.dump(cal, f"{ARTIFACT_DIR}/podium_xgb_calibrated_pre_austin.joblib")

feature_cfg = {
    "FEATURES_NUM": ["fp2_avg_lap","fp2_best_lap","fp2_total_laps","FastestQualiLap"],
    "FEATURES_CAT": ["Driver","QualiBin"],
    "QUALIBIN_BINS": [0,1,3,6,10,15,100],
    "QUALIBIN_LABELS": ["P1","P2-3","P4-6","P7-10","P11-15","P16+"],
}
with open(f"{ARTIFACT_DIR}/feature_config.json", "w") as f:
    json.dump(feature_cfg, f)

print("Saved:", f"{ARTIFACT_DIR}/podium_xgb_calibrated.joblib",
      "and", f"{ARTIFACT_DIR}/feature_config.json")


Saved: ../models/podium_xgb_calibrated.joblib and ../models/feature_config.json
