In [1]:
import sys
from pathlib import Path
import pandas as pd

# make parent folder importable
parent_folder = Path.cwd().parent
sys.path.append(str(parent_folder))

# import config
from config import DATA_DIR

results_file = DATA_DIR / "10-20-results.csv"
misc_file = DATA_DIR / "10-20-misc.csv"
stages_file = DATA_DIR / "10-20-stages.csv"
practice_file = DATA_DIR / "10-20-practice.csv"

results = pd.read_csv(results_file)
misc = pd.read_csv(misc_file)
stages = pd.read_csv(stages_file)
practice = pd.read_csv(practice_file)

In [2]:
# aggregation of practice data to eliminate 'duplicate' observations 
# the only race consistently having multiple practices is the Daytona500, but I think this is best practice for now
practice_agg = (
    practice.groupby(["race_id", "driver_id"])
    .agg({
        "BestLapRank" : "mean",
        "OverAllAvgRank" : "mean",
        "Con5LapRank" : "mean",
        "Con10LapRank" : "mean",
        "Con15LapRank" : "mean",
        "Con20LapRank" : "mean",
        "Con25LapRank" : "mean",
        "Con30LapRank" : "mean"
    })
)

In [3]:
# pivoting the stage dataset to eliminate 'duplicate' observations
# doing this rather than aggregation since I want to preserve stage 1 and stage 2 as seperate parts of the race (not averaged together)
stages_wide = stages.pivot_table(
    index=["race_id", "driver_id"],
    columns="stage_number",
    values=["position", "stage_points"]
)

stages_wide.columns = [
    f"stage_{col[1]}_{col[0]}" for col in stages_wide.columns.to_flat_index()
]

stages_wide = stages_wide.reset_index()

In [4]:
# data merging
df = results.copy()
df = pd.merge(df, misc, on=["race_id", "driver_id"], how="outer", suffixes=("", "_misc"))
df = pd.merge(df, stages_wide, on=["race_id", "driver_id"], how="outer")
df = pd.merge(df, practice_agg, on=["race_id", "driver_id"], how="outer")

In [5]:
# shows duplicates (same driver_id AND race_id in multiple rows)
# dupes = df[df.duplicated(subset=["race_id", "driver_id"], keep=False)]
# print(dupes.sort_values(["race_id", "driver_id"]))
# ensure no duplicates
assert df.duplicated(subset=["race_id", "driver_id"]).sum() == 0

In [6]:
import importlib, rolling_lagging as rolling_lagging
importlib.reload(rolling_lagging)
from rolling_lagging import lagging_rolling_generator, reconcile_driver_carteams

lagroll_features = [
    "finishing_position", 
    "starting_position",
    "points_position", 
    "stage_1_position", 
    "stage_2_position", 
    "mid_ps", 
    "closing_ps", 
    "avg_ps", 
    "BestLapRank", 
    "OverAllAvgRank",
    "laps_completed", 
    "laps_led", 
    "points_earned", 
    "fast_laps", 
    "top15_laps", 
    "rating"
]

# feature engineering, lagging & rolling averages
# df, features, sort_list, filter_list, windows_list, suffix, min_periods = 1

# directly recent races (momentum):
df = lagging_rolling_generator(df, lagroll_features, ["driver_id"], ["driver_id"], [3, 5, 10], "general", 1)
df = lagging_rolling_generator(df, lagroll_features, ["team_name"], ["team_name"], [3, 5, 10], "general_team", 1)
df = lagging_rolling_generator(df, lagroll_features, ["team_name", "car_number"], ["team_name", "car_number"], [3, 5, 10], "general_carteam", 1)

# most recent at track type:
df = lagging_rolling_generator(df, lagroll_features, ["driver_id"], ["driver_id", "track_type"], [3, 5, 10], "tracktype", 1)
df = lagging_rolling_generator(df, lagroll_features, ["team_name"], ["team_name", "track_type"], [3, 5, 10], "tracktype_team", 1)
df = lagging_rolling_generator(df, lagroll_features, ["team_name", "car_number"], ["team_name", "car_number", "track_type"], [3, 5, 10], "tracktype_carteam", 1)

# most recent at specific track:
df = lagging_rolling_generator(df, lagroll_features, ["driver_id"], ["driver_id", "track_name"], [3, 5], "track", 1)
df = lagging_rolling_generator(df, lagroll_features, ["team_name"], ["team_name", "track_name"], [3, 5], "track_team", 1)
df = lagging_rolling_generator(df, lagroll_features, ["team_name", "car_number"], ["team_name", "car_number", "track_name"], [3, 5], "track_carteam", 1)

In [7]:
# fill NaNs in lagroll features with carteam or team averages
# df = reconcile_driver_carteams(df, lagroll_features, [3, 5, 10], "general")
df = reconcile_driver_carteams(df, lagroll_features, [3, 5, 10], "general")
df = reconcile_driver_carteams(df, lagroll_features, [3, 5, 10], "tracktype")
df = reconcile_driver_carteams(df, lagroll_features, [3, 5], "track")

In [8]:
# dropping features irrelevant to benchmark model

# keeping for visibility when reviewing df as csv
keep_cols = [
    "finishing_position", "race_id", "race_season", "race_name", "track_name", "race_date",
    "driver_fullname", "driver_id", "car_number", "team_name", "car_make",
    "crew_chief_fullname", "finishing_status"
]

# remove the actual in-race data per observation but keep P&Q
in_race_leakage = [
    'finishing_position', 'laps_completed', 'laps_led', 'points_earned', 'playoff_points_earned', 
    'points_position', 'mid_ps', 'closing_ps', 'avg_ps', 'fast_laps', 'top15_laps', 'rating', 
    'stage_1_position', 'stage_2_position'
]

# keep engineered driver features for model, but not team / carteam features (those were only used to fill missing driver stats)
lagroll_cols = [
    col for col in df.columns
    if any(feat in col for feat in lagroll_features)
    and "_team" not in col
    and "_carteam" not in col
    and col not in in_race_leakage
]
print(lagroll_cols)

final_cols = keep_cols + lagroll_cols
df = df[final_cols]
# df.to_csv("training_ready_bench.csv", index=False)

['starting_position', 'BestLapRank', 'OverAllAvgRank', 'finishing_position_lag1_general', 'finishing_position_roll3_general', 'finishing_position_roll5_general', 'finishing_position_roll10_general', 'starting_position_lag1_general', 'starting_position_roll3_general', 'starting_position_roll5_general', 'starting_position_roll10_general', 'points_position_lag1_general', 'points_position_roll3_general', 'points_position_roll5_general', 'points_position_roll10_general', 'stage_1_position_lag1_general', 'stage_1_position_roll3_general', 'stage_1_position_roll5_general', 'stage_1_position_roll10_general', 'stage_2_position_lag1_general', 'stage_2_position_roll3_general', 'stage_2_position_roll5_general', 'stage_2_position_roll10_general', 'mid_ps_lag1_general', 'mid_ps_roll3_general', 'mid_ps_roll5_general', 'mid_ps_roll10_general', 'closing_ps_lag1_general', 'closing_ps_roll3_general', 'closing_ps_roll5_general', 'closing_ps_roll10_general', 'avg_ps_lag1_general', 'avg_ps_roll3_general', 'a

In [9]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, GroupKFold
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from scipy.stats import spearmanr
import numpy as np

# Features and target
df_train = df.dropna(subset=["finishing_position"]).copy()
X = df_train[lagroll_cols]
y = df_train["finishing_position"]
groups = df_train["race_id"]

# Define pipeline: impute -> scale -> model
pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler()),
    ("ridge", Ridge())
])

# Parameter grid for alpha
param_grid = {"ridge__alpha": [1000, 1400, 1600, 1800, 2000]}

# Grouped CV
cv = GroupKFold(n_splits=5)

# Grid search
grid = GridSearchCV(pipe, param_grid, scoring="neg_mean_squared_error", cv=cv)
grid.fit(X, y, groups=groups)

print("Best alpha:", grid.best_params_)
best_model = grid.best_estimator_

# Get coefficients
ridge_model = best_model.named_steps["ridge"]
coef = pd.Series(ridge_model.coef_, index=lagroll_cols)
print(coef.sort_values(ascending=False).head(20))

# Apply to full df
df["weighted_score_lr"] = best_model.predict(df[lagroll_cols])
df["pred_rank_lr"] = df.groupby("race_id")["weighted_score_lr"].rank(method="min", ascending=True)  
# ascending=True because lower predicted finish = better (P1)

def race_spearman(g):
    if g["weighted_score_lr"].nunique() < 2:
        return np.nan
    return spearmanr(g["weighted_score_lr"], g["finishing_position"]).correlation

race_corrs = (
    df.dropna(subset=["finishing_position"])
      .groupby("race_id")
      .apply(race_spearman)
)

print("\nSpearman mean:", race_corrs.mean().round(3))
print("Spearman median:", race_corrs.median().round(3))
print("Number of races evaluated:", race_corrs.notna().sum())

df.to_csv("analysis_ready_bench.csv", index=False)

Best alpha: {'ridge__alpha': 1600}
starting_position                  1.093034
OverAllAvgRank                     0.473072
BestLapRank                        0.413505
mid_ps_roll10_general              0.348017
OverAllAvgRank_roll5_general       0.329482
laps_led_roll5_tracktype           0.263406
points_position_lag1_general       0.238216
mid_ps_lag1_general                0.212826
BestLapRank_roll5_track            0.208415
points_earned_roll3_general        0.196703
OverAllAvgRank_roll3_tracktype     0.195675
stage_1_position_roll10_general    0.192749
stage_1_position_lag1_tracktype    0.191542
points_position_roll10_general     0.190011
points_position_roll5_general      0.185861
avg_ps_roll5_track                 0.181618
OverAllAvgRank_roll5_track         0.178165
points_position_roll3_general      0.171581
laps_led_roll10_tracktype          0.171491
mid_ps_roll5_tracktype             0.164186
dtype: float64

Spearman mean: 0.437
Spearman median: 0.457
Number of races evaluated

  df["weighted_score_lr"] = best_model.predict(df[lagroll_cols])
  df["pred_rank_lr"] = df.groupby("race_id")["weighted_score_lr"].rank(method="min", ascending=True)
  .apply(race_spearman)
