In [1]:
import sys
from pathlib import Path
import pandas as pd
import ipywidgets as widgets
from IPython.display import display

# make parent folder importable
parent_folder = Path.cwd().parent
sys.path.append(str(parent_folder))

# import config
from config import DATA_DIR

results_file = DATA_DIR / "10-25-arca-results.csv"
practice_file = DATA_DIR / "10-25-arca-practice.csv"
quali_file = DATA_DIR / "10-25-arca-quali.csv"

results = pd.read_csv(results_file)
practice = pd.read_csv(practice_file)
qualifying = pd.read_csv(quali_file)

In [2]:
df = results.copy()
df = pd.merge(df, practice, on=["race_id", "driver_id"], how="outer", suffixes=("", "practice"))
df = pd.merge(df, qualifying, on=["race_id", "driver_id"], how="outer", suffixes=("", "qualifying"))

In [3]:
# shows duplicates (same driver_id AND race_id in multiple rows)
dupes = df[df.duplicated(subset=["race_id", "driver_id"], keep=False)]
print(dupes.sort_values(["race_id", "driver_id"]))
# ensure no duplicates
assert df.duplicated(subset=["race_id", "driver_id"]).sum() == 0

Empty DataFrame
Columns: [race_id, race_season, race_name, track_name, race_date, finishing_position, starting_position, car_number, driver_fullname, sponsor, car_make, laps_completed, laps_led, finishing_status, points_earned, driver_id, team_name, track_type, track_length, division, Pos, No., Name, Sponsor, Best Tm, Best Speed, In Lap, Laps, Diff, BestLapRank, Number, FullName, Sponsorqualifying, BestLapTime, BestLapSpeed, InLap, Lapsqualifying, Diffqualifying]
Index: []

[0 rows x 38 columns]


In [4]:
import importlib, rolling_lagging as rolling_lagging
importlib.reload(rolling_lagging)
from rolling_lagging import lagging_rolling_generator, reconcile_driver_carteams

lagroll_features = [
    "finishing_position", 
    "starting_position",
    "laps_completed", 
    "laps_led", 
    "points_earned",
    "BestLapTime",
    "BestLapSpeed",
    "Diffqualifying",
    "Best Tm",
    "Best Speed",
    "Diff"
]

# feature engineering, lagging & rolling averages
# df, features, sort_list, filter_list, windows_list, suffix, min_periods = 1

# directly recent races (momentum):
df = lagging_rolling_generator(df, lagroll_features, ["driver_id"], ["driver_id"], [3, 5, 10], "general", 1)
df = lagging_rolling_generator(df, lagroll_features, ["team_name"], ["team_name"], [3, 5, 10], "general_team", 1)
df = lagging_rolling_generator(df, lagroll_features, ["team_name", "car_number"], ["team_name", "car_number"], [3, 5, 10], "general_carteam", 1)

# most recent at track type:
df = lagging_rolling_generator(df, lagroll_features, ["driver_id"], ["driver_id", "track_type"], [3, 5], "tracktype", 1)
df = lagging_rolling_generator(df, lagroll_features, ["team_name"], ["team_name", "track_type"], [3, 5], "tracktype_team", 1)
df = lagging_rolling_generator(df, lagroll_features, ["team_name", "car_number"], ["team_name", "car_number", "track_type"], [3, 5], "tracktype_carteam", 1)

# most recent at specific track:
df = lagging_rolling_generator(df, lagroll_features, ["driver_id"], ["driver_id", "track_name"], [3], "track", 1)
df = lagging_rolling_generator(df, lagroll_features, ["team_name"], ["team_name", "track_name"], [3], "track_team", 1)
df = lagging_rolling_generator(df, lagroll_features, ["team_name", "car_number"], ["team_name", "car_number", "track_name"], [3], "track_carteam", 1)

In [5]:
# fill NaNs in lagroll features with carteam or team averages
# df = reconcile_driver_carteams(df, lagroll_features, [3, 5, 10], "general")
df = reconcile_driver_carteams(df, lagroll_features, [3, 5, 10], "general")
df = reconcile_driver_carteams(df, lagroll_features, [3, 5], "tracktype")
df = reconcile_driver_carteams(df, lagroll_features, [3], "track")

In [6]:
# dropping features irrelevant to benchmark model

# keeping for visibility when reviewing df as csv
keep_cols = [
    # target attribute here (finishing position or points for now)
    "finishing_position", 
    # non-leaky identifiers
    "race_id", "race_season", "race_name", "track_name", "race_date",
    "driver_fullname", "driver_id", "car_number", "team_name", "car_make"
]

# remove the actual in-race data per observation but keep P&Q
in_race_leakage = [
    'finishing_position', 'laps_completed', 'laps_led', 'points_earned'
]

# keep engineered driver features for model, but not team / carteam features (those were only used to fill missing driver stats)
lagroll_cols = [
    col for col in df.columns
    if any(feat in col for feat in lagroll_features)
    and "_team" not in col
    and "_carteam" not in col
    and col not in in_race_leakage
]
print(lagroll_cols)

# create a dataframe for finishing position model training
finish_final_cols = keep_cols + lagroll_cols
finishing_df = df[finish_final_cols]

['starting_position', 'Best Tm', 'Best Speed', 'Diff', 'BestLapTime', 'BestLapSpeed', 'Diffqualifying', 'finishing_position_lag1_general', 'finishing_position_roll3_general', 'finishing_position_roll5_general', 'finishing_position_roll10_general', 'starting_position_lag1_general', 'starting_position_roll3_general', 'starting_position_roll5_general', 'starting_position_roll10_general', 'laps_completed_lag1_general', 'laps_completed_roll3_general', 'laps_completed_roll5_general', 'laps_completed_roll10_general', 'laps_led_lag1_general', 'laps_led_roll3_general', 'laps_led_roll5_general', 'laps_led_roll10_general', 'points_earned_lag1_general', 'points_earned_roll3_general', 'points_earned_roll5_general', 'points_earned_roll10_general', 'BestLapTime_lag1_general', 'BestLapTime_roll3_general', 'BestLapTime_roll5_general', 'BestLapTime_roll10_general', 'BestLapSpeed_lag1_general', 'BestLapSpeed_roll3_general', 'BestLapSpeed_roll5_general', 'BestLapSpeed_roll10_general', 'Diffqualifying_lag1

In [7]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, GroupKFold
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from scipy.stats import spearmanr
import numpy as np

# Linear Regression Model Function

def lr_model(training_df, lagroll_cols, target_col):
    # Features and target
    df_train = training_df.dropna(subset=[target_col]).copy()
    X = df_train[lagroll_cols]
    y = df_train[target_col]
    groups = df_train["race_id"]

    # Define pipeline: impute -> scale -> model
    pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="mean")),
        ("scaler", StandardScaler()),
        ("ridge", Ridge())
    ])

    # Parameter grid for alpha
    if target_col == "finishing_position":
        param_grid = {"ridge__alpha": [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]}
    else:
        raise ValueError("Invalid target column specified.")

    # Grouped CV
    cv = GroupKFold(n_splits=5)

    # Grid search
    grid = GridSearchCV(pipe, param_grid, scoring="neg_mean_squared_error", cv=cv)
    grid.fit(X, y, groups=groups)

    print("Best alpha:", grid.best_params_)
    best_model = grid.best_estimator_

    # Get coefficients
    ridge_model = best_model.named_steps["ridge"]
    coef = pd.Series(ridge_model.coef_, index=lagroll_cols)
    print(coef.sort_values(ascending=False).head(20))

    # Apply to full df
    training_df["weighted_score_lr"] = best_model.predict(training_df[lagroll_cols])
    training_df["pred_rank_lr"] = training_df.groupby("race_id")["weighted_score_lr"].rank(method="min", ascending=True)  

    def race_spearman(g):
        if g["weighted_score_lr"].nunique() < 2:
            return np.nan
        return spearmanr(g["weighted_score_lr"], g[target_col]).correlation

    race_corrs = (
        training_df.dropna(subset=[target_col])
        .groupby("race_id")
        .apply(race_spearman)
    )

    print("\nSpearman mean:", race_corrs.mean().round(3))
    print("Spearman median:", race_corrs.median().round(3))
    print("Number of races evaluated:", race_corrs.notna().sum())

    # remove lagroll columns before saving for comparison (cuts down csv file size significantly)
    training_df = training_df.drop(columns=lagroll_cols)

    if target_col == "finishing_position":
        training_df.to_csv("arca_lr_analysis_ready_finishing.csv", index=False)
    else: 
        raise ValueError("Invalid target column specified.")

    return None


# create the finishing position model
lr_model(finishing_df, lagroll_cols, "finishing_position")

Best alpha: {'ridge__alpha': 90}
starting_position                    4.430637
finishing_position_lag1_track        0.978595
laps_completed_roll3_track           0.637706
BestLapSpeed                         0.605975
BestLapTime                          0.593498
BestLapSpeed_lag1_general            0.586826
starting_position_roll5_tracktype    0.519929
Best Speed                           0.437428
starting_position_roll3_tracktype    0.430352
Diff                                 0.402776
laps_completed_lag1_tracktype        0.399187
starting_position_roll5_general      0.392216
finishing_position_roll3_track       0.382833
Diffqualifying_roll3_tracktype       0.380491
Diffqualifying_roll5_tracktype       0.369762
laps_led_roll3_tracktype             0.324416
Diff_roll5_general                   0.316379
laps_led_lag1_track                  0.313517
Best Speed_lag1_tracktype            0.301827
laps_led_roll5_general               0.277302
dtype: float64

Spearman mean: 0.693
Spearman m

  training_df["weighted_score_lr"] = best_model.predict(training_df[lagroll_cols])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_df["weighted_score_lr"] = best_model.predict(training_df[lagroll_cols])
  training_df["pred_rank_lr"] = training_df.groupby("race_id")["weighted_score_lr"].rank(method="min", ascending=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_df["pred_rank_lr"] = training_df.groupby("race_id")["weighted_score_lr"].rank(method="min", ascending=True)
  .apply(race_spearman)
