In [1]:
import pandas as pd
import sys
from pathlib import Path

# # make parent folder importable
parent_folder = Path.cwd().parent
sys.path.append(str(parent_folder))

# import config
from config import DATA_DIR

df = pd.read_csv("analysis_ready_bench.csv")

In [None]:
import ipywidgets as widgets
from IPython.display import display

series_dropdown = widgets.Dropdown(
    options=["Cup", "Xfinity", "Truck"],
    description="Series:"
)
display(series_dropdown)

# to be updtated with full race name dictionary soon for better UX
# Xfinity and Truck data not implemented yet, but structure is in place
if series_dropdown.value == "Cup":
    race_id_dropdown = widgets.Dropdown(
        options=[5575, 5577, 5578, 5567, 5579, 5581, 5582, 5580],
        description="Race ID:"
    )
    display(race_id_dropdown)

elif series_dropdown.value == "Xfinity":
    race_id_dropdown = widgets.Dropdown(
        options=[5580, 5581, 5582, 5583],
        description="Race ID:"
    )
    display(race_id_dropdown)

elif series_dropdown.value == "Truck":
    race_id_dropdown = widgets.Dropdown(
        options=[5580, 5581, 5582, 5583],
        description="Race ID:"
    )
    display(race_id_dropdown)

else: print("An error selecting series occurred.")

Dropdown(description='Series:', options=('Cup', 'Xfinity', 'Truck'), value='Cup')

Dropdown(description='Race ID:', options=(5580, 5581, 5582, 5583), value=5580)

In [3]:
# race_id_to_check = 5580 
race_id_to_check = race_id_dropdown.value

race_df = df[df["race_id"] == race_id_to_check].copy()

# compute absolute error between predicted rank and actual finish
race_df["abs_error_weighted"] = (race_df["pred_rank_lr"] - race_df["finishing_position"]).abs()

# compute summary metrics
mean_error = race_df["abs_error_weighted"].mean()
median_error = race_df["abs_error_weighted"].median()

print(f"Weighted model for race {race_id_to_check}:")
print(f"Average (mean) error: {mean_error:.2f}")
print(f"Median error: {median_error:.2f}")

# Sort by predicted rank (best at the top)
race_df = race_df.sort_values("pred_rank_lr")

# Select useful columns to display
cols_to_show = [
    "race_id",
    "driver_fullname",
    # "team_name",
    # "weighted_score_lr_flipped",
    "pred_rank_lr",
    "finishing_position"  # actual for comparison
]

print(race_df[cols_to_show].head(40))
# race_df.to_csv("race_db.csv", index=False)

Weighted model for race 5583:
Average (mean) error: 7.30
Median error: 5.00
      race_id      driver_fullname  pred_rank_lr  finishing_position
1290     5583        William Byron           1.0                12.0
303      5583        Tyler Reddick           2.0                 8.0
4678     5583          Ryan Blaney           3.0                36.0
1006     5583          Kyle Larson           4.0                 1.0
1895     5583     Christopher Bell           5.0                29.0
1432     5583          Alex Bowman           6.0                 2.0
1575     5583         Denny Hamlin           7.0                 5.0
1148     5583        Chase Elliott           8.0                18.0
1753     5583        Chase Briscoe           9.0                 4.0
2954     5583       Chris Buescher          10.0                 6.0
127      5583        Bubba Wallace          11.0                 3.0
2188     5583      AJ Allmendinger          12.0                 7.0
2001     5583             T

In [4]:
racing_insights_file = DATA_DIR / "10-20-racing-insights.csv"
racing_insights = pd.read_csv(racing_insights_file)

racing_insights["race_id"] = racing_insights["race_id"].astype(df["race_id"].dtype)
racing_insights["driver_id"] = racing_insights["driver_id"].astype(df["driver_id"].dtype)

df = pd.merge(df, racing_insights, on=["race_id", "driver_id"], how="outer", suffixes=("", "_racing_insights"))

assert df.duplicated(subset=["race_id", "driver_id"]).sum() == 0

df_2025 = df[df["race_season"] == 2025].copy()

print(df_2025.columns.tolist())

['finishing_position', 'race_id', 'race_season', 'race_name', 'track_name', 'race_date', 'driver_fullname', 'driver_id', 'car_number', 'team_name', 'car_make', 'crew_chief_fullname', 'finishing_status', 'starting_position', 'BestLapRank', 'OverAllAvgRank', 'finishing_position_lag1_general', 'finishing_position_roll3_general', 'finishing_position_roll5_general', 'finishing_position_roll10_general', 'starting_position_lag1_general', 'starting_position_roll3_general', 'starting_position_roll5_general', 'starting_position_roll10_general', 'points_position_lag1_general', 'points_position_roll3_general', 'points_position_roll5_general', 'points_position_roll10_general', 'stage_1_position_lag1_general', 'stage_1_position_roll3_general', 'stage_1_position_roll5_general', 'stage_1_position_roll10_general', 'stage_2_position_lag1_general', 'stage_2_position_roll3_general', 'stage_2_position_roll5_general', 'stage_2_position_roll10_general', 'mid_ps_lag1_general', 'mid_ps_roll3_general', 'mid_ps_

In [5]:
from scipy.stats import spearmanr
import numpy as np
import pandas as pd

def race_spearman_corrs(g):
    result = {}
    
    # Only calculate if we have at least 2 drivers
    if g["finishing_position"].nunique() > 1:
        # Your model correlation
        if g["pred_rank_lr"].nunique() > 1:
            result["my_model_corr"] = spearmanr(g["pred_rank_lr"], g["finishing_position"]).correlation
        else:
            result["my_model_corr"] = np.nan
        
        # Racing Insights correlation
        if g["pred_finish_ri"].nunique() > 1:
            result["ri_corr"] = spearmanr(g["pred_finish_ri"], g["finishing_position"]).correlation
        else:
            result["pred_finish_ri"] = np.nan
    
    return pd.Series(result)

race_corrs_2025 = (
    df_2025.dropna(subset=["finishing_position"])
    .groupby("race_id")
    .apply(race_spearman_corrs)
)

  .apply(race_spearman_corrs)


In [6]:
print("Race-by-race Spearman Correlations (2025):")
print(race_corrs_2025.round(3))

print("\nAverage correlations across 2025:")
print("My model:", race_corrs_2025["my_model_corr"].mean().round(3))
print("Racing Insights:", race_corrs_2025["ri_corr"].mean().round(3))

Race-by-race Spearman Correlations (2025):
         my_model_corr  ri_corr
race_id                        
5546             0.324    0.208
5547             0.180    0.243
5548             0.286    0.359
5549             0.328    0.403
5550             0.663    0.721
5551             0.462    0.479
5552             0.452    0.441
5553             0.664    0.579
5554             0.017    0.042
5555             0.233    0.301
5556             0.610    0.555
5557             0.333    0.220
5558             0.500    0.427
5563             0.265    0.285
5564             0.734    0.712
5565             0.464    0.451
5566             0.710    0.633
5567             0.656    0.691
5568             0.667    0.665
5569             0.391    0.412
5570             0.530    0.518
5571             0.446    0.413
5572             0.339    0.309
5573             0.452    0.484
5574             0.101    0.099
5575             0.547    0.472
5576            -0.051   -0.185
5577             0.549    0.5

In [7]:
df_2025["abs_error_my_model"] = (df_2025["pred_rank_lr"] - df_2025["finishing_position"]).abs()
df_2025["abs_error_ri"] = (df_2025["pred_finish_ri"] - df_2025["finishing_position"]).abs()

print("\nMean Absolute Error (lower = better):")
print("My model:", df_2025["abs_error_my_model"].mean().round(2))
print("Racing Insights:", df_2025["abs_error_ri"].mean().round(2))


Mean Absolute Error (lower = better):
My model: 9.16
Racing Insights: 9.29
