In [18]:
import sys
from pathlib import Path
import pandas as pd
import ipywidgets as widgets
from IPython.display import display

# make parent folder importable
parent_folder = Path.cwd().parent
sys.path.append(str(parent_folder))

# import config
from config import DATA_DIR

series_dropdown = widgets.Dropdown(
    options=["Cup", "Xfinity", "Trucks"],
    description="Series: "
)
display(series_dropdown)

Dropdown(description='Series: ', options=('Cup', 'Xfinity', 'Trucks'), value='Cup')

In [19]:
if series_dropdown.value == "Cup":
    results_file = DATA_DIR / "10-20-results.csv"
    misc_file = DATA_DIR / "10-20-misc.csv"
    stages_file = DATA_DIR / "10-20-stages.csv"
    practice_file = DATA_DIR / "10-20-practice.csv"

elif series_dropdown.value == "Xfinity":
    # print("No Xfinity data files available yet.")
    results_file = DATA_DIR / "10-20-xfinity-results.csv"
    misc_file = DATA_DIR / "10-20-xfinity-misc.csv"
    stages_file = DATA_DIR / "10-20-xfinity-stages.csv"
    practice_file = DATA_DIR / "10-20-xfinity-practice.csv"

elif series_dropdown.value == "Trucks":
    # print("No Truck data files available yet.")
    results_file = DATA_DIR / "10-20-trucks-results.csv"
    misc_file = DATA_DIR / "10-20-trucks-misc.csv"
    stages_file = DATA_DIR / "10-20-trucks-stages.csv"
    practice_file = DATA_DIR / "10-20-trucks-practice.csv"

else: 
    print("An error selecting series occurred.")

results = pd.read_csv(results_file)
misc = pd.read_csv(misc_file)
stages = pd.read_csv(stages_file)
practice = pd.read_csv(practice_file)

In [20]:
# aggregation of practice data to eliminate 'duplicate' observations 
# the only race consistently having multiple practices is the Daytona500, but I think this is best practice for now
practice_agg = (
    practice.groupby(["race_id", "driver_id"])
    .agg({
        "BestLapRank" : "mean",
        "OverAllAvgRank" : "mean",
        "Con5LapRank" : "mean",
        "Con10LapRank" : "mean",
        "Con15LapRank" : "mean",
        "Con20LapRank" : "mean",
        "Con25LapRank" : "mean",
        "Con30LapRank" : "mean"
    })
)

In [21]:
# pivoting the stage dataset to eliminate 'duplicate' observations
# doing this rather than aggregation since I want to preserve stage 1 and stage 2 as seperate parts of the race (not averaged together)
stages_wide = stages.pivot_table(
    index=["race_id", "driver_id"],
    columns="stage_number",
    values=["position", "stage_points"]
)

stages_wide.columns = [
    f"stage_{col[1]}_{col[0]}" for col in stages_wide.columns.to_flat_index()
]

stages_wide = stages_wide.reset_index()

In [22]:
# data merging
df = results.copy()
df = pd.merge(df, misc, on=["race_id", "driver_id"], how="outer", suffixes=("", "_misc"))
df = pd.merge(df, stages_wide, on=["race_id", "driver_id"], how="outer")
df = pd.merge(df, practice_agg, on=["race_id", "driver_id"], how="outer")

In [23]:
# shows duplicates (same driver_id AND race_id in multiple rows)
dupes = df[df.duplicated(subset=["race_id", "driver_id"], keep=False)]
print(dupes.sort_values(["race_id", "driver_id"]))
# ensure no duplicates
assert df.duplicated(subset=["race_id", "driver_id"]).sum() == 0

Empty DataFrame
Columns: [race_id, race_season, race_name, track_name, race_date, finishing_position, starting_position, car_number, driver_fullname, driver_id, team_name, car_make, sponsor, crew_chief_fullname, laps_completed, finishing_status, laps_led, times_led, points_earned, diff_laps, diff_time, playoff_points_earned, points_position, points_delta, disqualified, qualifying_order, qualifying_position, qualifying_speed, track_type, track_length, race_name_misc, sch_laps, act_laps, start_ps, mid_ps, ps, closing_ps, closing_laps_diff, best_ps, worst_ps, avg_ps, passes_gf, passing_diff, passed_gf, quality_passes, fast_laps, top15_laps, lead_laps, laps, rating, stage_1_position, stage_2_position, stage_1_stage_points, stage_2_stage_points, BestLapRank, OverAllAvgRank, Con5LapRank, Con10LapRank, Con15LapRank, Con20LapRank, Con25LapRank, Con30LapRank]
Index: []

[0 rows x 62 columns]


In [24]:
import importlib, rolling_lagging as rolling_lagging
importlib.reload(rolling_lagging)
from rolling_lagging import lagging_rolling_generator, reconcile_driver_carteams

lagroll_features = [
    "finishing_position", 
    "starting_position",
    "points_position", 
    "stage_1_position", 
    "stage_2_position", 
    "mid_ps", 
    "closing_ps", 
    "avg_ps", 
    "BestLapRank", 
    "OverAllAvgRank",
    "laps_completed", 
    "laps_led", 
    "points_earned", 
    "fast_laps", 
    "top15_laps", 
    "rating"
]

# feature engineering, lagging & rolling averages
# df, features, sort_list, filter_list, windows_list, suffix, min_periods = 1

# directly recent races (momentum):
df = lagging_rolling_generator(df, lagroll_features, ["driver_id"], ["driver_id"], [3, 5, 10], "general", 1)
df = lagging_rolling_generator(df, lagroll_features, ["team_name"], ["team_name"], [3, 5, 10], "general_team", 1)
df = lagging_rolling_generator(df, lagroll_features, ["team_name", "car_number"], ["team_name", "car_number"], [3, 5, 10], "general_carteam", 1)

# most recent at track type:
df = lagging_rolling_generator(df, lagroll_features, ["driver_id"], ["driver_id", "track_type"], [3, 5, 10], "tracktype", 1)
df = lagging_rolling_generator(df, lagroll_features, ["team_name"], ["team_name", "track_type"], [3, 5, 10], "tracktype_team", 1)
df = lagging_rolling_generator(df, lagroll_features, ["team_name", "car_number"], ["team_name", "car_number", "track_type"], [3, 5, 10], "tracktype_carteam", 1)

# most recent at specific track:
df = lagging_rolling_generator(df, lagroll_features, ["driver_id"], ["driver_id", "track_name"], [3, 5], "track", 1)
df = lagging_rolling_generator(df, lagroll_features, ["team_name"], ["team_name", "track_name"], [3, 5], "track_team", 1)
df = lagging_rolling_generator(df, lagroll_features, ["team_name", "car_number"], ["team_name", "car_number", "track_name"], [3, 5], "track_carteam", 1)

In [25]:
# fill NaNs in lagroll features with carteam or team averages
# df = reconcile_driver_carteams(df, lagroll_features, [3, 5, 10], "general")
df = reconcile_driver_carteams(df, lagroll_features, [3, 5, 10], "general")
df = reconcile_driver_carteams(df, lagroll_features, [3, 5, 10], "tracktype")
df = reconcile_driver_carteams(df, lagroll_features, [3, 5], "track")

In [26]:
# create points_earned rank feature for points model
df["points_earned_rank"] = df.groupby("race_id")["points_earned"].rank(method="min", ascending=False)  

  df["points_earned_rank"] = df.groupby("race_id")["points_earned"].rank(method="min", ascending=False)


In [27]:
# dropping features irrelevant to benchmark model

# keeping for visibility when reviewing df as csv
keep_cols = [
    # target attribute here (finishing position or points for now)
    "finishing_position", 
    # non-leaky identifiers
    "race_id", "race_season", "race_name", "track_name", "race_date",
    "driver_fullname", "driver_id", "car_number", "team_name", "car_make",
    "crew_chief_fullname"
]

# remove the actual in-race data per observation but keep P&Q
in_race_leakage = [
    'finishing_position', 'laps_completed', 'laps_led', 
    'points_earned', 'playoff_points_earned', 'points_earned_rank',
    'points_position', 'mid_ps', 'closing_ps', 'avg_ps', 'fast_laps', 'top15_laps', 'rating', 
    'stage_1_position', 'stage_2_position'
]

# keep engineered driver features for model, but not team / carteam features (those were only used to fill missing driver stats)
lagroll_cols = [
    col for col in df.columns
    if any(feat in col for feat in lagroll_features)
    and "_team" not in col
    and "_carteam" not in col
    and col not in in_race_leakage
]
print(lagroll_cols)

# create a dataframe for finishing position model training
finish_final_cols = keep_cols + lagroll_cols
finishing_df = df[finish_final_cols]

# edit for points earned model
keep_cols.remove("finishing_position")
keep_cols.insert(0, "points_earned_rank")

points_final_cols = keep_cols + lagroll_cols
points_df = df[points_final_cols]

['starting_position', 'BestLapRank', 'OverAllAvgRank', 'finishing_position_lag1_general', 'finishing_position_roll3_general', 'finishing_position_roll5_general', 'finishing_position_roll10_general', 'starting_position_lag1_general', 'starting_position_roll3_general', 'starting_position_roll5_general', 'starting_position_roll10_general', 'points_position_lag1_general', 'points_position_roll3_general', 'points_position_roll5_general', 'points_position_roll10_general', 'stage_1_position_lag1_general', 'stage_1_position_roll3_general', 'stage_1_position_roll5_general', 'stage_1_position_roll10_general', 'stage_2_position_lag1_general', 'stage_2_position_roll3_general', 'stage_2_position_roll5_general', 'stage_2_position_roll10_general', 'mid_ps_lag1_general', 'mid_ps_roll3_general', 'mid_ps_roll5_general', 'mid_ps_roll10_general', 'closing_ps_lag1_general', 'closing_ps_roll3_general', 'closing_ps_roll5_general', 'closing_ps_roll10_general', 'avg_ps_lag1_general', 'avg_ps_roll3_general', 'a

In [28]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, GroupKFold
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from scipy.stats import spearmanr
import numpy as np

# Linear Regression Model Function

def lr_model(training_df, lagroll_cols, target_col):
    # Features and target
    df_train = training_df.dropna(subset=[target_col]).copy()
    X = df_train[lagroll_cols]
    y = df_train[target_col]
    groups = df_train["race_id"]

    # Define pipeline: impute -> scale -> model
    pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="mean")),
        ("scaler", StandardScaler()),
        ("ridge", Ridge())
    ])

    # Parameter grid for alpha
    if target_col == "finishing_position":
        param_grid = {"ridge__alpha": [800, 850, 900, 950, 1000, 1050, 1100, 1150, 1200]}
    elif target_col == "points_earned_rank":
        param_grid = {"ridge__alpha": [100, 150, 200, 250, 300, 350, 400, 450, 500]}
    else:
        raise ValueError("Invalid target column specified.")

    # Grouped CV
    cv = GroupKFold(n_splits=5)

    # Grid search
    grid = GridSearchCV(pipe, param_grid, scoring="neg_mean_squared_error", cv=cv)
    grid.fit(X, y, groups=groups)

    print("Best alpha:", grid.best_params_)
    best_model = grid.best_estimator_

    # Get coefficients
    ridge_model = best_model.named_steps["ridge"]
    coef = pd.Series(ridge_model.coef_, index=lagroll_cols)
    print(coef.sort_values(ascending=False).head(20))

    # Apply to full df
    training_df["weighted_score_lr"] = best_model.predict(training_df[lagroll_cols])
    training_df["pred_rank_lr"] = training_df.groupby("race_id")["weighted_score_lr"].rank(method="min", ascending=True)  

    def race_spearman(g):
        if g["weighted_score_lr"].nunique() < 2:
            return np.nan
        return spearmanr(g["weighted_score_lr"], g[target_col]).correlation

    race_corrs = (
        training_df.dropna(subset=[target_col])
        .groupby("race_id")
        .apply(race_spearman)
    )

    print("\nSpearman mean:", race_corrs.mean().round(3))
    print("Spearman median:", race_corrs.median().round(3))
    print("Number of races evaluated:", race_corrs.notna().sum())

    # remove lagroll columns before saving for comparison (cuts down csv file size significantly)
    training_df = training_df.drop(columns=lagroll_cols)

    if target_col == "finishing_position":
        training_df.to_csv("lr_analysis_ready_finishing.csv", index=False)
    elif target_col == "points_earned_rank":
        training_df.to_csv("lr_analysis_ready_points.csv", index=False)
    else: 
        raise ValueError("Invalid target column specified.")

    return None


# create the finishing position model
lr_model(finishing_df, lagroll_cols, "finishing_position")

# create the points earned model
lr_model(points_df, lagroll_cols, "points_earned_rank")

Best alpha: {'ridge__alpha': 800}
starting_position                    1.554567
BestLapRank                          0.612438
stage_2_position_lag1_tracktype      0.553053
OverAllAvgRank                       0.422822
finishing_position_roll10_general    0.421577
top15_laps_lag1_general              0.323240
mid_ps_lag1_general                  0.316114
finishing_position_roll5_general     0.302046
stage_1_position_roll3_general       0.288527
points_position_lag1_general         0.276572
starting_position_roll5_general      0.261094
mid_ps_roll3_general                 0.232248
stage_1_position_roll10_general      0.229305
OverAllAvgRank_roll5_track           0.226007
points_position_lag1_tracktype       0.220126
stage_1_position_roll5_tracktype     0.211577
points_earned_roll5_general          0.210358
closing_ps_roll5_track               0.207059
OverAllAvgRank_lag1_tracktype        0.200477
laps_led_roll3_tracktype             0.200477
dtype: float64

Spearman mean: 0.563
Spearman 

  training_df["weighted_score_lr"] = best_model.predict(training_df[lagroll_cols])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_df["weighted_score_lr"] = best_model.predict(training_df[lagroll_cols])
  training_df["pred_rank_lr"] = training_df.groupby("race_id")["weighted_score_lr"].rank(method="min", ascending=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_df["pred_rank_lr"] = training_df.groupby("race_id")["weighted_score_lr"].rank(method="min", ascending=True)
  .apply(race_spearman)


Best alpha: {'ridge__alpha': 300}
starting_position                    1.772639
points_position_lag1_general         1.582085
stage_2_position_lag1_tracktype      0.773615
points_position_roll3_general        0.552412
BestLapRank                          0.524049
top15_laps_lag1_general              0.471459
mid_ps_lag1_general                  0.446934
stage_1_position_roll3_general       0.382080
points_position_lag1_tracktype       0.364341
stage_1_position_roll5_tracktype     0.354610
starting_position_roll5_general      0.352878
stage_1_position_roll10_tracktype    0.336766
OverAllAvgRank                       0.330504
fast_laps_roll5_general              0.310462
mid_ps_roll3_general                 0.304740
BestLapRank_roll3_track              0.298073
avg_ps_roll10_tracktype              0.291333
laps_led_roll5_tracktype             0.278124
closing_ps_roll5_general             0.254533
top15_laps_roll3_tracktype           0.251430
dtype: float64

Spearman mean: 0.688
Spearman 

  training_df["weighted_score_lr"] = best_model.predict(training_df[lagroll_cols])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_df["weighted_score_lr"] = best_model.predict(training_df[lagroll_cols])
  training_df["pred_rank_lr"] = training_df.groupby("race_id")["weighted_score_lr"].rank(method="min", ascending=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_df["pred_rank_lr"] = training_df.groupby("race_id")["weighted_score_lr"].rank(method="min", ascending=True)
  .apply(race_spearman)
