In [1]:
# load data
import pandas as pd
import numpy as np

results_file = "9-18-results.csv"
misc_file = "9-18-misc.csv"
stages_file = "9-18-stages.csv"
practice_file = "9-18-practice.csv"

results = pd.read_csv(results_file)
misc = pd.read_csv(misc_file)
stages = pd.read_csv(stages_file)
practice = pd.read_csv(practice_file)

In [2]:
# aggregation of practice data to eliminate 'duplicate' observations 
# the only race consistently having multiple practices is the Daytona500, but I think this is best practice for now
practice_agg = (
    practice.groupby(["race_id", "driver_id"])
    .agg({
        "BestLapRank" : "mean",
        "OverAllAvgRank" : "mean",
        "Con5LapRank" : "mean",
        "Con10LapRank" : "mean",
        "Con15LapRank" : "mean",
        "Con20LapRank" : "mean",
        "Con25LapRank" : "mean",
        "Con30LapRank" : "mean"
    })
)

In [3]:
# pivoting the stage dataset to eliminate 'duplicate' observations
# doing this rather than aggregation since I want to preserve stage 1 and stage 2 as seperate parts of the race (not averaged together)
stages_wide = stages.pivot_table(
    index=["race_id", "driver_id"],
    columns="stage_number",
    values=["position", "stage_points"]
)

stages_wide.columns = [
    f"stage_{col[1]}_{col[0]}" for col in stages_wide.columns.to_flat_index()
]

stages_wide = stages_wide.reset_index()

In [4]:
# data merging
df = results.copy()
df = pd.merge(df, misc, on=["race_id", "driver_id"], how="outer", suffixes=("", "_misc"))
df = pd.merge(df, stages_wide, on=["race_id", "driver_id"], how="outer")
df = pd.merge(df, practice_agg, on=["race_id", "driver_id"], how="outer")

In [5]:
# shows duplicates (same driver_id AND race_id in multiple rows)
# dupes = df[df.duplicated(subset=["race_id", "driver_id"], keep=False)]
# print(dupes.sort_values(["race_id", "driver_id"]))
# ensure no duplicates
assert df.duplicated(subset=["race_id", "driver_id"]).sum() == 0

In [None]:
# feature engineering - general classification features, improving features
# creating finish-related attributes for classification model
df["top20"] = (df["finishing_position"] <= 20).astype(int)
df["top10"] = (df["finishing_position"] <= 10).astype(int)
df["top5"] = (df["finishing_position"] <= 5).astype(int)
df["win"] = (df["finishing_position"] == 1).astype(int)
df["dnf"] = (df["finishing_status"] != "Running").astype(int)
df["crash"] = df["finishing_status"].isin(["Accident", "DVP"]).astype(int)

# stage-related attributes
df["stage_win"] = (df["stage_1_position"] == 1) | (df["stage_2_position"] == 1) | (df["stage_3_position"] == 1)
df["stage_win"] = df["stage_win"].astype(int)
df["got_stage_points"] = (df["stage_1_stage_points"] > 0) | (df["stage_2_stage_points"] > 0) | (df["stage_3_stage_points"] > 0)
df["got_stage_points"] = df["got_stage_points"].astype(int)

# changing laps_completed to percentage of total laps
df["laps_completed_pct"] = df["laps_completed"] / df["act_laps"]

# check whether driver has at least 3 starts with current team (flags new drivers or drivers switching teams)
df["has_experience_with_team"] = df.groupby(["driver_id", "team_name"]).cumcount() >= 3 
df["has_experience_with_team"] = df["has_experience_with_team"].astype(int)

In [7]:
import importlib, rolling_lagging
importlib.reload(rolling_lagging)
from rolling_lagging import lagging_rolling_features, lagging_rolling_track_type_features, lagging_rolling_track_features

lagroll_features = [
    "finishing_position", "laps_completed_pct", "laps_led", "times_led",
    "points_earned", "points_position", "stage_1_position", "stage_2_position",
    "stage_3_position", "stage_1_stage_points", "stage_2_stage_points", "stage_3_stage_points",
    "mid_ps", "closing_ps", "closing_laps_diff", "avg_ps", "passing_diff", "fast_laps",
    "top15_laps", "rating", "dnf", "crash", "top20", "top10", "top5", "win",
    "stage_win", "got_stage_points"
]

df = lagging_rolling_features(df, lagroll_features)
df = lagging_rolling_track_type_features(df, lagroll_features)
df = lagging_rolling_track_features(df, lagroll_features)

In [8]:
# commented out attributes are from Stages and Misc (already dropped due to aggregation/pivoting) but want to keep for reference
drop_cols_noise_duplicates = [
    "race_name", "car_number", "sponsor", "laps_completed", "points_delta", "disqualified", "qualifying_position",
    # "vehicle_number", "full_name", 
    "race_name_misc", "sch_laps", "start_ps", "ps", "best_ps", "worst_ps", "lead_laps", "laps", 
    # "Number", "FullName", "Manufacturer", "Sponsor", "BestLapTime", "OverAllAvg", "Con5Lap", 
    # "Con10Lap", "Con15Lap", "Con20Lap", "Con25Lap", "Con30Lap"
    ]

df = df.drop(columns=drop_cols_noise_duplicates)

In [9]:
from xgboost import XGBClassifier, XGBRegressor 
# train-test split 
import importlib, utils 
importlib.reload(utils) 
from utils import chronological_split 

target_col = "finishing_position" 

# call split function (ensures training data is always earlier than testing data)
X_train, X_test, y_train, y_test = chronological_split(df, target_col, 0.2)

In [24]:
# Classification model for categorical targets, regression model for continuous targets

if target_col in ["top20", "top10", "top5", "win", "dnf", "crash", "got_stage_points", "stage_win"]:
    pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
    model = XGBClassifier(
        n_estimators=300, learning_rate=0.05, max_depth=6,
        subsample=0.8, colsample_bytree=0.8, tree_method="hist",
        enable_categorical=True, scale_pos_weight=pos_weight, random_state=42
    )
    target_type = "classification"
    
else:
    for col in X_train.select_dtypes(include=["category", "object"]).columns:
        X_train[col] = X_train[col].astype("category")
        X_test[col] = X_test[col].astype("category")

    model = XGBRegressor(
        importance_type="weight", n_estimators=300, learning_rate=0.05, max_depth=6,
        subsample=0.8, colsample_bytree=0.8, tree_method="hist",
        enable_categorical=True, random_state=42
    )
    target_type = "regression"

model.fit(X_train, y_train)

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,True


In [25]:
# import optuna
# import numpy as np
# from xgboost import XGBRegressor
# from sklearn.model_selection import cross_val_score

# def get_feature_ranking(model, X, importance_type="gain"):
#     """
#     Return feature ranking based on chosen importance type.
#     """
#     booster = model.get_booster()
#     importance_dict = booster.get_score(importance_type=importance_type)
    
#     # Map back to full feature list
#     importances = [importance_dict.get(f, 0) for f in X.columns]
    
#     # Sort descending
#     return np.argsort(importances)[::-1]


# def objective(trial):
#     # Suggest importance type
#     importance_type = trial.suggest_categorical("importance_type", ["gain", "cover", "weight"])
    
#     # Compute ranking based on this importance type
#     feature_ranking = get_feature_ranking(model, X_train, importance_type=importance_type)
    
#     # Suggest feature keep ratio
#     keep_ratio = trial.suggest_float("keep_ratio", 0.1, 1.0)
    
#     # Reduce training set
#     n_features = int(len(feature_ranking) * keep_ratio)
#     selected_idx = feature_ranking[:n_features]
#     X_train_reduced = X_train.iloc[:, selected_idx]
    
#     # Hyperparameters
#     params = {
#         "n_estimators": trial.suggest_int("n_estimators", 200, 600),
#         "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1, log=True),
#         "max_depth": trial.suggest_int("max_depth", 3, 10),
#         "subsample": trial.suggest_float("subsample", 0.6, 1.0),
#         "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
#         "tree_method": "hist",
#         "enable_categorical": True,
#         "random_state": 42
#     }
    
#     # Train + cross-validate
#     model_tmp = XGBRegressor(**params)
#     score = cross_val_score(
#         model_tmp, X_train_reduced, y_train,
#         scoring="neg_root_mean_squared_error",
#         cv=3, n_jobs=-1
#     ).mean()
    
#     return -score  # minimize RMSE


# study = optuna.create_study(direction="minimize")
# study.optimize(objective, n_trials=50)

# print("Best trial:")
# print(study.best_trial.params)

In [None]:
# hard coding best params from optuna study to avoid re-running
best_params = {
    "importance_type": "weight",
    "keep_ratio": 0.41948613536905166,
    "n_estimators": 293,  
    "learning_rate": 0.017147992725196598,
    "max_depth": 4,
    "subsample": 0.9288572447223334,
    "colsample_bytree": 0.6037678774243146
}

def get_feature_ranking(model, X, importance_type="gain"):
    booster = model.get_booster()
    importance_dict = booster.get_score(importance_type=importance_type)
    importances = [importance_dict.get(f, 0) for f in X.columns]
    return np.argsort(importances)[::-1]  # indices sorted by importance

importance_type = best_params["importance_type"]
keep_ratio = best_params["keep_ratio"]

# Rank features
feature_ranking = get_feature_ranking(model, X_train, importance_type=importance_type)

# Select top N features
n_features = int(len(feature_ranking) * keep_ratio)
selected_idx = feature_ranking[:n_features]
selected_features = X_train.columns[selected_idx]

# Reduce train & test sets
X_train_reduced = X_train[selected_features]
X_test_reduced = X_test[selected_features]

In [47]:
final_model = XGBRegressor(
    n_estimators=best_params["n_estimators"],
    learning_rate=best_params["learning_rate"],
    max_depth=best_params["max_depth"],
    subsample=best_params["subsample"],
    colsample_bytree=best_params["colsample_bytree"],
    tree_method="hist",
    enable_categorical=True,
    random_state=42
)

final_model.fit(X_train_reduced, y_train)

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.6037678774243146
,device,
,early_stopping_rounds,
,enable_categorical,True


In [48]:
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score

y_pred = final_model.predict(X_test_reduced)

# print("NaNs per column:")
# print(X_test_reduced.isna().sum())
# print("Total NaNs in DataFrame:", X_test_reduced.isna().sum().sum())

mask = ~y_test.isna()
y_test_clean, y_pred_clean = y_test[mask], y_pred[mask]

metrics = {
    "RMSE": root_mean_squared_error(y_test_clean, y_pred_clean),
    "MAE": mean_absolute_error(y_test_clean, y_pred_clean),
    "R2": r2_score(y_test_clean, y_pred_clean)
}

print(metrics)

{'RMSE': 9.861906715403139, 'MAE': 8.167649358702187, 'R2': 0.19765606773698474}


In [49]:
# # Save reduced training features
# X_train_reduced.to_csv("X_train_reduced.csv", index=False)

# # Save reduced test features
# X_test_reduced.to_csv("X_test_reduced.csv", index=False)

In [50]:
# Replace with next week's race_id
next_race_id = 5579

# Filter the dataset for that race
X_next_race = df[df["race_id"] == next_race_id].copy()

X_next_race = X_next_race[selected_features]

for col in X_next_race.select_dtypes(include=["object"]).columns:
    X_next_race[col] = X_next_race[col].astype("category")

# Regression model predicts numeric finishing positions
y_next_race_pred = final_model.predict(X_next_race)

predictions_df = df[df["race_id"] == next_race_id][["driver_fullname"]].copy()
predictions_df["predicted_finish"] = y_next_race_pred

# Sort by predicted finishing position
predictions_df = predictions_df.sort_values("predicted_finish")
predictions_df.reset_index(drop=True, inplace=True)

print(predictions_df)



        driver_fullname  predicted_finish
0          Denny Hamlin         14.731231
1         Chase Elliott         15.579134
2            Kyle Busch         15.906544
3              Ty Gibbs         16.486979
4         Bubba Wallace         16.716114
5           Joey Logano         16.763393
6      Michael McDowell         17.761154
7            Josh Berry         18.004002
8        Carson Hocevar         18.122498
9         Tyler Reddick         18.382959
10     Christopher Bell         18.592329
11      Brad Keselowski         18.631733
12          Kyle Larson         18.708725
13          Alex Bowman         18.738304
14            Ty Dillon         19.143358
15        Ross Chastain         19.640045
16   Ricky Stenhouse Jr         19.857056
17             JJ Yeley         19.918329
18          Cole Custer         20.211735
19           Erik Jones         20.505308
20         Noah Gragson         20.511463
21      AJ Allmendinger         20.544743
22       Chris Buescher         20