# NASCAR Race Winner Prediction: Monte Carlo Simulation
## Notebook 1: Data Preprocessing

In [4]:
import pandas as pd

### Race 1: Pocono 06/22/2025

In [5]:
pocono_stats = pd.read_csv("https://raw.githubusercontent.com/jake-lukasik/NASCAR-Race-Predictions/refs/heads/main/Data/Pocono-2025/driver-stats-pocono.txt")
cup_roster = pd.read_csv("https://raw.githubusercontent.com/jake-lukasik/NASCAR-Race-Predictions/refs/heads/main/Data/Master-Data/2025-cup-series-full-timers.txt")
season_stats = pd.read_csv("https://raw.githubusercontent.com/jake-lukasik/NASCAR-Race-Predictions/refs/heads/main/Data/Master-Data/season-stats.txt")

In [6]:
season_stats = season_stats.drop(columns=['NASCAR Points', "Top 10's"])

season_stats = season_stats.rename(
    columns={col: f"szn_{col}" for col in season_stats.columns if col != "Driver"}
)

season_stats

Unnamed: 0,Driver,szn_Avg Finish,szn_Races,szn_Wins,szn_Top 5's,szn_Laps Led,szn_Avg Rating
0,William Byron,10.4,16,1,7,769,106.7
1,Kyle Larson,12.8,16,3,9,851,96.0
2,Christopher Bell,10.6,16,3,7,153,94.2
3,Chase Elliott,11.2,16,0,4,95,85.5
4,Denny Hamlin,13.5,15,3,7,455,94.3
5,Tyler Reddick,14.5,16,0,3,94,90.6
6,Ryan Blaney,17.4,16,1,6,346,90.6
7,Ross Chastain,12.6,16,1,3,58,80.3
8,Joey Logano,16.9,16,1,2,302,84.5
9,Bubba Wallace,18.3,16,0,3,103,82.4


In [7]:
pocono_stats = pocono_stats[pocono_stats['Driver'].isin(cup_roster['Driver'])]
pocono_stats

Unnamed: 0,Rank,Driver,Avg Finish,Races,Wins,Top 5's,Top 10's,Top 20's,Laps Led,Avg Start,Best Finish,Low Finish,DNF,Avg Rating
0,1,William Byron,9.4,11,0,3,6,11,130,13.6,3,18,0,96.7
1,2,Brad Keselowski,10.8,27,1,11,16,23,247,12.0,1,38,1,94.4
2,3,Denny Hamlin,11.3,35,7,16,23,28,858,7.5,1,43,3,108.7
3,4,Kyle Larson,11.6,17,0,5,9,14,158,12.9,2,33,0,95.8
5,6,Ryan Blaney,13.4,15,2,3,7,11,95,13.6,1,33,1,89.6
6,7,Chase Elliott,13.5,15,1,4,10,11,67,14.9,1,38,1,93.9
7,8,Erik Jones,13.5,13,0,5,8,9,56,18.6,2,38,1,89.3
8,9,Tyler Reddick,13.6,7,0,2,4,5,15,14.4,2,35,0,81.9
11,12,Ty Gibbs,16.0,3,0,1,1,2,25,7.3,5,27,1,78.6
12,13,Kyle Busch,16.3,37,4,11,18,22,585,11.8,1,43,7,95.6


#### Important note:

**Missing Cole Custer, SVG, and Riley Herbst.** Due to the nature of this Monte Carlo simulation, and how we depend mainly on track specific statistics, these drivers will not be included in the full simulation. 

In [8]:
pocono_stats.drop(columns=["Rank", "Top 20's", "Avg Start", "Best Finish", "Low Finish"], inplace=True)
pocono_stats

Unnamed: 0,Driver,Avg Finish,Races,Wins,Top 5's,Top 10's,Laps Led,DNF,Avg Rating
0,William Byron,9.4,11,0,3,6,130,0,96.7
1,Brad Keselowski,10.8,27,1,11,16,247,1,94.4
2,Denny Hamlin,11.3,35,7,16,23,858,3,108.7
3,Kyle Larson,11.6,17,0,5,9,158,0,95.8
5,Ryan Blaney,13.4,15,2,3,7,95,1,89.6
6,Chase Elliott,13.5,15,1,4,10,67,1,93.9
7,Erik Jones,13.5,13,0,5,8,56,1,89.3
8,Tyler Reddick,13.6,7,0,2,4,15,0,81.9
11,Ty Gibbs,16.0,3,0,1,1,25,1,78.6
12,Kyle Busch,16.3,37,4,11,18,585,7,95.6


Now we can begin feature engineering to prepare the data for a Monte Carlo simulated model:

In [9]:
# Helper functions, we will apply these later:

# estimate_std, since standard deviation of finishes is not easily available,
# we instead estimate it based off of drivers average finish (a better average
# finish gives us a clue that that driver is more consistent)
def estimate_std(avg_finish):
    if avg_finish <= 12:
        return 2.5
    elif avg_finish <= 18:
        return 3.5
    else:
        return 4.5
    
# calculate the probability of a dnf, using DNF count along with race count, includes
# some manual tweaks to ensure this is a valuable and realistic measure
def dnf_prob(dnf_count, race_count):
    dnf_probability = dnf_count / race_count

    if dnf_probability == 0:
        if race_count >= 5:
            return 0.03  
        else:
            return 0.15 
    elif dnf_probability == 1:
        return 0.4  # cap max failure
    else:
        return min(dnf_probability, 0.35)  # avoid extreme dnf probabilites
    
# simple ratio of laps led per race
def laps_led_per_race(laps_led, races):
    return laps_led/races

In [10]:
pocono_stats_engineered = pd.merge(pocono_stats, season_stats, on='Driver', how='inner')
pocono_stats_engineered.head()

Unnamed: 0,Driver,Avg Finish,Races,Wins,Top 5's,Top 10's,Laps Led,DNF,Avg Rating,szn_Avg Finish,szn_Races,szn_Wins,szn_Top 5's,szn_Laps Led,szn_Avg Rating
0,William Byron,9.4,11,0,3,6,130,0,96.7,10.4,16,1,7,769,106.7
1,Brad Keselowski,10.8,27,1,11,16,247,1,94.4,24.3,16,0,1,8,62.9
2,Denny Hamlin,11.3,35,7,16,23,858,3,108.7,13.5,15,3,7,455,94.3
3,Kyle Larson,11.6,17,0,5,9,158,0,95.8,12.8,16,3,9,851,96.0
4,Ryan Blaney,13.4,15,2,3,7,95,1,89.6,17.4,16,1,6,346,90.6


In [11]:
pocono_stats_engineered['DNF_Prob'] = pocono_stats_engineered.apply(lambda row: dnf_prob(row['DNF'], row['Races']), axis=1)
pocono_stats_engineered.head()

Unnamed: 0,Driver,Avg Finish,Races,Wins,Top 5's,Top 10's,Laps Led,DNF,Avg Rating,szn_Avg Finish,szn_Races,szn_Wins,szn_Top 5's,szn_Laps Led,szn_Avg Rating,DNF_Prob
0,William Byron,9.4,11,0,3,6,130,0,96.7,10.4,16,1,7,769,106.7,0.03
1,Brad Keselowski,10.8,27,1,11,16,247,1,94.4,24.3,16,0,1,8,62.9,0.037037
2,Denny Hamlin,11.3,35,7,16,23,858,3,108.7,13.5,15,3,7,455,94.3,0.085714
3,Kyle Larson,11.6,17,0,5,9,158,0,95.8,12.8,16,3,9,851,96.0,0.03
4,Ryan Blaney,13.4,15,2,3,7,95,1,89.6,17.4,16,1,6,346,90.6,0.066667


In [12]:
pocono_stats_engineered['track_Laps Led Per Race'] = pocono_stats_engineered.apply(lambda row: laps_led_per_race(row['Laps Led'], row['Races']), axis=1)
pocono_stats_engineered['szn_Laps Led Per Race'] = pocono_stats_engineered.apply(lambda row: laps_led_per_race(row['szn_Laps Led'], row['szn_Races']), axis=1)

In [13]:
pocono_stats_engineered['Std Dev'] = pocono_stats_engineered.apply(lambda row: estimate_std(row['Avg Finish']), axis=1)
pocono_stats_engineered.head()

Unnamed: 0,Driver,Avg Finish,Races,Wins,Top 5's,Top 10's,Laps Led,DNF,Avg Rating,szn_Avg Finish,szn_Races,szn_Wins,szn_Top 5's,szn_Laps Led,szn_Avg Rating,DNF_Prob,track_Laps Led Per Race,szn_Laps Led Per Race,Std Dev
0,William Byron,9.4,11,0,3,6,130,0,96.7,10.4,16,1,7,769,106.7,0.03,11.818182,48.0625,2.5
1,Brad Keselowski,10.8,27,1,11,16,247,1,94.4,24.3,16,0,1,8,62.9,0.037037,9.148148,0.5,2.5
2,Denny Hamlin,11.3,35,7,16,23,858,3,108.7,13.5,15,3,7,455,94.3,0.085714,24.514286,30.333333,2.5
3,Kyle Larson,11.6,17,0,5,9,158,0,95.8,12.8,16,3,9,851,96.0,0.03,9.294118,53.1875,2.5
4,Ryan Blaney,13.4,15,2,3,7,95,1,89.6,17.4,16,1,6,346,90.6,0.066667,6.333333,21.625,3.5


In [14]:
pocono_stats_engineered.drop(columns=["Races", "Top 10's", "Laps Led", "szn_Races", "szn_Laps Led"], inplace=True)
pocono_stats_engineered.head()

Unnamed: 0,Driver,Avg Finish,Wins,Top 5's,DNF,Avg Rating,szn_Avg Finish,szn_Wins,szn_Top 5's,szn_Avg Rating,DNF_Prob,track_Laps Led Per Race,szn_Laps Led Per Race,Std Dev
0,William Byron,9.4,0,3,0,96.7,10.4,1,7,106.7,0.03,11.818182,48.0625,2.5
1,Brad Keselowski,10.8,1,11,1,94.4,24.3,0,1,62.9,0.037037,9.148148,0.5,2.5
2,Denny Hamlin,11.3,7,16,3,108.7,13.5,3,7,94.3,0.085714,24.514286,30.333333,2.5
3,Kyle Larson,11.6,0,5,0,95.8,12.8,3,9,96.0,0.03,9.294118,53.1875,2.5
4,Ryan Blaney,13.4,2,3,1,89.6,17.4,1,6,90.6,0.066667,6.333333,21.625,3.5


In [15]:
# Input files able to be read from GitHub: Don't feel like doing the API request stuff to write this out as well,
# This is all behind the scenes anyways :)
# pocono_stats_engineered.to_csv("C:/Users/jakel/OneDrive/Desktop/GitHub/NASCAR-Race-Predictions/Data/Pocono-2025/pocono-sim-ready-data.csv")