# NASCAR Race Winner Prediction: Monte Carlo Simulation
## Notebook 1: Data Preprocessing

In [14]:
import pandas as pd
from data_pulling import *
import sys
sys.dont_write_bytecode = True

For this notebook, I will be using web crawling to extract data.

In [15]:
cup_roster = pd.read_csv("https://raw.githubusercontent.com/jake-lukasik/NASCAR-Race-Predictions/refs/heads/main/Data/Master-Data/2025-cup-series-full-timers.txt")

### Race 3: Indianapolis (oval) 07/27/2025

Step 1: Pulling Indy stats from driveraverages.com, I left a ton of comments so this is easily understandable

In [16]:
indy_stats = pull_data_from_driver_averages(url="https://www.driveraverages.com/nascar/track_avg.php?trk_id=9",
                                            driver_in_first="Tyler Reddick")
indy_stats.head()

Found 6 tables.


Unnamed: 0,Driver,Avg Finish,Races,Wins,Top 5's,Top 10's,Top 20's,Laps Led,Avg Start,Best Finish,Low Finish,DNF,Avg Rating
0,Tyler Reddick,5.0,2,0,1,2,2,40,7.0,2,8,0,101.9
1,Cole Custer,5.0,1,0,1,1,1,0,30.0,5,5,0,92.3
2,Todd Gilliland,6.0,1,0,0,1,1,1,24.0,6,6,0,78.4
3,Austin Cindric,7.0,1,0,0,1,1,0,38.0,7,7,0,60.3
4,Christopher Bell,8.0,2,0,1,1,2,2,26.5,4,12,0,92.7


In [17]:
season_stats = pull_season_stats(driver_in_first="Chase Elliott")
season_stats.head()

Unnamed: 0,Driver,szn_Avg Finish,szn_Races,szn_Wins,szn_Top 5's,szn_Laps Led,szn_Avg Rating
0,Chase Elliott,10.0,21,1,7,374,90.9
1,William Byron,14.7,21,1,7,769,98.0
2,Kyle Larson,13.3,21,3,10,854,92.6
3,Denny Hamlin,13.0,20,4,10,554,92.2
4,Tyler Reddick,13.8,21,0,5,112,91.0


In [18]:
indy_stats = indy_stats[indy_stats['Driver'].isin(cup_roster['Driver'])]
indy_stats.head()

Unnamed: 0,Driver,Avg Finish,Races,Wins,Top 5's,Top 10's,Top 20's,Laps Led,Avg Start,Best Finish,Low Finish,DNF,Avg Rating
0,Tyler Reddick,5.0,2,0,1,2,2,40,7.0,2,8,0,101.9
1,Cole Custer,5.0,1,0,1,1,1,0,30.0,5,5,0,92.3
2,Todd Gilliland,6.0,1,0,0,1,1,1,24.0,6,6,0,78.4
3,Austin Cindric,7.0,1,0,0,1,1,0,38.0,7,7,0,60.3
4,Christopher Bell,8.0,2,0,1,1,2,2,26.5,4,12,0,92.7


In [19]:
indy_stats = indy_stats.drop(columns=["Top 20's", "Avg Start", "Best Finish", "Low Finish"])
indy_stats.head()

Unnamed: 0,Driver,Avg Finish,Races,Wins,Top 5's,Top 10's,Laps Led,DNF,Avg Rating
0,Tyler Reddick,5.0,2,0,1,2,40,0,101.9
1,Cole Custer,5.0,1,0,1,1,0,0,92.3
2,Todd Gilliland,6.0,1,0,0,1,1,0,78.4
3,Austin Cindric,7.0,1,0,0,1,0,0,60.3
4,Christopher Bell,8.0,2,0,1,1,2,0,92.7


Now we can begin feature engineering to prepare the data for a Monte Carlo simulated model:

In [20]:
# Helper functions, we will apply these later:

# estimate_std, since standard deviation of finishes is not easily available,
# we instead estimate it based off of drivers average finish (a better average
# finish gives us a clue that that driver is more consistent)
def estimate_std(avg_finish):
    if avg_finish <= 12:
        return 2.5
    elif avg_finish <= 18:
        return 3.5
    else:
        return 4.5
    
# calculate the probability of a dnf, using DNF count along with race count, includes
# some manual tweaks to ensure this is a valuable and realistic measure
def dnf_prob(dnf_count, race_count):
    dnf_probability = dnf_count / race_count

    if dnf_probability == 0:
        if race_count >= 5:
            return 0.03  
        else:
            return 0.15 
    elif dnf_probability == 1:
        return 0.4  # cap max failure
    else:
        return min(dnf_probability, 0.35)  # avoid extreme dnf probabilites
    
# simple ratio of laps led per race
def laps_led_per_race(laps_led, races):
    return laps_led/races

In [21]:
indy_stats_engineered = pd.merge(indy_stats, season_stats, on='Driver', how='inner')
indy_stats_engineered.head()

Unnamed: 0,Driver,Avg Finish,Races,Wins,Top 5's,Top 10's,Laps Led,DNF,Avg Rating,szn_Avg Finish,szn_Races,szn_Wins,szn_Top 5's,szn_Laps Led,szn_Avg Rating
0,Tyler Reddick,5.0,2,0,1,2,40,0,101.9,13.8,21,0,5,112,91.0
1,Cole Custer,5.0,1,0,1,1,0,0,92.3,24.6,21,0,0,2,47.8
2,Todd Gilliland,6.0,1,0,0,1,1,0,78.4,22.0,21,0,0,11,55.1
3,Austin Cindric,7.0,1,0,0,1,0,0,60.3,20.3,21,1,1,251,77.6
4,Christopher Bell,8.0,2,0,1,1,2,0,92.7,12.6,21,3,8,221,89.8


In [22]:
indy_stats_engineered['DNF_Prob'] = indy_stats_engineered.apply(lambda row: dnf_prob(row['DNF'], row['Races']), axis=1)
indy_stats_engineered.head()

Unnamed: 0,Driver,Avg Finish,Races,Wins,Top 5's,Top 10's,Laps Led,DNF,Avg Rating,szn_Avg Finish,szn_Races,szn_Wins,szn_Top 5's,szn_Laps Led,szn_Avg Rating,DNF_Prob
0,Tyler Reddick,5.0,2,0,1,2,40,0,101.9,13.8,21,0,5,112,91.0,0.15
1,Cole Custer,5.0,1,0,1,1,0,0,92.3,24.6,21,0,0,2,47.8,0.15
2,Todd Gilliland,6.0,1,0,0,1,1,0,78.4,22.0,21,0,0,11,55.1,0.15
3,Austin Cindric,7.0,1,0,0,1,0,0,60.3,20.3,21,1,1,251,77.6,0.15
4,Christopher Bell,8.0,2,0,1,1,2,0,92.7,12.6,21,3,8,221,89.8,0.15


In [23]:
indy_stats_engineered['track_Laps Led Per Race'] = indy_stats_engineered.apply(lambda row: laps_led_per_race(row['Laps Led'], row['Races']), axis=1)
indy_stats_engineered['szn_Laps Led Per Race'] = indy_stats_engineered.apply(lambda row: laps_led_per_race(row['szn_Laps Led'], row['szn_Races']), axis=1)

In [24]:
indy_stats_engineered['Std Dev'] = indy_stats_engineered.apply(lambda row: estimate_std(row['Avg Finish']), axis=1)
indy_stats_engineered.head()

Unnamed: 0,Driver,Avg Finish,Races,Wins,Top 5's,Top 10's,Laps Led,DNF,Avg Rating,szn_Avg Finish,szn_Races,szn_Wins,szn_Top 5's,szn_Laps Led,szn_Avg Rating,DNF_Prob,track_Laps Led Per Race,szn_Laps Led Per Race,Std Dev
0,Tyler Reddick,5.0,2,0,1,2,40,0,101.9,13.8,21,0,5,112,91.0,0.15,20.0,5.333333,2.5
1,Cole Custer,5.0,1,0,1,1,0,0,92.3,24.6,21,0,0,2,47.8,0.15,0.0,0.095238,2.5
2,Todd Gilliland,6.0,1,0,0,1,1,0,78.4,22.0,21,0,0,11,55.1,0.15,1.0,0.52381,2.5
3,Austin Cindric,7.0,1,0,0,1,0,0,60.3,20.3,21,1,1,251,77.6,0.15,0.0,11.952381,2.5
4,Christopher Bell,8.0,2,0,1,1,2,0,92.7,12.6,21,3,8,221,89.8,0.15,1.0,10.52381,2.5


In [25]:
indy_stats_engineered = indy_stats_engineered.drop(columns=["Races", "Top 10's", "Laps Led", "szn_Races", "szn_Laps Led"])
indy_stats_engineered.head()

Unnamed: 0,Driver,Avg Finish,Wins,Top 5's,DNF,Avg Rating,szn_Avg Finish,szn_Wins,szn_Top 5's,szn_Avg Rating,DNF_Prob,track_Laps Led Per Race,szn_Laps Led Per Race,Std Dev
0,Tyler Reddick,5.0,0,1,0,101.9,13.8,0,5,91.0,0.15,20.0,5.333333,2.5
1,Cole Custer,5.0,0,1,0,92.3,24.6,0,0,47.8,0.15,0.0,0.095238,2.5
2,Todd Gilliland,6.0,0,0,0,78.4,22.0,0,0,55.1,0.15,1.0,0.52381,2.5
3,Austin Cindric,7.0,0,0,0,60.3,20.3,1,1,77.6,0.15,0.0,11.952381,2.5
4,Christopher Bell,8.0,0,1,0,92.7,12.6,3,8,89.8,0.15,1.0,10.52381,2.5


In [None]:
# Input files able to be read from GitHub: Don't feel like doing the API request stuff to write this out as well,
# This is all behind the scenes anyways :)
#indy_stats_engineered.to_csv("C:/Users/jakel/OneDrive/Desktop/GitHub/NASCAR-Race-Predictions/Data/Indy-2025/indy-sim-ready-data.csv")