# NASCAR Race Winner Prediction: Monte Carlo Simulation
## Notebook 1: Data Preprocessing

In [1]:
import pandas as pd
from data_pulling import *
import sys
sys.dont_write_bytecode = True

For this notebook, I will be using web crawling to extract data.

In [2]:
cup_roster = pd.read_csv("https://raw.githubusercontent.com/jake-lukasik/NASCAR-Race-Predictions/refs/heads/main/Data/Master-Data/2025-cup-series-full-timers.txt")

### Race 4: Iowa (oval) 08/03/2025

Step 1: Pulling iowa stats from driveraverages.com, I left a ton of comments so this is easily understandable

In [3]:
iowa_stats = pull_data_from_driver_averages(url="https://www.driveraverages.com/nascar/track_avg.php?trk_id=55",
                                            driver_in_first="Ryan Blaney")
iowa_stats.head()

Found 6 tables.


Unnamed: 0,Driver,Avg Finish,Races,Wins,Top 5's,Top 10's,Top 20's,Laps Led,Avg Start,Best Finish,Low Finish,DNF,Avg Rating
0,Ryan Blaney,1.0,1,1,1,1,1,201,2.0,1,1,0,146.0
1,William Byron,2.0,1,0,1,1,1,0,4.0,2,2,0,120.0
2,Chase Elliott,3.0,1,0,1,1,1,1,9.0,3,3,0,116.6
3,Christopher Bell,4.0,1,0,1,1,1,7,10.0,4,4,0,97.2
4,Ricky Stenhouse Jr.,5.0,1,0,1,1,1,4,35.0,5,5,0,79.6


In [4]:
season_stats = pull_season_stats(driver_in_first="Chase Elliott")
season_stats.head()

Unnamed: 0,Driver,szn_Avg Finish,szn_Races,szn_Wins,szn_Top 5's,szn_Laps Led,szn_Avg Rating
0,Chase Elliott,10.1,22,1,7,374,89.8
1,William Byron,14.8,22,1,7,769,98.6
2,Kyle Larson,12.8,22,3,11,873,93.6
3,Denny Hamlin,12.5,21,4,11,555,92.2
4,Christopher Bell,12.4,22,3,8,221,89.7


In [5]:
iowa_stats = iowa_stats[iowa_stats['Driver'].isin(cup_roster['Driver'])]
iowa_stats.head()

Unnamed: 0,Driver,Avg Finish,Races,Wins,Top 5's,Top 10's,Top 20's,Laps Led,Avg Start,Best Finish,Low Finish,DNF,Avg Rating
0,Ryan Blaney,1.0,1,1,1,1,1,201,2.0,1,1,0,146.0
1,William Byron,2.0,1,0,1,1,1,0,4.0,2,2,0,120.0
2,Chase Elliott,3.0,1,0,1,1,1,1,9.0,3,3,0,116.6
3,Christopher Bell,4.0,1,0,1,1,1,7,10.0,4,4,0,97.2
4,Ricky Stenhouse Jr.,5.0,1,0,1,1,1,4,35.0,5,5,0,79.6


In [6]:
iowa_stats = iowa_stats.drop(columns=["Top 20's", "Avg Start", "Best Finish", "Low Finish"])
iowa_stats.head()

Unnamed: 0,Driver,Avg Finish,Races,Wins,Top 5's,Top 10's,Laps Led,DNF,Avg Rating
0,Ryan Blaney,1.0,1,1,1,1,201,0,146.0
1,William Byron,2.0,1,0,1,1,0,0,120.0
2,Chase Elliott,3.0,1,0,1,1,1,0,116.6
3,Christopher Bell,4.0,1,0,1,1,7,0,97.2
4,Ricky Stenhouse Jr.,5.0,1,0,1,1,4,0,79.6


Now we can begin feature engineering to prepare the data for a Monte Carlo simulated model:

In [7]:
# Helper functions, we will apply these later:

# estimate_std, since standard deviation of finishes is not easily available,
# we instead estimate it based off of drivers average finish (a better average
# finish gives us a clue that that driver is more consistent)
def estimate_std(avg_finish):
    if avg_finish <= 12:
        return 2.5
    elif avg_finish <= 18:
        return 3.5
    else:
        return 4.5
    
# calculate the probability of a dnf, using DNF count along with race count, includes
# some manual tweaks to ensure this is a valuable and realistic measure
def dnf_prob(dnf_count, race_count):
    dnf_probability = dnf_count / race_count

    if dnf_probability == 0:
        if race_count >= 5:
            return 0.03  
        else:
            return 0.15 
    elif dnf_probability == 1:
        return 0.4  # cap max failure
    else:
        return min(dnf_probability, 0.35)  # avoid extreme dnf probabilites
    
# simple ratio of laps led per race
def laps_led_per_race(laps_led, races):
    return laps_led/races

In [8]:
iowa_stats_engineered = pd.merge(iowa_stats, season_stats, on='Driver', how='inner')
iowa_stats_engineered.head()

Unnamed: 0,Driver,Avg Finish,Races,Wins,Top 5's,Top 10's,Laps Led,DNF,Avg Rating,szn_Avg Finish,szn_Races,szn_Wins,szn_Top 5's,szn_Laps Led,szn_Avg Rating
0,Ryan Blaney,1.0,1,1,1,1,201,0,146.0,17.5,22,1,7,366,87.8
1,William Byron,2.0,1,0,1,1,0,0,120.0,14.8,22,1,7,769,98.6
2,Chase Elliott,3.0,1,0,1,1,1,0,116.6,10.1,22,1,7,374,89.8
3,Christopher Bell,4.0,1,0,1,1,7,0,97.2,12.4,22,3,8,221,89.7
4,Ricky Stenhouse Jr.,5.0,1,0,1,1,4,0,79.6,21.0,22,0,1,3,55.4


In [9]:
iowa_stats_engineered['DNF_Prob'] = iowa_stats_engineered.apply(lambda row: dnf_prob(row['DNF'], row['Races']), axis=1)
iowa_stats_engineered.head()

Unnamed: 0,Driver,Avg Finish,Races,Wins,Top 5's,Top 10's,Laps Led,DNF,Avg Rating,szn_Avg Finish,szn_Races,szn_Wins,szn_Top 5's,szn_Laps Led,szn_Avg Rating,DNF_Prob
0,Ryan Blaney,1.0,1,1,1,1,201,0,146.0,17.5,22,1,7,366,87.8,0.15
1,William Byron,2.0,1,0,1,1,0,0,120.0,14.8,22,1,7,769,98.6,0.15
2,Chase Elliott,3.0,1,0,1,1,1,0,116.6,10.1,22,1,7,374,89.8,0.15
3,Christopher Bell,4.0,1,0,1,1,7,0,97.2,12.4,22,3,8,221,89.7,0.15
4,Ricky Stenhouse Jr.,5.0,1,0,1,1,4,0,79.6,21.0,22,0,1,3,55.4,0.15


In [10]:
iowa_stats_engineered['track_Laps Led Per Race'] = iowa_stats_engineered.apply(lambda row: laps_led_per_race(row['Laps Led'], row['Races']), axis=1)
iowa_stats_engineered['szn_Laps Led Per Race'] = iowa_stats_engineered.apply(lambda row: laps_led_per_race(row['szn_Laps Led'], row['szn_Races']), axis=1)

In [11]:
iowa_stats_engineered['Std Dev'] = iowa_stats_engineered.apply(lambda row: estimate_std(row['Avg Finish']), axis=1)
iowa_stats_engineered.head()

Unnamed: 0,Driver,Avg Finish,Races,Wins,Top 5's,Top 10's,Laps Led,DNF,Avg Rating,szn_Avg Finish,szn_Races,szn_Wins,szn_Top 5's,szn_Laps Led,szn_Avg Rating,DNF_Prob,track_Laps Led Per Race,szn_Laps Led Per Race,Std Dev
0,Ryan Blaney,1.0,1,1,1,1,201,0,146.0,17.5,22,1,7,366,87.8,0.15,201.0,16.636364,2.5
1,William Byron,2.0,1,0,1,1,0,0,120.0,14.8,22,1,7,769,98.6,0.15,0.0,34.954545,2.5
2,Chase Elliott,3.0,1,0,1,1,1,0,116.6,10.1,22,1,7,374,89.8,0.15,1.0,17.0,2.5
3,Christopher Bell,4.0,1,0,1,1,7,0,97.2,12.4,22,3,8,221,89.7,0.15,7.0,10.045455,2.5
4,Ricky Stenhouse Jr.,5.0,1,0,1,1,4,0,79.6,21.0,22,0,1,3,55.4,0.15,4.0,0.136364,2.5


In [12]:
iowa_stats_engineered = iowa_stats_engineered.drop(columns=["Races", "Top 10's", "Laps Led", "szn_Races", "szn_Laps Led"])
iowa_stats_engineered.head()

Unnamed: 0,Driver,Avg Finish,Wins,Top 5's,DNF,Avg Rating,szn_Avg Finish,szn_Wins,szn_Top 5's,szn_Avg Rating,DNF_Prob,track_Laps Led Per Race,szn_Laps Led Per Race,Std Dev
0,Ryan Blaney,1.0,1,1,0,146.0,17.5,1,7,87.8,0.15,201.0,16.636364,2.5
1,William Byron,2.0,0,1,0,120.0,14.8,1,7,98.6,0.15,0.0,34.954545,2.5
2,Chase Elliott,3.0,0,1,0,116.6,10.1,1,7,89.8,0.15,1.0,17.0,2.5
3,Christopher Bell,4.0,0,1,0,97.2,12.4,3,8,89.7,0.15,7.0,10.045455,2.5
4,Ricky Stenhouse Jr.,5.0,0,1,0,79.6,21.0,0,1,55.4,0.15,4.0,0.136364,2.5


In [13]:
# Input files able to be read from GitHub: Don't feel like doing the API request stuff to write this out as well,
# This is all behind the scenes anyways :)
iowa_stats_engineered.to_csv("/Users/jakelukasik/Desktop/GitHub/NASCAR-Race-Predictions/Data/Iowa-2025/iowa-sim-ready-data.csv")