# NASCAR Race Winner Prediction: Monte Carlo Simulation
## Notebook 1: Data Preprocessing

In [136]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from io import StringIO

For this notebook, I will be using web crawling to extract data.

In [137]:
cup_roster = pd.read_csv("https://raw.githubusercontent.com/jake-lukasik/NASCAR-Race-Predictions/refs/heads/main/Data/Master-Data/2025-cup-series-full-timers.txt")

### Race 2: Dover 07/20/2025

Step 1: Pulling Dover stats from driveraverages.com, I left a ton of comments so this is easily understandable, this is my first time using web crawling concepts :)

In [138]:
url = "https://www.driveraverages.com/nascar/track_avg.php?trk_id=7"

In [139]:
# send the HTTP request
headers = {'User-Agent': 'Mozilla/5.0'}
response = requests.get(url, headers=headers)

In [140]:
# parse the HTML of the webpage
soup = BeautifulSoup(response.content, 'html.parser')

In [141]:
# find all <table> html objects
tables = soup.find_all("table")

# check how many tables there are (how many we'll have to iterate through in the next cell)
print(f"Found {len(tables)} tables.")

Found 6 tables.


In [142]:
# loop through found tables and save the correct one as a pandas df
for i in range(len(tables)):
    try:
        html_str = str(tables[i])
        temp = pd.read_html(StringIO(html_str))[0]
        if "Driver" in temp.columns:
            if temp.iloc[0]["Driver"] == "Kyle Larson": ## change this line to the best driver as of recent in the table
                dover_stats = temp
                break
    except Exception as e:
        print(f"Skipping table {i}: {e}")

dover_stats.columns = dover_stats.columns.str.replace("  ", " ", regex=False).str.strip()
for col in dover_stats.columns:
    if "Unnamed" in col:
        dover_stats = dover_stats.drop(columns=[col])
dover_stats.head()

Unnamed: 0,Driver,Avg Finish,Races,Wins,Top 5's,Top 10's,Top 20's,Laps Led,Avg Start,Best Finish,Low Finish,DNF,Avg Rating
0,Kyle Larson,8.2,16,1,8,12,14,938,8.7,1,32,0,103.8
1,Chase Elliott,9.5,14,2,10,10,12,394,9.9,1,39,2,96.1
2,Jimmie Johnson,10.1,39,11,18,27,33,3113,11.3,1,41,2,109.7
3,Martin Truex Jr.,11.2,34,4,11,20,28,1138,11.0,1,38,2,101.7
4,Ty Gibbs,11.5,2,0,0,1,2,0,21.5,10,13,0,86.2


In [143]:
# for fun, we can now use the same exact code to extract season stats, I won't break it up into as many individual code cells this time:
szn_stats_url = "https://www.driveraverages.com/nascar/nascar-stats.php"

# send the HTTP request
headers = {'User-Agent': 'Mozilla/5.0'}
response = requests.get(szn_stats_url, headers=headers)

# parse the HTML of the webpage
soup = BeautifulSoup(response.content, 'html.parser')

# find all <table> html objects
tables = soup.find_all("table")

# check how many tables there are (how many we'll have to iterate through in the next cell)
print(f"Found {len(tables)} tables.")

Found 6 tables.


In [144]:
# loop through found tables and save the correct one as a pandas df
for i in range(len(tables)):
    try:
        html_str = str(tables[i])
        temp = pd.read_html(StringIO(html_str))[0]
        if "Driver" in temp.columns:
            if temp.iloc[0]["Driver"] == "William Byron": ## change this line to the best driver as of recent in the season points
                season_stats = temp
                break
    except Exception as e:
        print(f"Skipping table {i}: {e}")

season_stats.columns = season_stats.columns.str.replace("  ", " ", regex=False).str.strip()
season_stats = season_stats.drop(columns=["NASCAR Points", "Top 10's"])
for col in season_stats.columns:
    if "Unnamed" in col:
        season_stats = season_stats.drop(columns=[col])

season_stats = season_stats.rename(
    columns={col: f"szn_{col}" for col in season_stats.columns if col != "Driver"}
)

season_stats.head()

Unnamed: 0,Driver,szn_Avg Finish,szn_Races,szn_Wins,szn_Top 5's,szn_Laps Led,szn_Avg Rating
0,William Byron,13.9,20,1,7,769,98.3
1,Chase Elliott,10.2,20,1,7,136,88.7
2,Kyle Larson,13.8,20,3,9,854,92.2
3,Tyler Reddick,13.9,20,0,5,112,91.2
4,Denny Hamlin,13.6,19,3,9,487,90.4


In [145]:
dover_stats = dover_stats[dover_stats['Driver'].isin(cup_roster['Driver'])]
dover_stats.head()

Unnamed: 0,Driver,Avg Finish,Races,Wins,Top 5's,Top 10's,Top 20's,Laps Led,Avg Start,Best Finish,Low Finish,DNF,Avg Rating
0,Kyle Larson,8.2,16,1,8,12,14,938,8.7,1,32,0,103.8
1,Chase Elliott,9.5,14,2,10,10,12,394,9.9,1,39,2,96.1
4,Ty Gibbs,11.5,2,0,0,1,2,0,21.5,10,13,0,86.2
5,Brad Keselowski,13.5,26,1,5,11,22,411,11.7,1,38,1,90.3
6,Erik Jones,13.8,11,0,1,3,9,0,13.5,4,22,1,79.1


#### Important note:

**Still missing Cole Custer, SVG, and Riley Herbst.** I am figuring out ways to include them as I code this

In [146]:
dover_stats = dover_stats.drop(columns=["Top 20's", "Avg Start", "Best Finish", "Low Finish"])
dover_stats.head()

Unnamed: 0,Driver,Avg Finish,Races,Wins,Top 5's,Top 10's,Laps Led,DNF,Avg Rating
0,Kyle Larson,8.2,16,1,8,12,938,0,103.8
1,Chase Elliott,9.5,14,2,10,10,394,2,96.1
4,Ty Gibbs,11.5,2,0,0,1,0,0,86.2
5,Brad Keselowski,13.5,26,1,5,11,411,1,90.3
6,Erik Jones,13.8,11,0,1,3,0,1,79.1


Now we can begin feature engineering to prepare the data for a Monte Carlo simulated model:

In [147]:
# Helper functions, we will apply these later:

# estimate_std, since standard deviation of finishes is not easily available,
# we instead estimate it based off of drivers average finish (a better average
# finish gives us a clue that that driver is more consistent)
def estimate_std(avg_finish):
    if avg_finish <= 12:
        return 2.5
    elif avg_finish <= 18:
        return 3.5
    else:
        return 4.5
    
# calculate the probability of a dnf, using DNF count along with race count, includes
# some manual tweaks to ensure this is a valuable and realistic measure
def dnf_prob(dnf_count, race_count):
    dnf_probability = dnf_count / race_count

    if dnf_probability == 0:
        if race_count >= 5:
            return 0.03  
        else:
            return 0.15 
    elif dnf_probability == 1:
        return 0.4  # cap max failure
    else:
        return min(dnf_probability, 0.35)  # avoid extreme dnf probabilites
    
# simple ratio of laps led per race
def laps_led_per_race(laps_led, races):
    return laps_led/races

In [148]:
dover_stats_engineered = pd.merge(dover_stats, season_stats, on='Driver', how='inner')
dover_stats_engineered.head()

Unnamed: 0,Driver,Avg Finish,Races,Wins,Top 5's,Top 10's,Laps Led,DNF,Avg Rating,szn_Avg Finish,szn_Races,szn_Wins,szn_Top 5's,szn_Laps Led,szn_Avg Rating
0,Kyle Larson,8.2,16,1,8,12,938,0,103.8,13.8,20,3,9,854,92.2
1,Chase Elliott,9.5,14,2,10,10,394,2,96.1,10.2,20,1,7,136,88.7
2,Ty Gibbs,11.5,2,0,0,1,0,0,86.2,17.7,20,0,3,97,73.7
3,Brad Keselowski,13.5,26,1,5,11,411,1,90.3,22.4,20,0,2,81,66.5
4,Erik Jones,13.8,11,0,1,3,0,1,79.1,19.3,20,0,2,1,65.3


In [149]:
dover_stats_engineered['DNF_Prob'] = dover_stats_engineered.apply(lambda row: dnf_prob(row['DNF'], row['Races']), axis=1)
dover_stats_engineered.head()

Unnamed: 0,Driver,Avg Finish,Races,Wins,Top 5's,Top 10's,Laps Led,DNF,Avg Rating,szn_Avg Finish,szn_Races,szn_Wins,szn_Top 5's,szn_Laps Led,szn_Avg Rating,DNF_Prob
0,Kyle Larson,8.2,16,1,8,12,938,0,103.8,13.8,20,3,9,854,92.2,0.03
1,Chase Elliott,9.5,14,2,10,10,394,2,96.1,10.2,20,1,7,136,88.7,0.142857
2,Ty Gibbs,11.5,2,0,0,1,0,0,86.2,17.7,20,0,3,97,73.7,0.15
3,Brad Keselowski,13.5,26,1,5,11,411,1,90.3,22.4,20,0,2,81,66.5,0.038462
4,Erik Jones,13.8,11,0,1,3,0,1,79.1,19.3,20,0,2,1,65.3,0.090909


In [150]:
dover_stats_engineered['track_Laps Led Per Race'] = dover_stats_engineered.apply(lambda row: laps_led_per_race(row['Laps Led'], row['Races']), axis=1)
dover_stats_engineered['szn_Laps Led Per Race'] = dover_stats_engineered.apply(lambda row: laps_led_per_race(row['szn_Laps Led'], row['szn_Races']), axis=1)

In [151]:
dover_stats_engineered['Std Dev'] = dover_stats_engineered.apply(lambda row: estimate_std(row['Avg Finish']), axis=1)
dover_stats_engineered.head()

Unnamed: 0,Driver,Avg Finish,Races,Wins,Top 5's,Top 10's,Laps Led,DNF,Avg Rating,szn_Avg Finish,szn_Races,szn_Wins,szn_Top 5's,szn_Laps Led,szn_Avg Rating,DNF_Prob,track_Laps Led Per Race,szn_Laps Led Per Race,Std Dev
0,Kyle Larson,8.2,16,1,8,12,938,0,103.8,13.8,20,3,9,854,92.2,0.03,58.625,42.7,2.5
1,Chase Elliott,9.5,14,2,10,10,394,2,96.1,10.2,20,1,7,136,88.7,0.142857,28.142857,6.8,2.5
2,Ty Gibbs,11.5,2,0,0,1,0,0,86.2,17.7,20,0,3,97,73.7,0.15,0.0,4.85,2.5
3,Brad Keselowski,13.5,26,1,5,11,411,1,90.3,22.4,20,0,2,81,66.5,0.038462,15.807692,4.05,3.5
4,Erik Jones,13.8,11,0,1,3,0,1,79.1,19.3,20,0,2,1,65.3,0.090909,0.0,0.05,3.5


In [152]:
dover_stats_engineered = dover_stats_engineered.drop(columns=["Races", "Top 10's", "Laps Led", "szn_Races", "szn_Laps Led"])
dover_stats_engineered.head()

Unnamed: 0,Driver,Avg Finish,Wins,Top 5's,DNF,Avg Rating,szn_Avg Finish,szn_Wins,szn_Top 5's,szn_Avg Rating,DNF_Prob,track_Laps Led Per Race,szn_Laps Led Per Race,Std Dev
0,Kyle Larson,8.2,1,8,0,103.8,13.8,3,9,92.2,0.03,58.625,42.7,2.5
1,Chase Elliott,9.5,2,10,2,96.1,10.2,1,7,88.7,0.142857,28.142857,6.8,2.5
2,Ty Gibbs,11.5,0,0,0,86.2,17.7,0,3,73.7,0.15,0.0,4.85,2.5
3,Brad Keselowski,13.5,1,5,1,90.3,22.4,0,2,66.5,0.038462,15.807692,4.05,3.5
4,Erik Jones,13.8,0,1,1,79.1,19.3,0,2,65.3,0.090909,0.0,0.05,3.5


In [None]:
# Input files able to be read from GitHub: Don't feel like doing the API request stuff to write this out as well,
# This is all behind the scenes anyways :)
#dover_stats_engineered.to_csv("C:/Users/jakel/OneDrive/Desktop/GitHub/NASCAR-Race-Predictions/Data/Dover-2025/dover-sim-ready-data.csv")