# NASCAR Race Winner Prediction: Monte Carlo Simulation
## Notebook 1: Data Preprocessing

In [1]:
import pandas as pd
from data_pulling import *
import sys
sys.dont_write_bytecode = True
from sklearn.preprocessing import MinMaxScaler

For this notebook, I will be using web crawling to extract data.

In [2]:
cup_roster = pd.read_csv("https://raw.githubusercontent.com/jake-lukasik/NASCAR-Race-Predictions/refs/heads/main/Data/Master-Data/2025-cup-series-full-timers.txt")

### Race 6: Richmond Raceway 08/16/2025

Step 1: Pulling Richmond stats from driveraverages.com

In [3]:
richmond_stats = pull_data_from_driver_averages(url="https://www.driveraverages.com/nascar/track_avg.php?trk_id=19",
                                            driver_in_first="Christopher Bell")
richmond_stats.head()

Found 6 tables.


Unnamed: 0,Driver,Avg Finish,Races,Wins,Top 5's,Top 10's,Top 20's,Laps Led,Avg Start,Best Finish,Low Finish,DNF,Avg Rating
0,Christopher Bell,7.3,9,0,4,7,9,230,17.6,2,20,0,98.0
1,Kyle Busch,7.6,38,6,19,28,37,1530,11.7,1,24,0,106.0
2,Denny Hamlin,8.1,36,5,20,24,31,2367,9.0,1,24,0,109.8
3,Josh Berry,9.0,3,0,1,1,3,12,21.0,2,14,0,88.2
4,Joey Logano,10.0,31,2,14,19,27,653,10.9,1,35,1,95.8


In [4]:
season_stats = pull_season_stats(driver_in_first="William Byron")
season_stats.head()

Unnamed: 0,Driver,szn_Avg Finish,szn_Races,szn_Wins,szn_Top 5's,szn_Laps Led,szn_Avg Rating
0,William Byron,13.8,24,2,9,910,101.0
1,Chase Elliott,11.0,24,1,7,385,88.5
2,Denny Hamlin,13.6,23,4,11,555,89.3
3,Christopher Bell,12.1,24,3,9,221,90.1
4,Kyle Larson,14.5,24,3,11,873,90.6


In [5]:
richmond_stats = richmond_stats[richmond_stats['Driver'].isin(cup_roster['Driver'])]
richmond_stats.head()

Unnamed: 0,Driver,Avg Finish,Races,Wins,Top 5's,Top 10's,Top 20's,Laps Led,Avg Start,Best Finish,Low Finish,DNF,Avg Rating
0,Christopher Bell,7.3,9,0,4,7,9,230,17.6,2,20,0,98.0
1,Kyle Busch,7.6,38,6,19,28,37,1530,11.7,1,24,0,106.0
2,Denny Hamlin,8.1,36,5,20,24,31,2367,9.0,1,24,0,109.8
3,Josh Berry,9.0,3,0,1,1,3,12,21.0,2,14,0,88.2
4,Joey Logano,10.0,31,2,14,19,27,653,10.9,1,35,1,95.8


In [6]:
richmond_stats = richmond_stats.drop(columns=["Top 20's", "Avg Start", "Best Finish", "Low Finish"])
richmond_stats.head()

Unnamed: 0,Driver,Avg Finish,Races,Wins,Top 5's,Top 10's,Laps Led,DNF,Avg Rating
0,Christopher Bell,7.3,9,0,4,7,230,0,98.0
1,Kyle Busch,7.6,38,6,19,28,1530,0,106.0
2,Denny Hamlin,8.1,36,5,20,24,2367,0,109.8
3,Josh Berry,9.0,3,0,1,1,12,0,88.2
4,Joey Logano,10.0,31,2,14,19,653,1,95.8


In [7]:
# Helper functions, we will apply these later:

# estimate_std, since standard deviation of finishes is not easily available,
# we instead estimate it based off of drivers average finish (a better average
# finish gives us a clue that that driver is more consistent)
def estimate_std(avg_finish):
    if avg_finish <= 12:
        return 2.5
    elif avg_finish <= 18:
        return 3.5
    else:
        return 4.5
    
# calculate the probability of a dnf, using DNF count along with race count, includes
# some manual tweaks to ensure this is a valuable and realistic measure
def dnf_prob(dnf_count, race_count):
    dnf_probability = dnf_count / race_count

    if dnf_probability == 0:
        if race_count >= 5:
            return 0.03  
        else:
            return 0.15 
    elif dnf_probability == 1:
        return 0.4  # cap max failure
    else:
        return min(dnf_probability, 0.35)  # avoid extreme dnf probabilites
    
# simple ratio of laps led per race
def laps_led_per_race(laps_led, races):
    return laps_led/races

In [8]:
richmond_stats_engineered = pd.merge(richmond_stats, season_stats, on='Driver', how='inner')
richmond_stats_engineered.head()

Unnamed: 0,Driver,Avg Finish,Races,Wins,Top 5's,Top 10's,Laps Led,DNF,Avg Rating,szn_Avg Finish,szn_Races,szn_Wins,szn_Top 5's,szn_Laps Led,szn_Avg Rating
0,Christopher Bell,7.3,9,0,4,7,230,0,98.0,12.1,24,3,9,221,90.1
1,Kyle Busch,7.6,38,6,19,28,1530,0,106.0,17.6,24,0,2,62,71.2
2,Denny Hamlin,8.1,36,5,20,24,2367,0,109.8,13.6,23,4,11,555,89.3
3,Josh Berry,9.0,3,0,1,1,12,0,88.2,21.8,24,1,2,169,72.9
4,Joey Logano,10.0,31,2,14,19,653,1,95.8,17.2,24,1,2,358,84.0


In [9]:
richmond_stats_engineered['DNF_Prob'] = richmond_stats_engineered.apply(lambda row: dnf_prob(row['DNF'], row['Races']), axis=1)
richmond_stats_engineered.head()

Unnamed: 0,Driver,Avg Finish,Races,Wins,Top 5's,Top 10's,Laps Led,DNF,Avg Rating,szn_Avg Finish,szn_Races,szn_Wins,szn_Top 5's,szn_Laps Led,szn_Avg Rating,DNF_Prob
0,Christopher Bell,7.3,9,0,4,7,230,0,98.0,12.1,24,3,9,221,90.1,0.03
1,Kyle Busch,7.6,38,6,19,28,1530,0,106.0,17.6,24,0,2,62,71.2,0.03
2,Denny Hamlin,8.1,36,5,20,24,2367,0,109.8,13.6,23,4,11,555,89.3,0.03
3,Josh Berry,9.0,3,0,1,1,12,0,88.2,21.8,24,1,2,169,72.9,0.15
4,Joey Logano,10.0,31,2,14,19,653,1,95.8,17.2,24,1,2,358,84.0,0.032258


In [10]:
richmond_stats_engineered['track_Laps Led Per Race'] = richmond_stats_engineered.apply(lambda row: laps_led_per_race(row['Laps Led'], row['Races']), axis=1)
richmond_stats_engineered['szn_Laps Led Per Race'] = richmond_stats_engineered.apply(lambda row: laps_led_per_race(row['szn_Laps Led'], row['szn_Races']), axis=1)

In [11]:
richmond_stats_engineered['Std Dev'] = richmond_stats_engineered.apply(lambda row: estimate_std(row['Avg Finish']), axis=1)
richmond_stats_engineered.head()

Unnamed: 0,Driver,Avg Finish,Races,Wins,Top 5's,Top 10's,Laps Led,DNF,Avg Rating,szn_Avg Finish,szn_Races,szn_Wins,szn_Top 5's,szn_Laps Led,szn_Avg Rating,DNF_Prob,track_Laps Led Per Race,szn_Laps Led Per Race,Std Dev
0,Christopher Bell,7.3,9,0,4,7,230,0,98.0,12.1,24,3,9,221,90.1,0.03,25.555556,9.208333,2.5
1,Kyle Busch,7.6,38,6,19,28,1530,0,106.0,17.6,24,0,2,62,71.2,0.03,40.263158,2.583333,2.5
2,Denny Hamlin,8.1,36,5,20,24,2367,0,109.8,13.6,23,4,11,555,89.3,0.03,65.75,24.130435,2.5
3,Josh Berry,9.0,3,0,1,1,12,0,88.2,21.8,24,1,2,169,72.9,0.15,4.0,7.041667,2.5
4,Joey Logano,10.0,31,2,14,19,653,1,95.8,17.2,24,1,2,358,84.0,0.032258,21.064516,14.916667,2.5


In [12]:
richmond_stats_engineered = richmond_stats_engineered.drop(columns=["Races", "Top 10's", "Laps Led", "szn_Races", "szn_Laps Led"])
richmond_stats_engineered.head()

Unnamed: 0,Driver,Avg Finish,Wins,Top 5's,DNF,Avg Rating,szn_Avg Finish,szn_Wins,szn_Top 5's,szn_Avg Rating,DNF_Prob,track_Laps Led Per Race,szn_Laps Led Per Race,Std Dev
0,Christopher Bell,7.3,0,4,0,98.0,12.1,3,9,90.1,0.03,25.555556,9.208333,2.5
1,Kyle Busch,7.6,6,19,0,106.0,17.6,0,2,71.2,0.03,40.263158,2.583333,2.5
2,Denny Hamlin,8.1,5,20,0,109.8,13.6,4,11,89.3,0.03,65.75,24.130435,2.5
3,Josh Berry,9.0,0,1,0,88.2,21.8,1,2,72.9,0.15,4.0,7.041667,2.5
4,Joey Logano,10.0,2,14,1,95.8,17.2,1,2,84.0,0.032258,21.064516,14.916667,2.5


In [None]:
# Input files able to be read from GitHub: Don't feel like doing the API request stuff to write this out as well,
# This is all behind the scenes anyways :)
#richmond_stats_engineered.to_csv("C:/Users/jakel/OneDrive/Desktop/GitHub/NASCAR-Race-Predictions/Data/Richmond-2025/richmond-sim-ready-data.csv")