# NASCAR Race Winner Prediction: Monte Carlo Simulation
## Notebook 1: Data Preprocessing

In [85]:
import pandas as pd
from data_pulling import *
import sys
sys.dont_write_bytecode = True
from sklearn.preprocessing import MinMaxScaler

For this notebook, I will be using web crawling to extract data.

In [86]:
cup_roster = pd.read_csv("https://raw.githubusercontent.com/jake-lukasik/NASCAR-Race-Predictions/refs/heads/main/Data/Master-Data/2025-cup-series-full-timers.txt")

### Race 4: Watkins Glen (RC) 08/11/2025

Step 1: Pulling watkins_glen stats from driveraverages.com, I left a ton of comments so this is easily understandable

In [87]:
watkins_glen_stats = pull_data_from_driver_averages(url="https://www.driveraverages.com/nascar/track_avg.php?trk_id=22",
                                            driver_in_first="Shane van Gisbergen",
                                            driver_in_fourth="Christopher Bell")
watkins_glen_stats.head()

Found 6 tables.


Unnamed: 0,Driver,Avg Finish,Races,Wins,Top 5's,Top 10's,Top 20's,Laps Led,Avg Start,Best Finish,Low Finish,DNF,Avg Rating
0,Shane van Gisbergen,2.0,1,0,1,1,1,1,3.0,2,2,0,123.9
1,Carson Hocevar,3.0,1,0,1,1,1,1,29.0,3,3,0,103.6
2,Zane Smith,5.0,1,0,1,1,1,0,19.0,5,5,0,93.7
3,Christopher Bell,8.0,4,0,1,3,4,0,17.3,3,14,0,92.9
4,Chase Elliott,10.6,8,2,4,4,7,170,8.1,1,32,0,107.6


In [88]:
season_stats = pull_season_stats(driver_in_first="William Byron")
season_stats.head()

Unnamed: 0,Driver,szn_Avg Finish,szn_Races,szn_Wins,szn_Top 5's,szn_Laps Led,szn_Avg Rating
0,William Byron,14.2,23,2,8,910,100.5
1,Chase Elliott,10.3,23,1,7,385,89.9
2,Kyle Larson,13.5,23,3,11,873,92.9
3,Denny Hamlin,13.0,22,4,11,555,91.3
4,Christopher Bell,12.6,23,3,8,221,89.1


In [89]:
rd_course_stats = pull_road_course_stats(url="https://www.driveraverages.com/nascar/tracktype_avg.php?trktype_id=4",
                                         driver_in_first="Chase Elliott")
rd_course_stats = rd_course_stats[rd_course_stats['Driver'].isin(cup_roster['Driver'])].reset_index(drop=True)
display(rd_course_stats)

Unnamed: 0,Driver,Avg Finish,Races,Wins,Top 5's,Top 10's,Top 20's,Laps Led,Avg Start,Best Finish,Low Finish,DNF
0,Chase Elliott,8.9,39,7,22,26,34,489,9.4,1,37,1
1,Shane van Gisbergen,8.9,10,4,5,8,9,247,4.1,1,40,1
2,Tyler Reddick,12.0,30,3,10,19,24,217,7.6,1,38,1
3,Chris Buescher,13.1,41,1,6,17,37,62,18.3,1,37,0
4,Christopher Bell,13.7,30,3,10,16,21,125,13.4,1,38,4
5,Joey Logano,14.7,54,1,12,25,37,147,12.1,1,37,2
6,Kyle Busch,14.7,62,4,22,34,43,472,11.9,1,40,4
7,Alex Bowman,14.9,39,1,8,17,30,19,16.4,1,38,1
8,Ty Gibbs,15.3,18,0,5,8,12,46,10.9,2,37,1
9,Ryan Blaney,15.5,40,1,5,16,29,74,16.2,1,38,2


In [90]:
watkins_glen_stats = watkins_glen_stats[watkins_glen_stats['Driver'].isin(cup_roster['Driver'])]
watkins_glen_stats.head()

Unnamed: 0,Driver,Avg Finish,Races,Wins,Top 5's,Top 10's,Top 20's,Laps Led,Avg Start,Best Finish,Low Finish,DNF,Avg Rating
0,Shane van Gisbergen,2.0,1,0,1,1,1,1,3.0,2,2,0,123.9
1,Carson Hocevar,3.0,1,0,1,1,1,1,29.0,3,3,0,103.6
2,Zane Smith,5.0,1,0,1,1,1,0,19.0,5,5,0,93.7
3,Christopher Bell,8.0,4,0,1,3,4,0,17.3,3,14,0,92.9
4,Chase Elliott,10.6,8,2,4,4,7,170,8.1,1,32,0,107.6


In [91]:
watkins_glen_stats = watkins_glen_stats.drop(columns=["Top 20's", "Avg Start", "Best Finish", "Low Finish"])
watkins_glen_stats.head()

Unnamed: 0,Driver,Avg Finish,Races,Wins,Top 5's,Top 10's,Laps Led,DNF,Avg Rating
0,Shane van Gisbergen,2.0,1,0,1,1,1,0,123.9
1,Carson Hocevar,3.0,1,0,1,1,1,0,103.6
2,Zane Smith,5.0,1,0,1,1,0,0,93.7
3,Christopher Bell,8.0,4,0,1,3,0,0,92.9
4,Chase Elliott,10.6,8,2,4,4,170,0,107.6


## New for Road Courses: Build out this 'Road Course Weight' Feature

In [92]:
performance_metrics = rd_course_stats[['Avg Finish', 'Wins', "Top 5's", "Top 10's", 'Laps Led', 'DNF']].copy()
# reverse 'Avg Finish' and 'DNF' since lower is better
performance_metrics['Avg Finish'] = -performance_metrics['Avg Finish']
performance_metrics['DNF'] = -performance_metrics['DNF']

# normalize all columns between 0 and 1
scaler = MinMaxScaler()
normalized = scaler.fit_transform(performance_metrics)
normalized_df = pd.DataFrame(normalized, columns=performance_metrics.columns)

# Add driver names back
normalized_df['Driver'] = rd_course_stats['Driver']
normalized_df.head()

Unnamed: 0,Avg Finish,Wins,Top 5's,Top 10's,Laps Led,DNF,Driver
0,1.0,1.0,1.0,0.764706,1.0,0.888889,Chase Elliott
1,1.0,0.571429,0.227273,0.235294,0.505112,0.888889,Shane van Gisbergen
2,0.862832,0.428571,0.454545,0.558824,0.443763,0.888889,Tyler Reddick
3,0.814159,0.142857,0.272727,0.5,0.126789,1.0,Chris Buescher
4,0.787611,0.428571,0.454545,0.470588,0.255624,0.555556,Christopher Bell


In [93]:
weights = {
    'Avg Finish': 0.4,
    'Wins': 0.4,
    "Top 5's": 0.05,
    'Laps Led': 0.05,
    'DNF': 0.1
}

# weighted score
normalized_df['RoadCourseMultiplier'] = sum(
    normalized_df[col] * weight for col, weight in weights.items()
)

normalized_df.head()

Unnamed: 0,Avg Finish,Wins,Top 5's,Top 10's,Laps Led,DNF,Driver,RoadCourseMultiplier
0,1.0,1.0,1.0,0.764706,1.0,0.888889,Chase Elliott,0.988889
1,1.0,0.571429,0.227273,0.235294,0.505112,0.888889,Shane van Gisbergen,0.75408
2,0.862832,0.428571,0.454545,0.558824,0.443763,0.888889,Tyler Reddick,0.650366
3,0.814159,0.142857,0.272727,0.5,0.126789,1.0,Chris Buescher,0.502782
4,0.787611,0.428571,0.454545,0.470588,0.255624,0.555556,Christopher Bell,0.577537


Now we can begin feature engineering to prepare the data for a Monte Carlo simulated model:

In [94]:
# Helper functions, we will apply these later:

# estimate_std, since standard deviation of finishes is not easily available,
# we instead estimate it based off of drivers average finish (a better average
# finish gives us a clue that that driver is more consistent)
def estimate_std(avg_finish):
    if avg_finish <= 12:
        return 2.5
    elif avg_finish <= 18:
        return 3.5
    else:
        return 4.5
    
# calculate the probability of a dnf, using DNF count along with race count, includes
# some manual tweaks to ensure this is a valuable and realistic measure
def dnf_prob(dnf_count, race_count):
    dnf_probability = dnf_count / race_count

    if dnf_probability == 0:
        if race_count >= 5:
            return 0.03  
        else:
            return 0.15 
    elif dnf_probability == 1:
        return 0.4  # cap max failure
    else:
        return min(dnf_probability, 0.35)  # avoid extreme dnf probabilites
    
# simple ratio of laps led per race
def laps_led_per_race(laps_led, races):
    return laps_led/races

In [95]:
watkins_glen_stats_engineered = pd.merge(watkins_glen_stats, season_stats, on='Driver', how='inner')
watkins_glen_stats_engineered.head()

Unnamed: 0,Driver,Avg Finish,Races,Wins,Top 5's,Top 10's,Laps Led,DNF,Avg Rating,szn_Avg Finish,szn_Races,szn_Wins,szn_Top 5's,szn_Laps Led,szn_Avg Rating
0,Shane van Gisbergen,2.0,1,0,1,1,1,0,123.9,22.5,23,3,3,208,60.5
1,Carson Hocevar,3.0,1,0,1,1,1,0,103.6,22.3,23,0,2,80,74.1
2,Zane Smith,5.0,1,0,1,1,0,0,93.7,21.0,23,0,0,15,63.5
3,Christopher Bell,8.0,4,0,1,3,0,0,92.9,12.6,23,3,8,221,89.1
4,Chase Elliott,10.6,8,2,4,4,170,0,107.6,10.3,23,1,7,385,89.9


In [96]:
watkins_glen_stats_engineered['DNF_Prob'] = watkins_glen_stats_engineered.apply(lambda row: dnf_prob(row['DNF'], row['Races']), axis=1)
watkins_glen_stats_engineered.head()

Unnamed: 0,Driver,Avg Finish,Races,Wins,Top 5's,Top 10's,Laps Led,DNF,Avg Rating,szn_Avg Finish,szn_Races,szn_Wins,szn_Top 5's,szn_Laps Led,szn_Avg Rating,DNF_Prob
0,Shane van Gisbergen,2.0,1,0,1,1,1,0,123.9,22.5,23,3,3,208,60.5,0.15
1,Carson Hocevar,3.0,1,0,1,1,1,0,103.6,22.3,23,0,2,80,74.1,0.15
2,Zane Smith,5.0,1,0,1,1,0,0,93.7,21.0,23,0,0,15,63.5,0.15
3,Christopher Bell,8.0,4,0,1,3,0,0,92.9,12.6,23,3,8,221,89.1,0.15
4,Chase Elliott,10.6,8,2,4,4,170,0,107.6,10.3,23,1,7,385,89.9,0.03


In [97]:
watkins_glen_stats_engineered['track_Laps Led Per Race'] = watkins_glen_stats_engineered.apply(lambda row: laps_led_per_race(row['Laps Led'], row['Races']), axis=1)
watkins_glen_stats_engineered['szn_Laps Led Per Race'] = watkins_glen_stats_engineered.apply(lambda row: laps_led_per_race(row['szn_Laps Led'], row['szn_Races']), axis=1)

In [98]:
watkins_glen_stats_engineered['Std Dev'] = watkins_glen_stats_engineered.apply(lambda row: estimate_std(row['Avg Finish']), axis=1)
watkins_glen_stats_engineered.head()

Unnamed: 0,Driver,Avg Finish,Races,Wins,Top 5's,Top 10's,Laps Led,DNF,Avg Rating,szn_Avg Finish,szn_Races,szn_Wins,szn_Top 5's,szn_Laps Led,szn_Avg Rating,DNF_Prob,track_Laps Led Per Race,szn_Laps Led Per Race,Std Dev
0,Shane van Gisbergen,2.0,1,0,1,1,1,0,123.9,22.5,23,3,3,208,60.5,0.15,1.0,9.043478,2.5
1,Carson Hocevar,3.0,1,0,1,1,1,0,103.6,22.3,23,0,2,80,74.1,0.15,1.0,3.478261,2.5
2,Zane Smith,5.0,1,0,1,1,0,0,93.7,21.0,23,0,0,15,63.5,0.15,0.0,0.652174,2.5
3,Christopher Bell,8.0,4,0,1,3,0,0,92.9,12.6,23,3,8,221,89.1,0.15,0.0,9.608696,2.5
4,Chase Elliott,10.6,8,2,4,4,170,0,107.6,10.3,23,1,7,385,89.9,0.03,21.25,16.73913,2.5


In [99]:
watkins_glen_stats_engineered = watkins_glen_stats_engineered.drop(columns=["Races", "Top 10's", "Laps Led", "szn_Races", "szn_Laps Led"])
watkins_glen_stats_engineered.head()

Unnamed: 0,Driver,Avg Finish,Wins,Top 5's,DNF,Avg Rating,szn_Avg Finish,szn_Wins,szn_Top 5's,szn_Avg Rating,DNF_Prob,track_Laps Led Per Race,szn_Laps Led Per Race,Std Dev
0,Shane van Gisbergen,2.0,0,1,0,123.9,22.5,3,3,60.5,0.15,1.0,9.043478,2.5
1,Carson Hocevar,3.0,0,1,0,103.6,22.3,0,2,74.1,0.15,1.0,3.478261,2.5
2,Zane Smith,5.0,0,1,0,93.7,21.0,0,0,63.5,0.15,0.0,0.652174,2.5
3,Christopher Bell,8.0,0,1,0,92.9,12.6,3,8,89.1,0.15,0.0,9.608696,2.5
4,Chase Elliott,10.6,2,4,0,107.6,10.3,1,7,89.9,0.03,21.25,16.73913,2.5


In [100]:
watkins_glen_stats_engineered = watkins_glen_stats_engineered.merge(normalized_df[['Driver', 'RoadCourseMultiplier']], on='Driver', how='left')
watkins_glen_stats_engineered.head()

Unnamed: 0,Driver,Avg Finish,Wins,Top 5's,DNF,Avg Rating,szn_Avg Finish,szn_Wins,szn_Top 5's,szn_Avg Rating,DNF_Prob,track_Laps Led Per Race,szn_Laps Led Per Race,Std Dev,RoadCourseMultiplier
0,Shane van Gisbergen,2.0,0,1,0,123.9,22.5,3,3,60.5,0.15,1.0,9.043478,2.5,0.75408
1,Carson Hocevar,3.0,0,1,0,103.6,22.3,0,2,74.1,0.15,1.0,3.478261,2.5,0.271795
2,Zane Smith,5.0,0,1,0,93.7,21.0,0,0,63.5,0.15,0.0,0.652174,2.5,0.2543
3,Christopher Bell,8.0,0,1,0,92.9,12.6,3,8,89.1,0.15,0.0,9.608696,2.5,0.577537
4,Chase Elliott,10.6,2,4,0,107.6,10.3,1,7,89.9,0.03,21.25,16.73913,2.5,0.988889


In [None]:
# Input files able to be read from GitHub: Don't feel like doing the API request stuff to write this out as well,
# This is all behind the scenes anyways :)
#watkins_glen_stats_engineered.to_csv("C:/Users/jakel/OneDrive/Desktop/GitHub/NASCAR-Race-Predictions/Data/Watkins-Glen-2025/watkins-glen-sim-ready-data.csv")