# 1. Explaratory Data Analysis

In [1]:
# Import libraries for data analysis
import fastf1 as f1
import pandas as pd
import seaborn as sns
import scipy.stats as stats

## 1-1. Observe difference in race times for the same track between different years

We will compare how the race times for the same racetrack for the same driver across different years to determine how to define our dataset. We have chosen 2022 and 2023, as 2022 was the last time the FIA introduced a major rule change. 
We have chosen Verstappen and Perez as their cars have shown consistent results throughout the two years (2022 and 2023).

In [None]:
def process_event(event, drivers):
    session = f1.get_session(event.year, event.EventName, 'R')
    session.load(laps=True, telemetry=False, weather=False, messages=False)
    
    results = []
    for driver in drivers:
        driver_laps = session.laps.pick_driver(driver).reset_index()
        valid_laps = driver_laps[
            (~driver_laps['LapTime'].isna()) &
            (driver_laps[['PitOutTime', 'PitInTime']].isna().all(axis=1))
        ]
        
        results.extend({
            'Driver': driver,
            'Year': event.year,
            'Location': event.Country,
            'Laptime': lap_time
        } for lap_time in valid_laps['LapTime'])
    
    return results

def main():
    f1.Cache.enable_cache('path/to/cache')  # Enable caching
    
    drivers = ['VER', 'PER']
    events = f1.get_event_schedule(2022)
    race_events = events[events['Session5'] == 'Race']
    
    all_results = []
    for _, event in race_events.iterrows():
        all_results.extend(process_event(event, drivers))
    
    race_hist = pd.DataFrame(all_results)
    return race_hist

if __name__ == '__main__':
    race_hist = main()
    print(race_hist)

In [8]:
import os as os
os.getcwd()

'/Users/junghoonkim/Desktop/Formula1-Strategy'

In [13]:
race_hit = pd.DataFrame()
Events = f1.get_event_schedule(2022)
for _, Event in Events[Events['Session5'] == 'Race'].iterrows():
    session = f1.core.Session(Event, session_name = 'Race', f1_api_support  = True)
    session.load(laps = True)

    drivers = ['VER', 'PER']

    for driver in drivers:
        sesh_l = session.laps
        sesh_driver = sesh_l.pick_driver(driver).reset_index()
        sesh_driver = sesh_driver[sesh_driver['LapTime'].isna() == False]
        sesh_driver = sesh_driver[sesh_driver[['PitOutTime', 'PitInTime']].isna().all(axis=1)]

        temp = pd.DataFrame(columns = ['Driver', 'Year', 'Location', 'Laptime'])
        temp['Laptime'] = sesh_driver['LapTime']
        temp['Driver'] = driver
        temp['Year'] = 2022
        temp['Location'] = Event['Country']

        race_hist = pd.concat([race_hist, temp], axis = 0, ignore_index = True)

core           INFO 	Loading data for Bahrain Grand Prix - Race [v3.3.5]
req            INFO 	No cached data found for session_info. Loading data...
_api           INFO 	Fetching session info data...


req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...


KeyError: 'DriverNumber'

In [3]:
# Create a dataset and collect all the laptimes for Verstappen and Perez
race_hist = pd.DataFrame()
Years = [2022, 2023]
for year in Years:
    Events = f1.get_event_schedule(year)
    for _, Event in Events[Events['Session5'] == 'Race'].iterrows():
        session = f1.core.Session(Event, session_name = 'Race', f1_api_support  = True)
        session.load(laps = True, telemetry = True, weather = True, messages = True)

        drivers = ['VER', 'PER']

        for driver in drivers:
            sesh_l = session.laps
            sesh_driver = sesh_l.pick_driver(driver).reset_index()
            sesh_driver = sesh_driver[sesh_driver['LapTime'].isna() == False]
            sesh_driver = sesh_driver[sesh_driver[['PitOutTime', 'PitInTime']].isna().all(axis=1)]

            temp = pd.DataFrame(columns = ['Driver', 'Year', 'Location', 'Laptime'])
            temp['Laptime'] = sesh_driver['LapTime']
            temp['Driver'] = driver
            temp['Year'] = year
            temp['Location'] = Event['Country']

            race_hist = pd.concat([race_hist, temp], axis = 0, ignore_index = True)

core           INFO 	Loading data for Bahrain Grand Prix - Race [v3.3.5]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
Request for URL https://ergast.com/api/f1/2022/1/results.json failed; using cached response
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/requests_cache/session.py", line 290, in _resend
    response.raise_for_status()
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/requests/models.py", line 1021, in raise_for_status
    raise HTTPError(http_error_msg, response=self)
requests.exceptions.HTTPError: 503 Server Error: Backend fetch failed for url: https://ergast.com/api/f1/2022/1/results.json
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cach

KeyboardInterrupt: 

In [None]:
race_hist.head()

In [None]:
# Convert timedelta objects to total seconds 
for i in range(len(race_hist['Laptime'])):
    race_hist['Laptime'].loc[i] = race_hist['Laptime'].loc[i].total_seconds()

In [None]:
# Conduct Kolmogorov-Smirnov to determine normality
locations = race_hist['Location'].unique()
drivers = race_hist['Driver'].unique()
norm = []
not_norm = []
for drv in drivers:
    for loc in locations:
        for year in [2022, 2023]:
            dat = race_hist[(race_hist['Driver'] == drv )&
                      (race_hist['Location'] == loc )&
                      (race_hist['Year'] == year)]['Laptime'].astype(int)
            if dat.empty == False:
                mean_val = dat.mean()
                sd_val = dat.std()
                if stats.kstest(rvs = dat, cdf = 'norm', args = (mean_val, sd_val)).pvalue < 0.05:
                    ks_score = stats.kstest(rvs = dat, cdf = 'norm', args = (mean_val, sd_val)).statistic
                    not_norm.append((drv, loc, year, ks_score))
                else:
                    norm.append((drv, loc, year))
len(norm) == 0

The Kolomogorov-Smirnov test shows that the datasets are not normally distributed. This can be easily observed by plotting an instance of the dataset as shown below. We will plot the distribution of the time it took of Verstappen to complete the race in Bahrain in 2022.

In [None]:
sns.histplot(data = race_hist[(race_hist['Driver'] == 'VER') & 
        (race_hist['Location'] == 'Bahrain') & 
        (race_hist['Year'] == 2022)]['Laptime'].astype(int))

In [None]:
sns.histplot(data = race_hist[(race_hist['Driver'] == 'VER') & 
        (race_hist['Location'] == 'Bahrain') & 
        (race_hist['Year'] == 2023)]['Laptime'].astype(int))

In [None]:
sns.histplot(data = race_hist[(race_hist['Driver'] == 'PER') & 
        (race_hist['Location'] == 'Bahrain') & 
        (race_hist['Year'] == 2022)]['Laptime'].astype(int))

In [None]:
sns.histplot(data = race_hist[(race_hist['Driver'] == 'PER') & 
        (race_hist['Location'] == 'Bahrain') & 
        (race_hist['Year'] == 2023)]['Laptime'].astype(int))

It is visibally obvious that the dataset is highly skewed with outliers.

In [None]:
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# Visualize how the time it took to complete a lap differs 
laptime = race_hist[(race_hist['Driver'] == 'VER') & 
        (race_hist['Location'] == 'Bahrain') & 
        (race_hist['Year'] == 2022)]['Laptime'].astype(int).to_numpy()
x = np.arange(len(laptime))
plt.plot(x, list(laptime))

Potting the lap times by the lap number visually shows how the laps times are highly skewed.

Since the dataset is clearly not normally distributed we will be using a Kruskal-Wallis test to see if the median values across two years in the same race between the same players are equal.

In [None]:
null_hypo = []
alt_hypo = []
for drv in drivers:
    for loc in locations:
        dat = race_hist[(race_hist['Driver'] == drv )&
                      (race_hist['Location'] == loc )]
        if (dat[dat.Year == 2022].empty == False) & (dat[dat.Year == 2023].empty == False):
            dat_2022 = dat[dat['Year'] == 2022]['Laptime'].astype(int)
            dat_2023 = dat[dat['Year'] == 2023]['Laptime'].astype(int)
            if stats.kruskal(dat_2022, dat_2023).pvalue < 0.05:
                alt_hypo.append((drv, loc))
            else:
                null_hypo.append((drv, loc))
len(alt_hypo) > len(null_hypo)
alt_hypo

The results show that for the vast majority of cases the two distributions are not equal.

# Conclusion

The Kruskal Wallis test confirms that the two distributions for a driver in the same racetrack across the two years are not equal. In fact they seem to be faster on average. 

Additionally, in 2022 due to the changes in regulations the Mercedes team suffered heavily with porpoising. This lead to the team underscording compared to usual. This suggests that combining the results from 2022 and 2023 to create a large dataset would not be appropriate. 
Moreover, teams regularly repair and change vehicles between races due to DNFs or performance issues etc. Therefore fitting a model using data where a single time point is defined as a lap seems inappropriate. Instead, our model should use telemetry data to predict what would happen within a single lap. 

The data creation file uses telemetry data to predict the changes in positions within a race. The reults would then be used assist in determining whether a tyre change would be advisory or not. 