In [5]:
from matplotlib import pyplot as plt
import fastf1
import fastf1.plotting
import pandas as pd
import tqdm
import os
import numpy as np
import time
import logging
pd.set_option('display.max_columns', None)

In [3]:
# Getting a list of the names of the sessions that are actually races

races = []

# list of years
years = [2018, 2019, 2020, 2021, 2022, 2023]

# for each year, get a schedule
for year in years:
    schedule = fastf1.get_event_schedule(year)
    
    # for each row, save a tuple of the year and the official event name
    for _, event in schedule.iterrows():
        # if eventformat is not conventional, skip
        if event['EventFormat'] == 'conventional':
            races.append((year, event['OfficialEventName']))


In [26]:
# Saving these races into year folders - doesn't need to be run again

# event_names = []

# # tqdm iterate over races
# for i in tqdm.tqdm(range(len(races))):
#     R = races[i]
#     year, event_name = R
#     try:
#         session = fastf1.get_session(year, event_name, 'R')
#         session.load()
#     except:
#         event_names.append(event_name)
    
#     # create directory if it doesn't exist
#     if not os.path.exists(f'Data/{year}'):
#         os.makedirs(f'Data/{year}')
    
#     # save session data to csv
#     session.laps.to_csv(f'Data/{year}/{event_name}.csv', index=False)
# for name in event_names:
#     print("%s\n", name)
# # load session data from csv

# Trying a single session to debug

In [18]:
# Joining the lap and weather data for each session

session = fastf1.get_session(2023, 'Australian Grand Prix', 'R')
session.load()
laps = session.laps.copy()
# get the weather data for each lap
weather_data = session.laps.get_weather_data()
laps = laps.reset_index(drop=True)
weather_data = weather_data.reset_index(drop=True)
max_lap = laps['LapNumber'].max()

# exclude the 'Time' column from weather data when joining
joined = pd.concat([laps, weather_data.loc[:, ~(weather_data.columns == 'Time')]], axis=1)
test = joined.copy()
laps_nopit = test[(test['PitOutTime'].isna()) & (test['PitInTime'].isna())]
laps_norainfall = laps_nopit[laps_nopit['Rainfall'] == False]
laps_norainfall_update = laps_norainfall[(laps_norainfall['LapNumber'] > 3) & (laps_norainfall['LapNumber'] < max_lap)]
laps_norainfall_update

core           INFO 	Loading data for Australian Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info


req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '44', '14', '18', '11', '4', '27', '81', '24', '22', '77', '55', '10', '31', '21', '2', '20', '63', '23', '16']


Unnamed: 0,Time,Driver,DriverNumber,LapTime,LapNumber,Stint,PitOutTime,PitInTime,Sector1Time,Sector2Time,Sector3Time,Sector1SessionTime,Sector2SessionTime,Sector3SessionTime,SpeedI1,SpeedI2,SpeedFL,SpeedST,IsPersonalBest,Compound,TyreLife,FreshTyre,Team,LapStartTime,LapStartDate,TrackStatus,Position,Deleted,DeletedReason,FastF1Generated,IsAccurate,AirTemp,Humidity,Pressure,Rainfall,TrackTemp,WindDirection,WindSpeed
3,0 days 01:09:44.824000,VER,1,0 days 00:01:23.391000,4.0,1.0,NaT,NaT,0 days 00:00:28.900000,0 days 00:00:18.326000,0 days 00:00:36.165000,0 days 01:08:50.394000,0 days 01:09:08.720000,0 days 01:09:44.885000,280.0,296.0,288.0,303.0,True,MEDIUM,4.0,True,Red Bull Racing,0 days 01:08:21.433000,2023-04-02 05:09:23.255,1,3.0,False,,False,True,17.8,52.0,1018.5,False,34.4,0,0.8
4,0 days 01:11:07.928000,VER,1,0 days 00:01:23.104000,5.0,1.0,NaT,NaT,0 days 00:00:28.935000,0 days 00:00:18.347000,0 days 00:00:35.822000,0 days 01:10:13.820000,0 days 01:10:32.167000,0 days 01:11:07.989000,273.0,294.0,290.0,286.0,True,MEDIUM,5.0,True,Red Bull Racing,0 days 01:09:44.824000,2023-04-02 05:10:46.646,1,3.0,False,,False,True,17.8,52.0,1018.5,False,34.0,163,1.2
5,0 days 01:12:30.771000,VER,1,0 days 00:01:22.843000,6.0,1.0,NaT,NaT,0 days 00:00:28.986000,0 days 00:00:17.951000,0 days 00:00:35.906000,0 days 01:11:36.975000,0 days 01:11:54.926000,0 days 01:12:30.832000,276.0,322.0,304.0,296.0,True,MEDIUM,6.0,True,Red Bull Racing,0 days 01:11:07.928000,2023-04-02 05:12:09.750,1,3.0,False,,False,True,17.7,52.0,1018.5,False,33.7,157,1.0
6,0 days 01:14:05.841000,VER,1,0 days 00:01:35.070000,7.0,1.0,NaT,NaT,0 days 00:00:28.798000,0 days 00:00:18.011000,0 days 00:00:48.261000,0 days 01:12:59.630000,0 days 01:13:17.641000,0 days 01:14:05.902000,274.0,319.0,165.0,306.0,False,MEDIUM,7.0,True,Red Bull Racing,0 days 01:12:30.771000,2023-04-02 05:13:32.593,124,2.0,False,,False,False,17.8,51.0,1018.5,False,33.7,140,1.1
9,0 days 01:37:16.003000,VER,1,0 days 00:01:27.827000,10.0,2.0,NaT,NaT,0 days 00:00:33.323000,0 days 00:00:18.633000,0 days 00:00:35.871000,0 days 01:36:21.560000,0 days 01:36:40.193000,0 days 01:37:16.064000,276.0,288.0,288.0,242.0,False,HARD,2.0,True,Red Bull Racing,0 days 01:35:48.176000,2023-04-02 05:36:49.998,1,2.0,False,,False,True,18.0,52.0,1018.4,False,32.9,161,1.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
993,0 days 02:31:53.573000,PIA,81,0 days 00:01:21.525000,49.0,2.0,NaT,NaT,0 days 00:00:28.394000,0 days 00:00:18.467000,0 days 00:00:34.664000,0 days 02:31:00.492000,0 days 02:31:18.959000,0 days 02:31:53.623000,274.0,284.0,287.0,288.0,True,HARD,41.0,True,McLaren,0 days 02:30:32.048000,2023-04-02 06:31:33.870,1,11.0,False,,False,True,17.5,57.0,1018.2,False,29.2,133,1.2
994,0 days 02:33:15.325000,PIA,81,0 days 00:01:21.752000,50.0,2.0,NaT,NaT,0 days 00:00:28.522000,0 days 00:00:18.478000,0 days 00:00:34.752000,0 days 02:32:22.145000,0 days 02:32:40.623000,0 days 02:33:15.375000,276.0,288.0,286.0,287.0,False,HARD,42.0,True,McLaren,0 days 02:31:53.573000,2023-04-02 06:32:55.395,1,11.0,False,,False,True,17.5,56.0,1018.2,False,29.1,106,0.6
995,0 days 02:34:37.062000,PIA,81,0 days 00:01:21.737000,51.0,2.0,NaT,NaT,0 days 00:00:28.411000,0 days 00:00:18.485000,0 days 00:00:34.841000,0 days 02:33:43.786000,0 days 02:34:02.271000,0 days 02:34:37.112000,275.0,284.0,287.0,292.0,False,HARD,43.0,True,McLaren,0 days 02:33:15.325000,2023-04-02 06:34:17.147,1,11.0,False,,False,True,17.5,56.0,1018.2,False,29.0,147,0.8
996,0 days 02:35:58.745000,PIA,81,0 days 00:01:21.683000,52.0,2.0,NaT,NaT,0 days 00:00:28.577000,0 days 00:00:18.388000,0 days 00:00:34.718000,0 days 02:35:05.689000,0 days 02:35:24.077000,0 days 02:35:58.795000,273.0,286.0,287.0,289.0,False,HARD,44.0,True,McLaren,0 days 02:34:37.062000,2023-04-02 06:35:38.884,1,11.0,False,,False,True,17.5,56.0,1018.2,False,28.7,0,0.6


# Features of cleaned laps:

# - Qualifying and practice laps removed
# - The first 3 and last lap was removed 
# - Laps involving pit stops removed
# - Laps with rainfall separated from those without

In [6]:
# New clean function to get dry weather data

def save_session_laps_clean_dry(year, event_name):
    "Function loads the session data, concatenates the lap and weather data, and removes the laps with pit stops and rainfall. Saves the cleaned data into a new folder called Data_cleaned_new_dry"
    try:
        session = fastf1.get_session(year, event_name, 'R')
        session.load()
        laps = session.laps.copy()
        weather_data = session.laps.get_weather_data()
        laps = laps.reset_index(drop=True)
        weather_data = weather_data.reset_index(drop=True)
        max_lap = laps['LapNumber'].max()

        # exclude the 'Time' column from weather data when joining
        
        joined = pd.concat([laps, weather_data.loc[:, ~(weather_data.columns == 'Time')]], axis=1)
        clean_laps = joined.copy()

        # remove laps where pitouttime or pitintime is not NaT (keep undefined laps) & laps where there is rain & laps where the lap number is less than 3 or greater than the total number of laps
        laps_nopit = clean_laps[(clean_laps['PitOutTime'].isna()) & (clean_laps['PitInTime'].isna())]
        laps_norainfall = laps_nopit[laps_nopit['Rainfall'] == False]
        laps_norainfall_update = laps_norainfall[(laps_norainfall['LapNumber'] > 3) & (laps_norainfall['LapNumber'] < max_lap)]

        # create directory if it doesn't exist
        if not os.path.exists(f'Data_cleaned_new_dry/{year}'):
            os.makedirs(f'Data_cleaned_new_dry/{year}', exist_ok=True)

        # save cleaned laps
        laps_norainfall_update .to_csv(f'Data_cleaned_new_dry/{year}/{event_name}.csv', index=False)
    except:
        time.sleep(10)
        print(f"Error loading session for {year} {event_name}. Retrying...")


In [7]:
# New clean function to get wet weather data

def save_session_laps_clean_wet(year, event_name):
    "Function loads the session data, concatenates the lap and weather data, and removes the laps with pit stops and no rainfall. Saves the cleaned data into a new folder called Data_cleaned"
    try:
        session = fastf1.get_session(year, event_name, 'R')
        session.load()
        laps = session.laps.copy()
        weather_data = session.laps.get_weather_data()
        laps = laps.reset_index(drop=True)
        weather_data = weather_data.reset_index(drop=True)
        max_lap = laps['LapNumber'].max()

        # exclude the 'Time' column from weather data when joining
        
        joined = pd.concat([laps, weather_data.loc[:, ~(weather_data.columns == 'Time')]], axis=1)
        clean_laps = joined.copy()

        # remove laps where pitouttime or pitintime is not NaT (keep undefined laps) & laps where there is rain & laps where the lap number is less than 3 or greater than the total number of laps
        laps_nopit = clean_laps[(clean_laps['PitOutTime'].isna()) & (clean_laps['PitInTime'].isna())]
        laps_rainfall = laps_nopit[laps_nopit['Rainfall'] == True]
        laps_rainfall_update = laps_rainfall[(laps_norainfall['LapNumber'] > 3) & (laps_norainfall['LapNumber'] < max_lap)]

        # create directory if it doesn't exist
        if not os.path.exists(f'Data_cleaned_new_wet/{year}'):
            os.makedirs(f'Data_cleaned_new_wet/{year}', exist_ok=True)

        # save cleaned laps
        laps_rainfall_update.to_csv(f'Data_cleaned_new_wet/{year}/{event_name}.csv', index=False)
    except:
        time.sleep(10)
        print(f"Error loading session for {year} {event_name}. Retrying...")

In [8]:
# Implementing the new function to get dry weather data

logging.getLogger('fastf1').setLevel(logging.CRITICAL) # Removes the annoying lines

races = []
years = [2018, 2019, 2020, 2021, 2022, 2023]


for year in years:
    # get the list of files in the Data folder for the year
    files = os.listdir(f'Data/{year}')
    # for each file, load the session and join the lap and weather data
    for file in files:
        event_name = file.split('.')[0]
        races.append((year, event_name))

for r in tqdm.tqdm(races[87:]):
    save_session_laps_clean_dry(r[0], r[1]) 

100%|██████████| 26/26 [11:29<00:00, 26.51s/it]


In [14]:
## fastf1.req.RateLimitExceededError: any API: 500 calls/h

In [10]:
# Implementing the new function to get wet weather data

logging.getLogger('fastf1').setLevel(logging.CRITICAL) # Removes the annoying lines

races = []
years = [2018, 2019, 2020, 2021, 2022, 2023]


for year in years:
    # get the list of files in the Data folder for the year
    files = os.listdir(f'Data/{year}')
    # for each file, load the session and join the lap and weather data
    for file in files:
        event_name = file.split('.')[0]
        races.append((year, event_name))

for r in tqdm.tqdm(races):
    save_session_laps_clean_wet(r[0], r[1]) 

  1%|          | 1/113 [00:15<29:08, 15.62s/it]

Error loading session for 2018 FORMULA 1 GRAN PREMIO HEINEKEN D'ITALIA 2018. Retrying...


  2%|▏         | 2/113 [00:30<27:47, 15.02s/it]

Error loading session for 2018 FORMULA 1 2018 GULF AIR BAHRAIN GRAND PRIX. Retrying...


  3%|▎         | 3/113 [00:45<27:50, 15.18s/it]

Error loading session for 2018 FORMULA 1 ROLEX MAGYAR NAGYDÍJ 2018. Retrying...


  3%|▎         | 3/113 [00:57<35:01, 19.10s/it]


KeyboardInterrupt: 

In [3]:
races

NameError: name 'races' is not defined

In [2]:
# To find the index of the race in case it breaks and I don't want to start from the start

# find the index of '2021 FORMULA 1 GRAND PRIX DE MONACO 2021'
for i,r in enumerate(races):
    if r[1] == 'FORMULA 1 ROLEX BELGIAN GRAND PRIX 2021':
        print(i)
        break

NameError: name 'races' is not defined