In [1]:
import pandas as pd
import numpy as np
import fastf1
import datetime
fastf1.Cache.enable_cache('cache') 

In [2]:
def find_fastest_laps(laps, cols, index_cols, lap_num):
    
    laps = laps.sort_values('LapTime')
    fastest = laps.groupby('DriverNumber').nth(0).reset_index()

    fastest_time = {}
    added_cols = []
    
    
    for col in cols:
        fastest_time[col] = laps[col].min()
        
        added_cols.append(col[:-4] + 'Percent')
        added_cols.append(col[:-4] + 'Position')
        # fastest[col[:-4] + 'Percent'] = (fastest[col] - fastest_time) / fastest_time
        # total_cols.append(col[:-4] + 'Percent')
    
    # dummies = pd.get_dummies(fastest['Compound'])
    # #dummies.columns = dummies.columns + '_1'
    # fastest = pd.concat([fastest, dummies], axis = 1)
    
    fastest = fastest[index_cols]
    

    # top_cols = ['Sector1Time', 'Sector2Time', 'Sector3Time']
    # fastest = fastest.merge(laps.groupby('DriverNumber').min()[top_cols].reset_index(), on=['DriverNumber'], how = 'outer')
    for i in range(0,lap_num):
        laps_i = laps.groupby('DriverNumber').nth(i).reset_index()

        for col in ['LapTime', 'Sector1Time', 'Sector2Time', 'Sector3Time']:
            laps_i[col[:-4] + 'Percent'] = (laps_i[col] - fastest_time[col]) / fastest_time[col]
            laps_i[col[:-4] + 'Position'] = laps_i.sort_values(col).index
        # laps_i['LapPercent'] = (laps_i['LapTime'] - fastest_lap) / fastest_lap
        fastest = fastest.merge(laps_i[index_cols + cols + added_cols], on=index_cols, how = 'outer', suffixes=('', '_' + str(i)))
    return fastest

In [3]:
def load_practice(event, type, year):
    
    practice1 = fastf1.get_session(year, event, 'Practice 1')
    practice1.load(laps=True)
    try:
        laps1 = practice1.laps
        cols = ['LapTime', 'Sector1Time', 'Sector2Time', 'Sector3Time']
        index_cols = ['DriverNumber', 'Team', 'Driver']
        practice1 = find_fastest_laps(laps1, cols, index_cols, 2)
        practice2 = practice1
        if type == 'conventional':
            practice2 = fastf1.get_session(year, event, 'Practice 2')
            practice2.load(laps=True)
            laps2 = practice2.laps
            practice2 = find_fastest_laps(laps2, cols, index_cols, 2)

        practice_data = practice1.merge(practice2, on= index_cols, how = 'outer', suffixes=('_1', '_2'))
        return practice_data
    except:
         return pd.DataFrame()
   
def load_qualifying(event, year):
        qualifying = fastf1.get_session(year, event, 'Q')
        qualifying.load()
        try:
            results = qualifying.results

            results['FastestLap'] = results[['Q1', 'Q2', 'Q3']].apply(lambda x: min(x), axis = 1)

            fastest_lap_q = results['FastestLap'].min()
            results['LapPercent'] = (results['FastestLap'] - fastest_lap_q)/ fastest_lap_q
            results['Team'] = results['TeamName']
            return results[['DriverNumber', 'Position', 'LapPercent', 'Team']]
        except:
             return pd.DataFrame()


In [4]:
years = [2018, 2019,2020, 2021, 2022, 2023]
# years = [2020, 2021, 2022]

data = pd.DataFrame()
#prev_qualifying_data = pd.DataFrame()
for year in years:
    schedule = fastf1.get_event_schedule(year)
    for index, row in schedule.iterrows():
        if row['EventFormat'] in ['conventional']:
            practice_data = load_practice(row['EventName'], row['EventFormat'], year)
            qualifying_data = load_qualifying(row['EventName'], year)
            if 'DriverNumber' in practice_data.columns and 'DriverNumber' in qualifying_data.columns:
                #if 'DriverNumber' in prev_qualifying_data.columns:
                    # qualifying_data_full = qualifying_data.merge(prev_qualifying_data, on = 'DriverNumber', suffixes=('_curr', '_prev'))
                full_data = practice_data.merge(qualifying_data, on=['DriverNumber', 'Team'])
                for col in ['RoundNumber', 'Country', 'Location', 'OfficialEventName', 'EventDate','EventName', 'EventFormat']:
                    full_data[col] = row[col]
                data = pd.concat([data, full_data], axis = 0, ignore_index=True)
                # prev_qualifying_data = qualifying_data

core           INFO 	Loading data for Australian Grand Prix - Practice 1 [v2.3.1]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing_data
api            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
api            INFO 	Using cached data for session_status_data
api            INFO 	Using cached data for track_status_data
  df = pd.concat([df, result], sort=False)
  df = pd.concat([df, result], sort=False)
  df = pd.concat([df, result], sort=False)
  df = pd.concat([df, result], sort=False)
  df = pd.concat([df, result], sort=False)
  df = pd.concat([df, result], sort=False)
  df = pd.concat([df, result], sort=False)
  df = pd.concat([df, result], sort=False)
  df = pd.concat([df, result], sort=False)
  df = pd.concat([df, result], sort=False)
  df = pd.concat([df, result], sort=False)
  df = pd.concat([df, result], sort=False)
api            INFO 	Using cached data for car_data
api       

In [18]:
data = pd.DataFrame(data)
prev_range = 5

In [19]:
team_cols = []
individual_cols = []


for i in range(prev_range):
    data['PrevLapPercent' + str(i+1)] = data.groupby('DriverNumber')['LapPercent'].shift(i+1)
    individual_cols.append('PrevLapPercent' + str(i+1))
    # data['PrevPosition' + str(i+1)] = data.groupby('DriverNumber')['Position'].shift(i+1)
    prev = data.groupby(['Team', 'OfficialEventName', 'EventDate'])[['Position', 'LapPercent']].mean().reset_index().sort_values('EventDate').groupby('Team').shift(1)
    prev.index = data.groupby(['Team', 'OfficialEventName', 'EventDate'])[['Position', 'LapPercent']].mean().reset_index(2).sort_values('EventDate').index
    prev = prev[['Position', 'LapPercent']].reset_index()
    data = data.merge(prev, on = ['OfficialEventName', 'Team'], suffixes=('', 'PrevTeam' + str(i+1)))
    team_cols.append('LapPercentPrevTeam' + str(i+1))
    #team_cols.append('PositionPrevTeam' + str(i+1))

In [20]:
def cap_outliers(row):
    median = np.median(row)
    return np.clip(row, median-0.02, median+0.02)

In [21]:
data[team_cols] = data[team_cols].apply(cap_outliers, axis=1)
data[individual_cols] = data[individual_cols].apply(cap_outliers, axis=1)

In [26]:
data

Unnamed: 0,DriverNumber,Team,Driver,LapTime_1,Sector1Time_1,Sector2Time_1,Sector3Time_1,LapPercent_1,LapPosition_1,Sector1Percent_1,...,PositionPrevTeam1,LapPercentPrevTeam1,PositionPrevTeam2,LapPercentPrevTeam2,PositionPrevTeam3,LapPercentPrevTeam3,PositionPrevTeam4,LapPercentPrevTeam4,PositionPrevTeam5,LapPercentPrevTeam5
0,10,Toro Rosso,GAS,0 days 00:01:26.494000,0 days 00:00:28.639000,0 days 00:00:23.343000,0 days 00:00:34.512000,0.029372,13.0,0.034683,...,,,,,,,,,,
1,28,Toro Rosso,HAR,0 days 00:01:27.745000,0 days 00:00:28.951000,0 days 00:00:23.484000,0 days 00:00:35.310000,0.044260,15.0,0.045955,...,,,,,,,,,,
2,11,Force India,PER,0 days 00:01:26.767000,0 days 00:00:28.731000,0 days 00:00:23.505000,0 days 00:00:34.531000,0.032621,17.0,0.038007,...,,,,,,,,,,
3,31,Force India,OCO,0 days 00:01:26.605000,0 days 00:00:28.672000,0 days 00:00:23.527000,0 days 00:00:34.406000,0.030693,0.0,0.035876,...,,,,,,,,,,
4,14,McLaren,ALO,0 days 00:01:25.896000,0 days 00:00:28.485000,0 days 00:00:23.312000,0 days 00:00:34.099000,0.022255,11.0,0.029120,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2069,77,Alfa Romeo,BOT,0 days 00:01:41.032000,0 days 00:00:34.430000,0 days 00:00:38.090000,0 days 00:00:28.512000,0.022643,2.0,0.195237,...,16.5,0.037367,16.5,0.037367,16.5,0.037367,16.5,0.037367,16.5,0.037367
2070,4,McLaren,NOR,0 days 00:01:40.277000,0 days 00:00:34.105000,0 days 00:00:37.892000,0 days 00:00:28.280000,0.015001,12.0,0.183955,...,2.5,0.003534,2.5,0.003534,2.5,0.003534,2.5,0.003534,2.5,0.003534
2071,81,McLaren,PIA,0 days 00:01:39.154000,0 days 00:00:33.445000,0 days 00:00:37.459000,0 days 00:00:28.250000,0.003634,15.0,0.161043,...,2.5,0.003534,2.5,0.003534,2.5,0.003534,2.5,0.003534,2.5,0.003534
2072,44,Mercedes,HAM,NaT,NaT,0 days 00:00:40.339000,NaT,,13.0,,...,6.5,0.005339,6.5,0.005339,6.5,0.005339,6.5,0.005339,6.5,0.005339


In [28]:
data.to_pickle('train_data.pkl')

In [23]:
sched_2023 = fastf1.get_event_schedule(2023)
round_number = 10

In [14]:
pred_event = sched_2023[sched_2023['RoundNumber'] == round_number]

In [15]:
pred_data = load_practice(pred_event['EventName'].iloc[0], pred_event['EventFormat'].iloc[0], 2023)
for col in ['RoundNumber', 'Country', 'Location', 'OfficialEventName', 'EventDate','EventName', 'EventFormat']:
    pred_data[col] = pred_event[col].iloc[0]
i = 0
while i < prev_range:
    prev_event = sched_2023[sched_2023['RoundNumber'] == round_number - i - 1]
    prev_data = load_qualifying(prev_event['EventName'].iloc[0], 2023)
    if prev_data.shape[0] != 0:
        prev_data['PrevLapPercent' + str(i+1)] = prev_data['LapPercent']
        prev_data['PrevPosition' + str(i+1)] = prev_data['Position']
        prev_data = prev_data.merge(prev_data.groupby('Team')['LapPercent', 'Position'].mean().reset_index(), on=['Team'], suffixes=('', 'PrevTeam' + str(i+1)))

        pred_data = pred_data.merge(prev_data[['DriverNumber', 'PrevLapPercent' + str(i+1),'PrevPosition' + str(i+1), 'LapPercentPrevTeam' + str(i+1), 'PositionPrevTeam' + str(i+1)]], on = 'DriverNumber')
        i += 1

core           INFO 	Loading data for British Grand Prix - Practice 1 [v2.3.1]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing_data
api            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
api            INFO 	Using cached data for session_status_data
api            INFO 	Using cached data for track_status_data
  for key, value in row.iteritems():
  df = pd.concat([df, result], sort=False)
  df = pd.concat([df, result], sort=False)
  df = pd.concat([df, result], sort=False)
  df = pd.concat([df, result], sort=False)
  df = pd.concat([df, result], sort=False)
  df = pd.concat([df, result], sort=False)
  for key, value in row.iteritems():
  df = pd.concat([df, result], sort=False)
  for key, value in row.iteritems():
  df = pd.concat([df, result], sort=False)
  df = pd.concat([df, result], sort=False)
  df = pd.concat([df, result], sort=False)
  df = pd.concat([df, result], sort=Fals

In [16]:
pred_data

Unnamed: 0,DriverNumber,Team,Driver,LapTime_1,Sector1Time_1,Sector2Time_1,Sector3Time_1,LapPercent_1,LapPosition_1,Sector1Percent_1,...,LapPercentPrevTeam3,PositionPrevTeam3,PrevLapPercent4,PrevPosition4,LapPercentPrevTeam4,PositionPrevTeam4,PrevLapPercent5,PrevPosition5,LapPercentPrevTeam5,PositionPrevTeam5
0,1,Red Bull Racing,VER,0 days 00:01:28.600000,0 days 00:00:28.147000,0 days 00:00:35.994000,0 days 00:00:24.459000,0.0,0,0.003208,...,0.007347,6.0,0.0,1.0,0.01741,10.5,0.0,9.0,0.000156,5.0
1,10,Alpine,GAS,0 days 00:01:29.828000,0 days 00:00:28.380000,0 days 00:00:36.317000,0 days 00:00:25.131000,0.01386,2,0.011512,...,0.008807,5.5,0.007959,7.0,0.005297,5.5,0.009192,5.0,0.008224,6.5
2,11,Red Bull Racing,PER,0 days 00:01:29.048000,0 days 00:00:28.150000,0 days 00:00:36.257000,0 days 00:00:24.641000,0.005056,10,0.003315,...,0.007347,6.0,0.034821,20.0,0.01741,10.5,0.000311,1.0,0.000156,5.0
3,14,Aston Martin,ALO,0 days 00:01:29.268000,0 days 00:00:28.199000,0 days 00:00:36.282000,0 days 00:00:24.787000,0.00754,3,0.005061,...,0.01071,7.5,0.001177,2.0,0.009402,8.0,0.00326,2.0,0.011202,10.0
4,16,Ferrari,LEC,0 days 00:01:29.280000,0 days 00:00:28.255000,0 days 00:00:36.240000,0 days 00:00:24.785000,0.007675,4,0.007057,...,0.015698,10.5,0.001485,3.0,0.002599,4.0,0.001728,7.0,0.002788,5.0
5,18,Aston Martin,STR,0 days 00:01:29.471000,0 days 00:00:28.183000,0 days 00:00:36.240000,0 days 00:00:25.048000,0.009831,13,0.004491,...,0.01071,7.5,0.017628,14.0,0.009402,8.0,0.019144,18.0,0.011202,10.0
6,2,Williams,SAR,0 days 00:01:30.124000,0 days 00:00:28.337000,0 days 00:00:36.738000,0 days 00:00:25.049000,0.017201,16,0.00998,...,0.029181,19.0,0.024494,16.0,0.020388,14.5,0.020308,20.0,0.015804,15.5
7,20,Haas F1 Team,MAG,0 days 00:01:30.385000,0 days 00:00:28.525000,0 days 00:00:36.837000,0 days 00:00:25.023000,0.020147,14,0.01668,...,0.018866,12.5,0.026694,17.0,0.026757,17.5,0.009895,4.0,0.011219,8.0
8,21,AlphaTauri,DEV,0 days 00:01:29.691000,0 days 00:00:28.239000,0 days 00:00:36.518000,0 days 00:00:24.934000,0.012314,5,0.006487,...,0.020056,14.5,0.014895,12.0,0.012471,10.5,0.017405,15.0,0.018004,16.0
9,22,AlphaTauri,TSU,0 days 00:01:30.092000,0 days 00:00:28.432000,0 days 00:00:36.609000,0 days 00:00:25.051000,0.01684,19,0.013366,...,0.020056,14.5,0.010047,9.0,0.012471,10.5,0.018603,17.0,0.018004,16.0


In [17]:
pred_data.to_pickle('pred_data.pkl')