In [1]:
import pandas as pd
import numpy as np
import fastf1
import datetime
fastf1.Cache.enable_cache('cache') 

In [2]:
def find_fastest_laps(laps, cols, index_cols, lap_num):
    laps = laps.sort_values('LapTime')
    fastest = laps.groupby('DriverNumber').nth(0).reset_index()
    
    fastest_lap = fastest['LapTime'].min()
    fastest['LapPercent'] = (fastest['LapTime'] - fastest_lap) / fastest_lap

    # dummies = pd.get_dummies(fastest['Compound'])
    # #dummies.columns = dummies.columns + '_1'
    # fastest = pd.concat([fastest, dummies], axis = 1)

    fastest = fastest[cols + index_cols]

    for i in range(1,lap_num):
        laps_i = laps.groupby('DriverNumber').nth(i).reset_index()
        laps_i['LapPercent'] = (laps_i['LapTime'] - fastest_lap) / fastest_lap
        # dummies = pd.get_dummies(laps_i['Compound'])
        # laps_i = pd.concat([laps_i, dummies], axis = 1)
        fastest = fastest.merge(laps_i[cols], on=['DriverNumber'], how = 'outer', suffixes=('', '_' + str(i)))
    return fastest

In [3]:
def load_practice(event, type, year):
    practice1 = fastf1.get_session(year, event, 'Practice 1')
    practice1.load(laps=True)
    try:
        laps1 = practice1.laps
        cols = ['DriverNumber', 'Time', 'TyreLife', 'LapPercent', 'Compound', 'LapTime']
        index_cols = ['Team', 'Driver']
        practice1 = find_fastest_laps(laps1, cols, index_cols, 2)
        practice2 = practice1
        if type == 'conventional':
            practice2 = fastf1.get_session(year, event, 'Practice 2')
            practice2.load(laps=True)
            laps2 = practice2.laps
            practice2 = find_fastest_laps(laps2, cols, index_cols, 2)

        practice_data = practice1.merge(practice2, on=['DriverNumber'] + index_cols, how = 'outer', suffixes=('_1', '_2'))
        return practice_data
    except:
         return pd.DataFrame()
   
def load_qualifying(event, year):
        qualifying = fastf1.get_session(year, event, 'Q')
        qualifying.load()
        try:
            results = qualifying.results

            results['FastestLap'] = results[['Q1', 'Q2', 'Q3']].apply(lambda x: min(x), axis = 1)

            fastest_lap_q = results['FastestLap'].min()
            results['LapPercent'] = (results['FastestLap'] - fastest_lap_q)/ fastest_lap_q
            return results[['DriverNumber', 'Position', 'LapPercent']]
        except:
             return pd.DataFrame()


In [18]:
years = [2020, 2021, 2022, 2023]

data = pd.DataFrame()
prev_qualifying_data = pd.DataFrame()
for year in years:
    schedule = fastf1.get_event_schedule(year)
    for index, row in schedule.iterrows():
        if row['EventFormat'] in ['conventional', 'sprint_shootout']:
            practice_data = load_practice(row['EventName'], row['EventFormat'], year)
            qualifying_data = load_qualifying(row['EventName'], year)
            if 'DriverNumber' in practice_data.columns and 'DriverNumber' in qualifying_data.columns:
                if 'DriverNumber' in prev_qualifying_data.columns:
                    qualifying_data_full = qualifying_data.merge(prev_qualifying_data, on = 'DriverNumber', suffixes=('_curr', '_prev'))
                    full_data = practice_data.merge(qualifying_data_full, on='DriverNumber')
                    for col in ['RoundNumber', 'Country', 'Location', 'OfficialEventName', 'EventDate','EventName', 'EventFormat']:
                        full_data[col] = row[col]
                    data = pd.concat([data, full_data], axis = 0, ignore_index=True)
                prev_qualifying_data = qualifying_data

core           INFO 	Loading data for Austrian Grand Prix - Practice 1 [v2.3.1]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing_data
api            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
api            INFO 	Using cached data for session_status_data
api            INFO 	Using cached data for track_status_data
  for key, value in row.iteritems():
  for key, value in row.iteritems():
  df = pd.concat([df, result], sort=False)
  df = pd.concat([df, result], sort=False)
  df = pd.concat([df, result], sort=False)
  for key, value in row.iteritems():
  df = pd.concat([df, result], sort=False)
  df = pd.concat([df, result], sort=False)
  df = pd.concat([df, result], sort=False)
  df = pd.concat([df, result], sort=False)
  df = pd.concat([df, result], sort=False)
  df = pd.concat([df, result], sort=False)
  for key, value in row.iteritems():
  df = pd.concat([df, result], sort=False)
  

In [19]:
data.columns

Index(['DriverNumber', 'Time_1', 'TyreLife_1', 'LapPercent_1', 'Compound_1',
       'LapTime_1', 'Team', 'Driver', 'Time_1_1', 'TyreLife_1_1',
       'LapPercent_1_1', 'Compound_1_1', 'LapTime_1_1', 'Time_2', 'TyreLife_2',
       'LapPercent_2', 'Compound_2', 'LapTime_2', 'Time_1_2', 'TyreLife_1_2',
       'LapPercent_1_2', 'Compound_1_2', 'LapTime_1_2', 'Position_curr',
       'LapPercent_curr', 'Position_prev', 'LapPercent_prev', 'RoundNumber',
       'Country', 'Location', 'OfficialEventName', 'EventDate', 'EventName',
       'EventFormat'],
      dtype='object')

In [20]:

one_hot = ['Compound']

for col in data.columns:
    for name in one_hot:
        if name in col:
            dummies = pd.get_dummies(data[col])
            dummies.columns = col + '_' + dummies.columns
            data = pd.concat([data, dummies], axis = 1)

In [24]:
data['LapPercent_prev']

0       0.021704
1       0.014633
2       0.015634
3       0.016143
4       0.035352
          ...   
1191    0.010889
1192    0.007527
1193    0.010087
1194    0.007555
1195    0.014584
Name: LapPercent_prev, Length: 1196, dtype: float64

In [25]:
data.to_pickle('train_data.pkl')

In [36]:
sched_2023 = fastf1.get_event_schedule(2023)

In [37]:
prev_event = sched_2023[sched_2023['RoundNumber'] == 2]
pred_event = sched_2023[sched_2023['RoundNumber'] == 3]

In [45]:
pred_data = load_practice(pred_event['EventName'].iloc[0], pred_event['EventFormat'].iloc[0], 2023)
prev_data = load_qualifying(prev_event['EventName'].iloc[0], 2023)
prev_data['LapPercent_prev'] = prev_data['LapPercent']
prev_data['Position_prev'] = prev_data['Position']
pred_data = pred_data.merge(prev_data[['DriverNumber', 'LapPercent_prev', 'Position_prev']], on = 'DriverNumber')
for col in ['RoundNumber', 'Country', 'Location', 'OfficialEventName', 'EventDate','EventName', 'EventFormat']:
    pred_data[col] = pred_event[col].iloc[0]


core           INFO 	Loading data for Australian Grand Prix - Practice 1 [v2.3.1]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing_data
api            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
api            INFO 	Using cached data for session_status_data
api            INFO 	Using cached data for track_status_data
  df = pd.concat([df, result], sort=False)
  for key, value in row.iteritems():
  df = pd.concat([df, result], sort=False)
  df = pd.concat([df, result], sort=False)
  for key, value in row.iteritems():
  df = pd.concat([df, result], sort=False)
  df = pd.concat([df, result], sort=False)
  for key, value in row.iteritems():
  df = pd.concat([df, result], sort=False)
  df = pd.concat([df, result], sort=False)
  for key, value in row.iteritems():
  df = pd.concat([df, result], sort=False)
  df = pd.concat([df, result], sort=False)
  for key, value in row.iteritems():
  df =

In [46]:
pred_data.columns

Index(['DriverNumber', 'Time_1', 'TyreLife_1', 'LapPercent_1', 'Compound_1',
       'LapTime_1', 'Team', 'Driver', 'Time_1_1', 'TyreLife_1_1',
       'LapPercent_1_1', 'Compound_1_1', 'LapTime_1_1', 'Time_2', 'TyreLife_2',
       'LapPercent_2', 'Compound_2', 'LapTime_2', 'Time_1_2', 'TyreLife_1_2',
       'LapPercent_1_2', 'Compound_1_2', 'LapTime_1_2', 'LapPercent_prev',
       'Position_prev', 'RoundNumber', 'Country', 'Location',
       'OfficialEventName', 'EventDate', 'EventName', 'EventFormat'],
      dtype='object')

In [47]:
one_hot = ['Compound']

for col in pred_data.columns:
    for name in one_hot:
        if name in col:
            dummies = pd.get_dummies(pred_data[col])
            dummies.columns = col + '_' + dummies.columns
            pred_data = pd.concat([pred_data, dummies], axis = 1)

In [48]:
pred_data.columns

Index(['DriverNumber', 'Time_1', 'TyreLife_1', 'LapPercent_1', 'Compound_1',
       'LapTime_1', 'Team', 'Driver', 'Time_1_1', 'TyreLife_1_1',
       'LapPercent_1_1', 'Compound_1_1', 'LapTime_1_1', 'Time_2', 'TyreLife_2',
       'LapPercent_2', 'Compound_2', 'LapTime_2', 'Time_1_2', 'TyreLife_1_2',
       'LapPercent_1_2', 'Compound_1_2', 'LapTime_1_2', 'LapPercent_prev',
       'Position_prev', 'RoundNumber', 'Country', 'Location',
       'OfficialEventName', 'EventDate', 'EventName', 'EventFormat',
       'Compound_1_MEDIUM', 'Compound_1_SOFT', 'Compound_1_1_HARD',
       'Compound_1_1_MEDIUM', 'Compound_1_1_SOFT', 'Compound_2_HARD',
       'Compound_2_MEDIUM', 'Compound_2_SOFT', 'Compound_1_2_HARD',
       'Compound_1_2_INTERMEDIATE', 'Compound_1_2_MEDIUM',
       'Compound_1_2_SOFT'],
      dtype='object')

In [49]:
pred_data.to_pickle('pred_data.pkl')