In [1]:
import pandas as pd
import numpy as np
import fastf1
import datetime
fastf1.Cache.enable_cache('cache') 

In [2]:
def find_fastest_laps(laps, cols, index_cols, lap_num):
    laps = laps.sort_values('LapTime')
    fastest = laps.groupby('DriverNumber').nth(0).reset_index()
    
    fastest_lap = fastest['LapTime'].min()
    fastest['LapPercent'] = (fastest['LapTime'] - fastest_lap) / fastest_lap

    # dummies = pd.get_dummies(fastest['Compound'])
    # #dummies.columns = dummies.columns + '_1'
    # fastest = pd.concat([fastest, dummies], axis = 1)

    fastest = fastest[cols + index_cols]

    for i in range(1,lap_num):
        laps_i = laps.groupby('DriverNumber').nth(i).reset_index()
        laps_i['LapPercent'] = (laps_i['LapTime'] - fastest_lap) / fastest_lap
        # dummies = pd.get_dummies(laps_i['Compound'])
        # laps_i = pd.concat([laps_i, dummies], axis = 1)
        fastest = fastest.merge(laps_i[cols], on=['DriverNumber'], how = 'outer', suffixes=('', '_' + str(i)))
    return fastest

In [3]:
def load_practice(event, type, year):
    practice1 = fastf1.get_session(year, event, 'Practice 1')
    practice1.load(laps=True)
    try:
        laps1 = practice1.laps
        cols = ['DriverNumber', 'Time', 'TyreLife', 'LapPercent', 'Compound', 'LapTime']
        index_cols = ['Team', 'Driver']
        practice1 = find_fastest_laps(laps1, cols, index_cols, 2)
        practice2 = practice1
        if type == 'conventional':
            practice2 = fastf1.get_session(year, event, 'Practice 2')
            practice2.load(laps=True)
            laps2 = practice2.laps
            practice2 = find_fastest_laps(laps2, cols, index_cols, 2)

        practice_data = practice1.merge(practice2, on=['DriverNumber'] + index_cols, how = 'outer', suffixes=('_1', '_2'))
        return practice_data
    except:
         return pd.DataFrame()
   
def load_qualifying(event, year):
        qualifying = fastf1.get_session(year, event, 'Q')
        qualifying.load()
        try:
            results = qualifying.results

            results['FastestLap'] = results[['Q1', 'Q2', 'Q3']].apply(lambda x: min(x), axis = 1)

            fastest_lap_q = results['FastestLap'].min()
            results['LapPercent'] = (results['FastestLap'] - fastest_lap_q)/ fastest_lap_q
            return results[['DriverNumber', 'Position', 'LapPercent']]
        except:
             return pd.DataFrame()


In [4]:
years = [2020, 2021, 2022, 2023]

data = pd.DataFrame()
for year in years:
    schedule = fastf1.get_event_schedule(year)
    for index, row in schedule.iterrows():
        if row['EventFormat'] in ['conventional', 'sprint_shootout']:
            practice_data = load_practice(row['EventName'], row['EventFormat'], year)
            qualifying_data = load_qualifying(row['EventName'], year)
            if 'DriverNumber' in practice_data.columns and 'DriverNumber' in qualifying_data.columns:
                full_data = practice_data.merge(qualifying_data, on='DriverNumber')
                for col in ['RoundNumber', 'Country', 'Location', 'OfficialEventName', 'EventDate','EventName', 'EventFormat']:
                    full_data[col] = row[col]
                data = pd.concat([data, full_data], axis = 0, ignore_index=True)

core           INFO 	Loading data for Austrian Grand Prix - Practice 1 [v2.3.1]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing_data
api            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
api            INFO 	Using cached data for session_status_data
api            INFO 	Using cached data for track_status_data
  for key, value in row.iteritems():
  for key, value in row.iteritems():
  df = pd.concat([df, result], sort=False)
  df = pd.concat([df, result], sort=False)
  df = pd.concat([df, result], sort=False)
  for key, value in row.iteritems():
  df = pd.concat([df, result], sort=False)
  df = pd.concat([df, result], sort=False)
  df = pd.concat([df, result], sort=False)
  df = pd.concat([df, result], sort=False)
  df = pd.concat([df, result], sort=False)
  df = pd.concat([df, result], sort=False)
  for key, value in row.iteritems():
  df = pd.concat([df, result], sort=False)
  

In [5]:
data

Unnamed: 0,DriverNumber,Time_1,TyreLife_1,LapPercent_1,Compound_1,LapTime_1,Team,Driver,Time_1_1,TyreLife_1_1,...,LapTime_1_2,Position,LapPercent,RoundNumber,Country,Location,OfficialEventName,EventDate,EventName,EventFormat
0,10,0 days 01:37:09.350000,7.0,0.024500,MEDIUM,0 days 00:01:06.404000,AlphaTauri,GAS,0 days 01:34:35.161000,5.0,...,0 days 00:01:06.449000,12.0,0.021704,1,Austria,Spielberg,FORMULA 1 ROLEX GROSSER PREIS VON ÖSTERREICH 2020,2020-07-05 17:10:00,Austrian Grand Prix,conventional
1,11,0 days 01:08:04.652000,14.0,0.010738,SOFT,0 days 00:01:05.512000,Racing Point,PER,0 days 01:05:34.812000,12.0,...,0 days 00:01:05.192000,6.0,0.014633,1,Austria,Spielberg,FORMULA 1 ROLEX GROSSER PREIS VON ÖSTERREICH 2020,2020-07-05 17:10:00,Austrian Grand Prix,conventional
2,16,0 days 01:17:25.633000,7.0,0.017095,MEDIUM,0 days 00:01:05.924000,Ferrari,LEC,0 days 01:11:10.147000,2.0,...,0 days 00:01:06.075000,7.0,0.015634,1,Austria,Spielberg,FORMULA 1 ROLEX GROSSER PREIS VON ÖSTERREICH 2020,2020-07-05 17:10:00,Austrian Grand Prix,conventional
3,18,0 days 01:02:01.036000,13.0,0.019409,SOFT,0 days 00:01:06.074000,Racing Point,STR,0 days 00:58:17.264000,10.0,...,0 days 00:01:05.308000,9.0,0.016143,1,Austria,Spielberg,FORMULA 1 ROLEX GROSSER PREIS VON ÖSTERREICH 2020,2020-07-05 17:10:00,Austrian Grand Prix,conventional
4,20,0 days 01:23:37.572000,7.0,0.016832,SOFT,0 days 00:01:05.907000,Haas F1 Team,MAG,0 days 01:19:25.446000,4.0,...,0 days 00:01:05.738000,16.0,0.035352,1,Austria,Spielberg,FORMULA 1 ROLEX GROSSER PREIS VON ÖSTERREICH 2020,2020-07-05 17:10:00,Austrian Grand Prix,conventional
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1232,81,0 days 00:18:42.463000,2.0,0.035159,MEDIUM,0 days 00:01:21.496000,McLaren,PIA,0 days 00:17:20.967000,1.0,...,0 days 00:01:15.131000,9.0,0.011864,8,Canada,Montréal,FORMULA 1 PIRELLI GRAND PRIX DU CANADA 2023,2023-06-18 16:00:00,Canadian Grand Prix,conventional
1233,10,NaT,,,,NaT,Alpine,GAS,NaT,,...,0 days 00:01:15.064000,17.0,0.052855,8,Canada,Montréal,FORMULA 1 PIRELLI GRAND PRIX DU CANADA 2023,2023-06-18 16:00:00,Canadian Grand Prix,conventional
1234,31,NaT,,,,NaT,Alpine,OCO,NaT,,...,0 days 00:01:15.716000,6.0,0.021353,8,Canada,Montréal,FORMULA 1 PIRELLI GRAND PRIX DU CANADA 2023,2023-06-18 16:00:00,Canadian Grand Prix,conventional
1235,44,NaT,,,,NaT,Mercedes,HAM,NaT,,...,0 days 00:01:14.356000,4.0,0.021607,8,Canada,Montréal,FORMULA 1 PIRELLI GRAND PRIX DU CANADA 2023,2023-06-18 16:00:00,Canadian Grand Prix,conventional


In [6]:

one_hot = ['Compound']

for col in data.columns:
    for name in one_hot:
        if name in col:
            dummies = pd.get_dummies(data[col])
            dummies.columns = col + '_' + dummies.columns
            data = pd.concat([data, dummies], axis = 1)

In [7]:
data.columns

Index(['DriverNumber', 'Time_1', 'TyreLife_1', 'LapPercent_1', 'Compound_1',
       'LapTime_1', 'Team', 'Driver', 'Time_1_1', 'TyreLife_1_1',
       'LapPercent_1_1', 'Compound_1_1', 'LapTime_1_1', 'Time_2', 'TyreLife_2',
       'LapPercent_2', 'Compound_2', 'LapTime_2', 'Time_1_2', 'TyreLife_1_2',
       'LapPercent_1_2', 'Compound_1_2', 'LapTime_1_2', 'Position',
       'LapPercent', 'RoundNumber', 'Country', 'Location', 'OfficialEventName',
       'EventDate', 'EventName', 'EventFormat', 'Compound_1_',
       'Compound_1_HARD', 'Compound_1_INTERMEDIATE', 'Compound_1_MEDIUM',
       'Compound_1_SOFT', 'Compound_1_1_', 'Compound_1_1_HARD',
       'Compound_1_1_INTERMEDIATE', 'Compound_1_1_MEDIUM', 'Compound_1_1_SOFT',
       'Compound_1_1_TEST_UNKNOWN', 'Compound_1_1_WET', 'Compound_2_',
       'Compound_2_HARD', 'Compound_2_INTERMEDIATE', 'Compound_2_MEDIUM',
       'Compound_2_SOFT', 'Compound_2_TEST_UNKNOWN', 'Compound_2_WET',
       'Compound_1_2_', 'Compound_1_2_HARD', 'Compound

In [8]:
data

Unnamed: 0,DriverNumber,Time_1,TyreLife_1,LapPercent_1,Compound_1,LapTime_1,Team,Driver,Time_1_1,TyreLife_1_1,...,Compound_2_SOFT,Compound_2_TEST_UNKNOWN,Compound_2_WET,Compound_1_2_,Compound_1_2_HARD,Compound_1_2_INTERMEDIATE,Compound_1_2_MEDIUM,Compound_1_2_SOFT,Compound_1_2_TEST_UNKNOWN,Compound_1_2_WET
0,10,0 days 01:37:09.350000,7.0,0.024500,MEDIUM,0 days 00:01:06.404000,AlphaTauri,GAS,0 days 01:34:35.161000,5.0,...,0,0,0,0,0,0,0,1,0,0
1,11,0 days 01:08:04.652000,14.0,0.010738,SOFT,0 days 00:01:05.512000,Racing Point,PER,0 days 01:05:34.812000,12.0,...,1,0,0,0,0,0,0,1,0,0
2,16,0 days 01:17:25.633000,7.0,0.017095,MEDIUM,0 days 00:01:05.924000,Ferrari,LEC,0 days 01:11:10.147000,2.0,...,1,0,0,0,1,0,0,0,0,0
3,18,0 days 01:02:01.036000,13.0,0.019409,SOFT,0 days 00:01:06.074000,Racing Point,STR,0 days 00:58:17.264000,10.0,...,1,0,0,0,0,0,0,1,0,0
4,20,0 days 01:23:37.572000,7.0,0.016832,SOFT,0 days 00:01:05.907000,Haas F1 Team,MAG,0 days 01:19:25.446000,4.0,...,1,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1232,81,0 days 00:18:42.463000,2.0,0.035159,MEDIUM,0 days 00:01:21.496000,McLaren,PIA,0 days 00:17:20.967000,1.0,...,1,0,0,0,0,0,0,1,0,0
1233,10,NaT,,,,NaT,Alpine,GAS,NaT,,...,1,0,0,0,0,0,1,0,0,0
1234,31,NaT,,,,NaT,Alpine,OCO,NaT,,...,1,0,0,0,0,0,1,0,0,0
1235,44,NaT,,,,NaT,Mercedes,HAM,NaT,,...,1,0,0,0,0,0,0,1,0,0


In [9]:
data.to_pickle('train_data.pkl')

In [28]:
sched_2023 = fastf1.get_event_schedule(2023)

In [29]:
pred_event = sched_2023[sched_2023['RoundNumber'] == 3]

In [30]:
pred_data = load_practice(pred_event['EventName'].iloc[0], pred_event['EventFormat'].iloc[0], 2023)
for col in ['RoundNumber', 'Country', 'Location', 'OfficialEventName', 'EventDate','EventName', 'EventFormat']:
    pred_data[col] = pred_event[col].iloc[0]

core           INFO 	Loading data for Australian Grand Prix - Practice 1 [v2.3.1]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing_data
api            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
api            INFO 	Using cached data for session_status_data
api            INFO 	Using cached data for track_status_data
  df = pd.concat([df, result], sort=False)
  for key, value in row.iteritems():
  df = pd.concat([df, result], sort=False)
  df = pd.concat([df, result], sort=False)
  for key, value in row.iteritems():
  df = pd.concat([df, result], sort=False)
  df = pd.concat([df, result], sort=False)
  for key, value in row.iteritems():
  df = pd.concat([df, result], sort=False)
  df = pd.concat([df, result], sort=False)
  for key, value in row.iteritems():
  df = pd.concat([df, result], sort=False)
  df = pd.concat([df, result], sort=False)
  for key, value in row.iteritems():
  df =

In [31]:
one_hot = ['Compound']

for col in pred_data.columns:
    for name in one_hot:
        if name in col:
            dummies = pd.get_dummies(pred_data[col])
            dummies.columns = col + '_' + dummies.columns
            pred_data = pd.concat([pred_data, dummies], axis = 1)

In [32]:
pred_data.to_pickle('pred_data.pkl')