In [66]:
# Import necessary libraries for the data creation step of the project
import fastf1 as f1
import pandas as pd
import seaborn as sns
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F        

# 1. Data creation and engineering

## 1-1. Data Creation

The primary data source is from the FastF1 library (https://github.com/theOehrly/Fast-F1). We will be using the telemetry data from the 2023 season to predict change of positions for all drivers. The justification for the use of data from 2023 has been specified in the data-analysis.ipynb file. 

In [10]:
# Custom Turn information used for data creation 
Bah_Turn = pd.Series(['Low', 'Med-High', 'High', 'Med-Low', 'High', 'Med-High', 'High',
                             'Low', 'High', 'Low', 'Med-High', 'High', 'Med-Low', 'Med-Low', 'High'])

Sau_Turn = pd.Series(['Med-Low', 'Low', 'High', 'Med-Low', 'Low', 'Med-High', 'Med-High',
                             'Med-High', 'Med-High', 'Med-High', 'High', 'High', 
                             'Low', 'High', 'High', 'Med-High', 'Med-High', 'High', 
                             'High', 'High', 'High', 'Med-High', 'High', 'Med-High', 'Med-High',
                             'High', 'Low'])

In [67]:
# create our inital data 
from my_modules.data_create import data_importer
dc = data_importer(2023, 1)
total_tele = dc.data_creator(2023, 1, Bah_Turn)

req            INFO 	Using cached data for season_schedule
core           INFO 	Loading data for Bahrain Grand Prix - Race [v3.3.5]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '11', '14', '55', '44', '18', '63', '77', '10', '23', '22', '2', '20', '21', '27', '24', '4', '31', '16', '81']
Yo

The data is consisted of the following columns for each driver 

| Feature | Type | Description |
| --- | --- | --- |
| Date | TimeDelta | The timestamp of when the data was collected |
| SessionDate | TimeDelta | The relative timestamp of the session |
| RPM | int | The RPM of the vehicle |
| Speed | int | The speed of the vehicle |
| nGear | int | The gear status of the vehicle |
| Throttle | int | The % of throttle pressure |
| Brake | Bool | The brake status |
| DRS | Bool | The DRS status |
| X | int | X position (1/10 m) |
| Y | int | Y position (1/10 m) |
| Z | int | Z position (1/10 m) |
| Status | Cat (str) | Current status of the driver (DNF, Finished etc) |
| TrackStatus | Cat (str) | Flag (Yellow flag, Safety Car, Red Flag, Virtual Safety Car) | 
| Compound | Cat (str)|The Tyre Compound (Soft, Medium, Hard, Intermediate, Wet) |
| PitIn | Bool | Driver pit in status |
| PitOut | Bool | Driver pit out status |
| Distance | int | The total distance driven for the lap |
| Corner | int| The distance to the nearest turn |
| Angle | Cat (str) | The severity of the turn divided into 4 classes (Low (0-45), Med-Low (45-90), Med-High (90-120), High (120-180)) |

Here the categorical and boolean values will seperately be encoded as Dummy Variables. 

In [68]:
# Find the Date and Session Time column with the most input amongst drivers and use that
# SessionTime is kept to merge weather data 
def mx_len(df, col_name):
    mx_len = 0

    for col in df.columns:
        if col_name in col:
            if total_tele[col].count() > mx_len:

                if mx_len != 0:
                    df.drop(columns = col_name, inplace = True)
                mx_len = max(mx_len, df[col].count())
                df.rename(columns = {col: col_name}, inplace = True)
            else:
                df.drop(columns = col, inplace = True)
    df = df[[col_name] + [col for col in df.columns if col != col_name]]
    return df

total_tele = mx_len(total_tele, 'Date')
total_tele = mx_len(total_tele, 'SessionTime')

We will also add the weather data of the specified timestamp from the FastF1 library. The weather_data is a telemetry data with specified descriptions as show below, all of which will be included into our data.

| Feature | Type | Description |
| --- | --- | --- |
| AirTemp | Int | Temperature |
| Humidity | Int | Humidity |
| Pressure | Int | Air pressure|
| RainFall | bool | Show if there is rainfall |
| TrackTemp | Int | Temperature of the track |
| WindDirection | Int | Direction of the wind |
| WindSpeed | Int | Speed of the wind | 

In [69]:
total_tele = dc.weather_creator(total_tele)

core           INFO 	Loading data for Bahrain Grand Prix - Race [v3.3.5]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '11', '14', '55', '44', '18', '63', '77', '10', '23', '22', '2', '20', '21', '27', '24', '4', '31', '16', '81']


## 1-2. Feature Engineering

Here, our existing data will undergo feature engineering for the model (Transformer) of our choice. 

### 1-2-1. Distance 

Re-scale the values of the 'Distance' columns such that they are all within the range (0, 1)

In [70]:
drivers = ['VER','GAS','PER','ALO','LEC','STR','SAR','MAG','DEV','TSU','ALB','ZHO','HUL','OCO','NOR','HAM','SAI','RUS','BOT','PIA']

In [71]:
# Calculate the accumulate distance for each datapoint
max_val = 5412

# Apply the full length of the racetrack to the existing distance value to attain the accumulated sum 
for i in drivers:
    total_tele[i+'_Acc_Distance'] = total_tele[i + '_Distance'] + max_val*(total_tele[i+'_Lap']-1)

In [72]:
# Calculate the distance to the nearest corner and replace the column
for i in drivers:
    total_tele[i + '_Corner'] = ((total_tele[i + '_Corner']) - (total_tele[i + '_Distance']))

In [73]:
# Scale the distance columns 
for i in drivers:
    total_tele[i + '_Distance'] /= max_val
    total_tele[i + '_Distance'] = total_tele[i + '_Distance'] % 1

for i in drivers:
    total_tele[i + '_Acc_Distance'] /= max_val * 57

### 1-2-2. TrackStatus

In [74]:
# Feature Engineer TrackStatus to attain just the crucial part of the track
# Define function to return the largest integer value within the track status input
def return_max(col):
    if isinstance(col, float): return col
    else: 
        col = max(list(col))
    return col

for i in total_tele.columns:
    if 'TrackStatus' in i:
        total_tele[i] = total_tele[i].apply(return_max)

### 1-2-3. Imputation

In [75]:
Events = f1.get_event_schedule(2023)
Events_Race = Events[Events['Session5'] == 'Race']
Event = Events_Race.loc[1, :]
session = f1.core.Session(Event, session_name = 'Race', f1_api_support = True)
session.load(laps = True, telemetry = True, weather = True, messages = True)
sesh_r = session.results
sesh_l = session.laps

core           INFO 	Loading data for Bahrain Grand Prix - Race [v3.3.5]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '11', '14', '55', '44', '18', '63', '77', '10', '23', '22', '2', '20', '21', '27', '24', '4', '31', '16', '81']


In [76]:
# Imputation for continuous features
cts = ['RPM', 'Speed', 'nGear', 'Brake', 'Throttle', 'DRS', 'X', 'Y', 'Z', 'TyreLife', 'Distance', 'Corner']
for cols in cts:
    for drv in drivers:
        total_tele[drv + '_' + cols].fillna(0, inplace = True)

# Imputation for the compound column
for drv in drivers:
    total_tele[drv + '_' + 'Compound'].fillna('Done', inplace = True)

# Imputation for the status column
for drv in drivers:
    i = drv + '_' + 'Status'
    final_rec = list(total_tele[total_tele[i].isna() == False][i])[-1]
    total_tele[i].fillna(final_rec, inplace = True)

# Imputing the angle column
for drv in drivers:
    total_tele[drv + '_Angle'].fillna('Done', inplace = True)

# Imputing the Lap and TrackStatus columns
def impute(col, val):
    for drv in drivers:
        if sesh_r[sesh_r['Abbreviation'] == drv]['Status'].values[0] == 'Finished':
            i = drv + '_' + col
            total_tele[i].fillna(val, inplace = True)
            # total_tele[i].fillna(-1, inplace = True)
        else:
            i = drv + '_' + col
            final_rec = list(total_tele[total_tele[i].isna() == False][i])[-1]
            total_tele[i].fillna(final_rec, inplace = True)

# imputing Acc_Distance
def impute_acc_dist():
    for drv in drivers:
        i = drv + '_' + 'Acc_Distance'
        final_rec = list(total_tele[total_tele[i].isna() == False][i])[-1]
        total_tele[i].fillna(final_rec, inplace = True)
            
impute('Lap', 57)
impute('TrackStatus', '9')
impute_acc_dist()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  total_tele[drv + '_' + cols].fillna(0, inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  total_tele[drv + '_' + 'Compound'].fillna('Done', inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on 

### 1-2-4. Dummy Variables

In [77]:
# Set the categorical columns as dummy variables
cols = []
for drv in drivers:
    col_1 = drv +'_Status'
    col_2 = drv + '_TrackStatus'
    col_3 = drv + '_Compound'
    col_4 = drv + '_nGear'
    col_5 = drv + '_Angle'
    cols.append(col_1)
    cols.append(col_2)
    cols.append(col_3)
    cols.append(col_4)
    cols.append(col_5)
    total_tele[col_1] = pd.Categorical(total_tele[col_1], categories = ['Finished', 'DNF', 'Lapped'])
    total_tele[col_2] = pd.Categorical(total_tele[col_2], categories = ['1', '2', '4', '5', '6', '7', '8', '0'])
    total_tele[col_3] = pd.Categorical(total_tele[col_3], categories = ['SOFT', 'MEDIUM', 'HARD', 'INTERMEDIATE', 'WET', 'DONE'])
    ## the '9' represents the chequered flag for the driver
    total_tele[col_4] = pd.Categorical(total_tele[col_4], categories = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'])
    total_tele[col_5] = pd.Categorical(total_tele[col_5], categories = ['Low', 'Med-Low', 'Med-High', 'High', 'Done'])
temp = pd.get_dummies(total_tele[cols], dtype = int)

total_tele.drop(columns = cols, inplace = True)
total_tele = pd.concat([total_tele, temp], axis = 1)

### 1-2-5. Pit_in & Pit_out

In [78]:
for drv in drivers:
    sesh = sesh_l.pick_drivers(drv)
    sesh['PitInTime'] = pd.to_timedelta(sesh['PitInTime'])
    sesh['PitOutTime'] = pd.to_timedelta(sesh['PitOutTime'])
    In_df = sesh[sesh['PitInTime'].isna() == False]['PitInTime'].to_frame()
    Out_df = sesh[sesh['PitOutTime'].isna() == False]['PitOutTime'].to_frame()

    temp = pd.to_timedelta(total_tele['SessionTime'])

    in_temp = pd.merge_asof(In_df, temp, left_on = 'PitInTime', right_on = 'SessionTime', direction = 'nearest').SessionTime
    out_temp = pd.merge_asof(Out_df, temp, left_on = 'PitOutTime', right_on = 'SessionTime', direction = 'nearest').SessionTime

    total_tele[drv + '_Pit'] = 0

    if len(in_temp) == len(out_temp):
        for pit_in, pit_out in zip(in_temp, out_temp):
            total_tele[drv + '_Pit'].loc[(pit_in <= total_tele['SessionTime'])&(total_tele['SessionTime'] <= pit_out)] = 1
    else:
        for i in range(len(out_temp)):
            total_tele[drv + '_Pit'].loc[(in_temp.loc[i]  <= total_tele['SessionTime'])&(total_tele['SessionTime'] <= out_temp.loc[i])] = 1
        total_tele[drv + '_Pit'].loc[(in_temp.loc[len(in_temp)-1] == total_tele['SessionTime'])] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sesh['PitInTime'] = pd.to_timedelta(sesh['PitInTime'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sesh['PitOutTime'] = pd.to_timedelta(sesh['PitOutTime'])
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you ar

### 1-2-6. Scaling the Integer Columns

In [79]:
# Attain the max value for the columns: RPM, Speed, Throttle, Distance
def id_max(col_name):
    max_val = int()
    for i in drivers:
        max_val = max(max_val, max(total_tele[i + '_'+ col_name]))
    return max_val 

info = dict()
for i in ['RPM', 'Speed', 'Throttle']:
    info[i] = id_max(i)

In [91]:
info

{'RPM': 13144.0, 'Speed': 335.0, 'Throttle': 104.0}

In [80]:
for drv in drivers:
    for cols in info.keys():
        total_tele[drv + '_' + cols] /= info[cols]

## 1-3. Wrap Up

In [81]:
# Drop SessionTime and Date column as it is no longer needed
total_tele.drop(columns = ['SessionTime', 'Date'], inplace = True)

In [82]:
for i in drivers:
    total_tele.drop(columns = [i + '_RelativeDistance'], inplace = True)

In [83]:
total_tele[(total_tele['VER_Distance'] != 0)]['VER_Distance']

5        0.000005
6        0.000010
7        0.000090
8        0.000243
9        0.000421
           ...   
42489    0.984469
42490    0.984928
42491    0.988708
42492    0.988903
42493    0.988925
Name: VER_Distance, Length: 42433, dtype: float64

In [88]:
total_tele.to_csv('../../Desktop/tot_dat_bah.csv')