In [None]:
# Import necessary libraries for the data creation step of the project
import fastf1 as f1
import pandas as pd
import seaborn as sns
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F        

# 1. Data creation and engineering

## 1-1. Data Creation

The primary data source is from the FastF1 library (https://github.com/theOehrly/Fast-F1). We will be using the telemetry data from the 2023 season to predict change of positions for all drivers. The justification for the use of data from 2023 has been specified in the data-analysis.ipynb file. 

In [None]:
# Select all race events in 2023
Events = f1.get_event_schedule(2023)
Events_Race = Events[Events['Session5'] == 'Race']
total_tele = pd.DataFrame()
Event = Events_Race.loc[1, :]

# Load session object
session = f1.core.Session(Event, session_name = 'Race', f1_api_support = True)
session.load(laps = True, telemetry = True, weather = True, messages = True)

# Load laps and results data
sesh_l = session.laps
sesh_r = session.results

# Attain all the drivers from the lap
drivers = list(sesh_l['Driver'].unique())

for drv in drivers:
    total_drv = pd.DataFrame()

    # Total number of laps the driver had 
    total_laps = int(sesh_l.pick_driver(drv).LapNumber.iloc[-1])
    
    distance = 0

    for j in range(total_laps):

        temp_tele = sesh_l.pick_driver(drv).iloc[j].get_telemetry().add_distance()
        temp_tele['Brake'] = temp_tele['Brake'].astype(int)

        # Adding data from session.Laps
        # Laps: which lap the driver is in
        # Compound: Which compound it's in
        # TyreLife: How long the Tyre has been used 
        # TrackStatus: What the track status is 
        temp_tele['Lap'] = j+1
        temp_tele['Compound'] = sesh_l.pick_driver(drv).iloc[j]['Compound']
        temp_tele['TyreLife'] = sesh_l.pick_driver(drv).iloc[j]['TyreLife']
        temp_tele['TrackStatus'] = sesh_l.pick_driver(drv).iloc[j]['TrackStatus']

        # Combining the dataset 
        total_drv = pd.concat([total_drv, temp_tele.reset_index(drop=True)], axis = 0)

    # Drop columns we don't need 
    total_drv.drop(columns = ['Time', 'Source', 'DriverAhead', 'DistanceToDriverAhead'], inplace = True)

    # Add a status column for each telemetry input
    outcome = sesh_r[sesh_r['Abbreviation'] == drv]['Status'].values[0]

    if outcome == 'Finished':
        total_drv['Status'] = 'Finished'
    
    elif '+' in outcome:
        total_drv['Status'] = 'Finished'
        total_drv['Status'].iloc[-1] = 'Lapped'
    
    else:
        total_drv['Status'] = 'Finished'
        total_drv['Status'].iloc[-1] = 'DNF'
    
    # Set consistency of data for the RB team 
    #if drv in ['DEV', 'LAW']:
    #    drv = 'RIC'

    for i in total_drv.columns:
        new_col = drv + '_' + i
        total_drv.rename(columns = {i: new_col}, inplace = True)

    # Concatenate all the data from a single race together
    total_tele = pd.concat([total_tele, total_drv.reset_index(drop = True)], axis = 1)
    total_tele['Location'] = Event['RoundNumber']


The data is consisted of the following columns for each driver 

| Feature | Type | Description |
| --- | --- | --- |
| Date | TimeDelta | The timestamp of when the data was collected |
| SessionDate | TimeDelta | The relative timestamp of the session |
| RPM | int | The RPM of the vehicle |
| Speed | int | The speed of the vehicle |
| nGear | int | The gear status of the vehicle |
| Throttle | int | The % of throttle pressure |
| Brake | Bool | The brake status |
| DRS | Bool | The DRS status |
| RelativeDistance | int | Distance driven since first sample |
| X | int | X position (1/10 m) |
| Y | int | Y position (1/10 m) |
| Z | int | Z position (1/10 m) |
| Status | Cat (str) | Current status of the driver (DNF, Finished etc) |
| TrackStatus | Cat (str) | Flag (Yellow flag, Safety Car, Red Flag, Virtual Safety Car) | 
| Compound | Cat (str)|The Tyre Compound (Soft, Medium, Hard, Intermediate, Wet) |
| PitIn | Bool | Driver pit in status |
| PitOut | Bool | Driver pit out status |
| Distance | int | The total distance driven for the lap |

Here the categorical and boolean values will seperately be encoded as Dummy Variables. 

In [None]:
# Find the Date and Session Time column with the most input amongst drivers and use that
# SessionTime is kept to merge weather data 
def mx_len(df, col_name):
    mx_len = 0

    for col in df.columns:
        if col_name in col:
            if total_tele[col].count() > mx_len:

                if mx_len != 0:
                    df.drop(columns = col_name, inplace = True)
                mx_len = max(mx_len, df[col].count())
                df.rename(columns = {col: col_name}, inplace = True)
            else:
                df.drop(columns = col, inplace = True)
    df = df[[col_name] + [col for col in df.columns if col != col_name]]
    return df

total_tele = mx_len(total_tele, 'Date')
total_tele = mx_len(total_tele, 'SessionTime')

We will also add the weather data of the specified timestamp from the FastF1 library. The weather_data is a telemetry data with specified descriptions as show below, all of which will be included into our data.

| Feature | Type | Description |
| --- | --- | --- |
| AirTemp | Int | Temperature |
| Humidity | Int | Humidity |
| Pressure | Int | Air pressure|
| RainFall | bool | Show if there is rainfall |
| TrackTemp | Int | Temperature of the track |
| WindDirection | Int | Direction of the wind |
| WindSpeed | Int | Speed of the wind | 

In [None]:
# Add weatherdata 
weather_data = session.weather_data
weather_data['Time'] = pd.to_timedelta(weather_data['Time'])
weather_data['Rainfall'] = weather_data['Rainfall'].astype(int)

# Add the weather data df to the total_tele df and drop SessionTime as it is no longer needed
total_tele = pd.merge_asof(total_tele, weather_data, left_on = 'SessionTime', right_on = 'Time', direction = 'nearest')
total_tele.drop(columns=['Time'], inplace=True)
