In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import fastf1
from fastf1.core import *

In [3]:
''' 
Extracting the Session Data from a Race Weekend (Race and Quali)
'''
def get_session_data(year, track, event_type):
    session_data=fastf1.get_session(year, track, event_type)
    session_data.load(laps=True, telemetry=True, weather=True)
    return session_data

In [4]:
brit_22_R=get_session_data(2022, 'Silverstone', 'R')
spain_22_R=get_session_data(2022, 'Spain', 'R')
monza_22_R=get_session_data(2022, 'Monza', 'R')

core           INFO 	Loading data for British Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['55', '11', '44', '16', '14', '4', '1', '47', '5', '20', '18', '6', '3', '22', '31', '10', '77', '63', '24', '23']
core           INFO 	Loading data for Spanish Grand Prix - Race

In [5]:
def format_Time(td):
    total_sec = td.total_seconds()
    minutes = int(total_sec // 60)
    seconds = int(total_sec % 60)
    milliseconds = int((td.microseconds) / 1000)
    return f"{minutes:02}:{seconds:02}.{milliseconds:03}"

In [6]:
total_sec=0
def format_raceTime(td):
    global total_sec
    time_diff = td.total_seconds()
    if (time_diff == 0):
        return 0
    total_sec+=time_diff
    minutes = int(total_sec // 60)
    seconds = int(total_sec % 60)
    milliseconds = int((td.microseconds) / 1000)
    return f"{minutes:02}:{seconds:02}.{milliseconds:03}"

In [7]:
def preprocess_race_results(race_session):
    race_results = race_session.results
    race_results['Time']=race_results['Time'].fillna(pd.Timedelta(seconds=0))
    race_results['RaceTime'] = race_results['Time'].apply(format_raceTime)
    race_results.drop(columns=['Q1', 'Q2', 'Q3', 'HeadshotUrl', 'FirstName', 'LastName', 'TeamId', 'BroadcastName', 'TeamId', 'Time'], inplace=True)
    race_results.reset_index(drop=True)
    return race_results

In [8]:
brit22_race_results=preprocess_race_results(brit_22_R)
spain22_race_results=preprocess_race_results(spain_22_R)
monza22_race_results=preprocess_race_results(monza_22_R)

In [9]:
brit22_race_results.head()

Unnamed: 0,DriverNumber,Abbreviation,DriverId,TeamName,TeamColor,FullName,CountryCode,Position,ClassifiedPosition,GridPosition,Status,Points,RaceTime
55,55,SAI,sainz,Ferrari,ed1c24,Carlos Sainz,ESP,1.0,1,1.0,Finished,25.0,137:50.311
11,11,PER,perez,Red Bull Racing,1e5bc6,Sergio Perez,MEX,2.0,2,4.0,Finished,18.0,137:54.779
44,44,HAM,hamilton,Mercedes,6cd3bf,Lewis Hamilton,GBR,3.0,3,5.0,Finished,16.0,138:00.225
16,16,LEC,leclerc,Ferrari,ed1c24,Charles Leclerc,MON,4.0,4,3.0,Finished,12.0,138:08.546
14,14,ALO,alonso,Alpine,2293d1,Fernando Alonso,ESP,5.0,5,7.0,Finished,10.0,138:18.571


In [10]:
spain22_race_results.head()

Unnamed: 0,DriverNumber,Abbreviation,DriverId,TeamName,TeamColor,FullName,CountryCode,Position,ClassifiedPosition,GridPosition,Status,Points,RaceTime
1,1,VER,max_verstappen,Red Bull Racing,1e5bc6,Max Verstappen,NED,1.0,1,2.0,Finished,25.0,239:27.475
11,11,PER,perez,Red Bull Racing,1e5bc6,Sergio Perez,MEX,2.0,2,5.0,Finished,19.0,239:41.072
63,63,RUS,russell,Mercedes,6cd3bf,George Russell,GBR,3.0,3,4.0,Finished,15.0,240:13.927
55,55,SAI,sainz,Ferrari,ed1c24,Carlos Sainz,ESP,4.0,4,3.0,Finished,12.0,240:59.208
44,44,HAM,hamilton,Mercedes,6cd3bf,Lewis Hamilton,GBR,5.0,5,6.0,Finished,10.0,241:53.534


In [11]:
monza22_race_results.head()

Unnamed: 0,DriverNumber,Abbreviation,DriverId,TeamName,TeamColor,FullName,CountryCode,Position,ClassifiedPosition,GridPosition,Status,Points,RaceTime
1,1,VER,max_verstappen,Red Bull Racing,1e5bc6,Max Verstappen,NED,1.0,1,7.0,Finished,25.0,325:59.511
16,16,LEC,leclerc,Ferrari,ed1c24,Charles Leclerc,MON,2.0,2,1.0,Finished,18.0,326:02.446
63,63,RUS,russell,Mercedes,6cd3bf,George Russell,GBR,3.0,3,2.0,Finished,15.0,326:05.405
55,55,SAI,sainz,Ferrari,ed1c24,Carlos Sainz,ESP,4.0,4,18.0,Finished,12.0,326:10.061
44,44,HAM,hamilton,Mercedes,6cd3bf,Lewis Hamilton,GBR,5.0,5,19.0,Finished,10.0,326:16.380


In [12]:
def generate_laps_summary(race_session):
    laps_summary_df=race_session.laps
    laps_summary_df = laps_summary_df[(~laps_summary_df['Deleted']) & (laps_summary_df['IsAccurate']) & (laps_summary_df['LapTime'].notnull())]
    summaries = []
    pitstop_map = laps_summary_df.groupby('Driver')['Compound'].nunique().apply(lambda x: max(x - 1, 0)).to_dict()

    for driver in laps_summary_df['Driver'].unique():
        driver_laps = laps_summary_df[laps_summary_df['Driver'] == driver]

        fastest_lap = driver_laps.loc[driver_laps['LapTime'].idxmin()]
        fastest_lap_time = format_Time(fastest_lap['LapTime'])

        total_laps = driver_laps['LapNumber'].nunique()
        pitstops = pitstop_map.get(driver, 0)

        compound_groups = driver_laps.groupby('Compound')

        for compound, group in compound_groups:
            compound_summary = {
                'Driver': driver,
                'Compound': compound,
                'LapsOnCompound': len(group),
                'FastestLapOnCompound': format_Time(group['LapTime'].min()),
                'FastestSector1': format_Time(group['Sector1Time'].min()),
                'FastestSector2': format_Time(group['Sector2Time'].min()),
                'FastestSector3': format_Time(group['Sector3Time'].min()),
                'FastestLapOverall': fastest_lap_time,
                'TotalLaps': total_laps,
                'PitStops': pitstops
            }
            summaries.append(compound_summary)

    return pd.DataFrame(summaries)

In [13]:
brit22_race_laps= generate_laps_summary(brit_22_R)
spain22_race_laps=generate_laps_summary(spain_22_R)
monza22_race_laps=generate_laps_summary(monza_22_R)

In [14]:
brit22_race_laps.head()

Unnamed: 0,Driver,Compound,LapsOnCompound,FastestLapOnCompound,FastestSector1,FastestSector2,FastestSector3,FastestLapOverall,TotalLaps,PitStops
0,VER,HARD,13,01:34.023,00:29.685,00:38.292,00:25.605,01:32.354,39,2
1,VER,MEDIUM,17,01:33.451,00:29.619,00:38.014,00:25.397,01:32.354,39,2
2,VER,SOFT,9,01:32.354,00:29.354,00:37.558,00:25.261,01:32.354,39,2
3,GAS,HARD,8,01:34.614,00:30.022,00:38.633,00:25.863,01:34.614,20,1
4,GAS,SOFT,12,01:35.333,00:30.129,00:38.801,00:26.039,01:34.614,20,1


In [15]:
spain22_race_laps.head()

Unnamed: 0,Driver,Compound,LapsOnCompound,FastestLapOnCompound,FastestSector1,FastestSector2,FastestSector3,FastestLapOverall,TotalLaps,PitStops
0,VER,MEDIUM,33,01:25.456,00:23.900,00:32.120,00:29.325,01:25.456,58,1
1,VER,SOFT,25,01:25.732,00:23.877,00:32.294,00:29.533,01:25.456,58,1
2,PER,MEDIUM,31,01:26.649,00:23.923,00:32.702,00:29.823,01:24.108,58,1
3,PER,SOFT,27,01:24.108,00:23.486,00:31.649,00:28.973,01:24.108,58,1
4,RUS,MEDIUM,34,01:26.839,00:24.008,00:32.783,00:29.767,01:24.636,59,1


In [16]:
monza22_race_laps.head()

Unnamed: 0,Driver,Compound,LapsOnCompound,FastestLapOnCompound,FastestSector1,FastestSector2,FastestSector3,FastestLapOverall,TotalLaps,PitStops
0,VER,MEDIUM,21,01:24.745,00:27.789,00:28.872,00:27.863,01:24.745,41,1
1,VER,SOFT,20,01:25.297,00:27.904,00:29.065,00:28.144,01:24.745,41,1
2,GAS,HARD,27,01:26.718,00:28.053,00:29.663,00:28.734,01:26.718,41,1
3,GAS,MEDIUM,14,01:27.232,00:28.085,00:29.886,00:29.081,01:26.718,41,1
4,PER,HARD,30,01:25.570,00:28.060,00:29.234,00:28.140,01:24.030,38,2


In [17]:
monza22_race_results.head()

Unnamed: 0,DriverNumber,Abbreviation,DriverId,TeamName,TeamColor,FullName,CountryCode,Position,ClassifiedPosition,GridPosition,Status,Points,RaceTime
1,1,VER,max_verstappen,Red Bull Racing,1e5bc6,Max Verstappen,NED,1.0,1,7.0,Finished,25.0,325:59.511
16,16,LEC,leclerc,Ferrari,ed1c24,Charles Leclerc,MON,2.0,2,1.0,Finished,18.0,326:02.446
63,63,RUS,russell,Mercedes,6cd3bf,George Russell,GBR,3.0,3,2.0,Finished,15.0,326:05.405
55,55,SAI,sainz,Ferrari,ed1c24,Carlos Sainz,ESP,4.0,4,18.0,Finished,12.0,326:10.061
44,44,HAM,hamilton,Mercedes,6cd3bf,Lewis Hamilton,GBR,5.0,5,19.0,Finished,10.0,326:16.380


In [28]:
def get_raceContext_json(race_results, race_name, year):
    context = get_race_results_context(race_results)
    return {
        "instruction": f"Summarize the race results of {race_name} {year}",
        "context": context,
        "response": ""
    }

In [31]:
def get_race_results_context(race_results):
    context = []
    for _, row in race_results.iterrows():
        context.append({
            "driver": row["FullName"],
            "abbreviation": row["Abbreviation"],
            "team": row["TeamName"],
            "race_position": row["ClassifiedPosition"],
            "starting_position": int(row["GridPosition"]),
            "points_secured": float(row["Points"])
        })
    return context

In [37]:
monza_results_context=get_raceContext_json(monza22_race_results, 'Italian Grand Prix', 2022)
brit_results_context=get_raceContext_json(brit22_race_results, 'British Grand Prix', 2022)
spain_results_context=get_raceContext_json(spain22_race_results, 'Spanish Grand Prix', 2022)

In [35]:
import os
import json

In [45]:
def save_all_races_to_jsonl(race_data_dict, folder_path="jsonl", filename="f1_race_results.jsonl"):
    os.makedirs(folder_path, exist_ok=True)
    output_path = os.path.join(folder_path, filename)
    with open(output_path, "w", encoding="utf-8") as f:
        for (race_name, year), df in race_data_dict.items():
            entry = get_raceContext_json(df, race_name, year)
            f.write(json.dumps(entry) + "\n")

    print(f"Saved {len(race_data_dict)} races to {output_path}")

In [51]:
race_result_dict = {
    ("Italian Grand Prix", 2022): monza22_race_results,
    ("British Grand Prix", 2022): brit22_race_results,
    ("Spanish Grand Prix", 2022): spain22_race_results,
}

In [50]:
save_all_races_to_jsonl(race_result_dict)

Saved 3 races to jsonl/f1_race_results.jsonl
