<a href="https://colab.research.google.com/github/kennethajensen/FormulaOne/blob/main/F1_Download_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
from urllib.request import urlopen
import json
import pandas as pd
import numpy as np
import time
import os
from google.colab import drive

drive.mount('/content/drive')
base_url = 'https://api.openf1.org/v1/'
data_path = '/content/drive/MyDrive/Data Science/[02] Articles - Formula 1 [Work-in-progress]/Data/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [25]:
from urllib.error import HTTPError

def get_data(endpoint, filter='', max_retries=5, initial_retry_delay=2):
  retries = 0
  retry_delay = initial_retry_delay
  while retries <= max_retries:
    try:
      if filter:
        response = urlopen(base_url + endpoint + '?' + filter)
      else:
        response = urlopen(base_url + endpoint)
      data = json.loads(response.read().decode('utf-8'))
      df = pd.DataFrame(data)
      return df
    except HTTPError as e:
      if e.code == 429:
        print(f"Rate limit hit for {endpoint}?{filter}. Retrying in {retry_delay} seconds (Retry {retries+1}/{max_retries})...")
        time.sleep(retry_delay)
        retries += 1
        retry_delay *= 2 # Exponential backoff
      else:
        raise # Re-raise other HTTP errors immediately
    except Exception as e:
      print(f"An unexpected error occurred: {e}")
      raise # Re-raise other unexpected errors

  raise Exception(f"Failed to retrieve data for {endpoint}?{filter} after {max_retries} retries.")

First get the complete data set with all **Meetings** where each meeting is either a testing or racing event covering multiple sessions and days

In [5]:
meetings = get_data('meetings')
meetings.to_csv(os.path.join(data_path, 'meetings.csv'), index=False)

Second, get all of the **qualifying sessions**. This includes both the sprint qualifying and the qualifying for the feature race.



In [14]:
sessions = get_data('sessions', 'session_type=Qualifying&year=2025')
sessions.to_csv(os.path.join(data_path, 'qualifying_sessions.csv'), index=False)

# Create a list of all the unique sessions
# The session_key should be unique in the dataframe to begin with
all_session_keys = sessions['session_key'].unique()
print(f"Total number of sessions: {len(all_session_keys)}")


Total number of sessions: 30


In [38]:
laps_path = os.path.join(data_path, 'qualifying_laps.csv')

# Check if the file already exists to decide whether to write headers
laps_csv_file_exists = os.path.exists(laps_path)

if not laps_csv_file_exists:
    laps = []
    retrieved_session_keys = empty(0)
else:
    laps = pd.read_csv(laps_path)
    retrieved_session_keys = laps['session_key'].unique()

unretrieved_session_keys = list(set(all_session_keys) - set(retrieved_session_keys))


print(f"Number of sessions already retrieved: {len(retrieved_session_keys)}")
print(f"Number of sessions to be retrieved: {len(unretrieved_session_keys)}")
# The lap data from the qualifying in Baku in 2025 is missing from OpenF1.org

Number of sessions already retrieved: 0
Number of sessions to be retrieved: 30


In [39]:
laps = []

for session_key in unretrieved_session_keys:
    laps_by_session = get_data('laps', f'session_key={session_key}')
    laps.append(laps_by_session)
    # Since the API is rate limited, make sure that there is a
    # brief pause between calls
    time.sleep(1)

print(f"Fetched lap data for {len(laps)} sessions.")

Fetched lap data for 30 sessions.


In [40]:
if laps:
    # The laps dataframe is a dataframe of dataframes
    # This collapses the dataframes
    laps = pd.concat(laps, ignore_index=True)

    if laps_csv_file_exists:
        # Append without header if file exists
        laps.to_csv(laps_path, mode='a', header=False, index=False)
        print(f"Appended {len(laps)} new laps to {laps_path}")
    else:
        # Write with header if file does not exist
        laps.to_csv(laps_path, mode='w', header=True, index=False)
        print(f"Created and wrote {len(laps)} laps to new file {laps_path}")
else:
    print("No new lap data to append or save.")

Created and wrote 8498 laps to new file /content/drive/MyDrive/Data Science/[02] Articles - Formula 1 [Work-in-progress]/Data/qualifying_laps.csv


## Get Fastest Lap per Driver per Session

To find the fastest lap for each driver in each session, we need to:
1. Ensure the `laps` DataFrame contains data. If `laps` is currently a list of DataFrames, we will concatenate it into a single DataFrame.
2. Group the DataFrame by `session_key` and `driver_number`.
3. For each group, find the row with the minimum `lap_duration`.


In [47]:
if not laps.empty:
    # Create a copy to avoid SettingWithCopyWarning
    laps = laps.copy()

    # Convert is_pit_out_lap to boolean and drop rows where is_pit_out_lap is True
    laps['is_pit_out_lap'] = laps['is_pit_out_lap'].astype(bool)
    laps = laps[laps['is_pit_out_lap'] == False]

    # Convert lap_duration to numeric, coercing errors to NaN
    laps['lap_duration'] = pd.to_numeric(laps['lap_duration'], errors='coerce')
    # Drop rows where lap_duration is NaN (couldn't be converted)
    laps.dropna(subset=['lap_duration'], inplace=True)

    # Find the fastest lap for each driver in each session
    fastest_laps = laps.loc[laps.groupby(['session_key', 'driver_number'])['lap_duration'].idxmin()]

    print(f"Found {len(fastest_laps)} fastest laps.")
    display(fastest_laps.head())
else:
    print("No lap data available to process.")

Found 576 fastest laps.


Unnamed: 0,meeting_key,session_key,driver_number,lap_number,date_start,duration_sector_1,duration_sector_2,duration_sector_3,i1_speed,i2_speed,is_pit_out_lap,lap_duration,segments_sector_1,segments_sector_2,segments_sector_3,st_speed
3222,1254,9689,1,16,2025-03-15T05:58:21.833000+00:00,26.066,16.915,32.5,290.0,330.0,False,75.481,"[2048, 2048, 2049, 2049, 2048, 2048, 2048, 204...","[2048, 2048, 2049, 2051, 2051]","[2049, 2048, 2048, 2048, 2049, 2049, 2048, 2048]",331.0
3220,1254,9689,4,19,2025-03-15T05:58:00.264000+00:00,25.961,16.997,32.138,288.0,325.0,False,75.096,"[2049, 2049, 2051, 2048, 2049, 2051, 2049, 204...","[2048, 2049, 2049, 2049, 2049]","[2049, 2049, 2049, 2048, 2049, 2051, 2048]",326.0
3080,1254,9689,5,7,2025-03-15T05:17:46.514000+00:00,26.495,17.22,32.801,289.0,327.0,False,76.516,"[2048, 2048, 2049, 2048, 2048, 2049, 2049, 204...","[2049, 2049, 2049, 2049, 2049]","[2051, 2048, 2048, 2048, 2049, 2049, 2049, 2048]",325.0
3159,1254,9689,6,11,2025-03-15T05:38:56.443000+00:00,26.345,17.148,32.682,289.0,328.0,False,76.175,"[2049, 2049, 2049, 2049, 2049, 2048, 2049, 204...","[2048, 2049, 2049, 2049, 2049]","[2049, 2048, 2048, 2049, 2048, 2049, 2048, 2048]",329.0
3067,1254,9689,7,8,2025-03-15T05:16:32.512000+00:00,26.381,17.15,32.784,289.0,326.0,False,76.315,"[2048, 2049, 2049, 2049, 2048, 2049, 2049, 204...","[2049, 2048, 2049, 2048, 2049]","[2049, 2049, 2048, 2049, 2049, 2048, 2048, 2048]",325.0
