Install any poackages needed to run the code below.

In [None]:
%pip install -r requirements.txt

1. Create the Build to use the Api Helper Script and test the Helper
2. Download the Session Data for 2024
3. Extract the Unique Session Keys (IDs) for all the Races

In [None]:
import sys
import time
from pathlib import Path
import pandas as pd

# ── Add the 'utils' folder to Python path ────────────────────────────────
utils_path = Path.cwd() / "utils"
if str(utils_path) not in sys.path:
    sys.path.append(str(utils_path))

from utils.api_handler import fetch_openf1

# ── Step 1: Test API with a quick 2023 call ──────────────────────────────
try:
    test_sessions = fetch_openf1("sessions", {"year": 2023}, csv=False)
    print(f"✅ Test passed. Fetched {len(test_sessions)} records for 2023.")
except Exception as e:
    print("❌ Test failed. Cannot connect to OpenF1 sessions endpoint.")
    raise e

# ── Step 2: Try full metadata pull, fallback to 2024-only ────────────────
try:
    print("📥 Attempting full session metadata fetch...")
    sessions = fetch_openf1("sessions", csv=False)
    if sessions.empty or "year" not in sessions.columns:
        raise ValueError("⚠️ Session data malformed or empty.")
    print(f"✅ Retrieved {len(sessions)} total sessions.")
except Exception as e:
    print("❌ Full fetch failed. Trying year=2024 fallback.")
    try:
        sessions = fetch_openf1("sessions", {"year": 2024}, csv=False)
        print(f"✅ Fallback success: {len(sessions)} sessions for 2024.")
    except Exception as fallback_error:
        print("❌ Fallback fetch failed.")
        raise fallback_error

# ── Step 3: Filter to 2024 and save ───────────────────────────────────────
try:
    sessions_2024 = sessions[
        (sessions["year"] == 2024) & 
        sessions["session_key"].notna()
    ]
    output_csv = Path("data/raw/sessions_2024.csv")
    output_csv.parent.mkdir(parents=True, exist_ok=True)
    sessions_2024.to_csv(output_csv, index=False)
    print(f"✅ Saved {len(sessions_2024)} 2024 sessions to {output_csv}")
except Exception as filter_error:
    print("❌ Error during 2024 session filtering or saving.")
    raise filter_error

# ── Step 4: Extract session keys from 2024 data ──────────────────────────
try:
    sessions_2024["date_start"] = pd.to_datetime(sessions_2024["date_start"], errors="coerce", utc=True)
    filtered = sessions_2024[
        (sessions_2024["date_start"] >= "2024-01-01") &
        (sessions_2024["date_start"] <= "2024-12-31")
    ]
    session_keys_2024 = filtered["session_key"].dropna().unique()
    key_output_path = Path("data/raw/session_keys_2024.csv")
    pd.Series(session_keys_2024).to_csv(key_output_path, index=False, header=False)
    print(f"✅ Extracted and saved {len(session_keys_2024)} unique session keys to {key_output_path}")
except Exception as key_error:
    print("❌ Error during session key extraction or saving.")
    raise key_error


1. Download Data for Top 5 Drivers in 2024
2. Sort them into their respective folders /data/raw/varible_name

In [None]:
# ── Setup output directory ──────────────────────────────────────────────
BASE_DIR = Path("data/raw")
BASE_DIR.mkdir(parents=True, exist_ok=True)

# ── Sleep time between requests ─────────────────────────────────────────
RATE_LIMIT_SLEEP = 2  # seconds between all requests

# ── Define 2024 Drivers and Variables ───────────────────────────────────
drivers = {
    "1": "Verstappen",   # 🠐 Champion number used in 2024
    "4": "Norris",
    "16": "Leclerc",
    "81": "Piastri",
    "55": "Sainz"
}

variables_by_driver = {
    "1": ["laps", "position", "stints", "weather"],
    "4": ["laps", "position", "stints", "weather"],,
    "16": ["laps", "position", "stints", "weather"],,
    "81": ["laps", "position", "stints", "weather"],,
    "55": ["laps", "position", "stints", "weather"],,
}

# ── Load and filter sessions for 2024 only ──────────────────────────────
session_keys = sessions_2024["session_key"].dropna().unique()

# ── Download loop using fetch_openf1 ────────────────────────────────────
for driver_number, vars_list in variables_by_driver.items():
    for session_key in session_keys:
        for var in vars_list:
            params = {"session_key": session_key}
            if var != "weather":
                params["driver_number"] = driver_number

            try:
                df = fetch_openf1(var, params=params, csv=False, sleep_time=RATE_LIMIT_SLEEP)
                if df.empty:
                    continue

                output_dir = BASE_DIR / var
                output_dir.mkdir(exist_ok=True)
                if var == "weather":
                    file_path = output_dir / f"{var}_{session_key}.csv"
                else:
                    file_path = output_dir / f"{var}_{session_key}_driver{driver_number}.csv"

                df.to_csv(file_path, index=False)
                print(f"✅ Saved {len(df)} rows to {file_path.name}")
                time.sleep(RATE_LIMIT_SLEEP)

            except Exception as e:
                print(f"❌ Error: {var}, session={session_key}, driver={driver_number}, error={e}")


Download Car Data for Verstappen (Note: This data requires special API request latency adjustment)

In [None]:
# ── Setup ────────────────────────────────────────────────────────────────
BASE_DIR = Path("data/raw/car_data")
BASE_DIR.mkdir(parents=True, exist_ok=True)

RATE_LIMIT_SLEEP = 2.0
driver_number = "1"  # Verstappen

# ── Download car_data for each session ───────────────────────────────────
log = []

for session_key in session_keys_2024:
    print(f"⏳ Querying session={session_key}, driver={driver_number}")
    
    try:
        df = fetch_openf1("car_data", {"session_key": session_key, "driver_number": driver_number}, csv=False)

        if df.empty:
            print(f"ℹ️ No data returned: session={session_key}")
            log.append({"session_key": session_key, "status": "empty"})
            continue

        file_path = BASE_DIR / f"car_data_{session_key}_driver{driver_number}.csv"
        df.to_csv(file_path, index=False)
        print(f"✅ Saved {len(df)} rows to {file_path.name}")
        log.append({"session_key": session_key, "status": "saved", "rows": len(df)})

        time.sleep(RATE_LIMIT_SLEEP)

    except Exception as e:
        err_msg = str(e)
        if "422" in err_msg:
            print(f"⚠️ Unprocessable: session={session_key}, likely no data for driver={driver_number}")
            log.append({"session_key": session_key, "status": "unprocessable_entity"})
        else:
            print(f"❌ Exception: session={session_key}, error={err_msg}")
            log.append({"session_key": session_key, "status": "exception", "error": err_msg})

# ── Save log ─────────────────────────────────────────────────────────────
pd.DataFrame(log).to_csv(BASE_DIR / "car_data_download_log.csv", index=False)
print("📄 Log file saved: car_data_download_log.csv")


Download All Race Control Data for the Sessions

In [None]:
# ── Download race_control for all 2024 sessions using api_helper ─────────────
from utils.api_handler import fetch_openf1
import pandas as pd
from pathlib import Path
import time

# Load session keys
data_dir = Path("data/raw")
session_keys_path = data_dir / "session_keys_2024.csv"
session_keys = pd.read_csv(session_keys_path, header=None)[0].dropna().unique()

# Output directory for all race_control files
output_dir = data_dir / "race_control_all"
output_dir.mkdir(parents=True, exist_ok=True)

RATE_LIMIT_SLEEP = 1.5  # seconds between requests

for session_key in session_keys:
    try:
        df = fetch_openf1("race_control", params={"session_key": session_key}, csv=False, sleep_time=RATE_LIMIT_SLEEP)
        if df.empty:
            print(f"No race_control data for session {session_key}")
            continue
        file_path = output_dir / f"race_control_{session_key}.csv"
        df.to_csv(file_path, index=False)
        print(f"✅ Saved race_control for session {session_key} to {file_path.name}")
        time.sleep(RATE_LIMIT_SLEEP)
    except Exception as e:
        print(f"❌ Error for session {session_key}: {e}")