In [None]:
import os
import time
import requests
import pandas as pd

save_dir = "/content/drive/MyDrive/f1_data"
os.makedirs(save_dir, exist_ok=True)

years = [2023, 2024, 2025]
max_retries = 5
base_delay = 5  # seconds if hit rate limit 429

def safe_get_json(url, max_retries=5, base_delay=2):
    """Safe function to GET JSON with rate limit (429) handling."""
    for attempt in range(1, max_retries + 1):
        try:
            res = requests.get(url)
            if res.status_code == 429:
                wait_time = base_delay * attempt
                print(f"429 Too Many Requests — wait {wait_time}s (attempt {attempt})")
                time.sleep(wait_time)
                continue

            data = res.json()
            # if API returns error message in JSON
            if isinstance(data, dict) and data.get("error") == "Too Many Requests":
                wait_time = base_delay * attempt
                print(f"Rate limit JSON — wait {wait_time}s (attempt {attempt})")
                time.sleep(wait_time)
                continue

            return data  # success
        except Exception as e:
            print(f"    Error fetching {url}: {e}")
            if attempt < max_retries:
                wait_time = base_delay * attempt
                print(f"Waiting {wait_time}s before retrying...")
                time.sleep(wait_time)
            else:
                print(f"Failed to fetch data after {max_retries} attempts.")
                return None
    return None

for year in years:
    print(f"\n Fetching Race data for year {year}...\n")
    sessions = safe_get_json(f"https://api.openf1.org/v1/sessions?year={year}&session_type=Race")

    if not isinstance(sessions, list):
        print(f"none valid Race sessions found for year {year}")
        continue

    for session in sessions:
        if not isinstance(session, dict):
            continue
        session_key = session.get("session_key")
        race_name = session.get("circuit_short_name", "Unknown").replace(" ", "_")
        session_name = session.get("session_type", "Unknown")
        print(f"Race: {race_name} ({session_key}) [{session_name}]")

        drivers = safe_get_json(f"https://api.openf1.org/v1/drivers?session_key={session_key}")

        if not isinstance(drivers, list):
            print(f"drivers response is not valid: {drivers}")
            continue

        for driver in drivers:
            if not isinstance(driver, dict):
                continue
            driver_number = driver.get("driver_number")
            driver_name = driver.get("full_name", "Unknown").replace(" ", "_")
            if not driver_number:
                continue

            filename = f"{year}_{race_name}_{session_name}_{session_key}_{driver_number}.csv"
            filepath = os.path.join(save_dir, filename)

            if os.path.exists(filepath):
                print(f"Skipping, already exists: {filename}")
                continue

            print(f"fetching car data for {driver_name} (#{driver_number})...")

            url_car = (
                f"https://api.openf1.org/v1/car_data?"
                f"session_key={session_key}&driver_number={driver_number}&speed>=100"
            )

            car_data = safe_get_json(url_car, max_retries=max_retries, base_delay=base_delay)
            if not isinstance(car_data, list) or not car_data:
                print(f"no car data found for {driver_name}")
                continue

            df = pd.DataFrame(car_data)
            df["year"] = year
            df["race"] = race_name
            df["driver_number"] = driver_number
            df["driver_name"] = driver_name
            df.to_csv(filepath, index=False)
            print(f"data {driver_name} saved ({len(df)} rows)")

            time.sleep(0.5)

print(f"file saved to: {save_dir}")

In [None]:
# Read the sample CSV into a new DataFrame
df_sample_read = pd.read_csv("/content/drive/MyDrive/f1_data/2023_Sakhir_Race_7953_1.csv")

# Display the head of the new DataFrame
display(df_sample_read.head())
display(df.info())

In [None]:
import pandas as pd
url_2023 = "https://api.openf1.org/v1/sessions?year=2023&session_type=Race"
url_2024 = "https://api.openf1.org/v1/sessions?year=2024&session_type=Race"
url_2025 = "https://api.openf1.org/v1/sessions?year=2025&session_type=Race"
df_2023 = pd.read_json(url_2023)
df_2024 = pd.read_json(url_2024)
df_2025 = pd.read_json(url_2025)
df_all = pd.concat([df_2023, df_2024, df_2025], ignore_index=True)
df_all.to_csv("all_race_sessions_2023_2025.csv", index=False)
print(df_all)

In [None]:
import os
import glob
import pandas as pd

# Path setup (relative to script location)
script_dir = os.getcwd()
raw_dir = os.path.join(script_dir, "f1_data")
race_info_path = os.path.join(script_dir, "data_scraping", "f1_races_2023_2025.csv")
output_dir = os.path.join(script_dir, "f1_cleaned_data")
os.makedirs(output_dir, exist_ok=True)

# load race info
race_info = pd.read_csv(race_info_path)
race_map = dict(zip(race_info["session_key"], race_info["session_name"]))  # map session_key → session_name

# cleaning rules
numeric_columns = ["speed", "throttle", "rpm", "n_gear"]
bounds = {
    "speed": (0, 400),
    "throttle": (0, 100),
    "rpm": (0, 20000),
    "n_gear": (-1, 8)
}

# process all files
csv_files = glob.glob(os.path.join(raw_dir, "*.csv"))
print(f"{len(csv_files)} files found.")

for file_path in csv_files:
    try:
        df = pd.read_csv(file_path)
        file_name = os.path.basename(file_path)

        df.drop_duplicates(inplace=True)
        for col in numeric_columns:
            if col in df.columns:
                df[col] = pd.to_numeric(df[col], errors="coerce")
                low, high = bounds[col]
                df = df[(df[col] >= low) & (df[col] <= high)]

        df.dropna(subset=["speed", "throttle", "rpm", "n_gear"], inplace=True)

        # identify session type and year
        session_key = str(df["session_key"].iloc[0])
        session_name = race_map.get(int(session_key), "Unknown")

        year = str(df["year"].iloc[0])
        save_dir = os.path.join(output_dir, year, session_name.lower())
        os.makedirs(save_dir, exist_ok=True)

        save_path = os.path.join(save_dir, file_name)
        df.to_csv(save_path, index=False)
        print(f" done {file_name} → {session_name} ({len(df)} rows)")

    except Exception as e:
        print(f"error {os.path.basename(file_path)}: {e}")

print(f"Output saved to: {output_dir}")

In [None]:
import os
import glob
import pandas as pd

script_dir = os.getcwd()
base_dir = os.path.join(script_dir, "f1_cleaned_data")
output_dir = os.path.join(script_dir, "f1_annual_data")
os.makedirs(output_dir, exist_ok=True)

for year in ["2023", "2024", "2025"]:
    for session_name in ["race", "sprint"]:
        input_path = os.path.join(base_dir, year, session_name)
        all_files = glob.glob(os.path.join(input_path, "*.csv"))
        dfs = []
        for file in all_files:
            try:
                df = pd.read_csv(file)
                dfs.append(df)
            except Exception as e:
                print(f" failed {os.path.basename(file)}: {e}")

        if dfs:
            df_merged = pd.concat(dfs, ignore_index=True)
            save_path = os.path.join(output_dir, f"f1_{year}_{session_name}.csv")
            df_merged.to_csv(save_path, index=False)
            print(f"{year} {session_name} done ({len(df_merged):,})")
        else:
            print(f"No files found for {year} {session_name}")

print(f"Output saved to: {output_dir}")