In [1]:
import pandas as pd
import time

import pandas as pd
import requests

# Endpoint to get all sessions
SESSIONS_URL = "https://api.openf1.org/v1/sessions"

# Step 1: Get session data
response = requests.get(SESSIONS_URL)
response.raise_for_status()  # Raise error if API call fails

sessions = pd.DataFrame(response.json())

# Step 2: Filter to just race sessions (session_type == 'RACE')
race_sessions = sessions[sessions['session_type'].str.upper() == 'RACE']

# Step 3: Keep only meeting_key and circuit_short_name
race_info = race_sessions[['meeting_key', 'circuit_short_name']].drop_duplicates().sort_values('meeting_key')

# Optional: reset index
race_info.reset_index(drop=True, inplace=True)


In [2]:
driver_country_codes = {
    "Max VERSTAPPEN": "NLD",
    "Sergio PEREZ": "MEX",
    "Charles LECLERC": "MCO",
    "Carlos SAINZ": "ESP",
    "George RUSSELL": "GBR",
    "Lewis HAMILTON": "GBR",
    "Lando NORRIS": "GBR",
    "Oscar PIASTRI": "AUS",
    "Fernando ALONSO": "ESP",
    "Lance STROLL": "CAN",
    "Kevin MAGNUSSEN": "DNK",
    "Nico HULKENBURG": "DEU",
    "Gabriel BORTOLETO": "BRA",
    "Liam LAWSON": "NZL",
    "Zhou GUANYU": "CHN",
    "Valtteri BOTTAS": "FIN",
    "Oliver BEARMAN": "GBR",
    "Franco COLAPINTO": "ARG",
    "Esteban OCON": "FRA",
    "Pierre GASLY": "FRA",
    "Yuki TSUNODA": "JPN",
    "Daniel RICCARDO": "AUS",
    "Logan SARGENT": "USA",
    "Alex ALBON": "THA",
    "Isack HADJAR": "FRA",
    "Kimi ANTONELLI": "ITA"
}


In [3]:
# ---------- LAP PROCESSING ----------
def process_lap_data(laps_df: pd.DataFrame):
    if laps_df.empty:
        return pd.DataFrame()
    valid_laps = laps_df[
        (laps_df['is_pit_out_lap'] == False) &
        (laps_df['lap_duration'].notna()) &
        (laps_df['lap_duration'] > 0)
    ]
    summary = valid_laps.groupby(['driver_number', 'session_key', 'meeting_key']).agg(
        best_lap_time=('lap_duration', 'min'),
        avg_lap_time=('lap_duration', 'mean'),
        lap_consistency=('lap_duration', 'std'),
        best_sector_1=('duration_sector_1', 'min'),
        best_sector_2=('duration_sector_2', 'min'),
        best_sector_3=('duration_sector_3', 'min'),
        best_i1_speed=('i1_speed', 'max'),
        best_i2_speed=('i2_speed', 'max'),
        best_st_speed=('st_speed', 'max'),
        total_valid_laps=('lap_duration', 'count')
    ).reset_index()
    return summary

# ---------- DRIVER MERGE ----------
def merge_with_driver_info(lap_summary_df: pd.DataFrame, drivers_df: pd.DataFrame):
    if lap_summary_df.empty or drivers_df.empty:
        return pd.DataFrame()

    # 🔧 Force consistent dtypes and clean columns
    drivers_df = drivers_df.copy()
    drivers_df.columns = drivers_df.columns.str.strip()
    drivers_df['driver_number'] = drivers_df['driver_number'].astype(str)
    drivers_df['country_code'] = drivers_df['full_name'].map(driver_country_codes)

    lap_summary_df['driver_number'] = lap_summary_df['driver_number'].astype(str)

    drivers_trimmed = drivers_df.drop_duplicates(subset='driver_number')[
        ['driver_number', 'broadcast_name', 'full_name', 'team_name', 'country_code']
    ]
    enriched_df = pd.merge(lap_summary_df, drivers_trimmed, on='driver_number', how='left')
    return enriched_df


# ---------- API FETCH ----------
def fetch_openf1(
    endpoint,
    meeting_key=None,
    session_type=None,
    session_key=None,
    enrich_laps=False,
    drivers_df=None,
    retries=3,
    wait=2
):
    base = "https://api.openf1.org/v1/"
    url = f"{base}{endpoint}?"

    if session_key:
        url += f"session_key={session_key}"
    elif meeting_key:
        url += f"meeting_key={meeting_key}"
        if session_type:
            url += f"&session_type={session_type}"

    for attempt in range(retries):
        try:
            df = pd.read_json(url)
            if df.empty:
                print(f"⚠️ No data for {endpoint} | session_key={session_key} | meeting_key={meeting_key}")
                return df

            # ✅ Always keep context keys
            df['meeting_key'] = meeting_key
            if session_key:
                df['session_key'] = session_key

            # ✅ Optional enrichment for lap data
            if enrich_laps and endpoint == 'laps':
                df = process_lap_data(df)
                if drivers_df is not None and not drivers_df.empty:
                    df = merge_with_driver_info(df, drivers_df)

            # ✅ Attach source label
            df['source'] = endpoint

            # ✅ Attach circuit name upstream if possible
            if 'meeting_key' in df.columns and 'circuit_short_name' not in df.columns:
                df['circuit'] = df['meeting_key'].map(
                    race_info.set_index('meeting_key')['circuit_short_name']
                )

            # ✅ Print using session_type if available
            session_info = session_key_map.get(session_key, session_type or "—")
            print(f"✅ {endpoint.upper()} | Rows: {len(df)} | session: {session_info} | meeting_key={meeting_key}")

            time.sleep(wait)
            return df

        except Exception as e:
            print(f"❌ Attempt {attempt+1} failed for {endpoint} — {e}")
            time.sleep(wait * (attempt + 1))

    return pd.DataFrame()


# ----------- WRAPPER FOR SOURCE TAGGING ----------
def tagged_fetch(endpoint, **kwargs):
    df = fetch_openf1(endpoint, **kwargs)
    return df

# ---------- SESSION METADATA ----------
print("🔄 Getting session metadata...")
sessions = pd.read_json("https://api.openf1.org/v1/sessions")

# Create mappings from session_key → session_name
session_name_map = sessions.set_index('session_key')['session_name'].to_dict()

# Filter to real races
races = sessions[
    (sessions['session_type'].str.lower() == 'race') &
    (sessions['session_name'].str.upper() == 'RACE')
]
unique_races = races.drop_duplicates(subset='meeting_key')
recent_races = unique_races.sort_values('date_start', ascending=False)
last_3_meeting_keys = recent_races['meeting_key'].head(3).tolist()
print("✅ Last 3 meeting_keys:", last_3_meeting_keys)

# Create session_type mapping from session_key
session_key_map = sessions.set_index("session_key")["session_type"].to_dict()

# ---------- LOOP PER RACE ----------
all_data = []

for mk in last_3_meeting_keys:
    print(f"\n📦 Processing meeting_key={mk}...")

    # Fetch session_keys for this meeting
    session_keys_df = sessions[sessions['meeting_key'] == mk]
    session_keys_df = session_keys_df[['session_key', 'session_name']].sort_values('session_key')
    session_keys = session_keys_df.to_dict('records')

    # Pull drivers once per meeting
    drivers_df = tagged_fetch("drivers", meeting_key=mk)

    # Pull other data for the whole weekend
    all_data.extend([
        tagged_fetch("pit", meeting_key=mk),
        tagged_fetch("stints", meeting_key=mk),
        tagged_fetch("position", meeting_key=mk),
        drivers_df,
        tagged_fetch("weather", meeting_key=mk),
        tagged_fetch("race_control", meeting_key=mk)
    ])

    # Now pull lap data per session_key
    for sess in session_keys:
        skey = sess['session_key']
        sname = sess['session_name'].upper()
        if any(x in sname for x in ["PRACTICE", "QUALIFYING", "RACE"]):
            laps_df = tagged_fetch("laps", session_key=skey, meeting_key=mk, enrich_laps=True, drivers_df=drivers_df)
            all_data.append(laps_df)

# ---------- DONE ----------
print(f"\n✅ Data collection complete. all_data contains {len(all_data)} tagged DataFrames.")


🔄 Getting session metadata...
✅ Last 3 meeting_keys: [1266, 1265, 1277]

📦 Processing meeting_key=1266...
✅ DRIVERS | Rows: 100 | session: — | meeting_key=1266
✅ PIT | Rows: 479 | session: — | meeting_key=1266
✅ STINTS | Rows: 456 | session: — | meeting_key=1266
✅ POSITION | Rows: 3942 | session: — | meeting_key=1266
✅ WEATHER | Rows: 481 | session: — | meeting_key=1266
✅ RACE_CONTROL | Rows: 173 | session: — | meeting_key=1266
✅ LAPS | Rows: 20 | session: Practice | meeting_key=1266
✅ LAPS | Rows: 20 | session: Practice | meeting_key=1266
✅ LAPS | Rows: 20 | session: Practice | meeting_key=1266
✅ LAPS | Rows: 20 | session: Qualifying | meeting_key=1266
✅ LAPS | Rows: 20 | session: Race | meeting_key=1266

📦 Processing meeting_key=1265...
✅ DRIVERS | Rows: 100 | session: — | meeting_key=1265
✅ PIT | Rows: 253 | session: — | meeting_key=1265
✅ STINTS | Rows: 308 | session: — | meeting_key=1265
✅ POSITION | Rows: 3314 | session: — | meeting_key=1265
✅ WEATHER | Rows: 539 | session: — | m

In [4]:
lap_dfs     = [df for df in all_data if not df.empty and df['source'].iloc[0] == 'laps']
pit_dfs     = [df for df in all_data if not df.empty and df['source'].iloc[0] == 'pit']
stint_dfs   = [df for df in all_data if not df.empty and df['source'].iloc[0] == 'stints']
driver_dfs  = [df for df in all_data if not df.empty and df['source'].iloc[0] == 'drivers']
position_dfs= [df for df in all_data if not df.empty and df['source'].iloc[0] == 'position']
weather_dfs = [df for df in all_data if not df.empty and df['source'].iloc[0] == 'weather']
rc_dfs      = [df for df in all_data if not df.empty and df['source'].iloc[0] == 'race_control']

In [30]:
def build_driver_card(driver_number, driver_df, lap_dfs, pit_dfs, stint_dfs, position_df, weather_df, rc_df, circuit_map):
    # 📋 Metadata
    driver_meta = driver_df[driver_df['driver_number'] == driver_number].iloc[0]
    full_name = driver_meta['full_name']
    broadcast = driver_meta['broadcast_name']
    team = driver_meta['team_name']
    country = driver_country_codes.get(full_name, 'Unknown')
    
    print(f"\n🏎️  Driver: {full_name} ({broadcast})")
    print(f"🏁 Team: {team} | Country: {country}")

    # 📊 Lap Summary
    print("\n📊 Lap Performance:")
    
    all_laps = pd.concat(lap_dfs)
    all_laps['driver_number'] = all_laps['driver_number'].astype(str)
    driver_number = str(driver_number)

    driver_laps = all_laps[all_laps['driver_number'] == driver_number]
    print(f"🔍 Lap entries found for driver {driver_number}: {len(driver_laps)}")

    if not driver_laps.empty:
        for _, row in driver_laps.iterrows():
            circuit_name = row.get('circuit', 'Unknown')
            session_label = session_name_map.get(row['session_key'], row['session_key'])

            print(f"  {circuit_name} - {session_label}: "
                f"Best Lap = {row.get('best_lap_time', '?'):.3f}s | "
                f"Avg = {row.get('avg_lap_time', '?'):.3f}s | "
                f"Consistency = {row.get('lap_consistency', '?'):.3f}s")
    else:
        print("  No lap data available.")

    # ⛽ Pit Summary
    print("\n⛽ Pit Stops:")
    all_pits = pd.concat(pit_dfs)
    driver_number = int(driver_number)
    driver_pits = all_pits[all_pits['driver_number'] == driver_number]

    if not driver_pits.empty:
        for _, row in driver_pits.iterrows():
            # ✅ Insert circuit name
            circuit_row = circuit_map[circuit_map['meeting_key'] == row['meeting_key']]
            circuit_name = circuit_row['circuit_short_name'].values[0] if not circuit_row.empty else "Unknown"
            session_label = session_name_map.get(row['session_key'], row['session_key'])

            lap = int(row['lap_number']) if pd.notna(row['lap_number']) else "?"
            duration = round(row['pit_duration'], 3) if pd.notna(row['pit_duration']) else "?"
            print(f"  {circuit_name} - {session_label}: 🔁 Lap {lap} | Duration: {duration}s")
    else:
        print("  No pit stop data available.")

    # 🛞 Stint Summary
    print("\n🛞 Tire Stints:")
    all_stints = pd.concat(stint_dfs)
    driver_stints = all_stints[all_stints['driver_number'] == driver_number]
    if not driver_stints.empty:
        for _, row in driver_stints.iterrows():
            # ✅ Insert circuit name
            circuit_row = circuit_map[circuit_map['meeting_key'] == row['meeting_key']]
            circuit_name = circuit_row['circuit_short_name'].values[0] if not circuit_row.empty else "Unknown"
            session_label = session_name_map.get(row['session_key'], row['session_key'])

            print(f"  {circuit_name} - {session_label}: Stint {row['stint_number']}: Laps {row['lap_start']}–{row.get('lap_end', '?')} | Compound: {row['compound']}")
    else:
        print("  No stint data available.")

    # 📈 Position Changes
    print("\n📈 Session Positions:")
    positions = position_df[position_df['driver_number'] == driver_number]

    if not positions.empty:
        grouped = (
            positions
            .groupby(['meeting_key', 'session_key', 'circuit'])['position']
            .agg(['first', 'last'])
            .reset_index()
        )

        for _, row in grouped.iterrows():
            circuit_name = row['circuit']
            session_label = session_name_map.get(row['session_key'], row['session_key'])
            print(f"  {circuit_name} - {session_label}: Grid → {int(row['first'])}, Finish → {int(row['last'])}")
    else:
        print("  No position data available.")


    # 🌦️ Weather Context
    print("\n🌦️ Weather Summary:")

    weather_summary = (
        weather_df
        .groupby(['meeting_key', 'session_key', 'circuit'])['rainfall']
        .max()
        .reset_index()
    )

    for _, row in weather_summary.iterrows():
        rain = float(row['rainfall'])
        condition = "Wet" if rain > 0 else "Dry"
        session_label = session_name_map.get(row['session_key'], row['session_key'])
        print(f"  {row['circuit']} - {session_label}: {condition} (Max Rainfall: {rain:.2f})")

    # 🚩 Incidents
    print("\n🚩 Race Control Events:")
    driver_incidents = rc_df[rc_df['driver_number'] == driver_number] if 'driver_number' in rc_df.columns else pd.DataFrame()
    if not driver_incidents.empty:
        for _, row in driver_incidents.iterrows():
            session_label = session_name_map.get(row['session_key'], row['session_key'])
            circuit_row = circuit_map[circuit_map['meeting_key'] == row['meeting_key']]
            circuit_name = circuit_row['circuit_short_name'].values[0] if not circuit_row.empty else "Unknown"

            print(f" {circuit_name} - {session_label} [{row['flag']} @ {row['date']}] Reason: {row.get('message', 'N/A')}")
    else:
        print("  No incidents recorded.")

    print("\n" + "-"*60)


In [31]:
# Combine all drivers into one DataFrame
driver_df = pd.concat(driver_dfs, ignore_index=True)
position_df = pd.concat(position_dfs, ignore_index=True)
weather_df = pd.concat(weather_dfs, ignore_index=True)
rc_df = pd.concat(rc_dfs, ignore_index=True)

unclean_dfs = [driver_df, position_df, weather_df, rc_df]

for df in unclean_dfs:
    if 'meeting_key' in df.columns and 'circuit' not in df.columns:
        df['circuit'] = df['meeting_key'].map(race_info)
    if df is driver_df:
        df['country_code'] = df['full_name'].map(driver_country_codes).fillna('Unknown')

# Now loop over drivers
for driver_number in driver_df['driver_number'].unique():
    build_driver_card(
        driver_number,
        driver_df,
        lap_dfs,
        pit_dfs,
        stint_dfs,
        position_df,
        weather_df,
        rc_df,
        race_info
    )



🏎️  Driver: Max VERSTAPPEN (M VERSTAPPEN)
🏁 Team: Red Bull Racing | Country: NLD

📊 Lap Performance:
🔍 Lap entries found for driver 1: 14
  Hungaroring - Practice 1: Best Lap = 76.940s | Avg = 95.612s | Consistency = 23.482s
  Hungaroring - Practice 2: Best Lap = 76.791s | Avg = 90.577s | Consistency = 18.096s
  Hungaroring - Practice 3: Best Lap = 76.162s | Avg = 95.833s | Consistency = 25.130s
  Hungaroring - Qualifying: Best Lap = 75.547s | Avg = 87.445s | Consistency = 16.111s
  Hungaroring - Race: Best Lap = 79.576s | Avg = 82.151s | Consistency = 1.281s
  Spa-Francorchamps - Practice 1: Best Lap = 102.426s | Avg = 122.445s | Consistency = 25.694s
  Spa-Francorchamps - Sprint Qualifying: Best Lap = 100.987s | Avg = 122.743s | Consistency = 23.372s
  Spa-Francorchamps - Qualifying: Best Lap = 100.903s | Avg = 120.817s | Consistency = 25.455s
  Spa-Francorchamps - Race: Best Lap = 106.096s | Avg = 114.697s | Consistency = 18.008s
  Silverstone - Practice 1: Best Lap = 87.432s | Avg