In [80]:
# --- STAGE 1: Data Loading and Initial Preparation ---

import pandas as pd
import numpy as np
import os # Import os for path checking

print("--- STAGE 1 START ---")

# --- 1a. Load CSV files ---
# <<< *** USER: Double-check this path is correct for your setup *** >>>
data_path = '../data/f1_world_champion/' # Example: Assumes CSVs are in the same directory as the notebook
print(f"Using data_path: '{data_path}'")

# List of required files
files_to_load = [
    'results.csv', 'races.csv', 'drivers.csv', 'constructors.csv',
    'qualifying.csv', 'circuits.csv', 'status.csv', 'constructor_results.csv',
    'lap_times.csv', 'pit_stops.csv', 'sprint_results.csv'
]

# Dictionary to hold loaded dataframes
dataframes = {}
all_loaded = True

for f_name in files_to_load:
    f_path = os.path.join(data_path, f_name)
    try:
        dataframes[f_name.split('.')[0]] = pd.read_csv(f_path)
        print(f"Successfully loaded: {f_name} (Shape: {dataframes[f_name.split('.')[0]].shape})")
    except FileNotFoundError:
        print(f"Error: File not found at '{f_path}'. Features requiring this file will be skipped.")
        dataframes[f_name.split('.')[0]] = None # Set to None if not found
        all_loaded = False
    except Exception as e:
        print(f"Error loading {f_name}: {e}")
        dataframes[f_name.split('.')[0]] = None
        all_loaded = False

# Assign to variables for convenience (optional, can use dataframes['results'] directly)
results = dataframes.get('results')
races = dataframes.get('races')
drivers = dataframes.get('drivers')
constructors = dataframes.get('constructors')
qualifying = dataframes.get('qualifying')
circuits = dataframes.get('circuits')
status = dataframes.get('status')
constructor_results = dataframes.get('constructor_results')
lap_times = dataframes.get('lap_times')
pit_stops = dataframes.get('pit_stops')
sprint_results = dataframes.get('sprint_results')

if not all_loaded:
    print("\nWarning: Not all required CSV files were loaded successfully.")
if results is None or races is None or status is None:
    raise ValueError("Critical DataFrames ('results', 'races', 'status') could not be loaded. Aborting.")

# --- 1b. Define Relevant Status IDs for Filtering ---
print("Defining relevant status IDs for analysis...")
# Categories 1(Acc/Inc), 2(Mech/Tech), 3(Drv/Team), 4(Procedural DNF - 2,62,92,96), 6(Finished/Lapped)
# Excludes Cat 5 (DNS/DNQ/Safety/...)
relevant_status_ids = [
     # Cat 1
     3, 4, 20, 104, 130, 137, 138,
     # Cat 2
     5, 6, 7, 8, 9, 10, 21, 22, 23, 24, 25, 26, 27, 29, 30, 32, 33, 34, 36, 37, 38, 39, 40, 41, 42, 43, 44, 46, 47, 48, 49, 51, 56, 59, 60, 61, 63, 64, 65, 66, 67, 69, 70, 71, 72, 74, 75, 76, 79, 80, 83, 84, 85, 86, 87, 91, 95, 98, 99, 101, 102, 103, 105, 106, 108, 109, 110, 121, 126, 129, 131, 132, 135, 140, 141,
     # Cat 3
     28, 31, 54, 68, 73, 82, 100, 107, 136, 139,
     # Cat 4 (Selected)
     2, 62, 92, 96,
     # Cat 6
     1, 11, 12, 13, 14, 15, 16, 17, 18, 19, 45, 50, 53, 55, 58, 88, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 122, 123, 124, 125, 127, 128, 133, 134
]
print(f"Defined {len(relevant_status_ids)} relevant status IDs.")

# --- 1c. Filter Initial Results Data ---
print(f"Shape before filtering results: {results.shape}")
results_filtered = results[results['statusId'].isin(relevant_status_ids)].copy()
print(f"Shape after filtering results by relevant status IDs: {results_filtered.shape}")


# --- 1d. Initial Merges (using results_filtered) ---
print("Starting initial merges...")
races_subset = races[['raceId', 'year', 'round', 'circuitId', 'date']]
df = pd.merge(results_filtered, races_subset, on='raceId', how='left')

# Merge other tables (checking if they were loaded)
if drivers is not None:
    drivers_subset = drivers[['driverId', 'driverRef', 'nationality', 'dob']]
    df = pd.merge(df, drivers_subset, on='driverId', how='left')
else: print("Skipping driver merge.")
if constructors is not None:
    constructors_subset = constructors[['constructorId', 'name', 'nationality']]
    constructors_subset = constructors_subset.rename(columns={'name': 'constructorName', 'nationality': 'constructorNationality'})
    df = pd.merge(df, constructors_subset, on='constructorId', how='left')
else: print("Skipping constructor merge.")

df = df.rename(columns={'grid': 'results_grid_start'}) # Rename original grid

if qualifying is not None:
    qualifying_subset = qualifying[['raceId', 'driverId', 'constructorId', 'position']].copy()
    qualifying_subset['position'] = pd.to_numeric(qualifying_subset['position'], errors='coerce')
    qualifying_subset = qualifying_subset.rename(columns={'position': 'grid'})
    df = pd.merge(df, qualifying_subset, on=['raceId', 'driverId', 'constructorId'], how='left')
    print("Merged qualifying info.")
else:
    print("Warning: Qualifying DataFrame not loaded. Grid position feature will be NaN.")
    df['grid'] = np.nan

if circuits is not None:
    circuits_subset = circuits[['circuitId', 'name', 'location', 'country']]
    circuits_subset = circuits_subset.rename(columns={'name': 'circuitName'})
    df = pd.merge(df, circuits_subset, on='circuitId', how='left')
else: print("Skipping circuit merge.")

if status is not None:
    status_subset = status[['statusId', 'status']]
    df = pd.merge(df, status_subset, on='statusId', how='left') # Status text added back
    print("Merged status info.")
else:
    df['status'] = 'Unknown'

print("Initial merges complete.")

# --- 1e. Data Cleaning ---
print("Starting cleaning...")
df_cleaned = df.copy() # Start with the merged, pre-filtered data

# Type Conversions
df_cleaned['date'] = pd.to_datetime(df_cleaned['date'], errors='coerce')
if drivers is not None and 'dob' in df_cleaned.columns:
    df_cleaned['dob'] = pd.to_datetime(df_cleaned['dob'], errors='coerce')

# Handle grid NaNs/Zeros (e.g. pit lane start) -> Use a high number like 25
if 'grid' in df_cleaned.columns:
     df_cleaned['grid'] = pd.to_numeric(df_cleaned['grid'], errors='coerce').fillna(25) # Fill NaNs
     df_cleaned.loc[df_cleaned['grid'] == 0, 'grid'] = 25 # Treat grid 0 as pit lane start (high number)
     print("Cleaned and filled NaN/0 grid positions with 25.")
else:
     print("Warning: 'grid' column missing.")


# --- 1f. Define RAW Target Variables ---
if 'positionOrder' in df_cleaned.columns:
    # Ensure positionOrder is numeric for target creation
    df_cleaned['positionOrder'] = pd.to_numeric(df_cleaned['positionOrder'], errors='coerce')
    df_cleaned['target_finishing_position'] = df_cleaned['positionOrder']
    # Handle potential NaNs in positionOrder when defining podium finish
    df_cleaned['target_podium_finish'] = df_cleaned['positionOrder'].apply(lambda x: 1 if pd.notna(x) and x <= 3 else 0)
    print("Created base target variables.")
else:
    print("Warning: 'positionOrder' column not found. Cannot create targets.")
    df_cleaned['target_finishing_position'] = np.nan
    df_cleaned['target_podium_finish'] = 0


# --- 1g. Sort DataFrame Chronologically ---
df_cleaned.dropna(subset=['date'], inplace=True) # Drop rows if date conversion failed
df_cleaned = df_cleaned.sort_values(by=['date', 'raceId']).reset_index(drop=True)
print("DataFrame sorted by date.")

# --- Final Check for Block 1 ---
print(f"--- STAGE 1 COMPLETE --- Shape after initial prep & filtering: {df_cleaned.shape} ---")
# Check counts of key IDs
print(f"Unique Race IDs: {df_cleaned['raceId'].nunique()}")
print(f"Unique Driver IDs: {df_cleaned['driverId'].nunique()}")
# Output of this block is df_cleaned

--- STAGE 1 START ---
Using data_path: '../data/f1_world_champion/'
Successfully loaded: results.csv (Shape: (26759, 18))
Successfully loaded: races.csv (Shape: (1125, 18))
Successfully loaded: drivers.csv (Shape: (861, 9))
Successfully loaded: constructors.csv (Shape: (212, 5))
Successfully loaded: qualifying.csv (Shape: (10494, 9))
Successfully loaded: circuits.csv (Shape: (77, 9))
Successfully loaded: status.csv (Shape: (139, 2))
Successfully loaded: constructor_results.csv (Shape: (12625, 5))
Successfully loaded: lap_times.csv (Shape: (589081, 6))
Successfully loaded: pit_stops.csv (Shape: (11371, 7))
Successfully loaded: sprint_results.csv (Shape: (360, 16))
Defining relevant status IDs for analysis...
Defined 130 relevant status IDs.
Shape before filtering results: (26759, 18)
Shape after filtering results by relevant status IDs: (25373, 18)
Starting initial merges...
Merged qualifying info.
Merged status info.
Initial merges complete.
Starting cleaning...
Cleaned and filled NaN/

In [81]:
# --- STAGE 2: Feature Engineering ---
# This block adds calculated features to 'df_cleaned'

import pandas as pd
import numpy as np

print("--- STAGE 2 START ---")
# Make sure df_cleaned exists from Block 1
if 'df_cleaned' not in locals() or df_cleaned is None:
     raise NameError("DataFrame 'df_cleaned' not found. Please run Block 1 first.")

# Also check required DataFrames for specific features are loaded
required_dfs = {'qualifying': qualifying, 'constructor_results': constructor_results,
                'lap_times': lap_times, 'pit_stops': pit_stops,
                'sprint_results': sprint_results, 'status': status, 'races': races}
for name, df_check in required_dfs.items():
    if df_check is None:
        print(f"Warning: DataFrame '{name}' is None, features requiring it will be skipped.")


# Create a working copy for feature engineering
df_featured = df_cleaned.copy()

# --- Define DNF Status IDs based on user categorization ---
# Categories 1 (Accident/Incident), 2 (Mechanical/Technical), 3 (Driver/Team),
# and selected from 4 (Disqualified, Not Classified, Excluded)
dnf_status_ids = [
     3, 4, 20, 104, 130, 137, 138, # Cat 1
     5, 6, 7, 8, 9, 10, 21, 22, 23, 24, 25, 26, 27, 29, 30, 32, 33, 34, 36, 37, 38, 39, 40, 41, 42, 43, 44, 46, 47, 48, 49, 51, 56, 59, 60, 61, 63, 64, 65, 66, 67, 69, 70, 71, 72, 74, 75, 76, 79, 80, 83, 84, 85, 86, 87, 91, 95, 98, 99, 101, 102, 103, 105, 106, 108, 109, 110, 121, 126, 129, 131, 132, 135, 140, 141, # Cat 2 (Removed Fuel rig 71?) Check 135 Brake duct, 140 Undertray, 141 Cooling system
     28, 31, 54, 68, 73, 82, 100, 107, 136, 139, # Cat 3
     2, 62, 92, 96 # Cat 4 (Selected: Disqualified, Not classified, Underweight, Excluded)
     # Excluded 35 (Refuelling) from Cat 4
 ]
print(f"Defined {len(dnf_status_ids)} status IDs as DNF indicators.")

# --- Drop potentially existing columns if re-running this cell ---
# Add all potential new feature column names here
cols_to_drop = [
    'avg_finish_pos_last_5', 'prev_finish_pos', 'avg_points_last_5',
    'prev_finish_at_circuit', 'avg_finish_at_circuit', 'quali_diff_from_pole_ms',
    'teammate_grid_diff', 'is_dnf', 'prev_dnf', 'dnf_rate_last_5', # Changed from dnf_last_5_races
    'constructor_dnf_rate_last_5', 'avg_lap_time_ms_last_5', 'prev_avg_lap_time',
    'avg_stops_last_5', 'prev_num_stops', 'sprint_pos', 'sprint_points', 'participated_in_sprint'
]
df_featured = df_featured.drop(columns=cols_to_drop, errors='ignore')
print("Dropped potentially existing feature columns for fresh calculation.")

# --- Feature Engineering Calculations ---

# Base features needed downstream
if 'statusId' in df_featured.columns:
    df_featured['is_dnf'] = df_featured['statusId'].apply(lambda x: 1 if x in dnf_status_ids else 0)
    print("Created 'is_dnf' column.")
else:
    df_featured['is_dnf'] = 0 # Assume not DNF if status unknown
    print("Warning: 'statusId' missing, cannot calculate 'is_dnf'. Set to 0.")

# 2a. Driver Historical Performance (Avg Finish Last 5)
if 'target_finishing_position' in df_featured.columns:
    df_featured['prev_finish_pos'] = df_featured.groupby('driverId')['target_finishing_position'].shift(1)
    df_featured['avg_finish_pos_last_5'] = df_featured.groupby('driverId')['prev_finish_pos'].transform(
        lambda x: x.rolling(window=5, min_periods=1).mean()
    )
    mean_overall_finish = df_featured['target_finishing_position'].mean()
    df_featured['avg_finish_pos_last_5'] = df_featured['avg_finish_pos_last_5'].fillna(mean_overall_finish)
    print("Engineered feature: 'avg_finish_pos_last_5' (driver).")
else: print("Skipping 'avg_finish_pos_last_5': target missing.")

# 2b. Constructor Historical Performance (Avg Points Last 5)
if constructor_results is not None and races is not None:
    try:
        races_minimal = races[['raceId', 'date']]
        constructor_results_merged = pd.merge(constructor_results.dropna(subset=['points']), races_minimal, on='raceId', how='left').dropna(subset=['date'])
        constructor_results_merged = constructor_results_merged.sort_values(by=['date', 'raceId'])
        constructor_results_merged['prev_points'] = constructor_results_merged.groupby('constructorId')['points'].shift(1)
        constructor_results_merged['avg_points_last_5'] = constructor_results_merged.groupby('constructorId')['prev_points'].transform(
            lambda x: x.rolling(window=5, min_periods=1).mean()
        ).fillna(0) # Fill NaNs within transform
        constructor_hist_points = constructor_results_merged[['raceId', 'constructorId', 'avg_points_last_5']].drop_duplicates(['raceId', 'constructorId'], keep='last') # Ensure unique race/constructor combo
        df_featured = pd.merge(df_featured, constructor_hist_points, on=['raceId', 'constructorId'], how='left')
        df_featured['avg_points_last_5'] = df_featured['avg_points_last_5'].fillna(0)
        print("Engineered feature: 'avg_points_last_5' (constructor).")
    except Exception as e:
        print(f"Error calculating constructor points history: {e}")
        df_featured['avg_points_last_5'] = 0 # Default value on error
else:
    print("Skipping constructor avg points: required DataFrames missing.")
    df_featured['avg_points_last_5'] = 0


# 2c. Driver Performance at Specific Circuit (Avg Finish Pos)
if 'target_finishing_position' in df_featured.columns:
    df_featured['prev_finish_at_circuit'] = df_featured.groupby(['driverId', 'circuitId'])['target_finishing_position'].shift(1)
    df_featured['avg_finish_at_circuit'] = df_featured.groupby(['driverId', 'circuitId'])['prev_finish_at_circuit'].transform(
        lambda x: x.expanding(min_periods=1).mean()
    )
    if 'mean_overall_finish' not in locals(): mean_overall_finish = df_featured['target_finishing_position'].mean()
    df_featured['avg_finish_at_circuit'] = df_featured['avg_finish_at_circuit'].fillna(mean_overall_finish)
    print("Engineered feature: 'avg_finish_at_circuit' (driver).")
else:
    print("Skipping driver avg finish at circuit: target missing.")
    df_featured['avg_finish_at_circuit'] = mean_overall_finish if 'mean_overall_finish' in locals() else 10 # Default


# --- 2d. Qualifying Details (Time Difference from Pole) ---

# Function to parse time string MM:SS.ms to milliseconds (keep definition here for clarity)
def time_to_millis(time_str):
    if pd.isna(time_str) or time_str == '\\N': return np.nan
    if isinstance(time_str, str) and not time_str.strip(): return np.nan
    try:
        parts = str(time_str).split(':')
        if len(parts) == 2: # MM:SS.ms
            minutes = int(parts[0]); seconds_parts = parts[1].split('.'); seconds = int(seconds_parts[0]); millis = int(seconds_parts[1])
            return (minutes * 60 + seconds) * 1000 + millis
        elif '.' in str(time_str): # SS.ms
             seconds_parts = str(time_str).split('.'); seconds = int(seconds_parts[0]); millis = int(seconds_parts[1])
             return seconds * 1000 + millis
        else: return int(time_str)*1000 # Assume whole seconds
    except Exception: return np.nan

# Check if qualifying DataFrame exists
if 'qualifying' in locals() and qualifying is not None:
    try:
        print("Calculating qualifying time difference from pole...")
        qualifying_copy = qualifying.copy() # Work on a copy

        # Apply time conversion
        qualifying_copy['q1_millis'] = qualifying_copy['q1'].apply(time_to_millis)
        qualifying_copy['q2_millis'] = qualifying_copy['q2'].apply(time_to_millis)
        qualifying_copy['q3_millis'] = qualifying_copy['q3'].apply(time_to_millis)
        qualifying_copy['best_quali_time_millis'] = qualifying_copy['q3_millis'].fillna(qualifying_copy['q2_millis']).fillna(qualifying_copy['q1_millis'])

        # Find indices of minimum times per race using idxmin()
        min_time_indices = qualifying_copy.groupby('raceId')['best_quali_time_millis'].idxmin()

        # Filter out any NaN indices that resulted from races with no valid times
        valid_indices = min_time_indices.dropna().astype(int) # Ensure integer indices

        # Use only the valid indices to select the rows corresponding to pole times
        if not valid_indices.empty:
            # Check if all valid indices exist in the qualifying_copy index
            valid_indices = valid_indices[valid_indices.isin(qualifying_copy.index)]
            if not valid_indices.empty:
                pole_times = qualifying_copy.loc[valid_indices, ['raceId', 'best_quali_time_millis']].rename(
                    columns={'best_quali_time_millis': 'pole_time_millis'}
                )
                # Merge pole times into the qualifying_copy DataFrame using a left merge
                qualifying_copy = pd.merge(qualifying_copy, pole_times, on='raceId', how='left')

                # Now calculate the difference; result is NaN if 'pole_time_millis' is NaN for that row
                qualifying_copy['quali_diff_from_pole_ms'] = qualifying_copy['best_quali_time_millis'] - qualifying_copy['pole_time_millis']
                print("Calculated 'quali_diff_from_pole_ms' for races with valid pole times.")
            else:
                 # If valid_indices become empty after checking against index (shouldn't normally happen with idxmin unless data is strange)
                 qualifying_copy['quali_diff_from_pole_ms'] = np.nan
                 print("Warning: Valid pole time indices were not found in the DataFrame index. 'quali_diff_from_pole_ms' column created with NaNs.")
        else:
            # If no valid pole times found at all, create the column with NaNs
            qualifying_copy['quali_diff_from_pole_ms'] = np.nan
            print("Warning: No valid pole times found. 'quali_diff_from_pole_ms' column created with NaNs.")

        # Now, regardless of the if/else, the column 'quali_diff_from_pole_ms' should exist
        # Select data for final merge, ensuring the column exists first
        if 'quali_diff_from_pole_ms' in qualifying_copy.columns:
            quali_diff_data = qualifying_copy[['raceId', 'driverId', 'constructorId', 'quali_diff_from_pole_ms']].drop_duplicates(['raceId', 'driverId', 'constructorId'], keep='last')
            # Merge into df_featured
            df_featured = pd.merge(df_featured, quali_diff_data, on=['raceId', 'driverId', 'constructorId'], how='left')

            # Fill any remaining NaNs in the final column in df_featured
            if 'quali_diff_from_pole_ms' in df_featured.columns and df_featured['quali_diff_from_pole_ms'].notna().any():
                mean_quali_diff = df_featured['quali_diff_from_pole_ms'].mean()
                df_featured['quali_diff_from_pole_ms'] = df_featured['quali_diff_from_pole_ms'].fillna(mean_quali_diff)
                print("Filled NaNs in df_featured['quali_diff_from_pole_ms'] using the mean difference.")
            elif 'quali_diff_from_pole_ms' in df_featured.columns:
                 # Handle case where column exists but is all NaN after merge
                 df_featured['quali_diff_from_pole_ms'] = df_featured['quali_diff_from_pole_ms'].fillna(999999) # Example fill with large value
                 print("Filled NaNs in df_featured['quali_diff_from_pole_ms'] using a default large value (999999).")
        else:
             # Fallback if column wasn't created properly
             print("Warning: 'quali_diff_from_pole_ms' column missing in qualifying_copy before final merge. Setting default in df_featured.")
             df_featured['quali_diff_from_pole_ms'] = 999999

    except Exception as e:
        print(f"Error calculating qualifying diff: {e}")
        # Ensure column exists even if error occurs, fill with default
        if 'quali_diff_from_pole_ms' not in df_featured.columns:
             df_featured['quali_diff_from_pole_ms'] = 999999
        else: # If column exists from merge but error happened during fill
             df_featured['quali_diff_from_pole_ms'] = df_featured['quali_diff_from_pole_ms'].fillna(999999)

else:
    print("Skipping qualifying diff feature: qualifying DataFrame missing.")
    # Ensure column exists for consistency downstream, fill with default
    df_featured['quali_diff_from_pole_ms'] = 999999


# 2e. Teammate Comparison (Grid Position Difference)
if 'grid' in df_featured.columns:
    try:
        print("Calculating teammate grid difference...")
        teammate_data = df_featured[['raceId', 'constructorId', 'driverId', 'grid']].rename(
            columns={'grid': 'teammate_grid', 'driverId': 'teammate_driverId'}
        )
        df_merged_teammates = pd.merge(df_featured[['raceId', 'driverId', 'constructorId', 'grid']], teammate_data, on=['raceId', 'constructorId'], how='left')
        df_merged_teammates = df_merged_teammates[df_merged_teammates['driverId'] != df_merged_teammates['teammate_driverId']]
        df_merged_teammates['grid_diff'] = df_merged_teammates['grid'] - df_merged_teammates['teammate_grid']
        teammate_grid_diff = df_merged_teammates.groupby(['raceId', 'driverId'])['grid_diff'].mean().reset_index().rename(columns={'grid_diff': 'teammate_grid_diff'})
        df_featured = pd.merge(df_featured, teammate_grid_diff, on=['raceId', 'driverId'], how='left')
        df_featured['teammate_grid_diff'] = df_featured['teammate_grid_diff'].fillna(0) # Fill NaNs (no teammate) with 0 diff
        print("Engineered feature: 'teammate_grid_diff'.")
    except Exception as e:
        print(f"Error calculating teammate grid diff: {e}")
        df_featured['teammate_grid_diff'] = 0
else:
    print("Skipping teammate grid diff: 'grid' column missing.")
    df_featured['teammate_grid_diff'] = 0


# 2f. Granular History (DNF Rate Last 5 Races)
if 'is_dnf' in df_featured.columns:
    df_featured['prev_dnf'] = df_featured.groupby('driverId')['is_dnf'].shift(1)
    df_featured['dnf_rate_last_5'] = df_featured.groupby('driverId')['prev_dnf'].transform(
        lambda x: x.rolling(window=5, min_periods=0).mean() # Rate instead of count
    ).fillna(0)
    print("Engineered feature: 'dnf_rate_last_5'.")
else:
    print("Skipping DNF rate feature: 'is_dnf' column missing.")
    df_featured['dnf_rate_last_5'] = 0


# --- Add Constructor DNF Rate (Optional but potentially useful) ---
# ... (Similar logic to driver DNF rate but grouping by constructor) ...
# --- Skipping for now to keep complexity manageable ---


# 2g. Lap Time Data (Avg Lap Time ms Last 5 Races)
if lap_times is not None and races is not None:
    try:
        print("Calculating lap time history...")
        # Filter lap times for only drivers/races in our main df
        relevant_races = df_featured['raceId'].unique()
        relevant_drivers = df_featured['driverId'].unique()
        laptimes_filtered = lap_times[lap_times['raceId'].isin(relevant_races) & lap_times['driverId'].isin(relevant_drivers)].copy()
        laptimes_filtered['milliseconds'] = pd.to_numeric(laptimes_filtered['milliseconds'], errors='coerce')
        laptimes_filtered.dropna(subset=['milliseconds'], inplace=True)

        avg_lap_times = laptimes_filtered.groupby(['raceId', 'driverId'])['milliseconds'].mean().reset_index()
        avg_lap_times = avg_lap_times.rename(columns={'milliseconds': 'avg_lap_time_ms'})
        avg_lap_times = pd.merge(avg_lap_times, races[['raceId', 'date']], on='raceId', how='left').dropna(subset=['date'])
        avg_lap_times = avg_lap_times.sort_values(by=['date', 'raceId'])
        avg_lap_times['prev_avg_lap_time'] = avg_lap_times.groupby('driverId')['avg_lap_time_ms'].shift(1)
        avg_lap_times['avg_lap_time_ms_last_5'] = avg_lap_times.groupby('driverId')['prev_avg_lap_time'].transform(
            lambda x: x.rolling(window=5, min_periods=1).mean()
        )
        mean_overall_lap_time = avg_lap_times['avg_lap_time_ms'].mean() # Calculate mean before filling NaNs
        avg_lap_times['avg_lap_time_ms_last_5'] = avg_lap_times['avg_lap_time_ms_last_5'].fillna(mean_overall_lap_time)

        lap_time_history = avg_lap_times[['raceId', 'driverId', 'avg_lap_time_ms_last_5']].drop_duplicates(['raceId','driverId'], keep='last')
        df_featured = pd.merge(df_featured, lap_time_history, on=['raceId', 'driverId'], how='left')
        df_featured['avg_lap_time_ms_last_5'] = df_featured['avg_lap_time_ms_last_5'].fillna(mean_overall_lap_time)
        print("Engineered feature: 'avg_lap_time_ms_last_5'.")
    except Exception as e:
        print(f"Error calculating lap time history: {e}")
        df_featured['avg_lap_time_ms_last_5'] = df_featured['avg_lap_time_ms_last_5'].mean() if 'avg_lap_time_ms_last_5' in df_featured else 95000 # Default backup
else:
    print("Skipping lap time history: lap_times or races DataFrame missing.")
    df_featured['avg_lap_time_ms_last_5'] = 95000 # Approx default ms


# 2h. Pit Stop Data (Avg Stops Last 5 Races)
if pit_stops is not None and races is not None:
    try:
        print("Calculating pit stop history...")
        num_stops = pit_stops.groupby(['raceId', 'driverId'])['stop'].max().reset_index()
        num_stops = num_stops.rename(columns={'stop': 'num_pit_stops'})
        num_stops = pd.merge(num_stops, races[['raceId', 'date']], on='raceId', how='left').dropna(subset=['date'])
        num_stops = num_stops.sort_values(by=['date', 'raceId'])
        num_stops['prev_num_stops'] = num_stops.groupby('driverId')['num_pit_stops'].shift(1)
        num_stops['avg_stops_last_5'] = num_stops.groupby('driverId')['prev_num_stops'].transform(
            lambda x: x.rolling(window=5, min_periods=0).mean()
        ).fillna(0)

        pit_stop_history = num_stops[['raceId', 'driverId', 'avg_stops_last_5']].drop_duplicates(['raceId','driverId'], keep='last')
        df_featured = pd.merge(df_featured, pit_stop_history, on=['raceId', 'driverId'], how='left')
        df_featured['avg_stops_last_5'] = df_featured['avg_stops_last_5'].fillna(0) # Fill drivers with no stops in history with 0
        print("Engineered feature: 'avg_stops_last_5'.")
    except Exception as e:
        print(f"Error calculating pit stop history: {e}")
        df_featured['avg_stops_last_5'] = 1 # Default fallback
else:
    print("Skipping pit stop history: pit_stops or races DataFrame missing.")
    df_featured['avg_stops_last_5'] = 1


# 2i. Sprint Race Data
if sprint_results is not None:
    try:
        print("Incorporating sprint race data...")
        sprint_data = sprint_results[['raceId', 'driverId', 'positionOrder', 'points']].copy()
        sprint_data = sprint_data.rename(columns={'positionOrder': 'sprint_pos', 'points': 'sprint_points'})
        df_featured = pd.merge(df_featured, sprint_data, on=['raceId', 'driverId'], how='left')
        df_featured['participated_in_sprint'] = df_featured['sprint_pos'].notna().astype(int)
        # Fill NaNs for non-sprint races or drivers not in sprint
        df_featured['sprint_pos'] = df_featured['sprint_pos'].fillna(25) # High value for position
        df_featured['sprint_points'] = df_featured['sprint_points'].fillna(0) # 0 points
        print("Engineered features: 'sprint_pos', 'sprint_points', 'participated_in_sprint'.")
    except Exception as e:
        print(f"Error incorporating sprint data: {e}")
        df_featured['sprint_pos'] = 25
        df_featured['sprint_points'] = 0
        df_featured['participated_in_sprint'] = 0
else:
    print("Skipping sprint results data: sprint_results DataFrame missing.")
    df_featured['sprint_pos'] = 25
    df_featured['sprint_points'] = 0
    df_featured['participated_in_sprint'] = 0


# --- Final Check ---
print("\n--- Checking NaNs after ALL Feature Engineering ---")
final_feature_cols = [col for col in df_featured.columns if col not in df_cleaned.columns or col in ['avg_finish_pos_last_5', 'avg_points_last_5', 'avg_finish_at_circuit', 'quali_diff_from_pole_ms', 'teammate_grid_diff', 'dnf_rate_last_5', 'avg_lap_time_ms_last_5', 'avg_stops_last_5', 'sprint_pos', 'sprint_points', 'participated_in_sprint']]
final_feature_cols = [col for col in final_feature_cols if col in df_featured.columns] # Ensure they exist
if final_feature_cols:
    print(df_featured[final_feature_cols].isnull().sum())
else:
    print("No new feature columns found to check for NaNs.")

# Optional: Clean intermediate columns if needed (e.g., 'prev_finish_pos')
df_featured = df_featured.drop(columns=['prev_finish_pos', 'prev_finish_at_circuit'], errors='ignore')


print(f"--- STAGE 2 COMPLETE --- Shape after feature engineering: {df_featured.shape} ---")
# The final DataFrame with all features is df_featured

--- STAGE 2 START ---
Defined 96 status IDs as DNF indicators.
Dropped potentially existing feature columns for fresh calculation.
Created 'is_dnf' column.
Engineered feature: 'avg_finish_pos_last_5' (driver).
Engineered feature: 'avg_points_last_5' (constructor).
Engineered feature: 'avg_finish_at_circuit' (driver).
Calculating qualifying time difference from pole...
Calculated 'quali_diff_from_pole_ms' for races with valid pole times.
Filled NaNs in df_featured['quali_diff_from_pole_ms'] using the mean difference.
Calculating teammate grid difference...
Engineered feature: 'teammate_grid_diff'.
Engineered feature: 'dnf_rate_last_5'.
Calculating lap time history...
Engineered feature: 'avg_lap_time_ms_last_5'.
Calculating pit stop history...
Engineered feature: 'avg_stops_last_5'.
Incorporating sprint race data...
Engineered features: 'sprint_pos', 'sprint_points', 'participated_in_sprint'.

--- Checking NaNs after ALL Feature Engineering ---
is_dnf                        0
prev_finis

In [82]:
# --- STAGE 3: Define Features, Targets, Split Data, Define Preprocessor ---

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
# Keep necessary imports for this block
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

print("--- STAGE 3 START ---")
# Make sure df_featured exists from Block 2
if 'df_featured' not in locals() or df_featured is None:
     raise NameError("DataFrame 'df_featured' not found. Please run Block 1 and 2 first.")

# --- 3a. Define Features and Targets ---
# ** IMPORTANT: Ensure this list matches columns created in Block 2 **
categorical_features = [
    'driverId',
    'constructorId',
    'circuitId',
    'participated_in_sprint' # Added as categorical/flag
]
numerical_features = [
    'grid',
    'year',
    'avg_finish_pos_last_5',
    'avg_points_last_5',
    'avg_finish_at_circuit',
    'quali_diff_from_pole_ms',
    'teammate_grid_diff',
    'dnf_rate_last_5',
    'avg_lap_time_ms_last_5',
    'avg_stops_last_5',
    'sprint_pos',
    'sprint_points'
]

target_regr = 'target_finishing_position'
target_clas = 'target_podium_finish'

# Check if all defined features and targets exist
all_needed_columns = numerical_features + categorical_features + [target_regr, target_clas]
missing_cols = [col for col in all_needed_columns if col not in df_featured.columns]
if missing_cols:
    raise ValueError(f"Missing required columns in df_featured: {missing_cols}.")

# Select final features (X) and targets (y)
features = numerical_features + categorical_features
X = df_featured[features].copy()
y_regr = df_featured[target_regr]
y_clas = df_featured[target_clas]
print(f"Features selected for modeling: {features}")

# Handle potential NaNs in X
if X.isnull().sum().sum() > 0:
    print("Warning: NaNs found in final feature set X. Imputing with 0.")
    X = X.fillna(0)

# --- 3b. Data Splitting ---
# We split ONCE here. Optuna will use X_train, y_train for cross-validation.
# X_test, y_test are held out for FINAL evaluation ONLY.
X_train, X_test, y_train_regr, y_test_regr = train_test_split(
    X, y_regr, test_size=0.2, random_state=42, stratify=df_featured['year']
)
y_train_clas = y_clas.loc[X_train.index]
y_test_clas = y_clas.loc[X_test.index]
print(f"Data split complete. Train Shape: {X_train.shape}, Test Shape: {X_test.shape}")

# --- 3c. Define Preprocessor (Used in Tuning and Final Model) ---
numerical_cols_final = X_train.select_dtypes(include=np.number).columns.tolist()
categorical_cols_final = [col for col in categorical_features if col in X_train.columns] # Use predefined list

print(f"Numerical columns for pipeline: {numerical_cols_final}")
print(f"Categorical columns for pipeline: {categorical_cols_final}")

# Define the preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols_final),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols_final)
    ],
    remainder='passthrough'
)
print("Preprocessor defined.")

# --- REMOVED: Pipeline definitions, fitting, and evaluation on test set ---
# --- Those steps are moved to Block 5, after tuning ---

print("--- STAGE 3 COMPLETE ---")

--- STAGE 3 START ---
Features selected for modeling: ['grid', 'year', 'avg_finish_pos_last_5', 'avg_points_last_5', 'avg_finish_at_circuit', 'quali_diff_from_pole_ms', 'teammate_grid_diff', 'dnf_rate_last_5', 'avg_lap_time_ms_last_5', 'avg_stops_last_5', 'sprint_pos', 'sprint_points', 'driverId', 'constructorId', 'circuitId', 'participated_in_sprint']
Data split complete. Train Shape: (20298, 16), Test Shape: (5075, 16)
Numerical columns for pipeline: ['grid', 'year', 'avg_finish_pos_last_5', 'avg_points_last_5', 'avg_finish_at_circuit', 'quali_diff_from_pole_ms', 'teammate_grid_diff', 'dnf_rate_last_5', 'avg_lap_time_ms_last_5', 'avg_stops_last_5', 'sprint_pos', 'sprint_points', 'driverId', 'constructorId', 'circuitId', 'participated_in_sprint']
Categorical columns for pipeline: ['driverId', 'constructorId', 'circuitId', 'participated_in_sprint']
Preprocessor defined.
--- STAGE 3 COMPLETE ---


In [83]:
# --- STAGE 4: Hyperparameter Tuning with Optuna ---

import optuna
import numpy as np
import xgboost as xgb # Import XGBoost
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier # Keep RF if you want to compare later
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
# Make sure necessary variables from Block 3 are available:
# preprocessor, X_train, y_train_regr, y_train_clas

print("--- STAGE 4 START ---")

N_TRIALS = 50 # Number of Optuna trials (increase for better search, e.g., 50-100)
CV_FOLDS = 5 # Number of cross-validation folds

# --- (Optional) Keep Original RF Tuning ---
# You can keep or comment out the RF tuning if you want
# print("\n--- Running Random Forest Tuning (as reference) ---")
# ... (Objective functions and studies for RF from previous Block 4 can remain here if desired) ...
# best_params_regr = study_regr.best_params # Store RF results if kept
# best_params_clas = study_clas.best_params # Store RF results if kept


# --- 4e. Objective Function for XGBoost Regression ---
def objective_regr_xgb(trial):
    # Suggest hyperparameters for XGBoost Regressor
    param = {
        'objective': 'reg:squarederror',
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000, step=100),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0), # Fraction of samples used per tree
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0), # Fraction of features used per tree
        'gamma': trial.suggest_float('gamma', 0, 5), # Minimum loss reduction required to make a further partition
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1.0, log=True), # L1 regularization
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0, log=True), # L2 regularization
        'random_state': 42,
        'n_jobs': -1
    }

    model = xgb.XGBRegressor(**param)
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', model)])
    score = cross_val_score(pipeline, X_train, y_train_regr, cv=CV_FOLDS, scoring='neg_mean_absolute_error', n_jobs=-1)
    return score.mean()

# --- 4f. Run Optuna Study for XGBoost Regression ---
print(f"\n--- Running XGBoost Tuning for Regression ---")
print(f"Starting Optuna study for XGBoost Regression (MAE, {N_TRIALS} trials, {CV_FOLDS}-fold CV)...")
study_regr_xgb = optuna.create_study(direction='maximize') # Maximize neg_mean_absolute_error
study_regr_xgb.optimize(objective_regr_xgb, n_trials=N_TRIALS)

print("Optuna study for XGBoost Regression complete.")
print("Best parameters (XGBoost Regression): ", study_regr_xgb.best_params)
print("Best value (Neg MAE): ", study_regr_xgb.best_value)
best_params_regr_xgb = study_regr_xgb.best_params


# --- 4g. Objective Function for XGBoost Classification ---
# Calculate scale_pos_weight (needs y_train_clas)
neg_count = np.sum(y_train_clas == 0)
pos_count = np.sum(y_train_clas == 1)
scale_pos_weight_val = neg_count / pos_count if pos_count > 0 else 1
print(f"\nCalculated scale_pos_weight for XGBClassifier Tuning: {scale_pos_weight_val:.2f}")

def objective_clas_xgb(trial):
    # Suggest hyperparameters for XGBoost Classifier
    param = {
        'objective': 'binary:logistic',
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000, step=100),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0, log=True),
        'scale_pos_weight': scale_pos_weight_val, # Use pre-calculated value
        'random_state': 42,
        'n_jobs': -1
    }

    model = xgb.XGBClassifier(**param)
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model)])
    score = cross_val_score(pipeline, X_train, y_train_clas, cv=CV_FOLDS, scoring='f1', n_jobs=-1) # Optimize F1
    return score.mean()

# --- 4h. Run Optuna Study for XGBoost Classification ---
print(f"\n--- Running XGBoost Tuning for Classification ---")
print(f"Starting Optuna study for XGBoost Classification (F1 Score, {N_TRIALS} trials, {CV_FOLDS}-fold CV)...")
study_clas_xgb = optuna.create_study(direction='maximize') # Maximize F1 score
study_clas_xgb.optimize(objective_clas_xgb, n_trials=N_TRIALS)

print("Optuna study for XGBoost Classification complete.")
print("Best parameters (XGBoost Classification): ", study_clas_xgb.best_params)
print("Best value (F1 Score): ", study_clas_xgb.best_value)
best_params_clas_xgb = study_clas_xgb.best_params


print("--- STAGE 4 COMPLETE ---")

[I 2025-04-06 15:04:25,323] A new study created in memory with name: no-name-27d0b3b1-8ecb-444d-abcf-cbfa58073d47


--- STAGE 4 START ---

--- Running XGBoost Tuning for Regression ---
Starting Optuna study for XGBoost Regression (MAE, 50 trials, 5-fold CV)...


[I 2025-04-06 15:04:36,945] Trial 0 finished with value: -4.815522003173828 and parameters: {'n_estimators': 400, 'learning_rate': 0.11284802133425699, 'max_depth': 5, 'subsample': 0.9456040052527537, 'colsample_bytree': 0.5696922940375366, 'gamma': 1.0200399378811364, 'reg_alpha': 0.0021560960788677857, 'reg_lambda': 0.0033851780773969213}. Best is trial 0 with value: -4.815522003173828.
[I 2025-04-06 15:04:42,642] Trial 1 finished with value: -4.907850074768066 and parameters: {'n_estimators': 100, 'learning_rate': 0.10492842394256792, 'max_depth': 3, 'subsample': 0.6736783727927032, 'colsample_bytree': 0.7508047613125373, 'gamma': 2.501727098948248, 'reg_alpha': 0.5406923056646545, 'reg_lambda': 0.000932662723573297}. Best is trial 0 with value: -4.815522003173828.
[I 2025-04-06 15:05:13,618] Trial 2 finished with value: -4.942002391815185 and parameters: {'n_estimators': 1000, 'learning_rate': 0.1006941049639469, 'max_depth': 9, 'subsample': 0.5979461629352489, 'colsample_bytree': 

Optuna study for XGBoost Regression complete.
Best parameters (XGBoost Regression):  {'n_estimators': 900, 'learning_rate': 0.038633681436004576, 'max_depth': 6, 'subsample': 0.960801107370461, 'colsample_bytree': 0.6263699801226762, 'gamma': 4.926498329907147, 'reg_alpha': 0.5687887554840704, 'reg_lambda': 5.810428411766778e-05}
Best value (Neg MAE):  -4.799409198760986

Calculated scale_pos_weight for XGBClassifier Tuning: 6.42

--- Running XGBoost Tuning for Classification ---
Starting Optuna study for XGBoost Classification (F1 Score, 50 trials, 5-fold CV)...


[I 2025-04-06 15:21:12,267] Trial 0 finished with value: 0.5214314053991451 and parameters: {'n_estimators': 200, 'learning_rate': 0.08645717198597459, 'max_depth': 4, 'subsample': 0.7972351582165182, 'colsample_bytree': 0.8954887644347789, 'gamma': 3.283383429573518, 'reg_alpha': 0.0012075953732307383, 'reg_lambda': 1.06799011583065e-08}. Best is trial 0 with value: 0.5214314053991451.
[I 2025-04-06 15:21:29,246] Trial 1 finished with value: 0.5221784532327052 and parameters: {'n_estimators': 700, 'learning_rate': 0.1665745342346827, 'max_depth': 3, 'subsample': 0.8378550466365733, 'colsample_bytree': 0.5468272003669729, 'gamma': 1.8920384463035667, 'reg_alpha': 1.1143732254766286e-08, 'reg_lambda': 0.00429446908032998}. Best is trial 1 with value: 0.5221784532327052.
[I 2025-04-06 15:21:36,048] Trial 2 finished with value: 0.5245703666148304 and parameters: {'n_estimators': 100, 'learning_rate': 0.02723146334007002, 'max_depth': 7, 'subsample': 0.8175712139757143, 'colsample_bytree':

Optuna study for XGBoost Classification complete.
Best parameters (XGBoost Classification):  {'n_estimators': 400, 'learning_rate': 0.02404725697718134, 'max_depth': 10, 'subsample': 0.8464161577053163, 'colsample_bytree': 0.5384129296787337, 'gamma': 1.284020793833157, 'reg_alpha': 0.0010937946130121804, 'reg_lambda': 0.00034656017495160576}
Best value (F1 Score):  0.5381897693492729
--- STAGE 4 COMPLETE ---


In [85]:
# --- STAGE 5: Final Model Training & Evaluation (Using Tuned Random Forest) ---

# --- Imports for this block ---
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, accuracy_score, classification_report
# Import Random Forest models
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
# Make sure necessary variables are available from previous blocks:
# preprocessor,
# best_params_regr, best_params_clas, # <<< Use the ORIGINAL RF params from Optuna
# X_train, y_train_regr, y_train_clas, X_test, y_test_regr, y_test_clas

print("--- STAGE 5 START (Using Tuned Random Forest with FULL features) ---")

# --- 5a. Define Final Pipelines with Tuned Random Forest Parameters ---
print("Defining final pipelines using best parameters found previously by Optuna for Random Forest...")

# Check if best parameters for RF were found/are available
# If you didn't store them previously, you might need to re-run the RF tuning part of Block 4
# or use the last known good parameters manually.
if 'best_params_regr' not in locals():
    print("Warning: Best RandomForest Regressor params ('best_params_regr') not found! Using defaults as fallback.")
    best_params_regr = {} # Fallback to defaults
if 'best_params_clas' not in locals():
    print("Warning: Best RandomForest Classifier params ('best_params_clas') not found! Using defaults as fallback.")
    best_params_clas = {} # Fallback to defaults

# Define RandomForest Regressor using parameters found previously by Optuna
final_rf_regressor = RandomForestRegressor(
    **best_params_regr, # Unpack the best RF parameters
    random_state=42,
    n_jobs=-1
)

# Define RandomForest Classifier using parameters found previously by Optuna
final_rf_classifier = RandomForestClassifier(
    **best_params_clas, # Unpack the best RF parameters
    random_state=42,
    class_weight='balanced', # Ensure class_weight is retained
    n_jobs=-1
)

# Create full pipelines with TUNED Random Forest models
final_pipeline_regr = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', final_rf_regressor) # Use Tuned RF Regressor
])

final_pipeline_clas = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', final_rf_classifier) # Use Tuned RF Classifier
])

# --- 5b. Train Final Models on FULL Training Data ---
print("Training final Tuned Random Forest Regression model on full training data...")
final_pipeline_regr.fit(X_train, y_train_regr)
print("Final Tuned Random Forest Regression model trained.")

print("Training final Tuned Random Forest Classification model on full training data...")
final_pipeline_clas.fit(X_train, y_train_clas)
print("Final Tuned Random Forest Classification model trained.")

# --- 5c. Evaluate Final Models on HELD-OUT Test Data ---
print("\n--- FINAL Tuned Random Forest Regression Model Evaluation (on Test Set with FULL features) ---")
y_pred_regr_final = final_pipeline_regr.predict(X_test)
mae_final = mean_absolute_error(y_test_regr, y_pred_regr_final) # RF usually predicts integers if target is int
print(f"FINAL Mean Absolute Error (MAE) on Test Set: {mae_final:.4f}")

print("\n--- FINAL Tuned Random Forest Classification Model Evaluation (on Test Set with FULL features) ---")
y_pred_clas_final = final_pipeline_clas.predict(X_test)
accuracy_final = accuracy_score(y_test_clas, y_pred_clas_final)
print(f"FINAL Accuracy on Test Set: {accuracy_final:.4f}")
print("\nFINAL Classification Report (on Test Set):")
print(classification_report(y_test_clas, y_pred_clas_final, target_names=['No Podium', 'Podium']))

print("--- STAGE 5 COMPLETE (Using Tuned Random Forest with FULL features) ---")

--- STAGE 5 START (Using Tuned Random Forest with FULL features) ---
Defining final pipelines using best parameters found previously by Optuna for Random Forest...
Training final Tuned Random Forest Regression model on full training data...
Final Tuned Random Forest Regression model trained.
Training final Tuned Random Forest Classification model on full training data...
Final Tuned Random Forest Classification model trained.

--- FINAL Tuned Random Forest Regression Model Evaluation (on Test Set with FULL features) ---
FINAL Mean Absolute Error (MAE) on Test Set: 4.9244

--- FINAL Tuned Random Forest Classification Model Evaluation (on Test Set with FULL features) ---
FINAL Accuracy on Test Set: 0.8300

FINAL Classification Report (on Test Set):
              precision    recall  f1-score   support

   No Podium       0.96      0.84      0.90      4414
      Podium       0.42      0.75      0.54       661

    accuracy                           0.83      5075
   macro avg       0.69  