In [17]:
import fastf1
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Configure matplotlib for inline plotting
%matplotlib inline

In [18]:
fastf1.Cache.enable_cache('../data')  # Cache data in the 'data' directory

In [19]:
def collect_race_data(years, race_names):
    all_results = []
    all_laps = []
    all_qualy = []
    all_qualy_laps = []
    all_car_data = []
    all_positions = []

    for year in years:
        for race_name in race_names:
            try:
                fastf1.Cache.offline_mode(enabled=True)
                # Load race session
                session = fastf1.get_session(year, race_name, 'R')

                if session is None:
                    print(f"Session not found for {year} {race_name}")
                    continue

                session.load()

                results = session.results
                results['Year'] = year
                results['RaceName'] = race_name
                all_results.append(results)

                # Collect lap data
                laps = session.laps
                weather_data = laps.get_weather_data()
                laps = laps.reset_index(drop=True)
                weather_data = weather_data.reset_index(drop=True)
                joined = pd.concat([laps, weather_data.loc[:, ~(weather_data.columns == 'Time')]], axis=1)
                laps['Year'] = year
                laps['RaceName'] = race_name
                all_laps.append(joined)

                # Collect telemetry and positional data
                car_data = session.car_data
                pos_data = session.pos_data

                # Collect telemetry data for each driver
                for driver in session.drivers:
                    # Collect telemetry for the driver
                    if driver in car_data:
                        driver_car_data = car_data[driver]
                        driver_car_data['DriverNumber'] = driver
                        driver_car_data['Year'] = year
                        driver_car_data['RaceName'] = race_name
                        all_car_data.append(driver_car_data)

                    # Collect positional data for the driver
                    if driver in pos_data:
                        driver_pos_data = pos_data[driver]
                        driver_pos_data['DriverNumber'] = driver
                        driver_pos_data['Year'] = year
                        driver_pos_data['RaceName'] = race_name
                        all_positions.append(driver_pos_data)

                # Load qualifying session
                qualy_session = fastf1.get_session(year, race_name, 'Q')
                qualy_session.load()
                qualy_results = qualy_session.results
                qualy_results['Year'] = year
                qualy_results['RaceName'] = race_name
                all_qualy.append(qualy_results[['DriverNumber', 'Position']].rename(columns={'Position': 'QualiPosition'}))

                # Collect lap data for qualifying
                qualy_laps = qualy_session.laps
                weather_data = qualy_laps.get_weather_data()
                qualy_laps = qualy_laps.reset_index(drop=True)
                weather_data = weather_data.reset_index(drop=True)
                joined = pd.concat([qualy_laps, weather_data.loc[:, ~(weather_data.columns == 'Time')]], axis=1)
                joined['Year'] = year
                joined['RaceName'] = race_name
                all_qualy_laps.append(joined)

                print(f"Loaded data for {race_name} {year}")
            except Exception as e:
                print(f"Error loading {race_name} {year}: {e}")

    # Concatenate all results and laps dataframes
    all_results_df = pd.concat(all_results, ignore_index=True)
    all_laps_df = pd.concat(all_laps, ignore_index=True)
    all_qualy_df = pd.concat(all_qualy, ignore_index=True)
    all_qualy_laps_df = pd.concat(all_qualy_laps, ignore_index=True)
    all_car_data = pd.concat(all_car_data, ignore_index=True)
    all_positions_df = pd.concat(all_positions, ignore_index=True)

    return all_results_df, all_laps_df, all_qualy_df, all_qualy_laps_df, all_car_data, all_positions_df


In [None]:
years = list(range(2019, 2025))
race_names = list(range(1, 25))
# years = [2020]
# race_names = [1,2,3]
# Collect the data
results, laps, qualy, qualylaps, car_data, positions = collect_race_data(years, race_names)

In [5]:
results.to_csv('../data/historical_results.csv', index=False)
laps.to_csv('../data/historical_laps.csv', index=False)
qualy.to_csv('../data/historical_qualy.csv', index=False)
car_data.to_csv('../data/historical_car_data.csv', index=False)
positions.to_csv('../data/historical_positions.csv', index=False)

In [22]:
results_with_features = pd.DataFrame()

# Average lap time
# Merge lap times with driver results
lap_times = results.merge(laps, on='DriverNumber')

# Calculate average lap time per driver
avg_lap_times = lap_times.groupby('DriverNumber')['LapTime'].mean().reset_index()
avg_lap_times.rename(columns={'LapTime': 'AvgLapTime'}, inplace=True)

# Merge with results
results_df = results[['DriverNumber', 'FullName', 'TeamName', 'Position', 'Points', 'Status']]
results_with_features = pd.merge(results_df, avg_lap_times, on='DriverNumber')

In [23]:
# Top speed
# Calculate top speed per driver
# Note: Ensure 'SpeedST' or 'Speed' is available in your laps data
if 'SpeedST' in laps.columns:
    speed_column = 'SpeedST'
elif 'Speed' in laps.columns:
    speed_column = 'Speed'
else:
    print("Speed data not available in laps dataframe.")
    speed_column = None

if speed_column:
    top_speeds = laps.groupby('DriverNumber')[speed_column].max().reset_index()
    top_speeds.rename(columns={speed_column: 'TopSpeed'}, inplace=True)

    # Merge with results_with_features
    results_with_features = pd.merge(results_with_features, top_speeds, on='DriverNumber', how='left')
else:
    print("Top Speed feature cannot be added due to missing speed data.")


In [24]:
# Aggregate telemetry features for each driver
car_telemetry_features = car_data.groupby('DriverNumber').agg({
    'Speed': ['mean', 'max'],  # Calculate both average and max speed
    'RPM': 'mean',             # Average RPM
    'Throttle': 'mean',        # Average throttle usage
    'Brake': 'mean'            # Average brake usage
}).reset_index()


# Rename the columns properly to avoid conflicts
car_telemetry_features.columns = ['DriverNumber', 'AvgSpeed', 'MaxSpeed', 'AvgRPM', 'AvgThrottle', 'AvgBrake']

# Merge telemetry features with the main dataset
results_with_features = pd.merge(results_with_features, car_telemetry_features, on='DriverNumber', how='left')

In [25]:
# Filter out invalid position data
valid_positions = positions[(positions['X'] != 0) & (positions['Y'] != 0) & (positions['Status'] != 'OffTrack')]

# Aggregate positional features for each driver
positional_features = valid_positions.groupby('DriverNumber').agg({
    'X': 'mean',  # Average X position on track
    'Y': 'mean',  # Average Y position on track
    'Z': 'mean',  # Average Z position (altitude, if relevant)
    'Time': 'count'  # Number of valid position samples
}).reset_index()

# Rename the columns for clarity
positional_features.rename(columns={
    'X': 'AvgXPosition',
    'Y': 'AvgYPosition',
    'Z': 'AvgZPosition',  # Optional, if Z is relevant
    'Time': 'NumPositionSamples'  # Total number of valid position data points
}, inplace=True)

# Merge positional features with the main dataset
results_with_features = pd.merge(results_with_features, positional_features, on='DriverNumber', how='left')

In [None]:
# Identify pit laps where 'PitInTime' is not null
pit_laps = laps[laps['PitInTime'].notnull()]
# Count pit stops per driver
pit_stop_counts = pit_laps.groupby('DriverNumber').size().reset_index(name='NumPitStops')

# Merge with results_with_features
results_with_features = pd.merge(results_with_features, pit_stop_counts, on='DriverNumber', how='left')
# Fill NaN values with 0 for drivers without pit stops
results_with_features['NumPitStops'].fillna(0, inplace=True)


In [27]:
# Merge qualifying position into results_with_features
results_with_features = pd.merge(results_with_features, qualy, on='DriverNumber', how='left')

In [28]:
# Extract tyre compound information
tyre_compounds = laps[['DriverNumber', 'Compound']]

# Determine the most used compound for each driver
most_used_compound = tyre_compounds.groupby('DriverNumber')['Compound'].agg(lambda x: x.value_counts().index[0]).reset_index()
most_used_compound.rename(columns={'Compound': 'MostUsedCompound'}, inplace=True)

# One-hot encode the tyre compounds
tyre_dummies = pd.get_dummies(most_used_compound['MostUsedCompound'], prefix='Tyre')

# Merge with driver data
most_used_compound = pd.concat([most_used_compound, tyre_dummies], axis=1)

# Merge with results_with_features
results_with_features = pd.merge(results_with_features, most_used_compound.drop('MostUsedCompound', axis=1), on='DriverNumber', how='left')

In [None]:
# Extract weather data
weather_data = laps[['DriverNumber', 'AirTemp', 'Humidity', 'Pressure', 'TrackTemp', 'WindDirection', 'WindSpeed']]

# Calculate the min, max, and standard deviation for weather conditions per driver
weather_variability = weather_data.groupby('DriverNumber').agg({
    'AirTemp': ['mean', 'min', 'max', 'std'],
    'Humidity': ['mean', 'min', 'max', 'std'],
    'Pressure': ['mean', 'min', 'max', 'std'],
    'TrackTemp': ['mean', 'min', 'max', 'std'],
    'WindDirection': ['mean', 'min', 'max', 'std'],
    'WindSpeed': ['mean', 'min', 'max', 'std']
}).reset_index()

# Flatten the multi-level column names
weather_variability.columns = ['DriverNumber', 'AvgAirTemp', 'MinAirTemp', 'MaxAirTemp', 'StdAirTemp',
                               'AvgHumidity', 'MinHumidity', 'MaxHumidity', 'StdHumidity',
                               'AvgPressure', 'MinPressure', 'MaxPressure', 'StdPressure',
                               'AvgTrackTemp', 'MinTrackTemp', 'MaxTrackTemp', 'StdTrackTemp',
                               'AvgWindDirection', 'MinWindDirection', 'MaxWindDirection', 'StdWindDirection',
                               'AvgWindSpeed', 'MinWindSpeed', 'MaxWindSpeed', 'StdWindSpeed']

# Calculate the rate of change for AirTemp and WindSpeed over the race
laps['AirTempChange'] = laps['AirTemp'].diff()
laps['WindSpeedChange'] = laps['WindSpeed'].diff()

# Aggregate rate of change statistics per driver
weather_rate_of_change = laps.groupby('DriverNumber').agg({
    'AirTempChange': ['mean', 'std'],  # Mean and std deviation of temperature change
    'WindSpeedChange': ['mean', 'std']  # Mean and std deviation of wind speed change
}).reset_index()

# Flatten the column names
weather_rate_of_change.columns = ['DriverNumber', 'AvgAirTempChange', 'StdAirTempChange',
                                  'AvgWindSpeedChange', 'StdWindSpeedChange']


# Merge weather variability features into results_with_features
results_with_features = pd.merge(results_with_features, weather_variability, on='DriverNumber', how='left')

# Merge weather rate of change features into results_with_features
results_with_features = pd.merge(results_with_features, weather_rate_of_change, on='DriverNumber', how='left')

results_with_features.columns


In [30]:
# Calculate average pit stop duration per driver
pit_stop_durations = pit_laps.groupby('DriverNumber')['PitOutTime'].mean().reset_index()
pit_stop_durations.rename(columns={'PitOutTime': 'AvgPitStopDuration'}, inplace=True)

# Merge with results_with_features
results_with_features = pd.merge(results_with_features, pit_stop_durations, on='DriverNumber', how='left')

In [31]:
laps['Sector1Time'] = pd.to_timedelta(laps['Sector1Time']).dt.total_seconds()
laps['Sector2Time'] = pd.to_timedelta(laps['Sector2Time']).dt.total_seconds()
laps['Sector3Time'] = pd.to_timedelta(laps['Sector3Time']).dt.total_seconds()

# Aggregate sector times for each driver (e.g., average sector time per driver)
sector_times = laps.groupby('DriverNumber').agg({
    'Sector1Time': 'mean',  # Average sector 1 time
    'Sector2Time': 'mean',  # Average sector 2 time
    'Sector3Time': 'mean'   # Average sector 3 time
}).reset_index()

# Rename columns for clarity
sector_times.rename(columns={
    'Sector1Time': 'AvgSector1Time',
    'Sector2Time': 'AvgSector2Time',
    'Sector3Time': 'AvgSector3Time'
}, inplace=True)

# Merge sector times with the main results_with_features DataFrame
results_with_features = pd.merge(results_with_features, sector_times, on='DriverNumber', how='left')

In [32]:
qualylaps['Sector1Time'] = pd.to_timedelta(qualylaps['Sector1Time']).dt.total_seconds()
qualylaps['Sector2Time'] = pd.to_timedelta(qualylaps['Sector2Time']).dt.total_seconds()
qualylaps['Sector3Time'] = pd.to_timedelta(qualylaps['Sector3Time']).dt.total_seconds()
qualylaps['LapTime'] = pd.to_timedelta(qualylaps['LapTime']).dt.total_seconds()

# Extract the fastest qualifying lap and best sector times for each driver
fastest_qualy_lap = qualylaps.groupby('DriverNumber').agg({
    'LapTime': 'min',              # Fastest lap time during qualifying
    'Sector1Time': 'min',          # Fastest sector 1 time
    'Sector2Time': 'min',          # Fastest sector 2 time
    'Sector3Time': 'min'           # Fastest sector 3 time
}).reset_index()

# Rename the columns for clarity
fastest_qualy_lap.rename(columns={
    'LapTime': 'FastestQualyLapTime',
    'Sector1Time': 'FastestQualySector1Time',
    'Sector2Time': 'FastestQualySector2Time',
    'Sector3Time': 'FastestQualySector3Time'
}, inplace=True)

# Merge fastest qualifying lap and sector times with the main dataset
results_with_features = pd.merge(results_with_features, fastest_qualy_lap, on='DriverNumber', how='left')

In [33]:
# Calculate the average qualifying lap time for each driver
average_qualy_lap = qualylaps.groupby('DriverNumber').agg({
    'LapTime': 'mean'  # Average lap time during qualifying
}).reset_index()

# Rename the column
average_qualy_lap.rename(columns={'LapTime': 'AvgQualyLapTime'}, inplace=True)

# Merge average qualifying lap time with the main dataset
results_with_features = pd.merge(results_with_features, average_qualy_lap, on='DriverNumber', how='left')

In [34]:
# Convert timedelta features to seconds
time_features = ['AvgLapTime', 'AvgPitStopDuration']
for feature in time_features:
    results_with_features[feature] = pd.to_timedelta(results_with_features[feature]).dt.total_seconds()

In [None]:
# Drop the duplicated weather columns with '_y' suffix
results_with_features.drop(columns=[col for col in results_with_features.columns if '_y' in col], inplace=True)

# Rename the columns with '_x' suffix to remove it
results_with_features.columns = results_with_features.columns.str.replace('_x', '')

results_with_features.columns

In [35]:
# List continuous features to scale
continuous_features = ['AvgLapTime', 'TopSpeed', 'NumPitStops', 'QualiPosition',
                       'AvgThrottle', 'AvgBrake', 'AvgSpeed', 'MaxSpeed',
                       'AvgRPM', 'AvgXPosition', 'AvgYPosition', 'AvgZPosition',
                       'AvgPitStopDuration', 'NumPositionSamples',
                       'AvgSector1Time', 'AvgSector2Time', 'AvgSector3Time',
                       'FastestQualyLapTime', 'FastestQualySector1Time',
                       'FastestQualySector2Time', 'FastestQualySector3Time',
                       'AvgQualyLapTime',
                       'MinAirTemp', 'MaxAirTemp', 'StdAirTemp',
                       'MinHumidity', 'MaxHumidity', 'StdHumidity',
                       'MinPressure', 'MaxPressure', 'StdPressure',
                       'MinTrackTemp', 'MaxTrackTemp', 'StdTrackTemp',
                       'MinWindDirection', 'MaxWindDirection', 'StdWindDirection',
                       'MinWindSpeed', 'MaxWindSpeed', 'StdWindSpeed',
                       'AvgAirTempChange', 'StdAirTempChange',
                       'AvgWindSpeedChange', 'StdWindSpeedChange']

# Select features and target variable
features = continuous_features + list(tyre_dummies.columns)
target = 'Position'

# Handle missing values if any
results_with_features.dropna(subset=features + [target], inplace=True)


# Define X and y
X = results_with_features[features]
y = results_with_features[target]


In [36]:
from sklearn.model_selection import train_test_split

# Split into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [37]:
from sklearn.preprocessing import StandardScaler

# Initialize the StandardScaler
scaler = StandardScaler()

# Scale only the continuous features
X_train_continuous = scaler.fit_transform(X_train[continuous_features])
X_test_continuous = scaler.transform(X_test[continuous_features])

# Convert scaled features back to DataFrame for easy concatenation with categorical features
X_train_continuous_df = pd.DataFrame(X_train_continuous, columns=continuous_features, index=X_train.index)
X_test_continuous_df = pd.DataFrame(X_test_continuous, columns=continuous_features, index=X_test.index)

In [111]:
# Combine the scaled continuous features with the categorical (unscaled) features
X_train_final = pd.concat([X_train_continuous_df, X_train.drop(columns=continuous_features)], axis=1)
X_test_final = pd.concat([X_test_continuous_df, X_test.drop(columns=continuous_features)], axis=1)

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Initialize the model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import mean_absolute_error

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")

In [None]:
import joblib

# Save the model
joblib.dump(model, '../models/f1_model_2.pkl')

# Save the scaler if you used one
joblib.dump(scaler, '../models/scaler_2.pkl')


In [None]:
# Get feature importances
importances = model.feature_importances_
feature_names = features

# Create a DataFrame
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})

# Sort by importance
feature_importance_df.sort_values(by='Importance', ascending=False, inplace=True)

# Plot
plt.figure(figsize=(15, 10))
sns.barplot(data=feature_importance_df, x='Importance', y='Feature')
plt.title('Feature Importances')
plt.show()


In [None]:
feature_importance_df

In [None]:
residuals = y_test - y_pred
plt.figure(figsize=(10,6))
sns.histplot(residuals, bins=20, kde=True)
plt.title('Residuals Distribution')
plt.xlabel('Residual (Actual - Predicted Position)')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Check alignment
mismatches = results_with_features[results_with_features['QualiPosition'].isnull()]
print(f"Number of drivers without qualifying position: {len(mismatches)}")

In [None]:
# Check variance in weather features
weather_features = ['AvgAirTemp', 'AvgHumidity', 'AvgPressure', 'AvgTrackTemp', 'AvgWindDirection', 'AvgWindSpeed']
weather_variability = results_with_features[weather_features].std()
print(weather_variability)

In [45]:
# Convert all object columns to numeric, coercing errors
for col in X_train.columns:
    if X_train[col].dtype == 'object':
        X_train[col] = pd.to_numeric(X_train[col], errors='coerce')
        X_test[col] = pd.to_numeric(X_test[col], errors='coerce')

# You may also need to handle NaN values after conversion, if any
X_train.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)

In [None]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

# Initialize the XGBoost model
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)

# Train the model
xgb_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_xgb = xgb_model.predict(X_test)
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
print(f"XGBoost Mean Absolute Error: {mae_xgb}")

In [None]:
# Get feature importances
importances_xgb = xgb_model.feature_importances_

# Create a DataFrame
feature_importance_xgb_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances_xgb})

# Sort by importance
feature_importance_xgb_df.sort_values(by='Importance', ascending=False, inplace=True)

# Plot
plt.figure(figsize=(15, 10))
sns.barplot(data=feature_importance_xgb_df, x='Importance', y='Feature')
plt.title('XGBoost Feature Importances')
plt.show()

In [None]:
residuals = y_test - y_pred_xgb
plt.figure(figsize=(10,6))
sns.histplot(residuals, bins=20, kde=True)
plt.title('Residuals Distribution')
plt.xlabel('Residual (Actual - Predicted Position)')
plt.ylabel('Frequency')
plt.show()

In [None]:
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error

# Initialize the LightGBM model
lgb_model = lgb.LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)

# Train the model
lgb_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_lgb = lgb_model.predict(X_test)
mae_lgb = mean_absolute_error(y_test, y_pred_lgb)
print(f"LightGBM Mean Absolute Error: {mae_lgb}")

In [None]:
# Get feature importances
importances_lgb = lgb_model.feature_importances_

# Create a DataFrame
feature_importance_lgb_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances_lgb})

# Sort by importance
feature_importance_lgb_df.sort_values(by='Importance', ascending=False, inplace=True)

# Plot
plt.figure(figsize=(15, 10))
sns.barplot(data=feature_importance_lgb_df, x='Importance', y='Feature')
plt.title('LightGBM Feature Importances')
plt.show()

In [None]:
residuals = y_test - y_pred_lgb
plt.figure(figsize=(10,6))
sns.histplot(residuals, bins=20, kde=True)
plt.title('Residuals Distribution')
plt.xlabel('Residual (Actual - Predicted Position)')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Explore Hyperparameter Tuning for Random Forest
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# Define the hyperparameter grid
param_dist = {
    'n_estimators': randint(50, 200),
    'max_depth': randint(2, 10),
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 20),
    'max_features': ['sqrt', 'log2']
}

# Initialize the RandomizedSearchCV
random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=20, cv=5, scoring='neg_mean_absolute_error', random_state=42)

# Perform the search
random_search.fit(X_train, y_train)

# Get the best parameters
print(random_search.best_params_)
print(random_search.best_score_)
print(random_search.best_estimator_)

In [None]:
# Check the column names in results_with_features
print(results_with_features.columns)

In [63]:
# Assuming you have the year and race name available in some variables
results_with_features['Year'] = 2018  # Replace race_year with the actual year value
results_with_features['RaceName'] = 1  # Replace race_name with the actual race name or number

In [64]:
# Filter your race data for Monza 2024
monza_2024_data = results_with_features[
    (results_with_features['Year'] == 2018) & (results_with_features['RaceName'] == 1)
]

# Select only the features that the model was trained on
X_1_2024 = monza_2024_data[features]  # 'features' is the list you used for training


In [65]:

# Use the trained RandomForest model (or LightGBM, XGBoost)
predicted_positions = random_search.best_estimator_.predict(X_1_2024)

# Add the predictions to the dataframe for easier inspection
monza_2024_data['PredictedPosition'] = predicted_positions


In [None]:

# Display the predicted positions along with driver names and actual positions (if available)
print(monza_2024_data[['DriverNumber', 'FullName', 'TeamName', 'PredictedPosition']])


In [None]:

# If actual race positions are available, compare predicted vs actual
monza_2024_data['ActualPosition'] = monza_2024_data['Position']  # Assuming 'Position' column has the true results

# Compare predicted vs actual
comparison = monza_2024_data[['DriverNumber', 'FullName', 'TeamName', 'PredictedPosition', 'ActualPosition']]
print(comparison)

In [69]:
# Aggregate lap-level data to one row per driver per race
monza_2024_agg = monza_2024_data.groupby(['DriverNumber', 'FullName', 'TeamName']).agg({
    'AvgLapTime': 'mean',
    'TopSpeed': 'mean',
    'AvgSpeed': 'mean',
    'MaxSpeed': 'mean',
    'AvgRPM': 'mean',
    'AvgThrottle': 'mean',
    'AvgBrake': 'mean',
    'NumPitStops': 'mean',   # Example for aggregation, adjust as necessary
    'QualiPosition': 'mean',
    # Add other necessary feature aggregations here
    'Position': 'first'  # Actual race position, using 'first' to get the original value
}).reset_index()

In [None]:
# Extract the features you used for training
X_monza_2024_agg = monza_2024_agg[[
    'AvgLapTime', 'TopSpeed', 'AvgSpeed', 'MaxSpeed', 'AvgRPM', 'AvgThrottle', 'AvgBrake', 'NumPitStops', 'QualiPosition'
]]  # 'features' is the list of features used in training

# Predict the driver positions
predicted_positions = random_search.best_estimator_.predict(X_monza_2024_agg)

# Add the predicted positions to the aggregated DataFrame
monza_2024_agg['PredictedPosition'] = predicted_positions

# Compare with actual positions
comparison = monza_2024_agg[['DriverNumber', 'FullName', 'TeamName', 'PredictedPosition', 'Position']]
print(comparison)

In [None]:
# Explore Hyperparameter Tuning:
# 	•	Both XGBoost and LightGBM are highly sensitive to hyperparameters. A grid search or random search across hyperparameters like learning_rate, max_depth, and n_estimators might reduce the MAE.
# 	•	Here’s how you could start tuning:

from sklearn.model_selection import GridSearchCV

# Define hyperparameter grid for XGBoost
param_grid_xgb = {
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 6, 9],
    'n_estimators': [100, 200, 300],
}

# Perform grid search
grid_search_xgb = GridSearchCV(estimator=xgb_model, param_grid=param_grid_xgb, scoring='neg_mean_absolute_error', cv=3)
grid_search_xgb.fit(X_train, y_train)
print("Best parameters for XGBoost:", grid_search_xgb.best_params_)
print("Best MAE for XGBoost:", -grid_search_xgb.best_score_)

In [None]:
# Explore Hyperparameter Tuning:
# 	•	Both XGBoost and LightGBM are highly sensitive to hyperparameters. A grid search or random search across hyperparameters like learning_rate, max_depth, and n_estimators might reduce the MAE.
# 	•	Here’s how you could start tuning:

from sklearn.model_selection import GridSearchCV

# Define hyperparameter grid for XGBoost
param_grid_lgb = {
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 6, 9],
    'n_estimators': [100, 200, 300],
}

# Perform grid search
grid_search_lgb = GridSearchCV(estimator=lgb_model, param_grid=param_grid_lgb, scoring='neg_mean_absolute_error', cv=3)
grid_search_lgb.fit(X_train, y_train)
print("Best parameters for LightGBM:", grid_search_lgb.best_params_)
print("Best MAE for LightGBM:", -grid_search_lgb.best_score_)

In [None]:
from xgboost import XGBRegressor
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error

# Initialize the XGBoost and LightGBM models with the tuned parameters
xgb_model = XGBRegressor(n_estimators=200, learning_rate=0.05, max_depth=3, random_state=42)
lgb_model = lgb.LGBMRegressor(n_estimators=200, learning_rate=0.05, max_depth=3, random_state=42)

# Train the models
xgb_model.fit(X_train, y_train)
lgb_model.fit(X_train, y_train)

# Make predictions
y_pred_xgb = xgb_model.predict(X_test)
y_pred_lgb = lgb_model.predict(X_test)

# Averaging the predictions
y_pred_avg = (y_pred_xgb + y_pred_lgb) / 2

# Evaluate the averaged predictions
mae_avg = mean_absolute_error(y_test, y_pred_avg)
print(f"Model Averaging Mean Absolute Error: {mae_avg}")

In [None]:
from sklearn.linear_model import LinearRegression

# Get predictions on the training data (for stacking)
y_train_pred_xgb = xgb_model.predict(X_train)
y_train_pred_lgb = lgb_model.predict(X_train)

# Stack the predictions together as new features for the meta-model
stacked_train = np.column_stack((y_train_pred_xgb, y_train_pred_lgb))

# Train the meta-model (Linear Regression)
meta_model = LinearRegression()
meta_model.fit(stacked_train, y_train)

# Get predictions from the base models on the test set
y_test_pred_xgb = xgb_model.predict(X_test)
y_test_pred_lgb = lgb_model.predict(X_test)

# Stack the test set predictions
stacked_test = np.column_stack((y_test_pred_xgb, y_test_pred_lgb))

# Meta-model makes final predictions
y_pred_stacked = meta_model.predict(stacked_test)

# Evaluate the stacked model
mae_stacked = mean_absolute_error(y_test, y_pred_stacked)
print(f"Model Stacking Mean Absolute Error: {mae_stacked}")

In [None]:
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error
import numpy as np

# Define the parameter grid
learning_rates = [0.01, 0.05, 0.1]
max_depths = [3, 6, 9]
n_estimators = [100, 200, 300]

# Keep track of the best parameters and MAE
best_mae = np.inf
best_params = {}

# Loop through all combinations of parameters
for lr in learning_rates:
    for depth in max_depths:
        for n_est in n_estimators:
            # Initialize and train the LightGBM model with current parameters
            lgb_model = lgb.LGBMRegressor(n_estimators=n_est, learning_rate=lr, max_depth=depth, random_state=42)
            lgb_model.fit(X_train, y_train)

            # Predict and evaluate
            y_pred_lgb = lgb_model.predict(X_test)
            mae_lgb = mean_absolute_error(y_test, y_pred_lgb)
            print(f"Parameters: learning_rate={lr}, max_depth={depth}, n_estimators={n_est} -> MAE: {mae_lgb}")

            # Update best parameters if MAE improves
            if mae_lgb < best_mae:
                best_mae = mae_lgb
                best_params = {'learning_rate': lr, 'max_depth': depth, 'n_estimators': n_est}

print(f"Best Parameters: {best_params} with MAE: {best_mae}")

In [None]:
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb

# Define the parameter grid
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 6, 9],
    'n_estimators': [100, 200, 300]
}

# Initialize the LightGBM model
lgb_model = lgb.LGBMRegressor(random_state=42)

# Perform Grid Search
grid_search = GridSearchCV(estimator=lgb_model, param_grid=param_grid, scoring='neg_mean_absolute_error', cv=3)
grid_search.fit(X_train, y_train)

# Output the best parameters and MAE
best_params = grid_search.best_params_
best_mae = -grid_search.best_score_
print(f"Best Parameters: {best_params}")
print(f"Best Mean Absolute Error: {best_mae}")

In [None]:
import concurrent.futures
import time

fastf1.Cache.offline_mode(enabled=False)
def collect_and_store_race_data(years, race_names):
    for year in years:
        for race_name in race_names:
            try:
                session = fastf1.get_session(year, race_name, 'R')

                if session is None:
                    print(f"Session not found for {year} {race_name}")
                    return

                session.load()
                print(f"Race data for {race_name} {year} loaded.")

                qualy_session = fastf1.get_session(year, race_name, 'Q')
                qualy_session.load()
                print(f"Qualifying data for {race_name} {year} loaded.")

                print(f"Saved data for {race_name} {year}")
            except Exception as e:
                print(f"Error loading {race_name} {year}: {e}")

years = [2017]
race_names = [1,2]
collect_and_store_race_data(years, race_names)