## Predictor from https://www.kaggle.com/code/sevrussnape/data-analytics-project/notebook#3.-Race-Strategy-Optimization-via-Simulation

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from utils import getFiles

In [5]:
# Load the datasets
files = getFiles()

# Load all datasets into a dictionary
data = {name: pd.read_csv(path) for name, path in files.items()}

In [6]:
# Filter races for the Italian Grand Prix from the past 10 years
recent_years = [2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]
races = data['races']
italian_gp_recent = races[(races['name'].str.contains("British Grand Prix")) & (races['year'].isin(recent_years))]

# Merge qualifying data with Italian GP data
qualifying = data['qualifying']
qualifying_italian_gp_recent = qualifying[qualifying['raceId'].isin(italian_gp_recent['raceId'])]

# Merge results data with Italian GP data
results = data['results']
results_italian_gp_recent = results[results['raceId'].isin(italian_gp_recent['raceId'])]

# Calculate average qualifying position and race position
avg_qualifying_position = qualifying_italian_gp_recent.groupby('driverId')['position'].mean().reset_index().rename(columns={'position': 'avg_qualifying_position'})
avg_race_position = results_italian_gp_recent.groupby('driverId')['positionOrder'].mean().reset_index().rename(columns={'positionOrder': 'avg_race_position'})

# Merge features
features = avg_qualifying_position.merge(avg_race_position, on='driverId')

# Create a binary target variable for race winners
results_italian_gp_recent = results_italian_gp_recent.copy()  # Create a copy to avoid SettingWithCopyWarning
results_italian_gp_recent.loc[:, 'winner'] = results_italian_gp_recent['positionOrder'] == 1
race_winners_recent = results_italian_gp_recent[['raceId', 'driverId', 'winner']]
race_winners_recent = race_winners_recent[race_winners_recent['winner']].drop(columns=['winner'])

# Merge target with features
data_recent = features.merge(race_winners_recent, on='driverId', how='left').fillna(0)

# Prepare data for model
X = data_recent[['avg_qualifying_position', 'avg_race_position']]
y = data_recent['driverId'].apply(lambda x: 1 if x in race_winners_recent['driverId'].values else 0)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Filter drivers participating in the 2024 championship
races_2024 = races[races['year'] == 2024]
results_2024 = results[results['raceId'].isin(races_2024['raceId'])]

# Get unique driver IDs for the 2024 season
drivers_2024_ids = results_2024['driverId'].unique()

# Sample average qualifying and race positions for drivers competing in 2024
latest_qualifying_position = qualifying[qualifying['driverId'].isin(drivers_2024_ids) & qualifying['raceId'].isin(races_2024['raceId'])].groupby('driverId')['position'].mean().reset_index().rename(columns={'position': 'avg_qualifying_position'})
latest_race_position = results[results['driverId'].isin(drivers_2024_ids) & results['raceId'].isin(races_2024['raceId'])].groupby('driverId')['positionOrder'].mean().reset_index().rename(columns={'positionOrder': 'avg_race_position'})

# Merge features for prediction
latest_features = latest_qualifying_position.merge(latest_race_position, on='driverId')

# Predict
latest_features['predicted_winner'] = model.predict(latest_features[['avg_qualifying_position', 'avg_race_position']])

# Map driver IDs to names
drivers = data['drivers']
driver_id_to_name = dict(zip(drivers['driverId'], drivers['surname']))
latest_features['driver'] = latest_features['driverId'].map(driver_id_to_name)

# Show predictions
print(latest_features[['driver', 'avg_qualifying_position', 'avg_race_position', 'predicted_winner']])

Accuracy: 0.9285714285714286
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.92      0.96        12
           1       0.67      1.00      0.80         2

    accuracy                           0.93        14
   macro avg       0.83      0.96      0.88        14
weighted avg       0.95      0.93      0.93        14

        driver  avg_qualifying_position  avg_race_position  predicted_winner
0     Hamilton                 7.750000           6.916667                 1
1       Alonso                10.000000           9.833333                 0
2   Hülkenberg                11.666667          10.916667                 0
3        Pérez                 8.250000           7.833333                 0
4    Ricciardo                13.166667          13.583333                 0
5       Bottas                15.166667          15.916667                 0
6    Magnussen                15.666667          13.583333                 0
7   