In [99]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score

In [100]:
data = pandas.read_csv("dataset.csv")

In [101]:
data

Unnamed: 0.1,Unnamed: 0,year,name_y,driverRef,constructorRef,grid,position,points,status,fastestLapTime,fastestLapSpeed,laps,milliseconds
0,0,2008,Australian Grand Prix,hamilton,mclaren,1,1.0,10.0,Finished,01:27.5,218.300,58,5690616.0
1,1,2008,Australian Grand Prix,heidfeld,bmw_sauber,5,2.0,8.0,Finished,01:27.7,217.586,58,5696094.0
2,2,2008,Australian Grand Prix,rosberg,williams,7,3.0,6.0,Finished,01:28.1,216.719,58,5698779.0
3,3,2008,Australian Grand Prix,alonso,renault,11,4.0,5.0,Finished,01:28.6,215.464,58,5707797.0
4,4,2008,Australian Grand Prix,kovalainen,mclaren,3,5.0,4.0,Finished,01:27.4,218.385,58,5708630.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
26754,26754,2024,Abu Dhabi Grand Prix,kevin_magnussen,haas,14,16.0,0.0,+1 Lap,01:25.6,222.002,57,
26755,26755,2024,Abu Dhabi Grand Prix,lawson,rb,12,17.0,0.0,Engine,01:28.8,214.212,55,
26756,26756,2024,Abu Dhabi Grand Prix,bottas,sauber,9,,0.0,Collision damage,01:29.5,212.462,30,
26757,26757,2024,Abu Dhabi Grand Prix,colapinto,williams,20,,0.0,Engine,01:29.4,212.631,26,


In [102]:
data["averageFinish"] = data.groupby("driverRef")["position"].transform(lambda x: x.rolling(10, min_periods=1).mean())
data["careerWins"] = data.groupby("driverRef")["position"].transform(lambda x: (x == 1).cumsum())
data["podiums"] = data.groupby("driverRef")["position"].transform(lambda x: (x <= 3).cumsum())
data["totalRaces"] = data.groupby("driverRef").cumcount() + 1
data["constructorAvgPoints"] = data.groupby("constructorRef")["points"].transform(lambda x: x.rolling(5, min_periods=1).mean())
data["constructorReliability"] = data.groupby("constructorRef")["status"].transform(lambda x: (x == "Finished").sum() / len(x))
data["trackPerformance"] = data.groupby(["driverRef", "name_y"])["position"].transform(lambda x: x.expanding().mean())
data["gridEffect"] = data["grid"] - data["position"]




In [103]:
data["position"] = pandas.to_numeric(data["position"], errors="coerce")
data = data.fillna(0)
features = ["grid", "averageFinish", "careerWins", "podiums", "totalRaces", 
            "constructorAvgPoints", "constructorReliability", "trackPerformance", 
            "gridEffect", "fastestLapSpeed", "laps"]
X = data[features]
y = (data["position"] == 1).astype(int)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5]
}
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3, scoring="f1", n_jobs=-1)
grid_search.fit(X_train_resampled, y_train_resampled)
best_model = grid_search.best_estimator_
best_model.fit(X_train_resampled, y_train_resampled)
y_pred = best_model.predict(X_test)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5126
           1       0.96      0.92      0.94       226

    accuracy                           1.00      5352
   macro avg       0.98      0.96      0.97      5352
weighted avg       1.00      1.00      1.00      5352



In [None]:

y_pred = best_model.predict(X_test)
test_results = X_test.copy()
test_results["Actual_Winner"] = y_test
test_results["Predicted_Winner"] = y_pred
incorrect_preds = test_results[test_results["Actual_Winner"] != test_results["Predicted_Winner"]]
accuracy = accuracy_score(y_test, y_pred)
print(f"🏁 Model Test Accuracy: {accuracy:.4f}")
print(incorrect_preds)




🏁 Model Test Accuracy: 0.9951
