# Predictive Exploration

In [1]:
import pandas as pd

In [61]:
features = pd.read_csv("../data/processed/features_pre_race.csv")

In [62]:
features

Unnamed: 0,Year,GrandPrix,Driver,TeamName,grid_position,is_top10_start,grid_vs_team_avg,driver_avg_finish_last5,driver_std_finish_last5,driver_avg_quali_last5,driver_points_last5,driver_dnf_rate_last5,driver_points_ytd,team_avg_finish_last5,team_avg_quali_last5,team_points_last5,team_dnf_rate_last5,team_points_ytd,Position
0,2024,Abu Dhabi Grand Prix,Alexander Albon,Williams,18.0,0,-1.0,,,,,,0.0,,,,,0.0,11.0
1,2024,Abu Dhabi Grand Prix,Fernando Alonso,Aston Martin,8.0,1,-2.5,,,,,,0.0,,,,,0.0,9.0
2,2024,Abu Dhabi Grand Prix,Valtteri Bottas,Kick Sauber,9.0,1,-3.0,,,,,,2.0,,,,,2.0,18.0
3,2024,Abu Dhabi Grand Prix,Franco Colapinto,Williams,20.0,0,1.0,,,,,,0.0,11.0,18.0,0.0,1.0,0.0,19.0
4,2024,Abu Dhabi Grand Prix,Jack Doohan,Alpine,17.0,0,6.0,,,,,,0.0,,,,,0.0,15.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,2025,United States Grand Prix,Oscar Piastri,McLaren,6.0,1,2.0,2.6,1.516575,2.4,17.8,0.0,26.0,2.4,2.6,17.6,0.0,58.0,5.0
914,2025,United States Grand Prix,George Russell,Mercedes,4.0,1,-1.5,4.4,4.098780,4.6,14.4,0.2,347.0,8.2,4.4,9.4,0.2,704.0,6.0
915,2025,United States Grand Prix,Carlos Sainz,Williams,9.0,1,-4.5,10.4,2.190890,14.0,1.4,0.6,266.0,14.2,16.8,0.2,0.6,368.0,20.0
916,2025,United States Grand Prix,Lance Stroll,Aston Martin,19.0,0,4.5,14.6,1.516575,16.6,0.0,1.0,30.0,11.0,12.2,1.8,0.4,100.0,12.0


In [63]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [64]:
le = LabelEncoder()
features["TeamName_enc"] = le.fit_transform(features["TeamName"])
features["GrandPrix_enc"] = le.fit_transform(features["GrandPrix"])
features["Driver_enc"] = le.fit_transform(features["Driver"])

In [65]:
features = features.drop(columns=["GrandPrix", "TeamName", "Driver"])

In [66]:
features

Unnamed: 0,Year,grid_position,is_top10_start,grid_vs_team_avg,driver_avg_finish_last5,driver_std_finish_last5,driver_avg_quali_last5,driver_points_last5,driver_dnf_rate_last5,driver_points_ytd,team_avg_finish_last5,team_avg_quali_last5,team_points_last5,team_dnf_rate_last5,team_points_ytd,Position,TeamName_enc,GrandPrix_enc,Driver_enc
0,2024,18.0,0,-1.0,,,,,,0.0,,,,,0.0,11.0,10,0,0
1,2024,8.0,1,-2.5,,,,,,0.0,,,,,0.0,9.0,1,0,6
2,2024,9.0,1,-3.0,,,,,,2.0,,,,,2.0,18.0,4,0,26
3,2024,20.0,0,1.0,,,,,,0.0,11.0,18.0,0.0,1.0,0.0,19.0,10,0,7
4,2024,17.0,0,6.0,,,,,,0.0,,,,,0.0,15.0,0,0,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,2025,6.0,1,2.0,2.6,1.516575,2.4,17.8,0.0,26.0,2.4,2.6,17.6,0.0,58.0,5.0,5,24,23
914,2025,4.0,1,-1.5,4.4,4.098780,4.6,14.4,0.2,347.0,8.2,4.4,9.4,0.2,704.0,6.0,6,24,9
915,2025,9.0,1,-4.5,10.4,2.190890,14.0,1.4,0.6,266.0,14.2,16.8,0.2,0.6,368.0,20.0,10,24,2
916,2025,19.0,0,4.5,14.6,1.516575,16.6,0.0,1.0,30.0,11.0,12.2,1.8,0.4,100.0,12.0,1,24,15


In [67]:
scaler = StandardScaler()
# X = features.drop(columns=["is_podium"])
X = features
X_scaled = scaler.fit_transform(X)
X = pd.DataFrame(X_scaled, columns=X.columns)
X = X.drop(columns=["Position"])

y = features["Position"].apply(lambda x: 1 if x <= 3 else 0) 

In [68]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

In [69]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [70]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [71]:
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {acc}")
print(f"F1: {f1}")

Accuracy: 0.9094202898550725
F1: 0.6575342465753424


In [72]:
confusion_matrix(y_test, y_pred)

array([[227,   8],
       [ 17,  24]])

In [73]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.97      0.95       235
           1       0.75      0.59      0.66        41

    accuracy                           0.91       276
   macro avg       0.84      0.78      0.80       276
weighted avg       0.90      0.91      0.90       276



In [74]:
xgb = XGBClassifier(random_state=42)

In [75]:
xgb.fit(X_train, y_train)

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [76]:
y_pred = xgb.predict(X_test)

In [77]:
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {acc}")
print(f"F1: {f1}")

Accuracy: 0.8985507246376812
F1: 0.6410256410256411


In [78]:
confusion_matrix(y_test, y_pred)

array([[223,  12],
       [ 16,  25]])

In [79]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.95      0.94       235
           1       0.68      0.61      0.64        41

    accuracy                           0.90       276
   macro avg       0.80      0.78      0.79       276
weighted avg       0.89      0.90      0.90       276

