In [90]:
import pandas as pd
import numpy as np

DATA_PATH ="/Users/kriti/Desktop/f1-podium-predictor/data/raw/"

races = pd.read_csv(DATA_PATH + "races.csv")
results = pd.read_csv(DATA_PATH + "results.csv")
qualifying = pd.read_csv(DATA_PATH + "qualifying.csv")
drivers = pd.read_csv(DATA_PATH + "drivers.csv")
constructors = pd.read_csv(DATA_PATH + "constructors.csv")
pit_stops = pd.read_csv(DATA_PATH + "pit_stops.csv")

In [91]:
#checking data
print("Races:", races.shape)
print("Results:", results.shape)
print("Qualifying:", qualifying.shape)
print("Drivers:", drivers.shape)
print("Constructors:", constructors.shape)
print("Pit Stops:", pit_stops.shape)


races[['raceId','year','round','circuitId']].head()


Races: (1149, 18)
Results: (27238, 18)
Qualifying: (10973, 9)
Drivers: (864, 9)
Constructors: (212, 5)
Pit Stops: (12192, 7)


Unnamed: 0,raceId,year,round,circuitId
0,1,2009,1,1
1,2,2009,2,2
2,3,2009,3,17
3,4,2009,4,3
4,5,2009,5,4


In [92]:
results[['raceId','driverId','constructorId','grid','positionOrder','statusId']].head()

Unnamed: 0,raceId,driverId,constructorId,grid,positionOrder,statusId
0,18,1,1,1,1,1
1,18,2,2,5,2,1
2,18,3,3,7,3,1
3,18,4,4,11,4,1
4,18,5,1,3,5,1


In [93]:
qualifying[['raceId','driverId','position','q1','q2','q3']].head()

Unnamed: 0,raceId,driverId,position,q1,q2,q3
0,18,1,1,1:26.572,1:25.187,1:26.714
1,18,9,2,1:26.103,1:25.315,1:26.869
2,18,5,3,1:25.664,1:25.452,1:27.079
3,18,13,4,1:25.994,1:25.691,1:27.178
4,18,2,5,1:25.960,1:25.518,1:27.236


In [94]:
#Filtering year from 2014 - new upgrades.

In [95]:
START_YEAR = 2014

modern_races = races[races["year"] >= START_YEAR].copy()
modern_race_ids = set(modern_races["raceId"])

results = results[results["raceId"].isin(modern_race_ids)].copy()
qualifying = qualifying[qualifying["raceId"].isin(modern_race_ids)].copy()
pit_stops = pit_stops[pit_stops["raceId"].isin(modern_race_ids)].copy()

print("Modern races:", modern_races.shape)
print("Results:", results.shape)
print("Qualifying:", qualifying.shape)
print(qualifying["raceId"].nunique()) # = count of races = 252

Modern races: (252, 18)
Results: (5105, 18)
Qualifying: (5089, 9)
252


In [96]:
#cleaning data

In [97]:
results["grid"] = pd.to_numeric(results["grid"], errors="coerce") #converts string to int

results = results[
    (results["grid"] > 0) &
    (results["positionOrder"].notna())
].copy()

print("Results after cleaning:", results.shape)

qualifying = qualifying[
    qualifying["position"].notna()
].copy()

print("Qualifying after cleaning:", qualifying.shape)


Results after cleaning: (5006, 18)
Qualifying after cleaning: (5089, 9)


In [98]:
drivers_per_race = results.groupby("raceId")["driverId"].nunique()
drivers_per_race.describe()


count    251.000000
mean      19.944223
std        1.045407
min       16.000000
25%       20.000000
50%       20.000000
75%       20.000000
max       22.000000
Name: driverId, dtype: float64

In [99]:
#CREATING PUDIUM TARGET - WE NEED <3 POSITION TO BE IN THE NEW COLUMN = PODIUM --> 0 is not podium and 1 is podium
results["positionOrder"] = pd.to_numeric(
    results["positionOrder"], errors="coerce"
)
results["podium"] = (results["positionOrder"] <= 3).astype(int)
results[["raceId", "driverId", "positionOrder", "podium"]].head(10)

Unnamed: 0,raceId,driverId,positionOrder,podium
22127,900,3,1,1
22128,900,825,2,1
22129,900,18,3,1
22130,900,4,4,0
22131,900,822,5,0
22132,900,807,6,0
22133,900,8,7,0
22134,900,818,8,0
22135,900,826,9,0
22136,900,815,10,0


In [100]:
results["podium"].value_counts(normalize=True)

0    0.849581
1    0.150419
Name: podium, dtype: float64

In [101]:
#Creating a new table which holds all our required data in one.

results_base = results[
    [
        "raceId",
        "driverId",
        "constructorId",
        "positionOrder",
        "podium"
    ]
].copy()

qualifying_base = qualifying[
    [
        "raceId",
        "driverId",
        "position",
        "q1",
        "q2",
        "q3"
    ]
].copy()

qualifying_base["position"] = pd.to_numeric(
    qualifying_base["position"], errors="coerce"
)

base_df = results_base.merge(
    qualifying_base,
    on=["raceId", "driverId"],
    how="inner"
)

base_df = base_df.merge(
    modern_races[["raceId", "year", "round", "circuitId"]],
    on="raceId",
    how="left"
)

base_df = base_df.merge(
    drivers[["driverId", "forename", "surname"]],
    on="driverId",
    how="left"
)

base_df = base_df.merge(
    constructors[["constructorId", "name"]],
    on="constructorId",
    how="left",
    suffixes=("", "_constructor")
)
base_df = base_df.rename(columns={
    "position": "qualifying_position",
    "name": "constructor_name"
})

In [102]:
base_df.head()

Unnamed: 0,raceId,driverId,constructorId,positionOrder,podium,qualifying_position,q1,q2,q3,year,round,circuitId,forename,surname,constructor_name
0,900,3,131,1,1,3,1:32.564,1:42.264,1:44.595,2014,1,1,Nico,Rosberg,Mercedes
1,900,825,1,2,1,4,1:30.949,1:43.247,1:45.745,2014,1,1,Kevin,Magnussen,McLaren
2,900,18,1,3,1,11,1:31.396,1:44.437,\N,2014,1,1,Jenson,Button,McLaren
3,900,4,6,4,0,5,1:31.388,1:42.805,1:45.819,2014,1,1,Fernando,Alonso,Ferrari
4,900,822,3,5,0,10,1:31.601,1:43.852,1:48.147,2014,1,1,Valtteri,Bottas,Williams


In [103]:
base_df.isna().sum().sort_values(ascending=False)

raceId                 0
driverId               0
constructorId          0
positionOrder          0
podium                 0
qualifying_position    0
q1                     0
q2                     0
q3                     0
year                   0
round                  0
circuitId              0
forename               0
surname                0
constructor_name       0
dtype: int64

In [104]:
#Adding features for drivers

In [105]:
#sorting the data based on driver id, year and round
base_df = base_df.sort_values(
    by=["driverId", "year", "round"]
).reset_index(drop=True)
base_df.head()

Unnamed: 0,raceId,driverId,constructorId,positionOrder,podium,qualifying_position,q1,q2,q3,year,round,circuitId,forename,surname,constructor_name
0,900,1,131,19,0,1,1:31.699,1:42.890,1:44.231,2014,1,1,Lewis,Hamilton,Mercedes
1,901,1,131,1,1,1,1:57.202,1:59.041,1:59.431,2014,2,2,Lewis,Hamilton,Mercedes
2,902,1,131,1,1,2,1:35.323,1:33.872,1:33.464,2014,3,3,Lewis,Hamilton,Mercedes
3,903,1,131,1,1,1,1:55.516,1:54.029,1:53.860,2014,4,17,Lewis,Hamilton,Mercedes
4,904,1,131,1,1,1,1:27.238,1:26.210,1:25.232,2014,5,4,Lewis,Hamilton,Mercedes


In [106]:
#creating new column prev_race_finish to add prev race finish to that row

base_df["prev_race_finish"] = (
    base_df
    .groupby("driverId")["positionOrder"]
    .shift(1) # basically taking position order and shifting 1 downwards
)
base_df[
    ["driverId", "year", "round", "positionOrder", "prev_race_finish"]
].head(10)


Unnamed: 0,driverId,year,round,positionOrder,prev_race_finish
0,1,2014,1,19,
1,1,2014,2,1,19.0
2,1,2014,3,1,1.0
3,1,2014,4,1,1.0
4,1,2014,5,1,1.0
5,1,2014,6,2,1.0
6,1,2014,7,17,2.0
7,1,2014,8,2,17.0
8,1,2014,9,1,2.0
9,1,2014,10,3,1.0


In [107]:
# creating new column to calculate avg position finished in last 5 races
N = 5

base_df["rolling_avg_finish"] = (
    base_df
    .groupby("driverId")["positionOrder"]
    .shift(1)
    .rolling(window=N, min_periods=1)
    .mean()
)
base_df[
    ["driverId", "year", "round", "positionOrder", "rolling_avg_finish"]
].head(15)


Unnamed: 0,driverId,year,round,positionOrder,rolling_avg_finish
0,1,2014,1,19,
1,1,2014,2,1,19.0
2,1,2014,3,1,10.0
3,1,2014,4,1,7.0
4,1,2014,5,1,5.5
5,1,2014,6,2,4.6
6,1,2014,7,17,1.2
7,1,2014,8,2,4.4
8,1,2014,9,1,4.6
9,1,2014,10,3,4.6


In [108]:
# new column - probablity of podium in last 5 races
base_df["rolling_podium_rate"] = (
    base_df
    .groupby("driverId")["podium"]
    .shift(1)
    .rolling(window=N, min_periods=1)
    .mean()
)
base_df[
    ["driverId", "year", "round", "podium", "rolling_podium_rate"]
].head(15)


Unnamed: 0,driverId,year,round,podium,rolling_podium_rate
0,1,2014,1,0,
1,1,2014,2,1,0.0
2,1,2014,3,1,0.5
3,1,2014,4,1,0.666667
4,1,2014,5,1,0.75
5,1,2014,6,1,0.8
6,1,2014,7,0,1.0
7,1,2014,8,1,0.8
8,1,2014,9,1,0.8
9,1,2014,10,1,0.8


In [109]:
# filling first row for new columns
base_df["prev_race_finish"].fillna(
    base_df["positionOrder"].median(), inplace=True
)

base_df["rolling_avg_finish"].fillna(
    base_df["positionOrder"].median(), inplace=True
)

base_df["rolling_podium_rate"].fillna(0, inplace=True)

base_df.head()


Unnamed: 0,raceId,driverId,constructorId,positionOrder,podium,qualifying_position,q1,q2,q3,year,round,circuitId,forename,surname,constructor_name,prev_race_finish,rolling_avg_finish,rolling_podium_rate
0,900,1,131,19,0,1,1:31.699,1:42.890,1:44.231,2014,1,1,Lewis,Hamilton,Mercedes,11.0,11.0,0.0
1,901,1,131,1,1,1,1:57.202,1:59.041,1:59.431,2014,2,2,Lewis,Hamilton,Mercedes,19.0,19.0,0.0
2,902,1,131,1,1,2,1:35.323,1:33.872,1:33.464,2014,3,3,Lewis,Hamilton,Mercedes,1.0,10.0,0.5
3,903,1,131,1,1,1,1:55.516,1:54.029,1:53.860,2014,4,17,Lewis,Hamilton,Mercedes,1.0,7.0,0.666667
4,904,1,131,1,1,1,1:27.238,1:26.210,1:25.232,2014,5,4,Lewis,Hamilton,Mercedes,1.0,5.5,0.75


In [110]:
#Adding features for Constructors

In [111]:
#new table to calculate constructor avg finish for this race
constructor_race_avg = (
    base_df
    .groupby(["constructorId", "raceId"])["positionOrder"]
    .mean()
    .reset_index(name="constructor_race_avg_finish")
)

In [112]:
#adding new column in df to find avg finish of both drivers in prev race -> 1. sorting, 2. shifting down 1
constructor_race_avg = constructor_race_avg.sort_values(
    by=["constructorId", "raceId"]
)

constructor_race_avg["constructor_prev_avg_finish"] = (
    constructor_race_avg
    .groupby("constructorId")["constructor_race_avg_finish"]
    .shift(1)
)

In [113]:
#merging both df's
base_df = base_df.merge(
    constructor_race_avg[
        ["constructorId", "raceId", "constructor_prev_avg_finish"]
    ],
    on=["constructorId", "raceId"],
    how="left"
)

In [114]:
base_df[
    ["constructorId", "raceId", "positionOrder", "constructor_prev_avg_finish"]
].head(10)


Unnamed: 0,constructorId,raceId,positionOrder,constructor_prev_avg_finish
0,131,900,19,
1,131,901,1,10.0
2,131,902,1,1.5
3,131,903,1,1.5
4,131,904,1,1.5
5,131,905,2,1.5
6,131,906,17,1.5
7,131,907,2,9.5
8,131,908,1,1.5
9,131,909,3,9.5


In [115]:
#Rolling avg of last 5 races
N = 5

constructor_race_avg["constructor_rolling_avg_finish"] = (
    constructor_race_avg
    .groupby("constructorId")["constructor_race_avg_finish"]
    .shift(1)
    .rolling(window=N, min_periods=1)
    .mean()
)

#Merge
base_df = base_df.merge(
    constructor_race_avg[
        ["constructorId", "raceId", "constructor_rolling_avg_finish"]
    ],
    on=["constructorId", "raceId"],
    how="left"
)


In [116]:
base_df[
    ["constructorId", "raceId", "constructor_rolling_avg_finish"]
].head(10)

Unnamed: 0,constructorId,raceId,constructor_rolling_avg_finish
0,131,900,14.125
1,131,901,13.875
2,131,902,10.25
3,131,903,6.875
4,131,904,3.625
5,131,905,3.2
6,131,906,1.5
7,131,907,3.1
8,131,908,3.1
9,131,909,4.7


In [117]:
#new table to track podium rate
constructor_podium = (
    base_df
    .groupby(["constructorId", "raceId"])["podium"]
    .sum()
    .reset_index(name="constructor_podium_count")
)


In [118]:
#sort, find rate of podium per last 5 races based on constructor
constructor_podium = constructor_podium.sort_values(
    by=["constructorId", "raceId"]
)

constructor_podium["constructor_rolling_podium_rate"] = (
    constructor_podium
    .groupby("constructorId")["constructor_podium_count"]
    .shift(1)
    .rolling(window=N, min_periods=1)
    .mean() / 2
)

#merge
base_df = base_df.merge(
    constructor_podium[
        ["constructorId", "raceId", "constructor_rolling_podium_rate"]
    ],
    on=["constructorId", "raceId"],
    how="left"
)


In [120]:
base_df.head(10)

Unnamed: 0,raceId,driverId,constructorId,positionOrder,podium,qualifying_position,q1,q2,q3,year,...,circuitId,forename,surname,constructor_name,prev_race_finish,rolling_avg_finish,rolling_podium_rate,constructor_prev_avg_finish,constructor_rolling_avg_finish,constructor_rolling_podium_rate
0,900,1,131,19,0,1,1:31.699,1:42.890,1:44.231,2014,...,1,Lewis,Hamilton,Mercedes,11.0,11.0,0.0,,14.125,0.0
1,901,1,131,1,1,1,1:57.202,1:59.041,1:59.431,2014,...,2,Lewis,Hamilton,Mercedes,19.0,19.0,0.0,10.0,13.875,0.125
2,902,1,131,1,1,2,1:35.323,1:33.872,1:33.464,2014,...,3,Lewis,Hamilton,Mercedes,1.0,10.0,0.5,1.5,10.25,0.375
3,903,1,131,1,1,1,1:55.516,1:54.029,1:53.860,2014,...,17,Lewis,Hamilton,Mercedes,1.0,7.0,0.666667,1.5,6.875,0.625
4,904,1,131,1,1,1,1:27.238,1:26.210,1:25.232,2014,...,4,Lewis,Hamilton,Mercedes,1.0,5.5,0.75,1.5,3.625,0.875
5,905,1,131,2,1,2,1:17.823,1:16.354,1:16.048,2014,...,6,Lewis,Hamilton,Mercedes,1.0,4.6,0.8,1.5,3.2,0.9
6,906,1,131,17,0,2,1:15.750,1:15.054,1:14.953,2014,...,7,Lewis,Hamilton,Mercedes,2.0,1.2,1.0,1.5,1.5,1.0
7,907,1,131,2,1,9,1:09.514,1:09.092,\N,2014,...,70,Lewis,Hamilton,Mercedes,17.0,4.4,0.8,9.5,3.1,0.9
8,908,1,131,1,1,6,1:41.058,1:34.870,1:39.232,2014,...,9,Lewis,Hamilton,Mercedes,2.0,4.6,0.8,1.5,3.1,0.9
9,909,1,131,3,1,16,1:18.683,\N,\N,2014,...,10,Lewis,Hamilton,Mercedes,1.0,4.6,0.8,9.5,4.7,0.8


In [122]:
#handing NAN data
base_df["constructor_prev_avg_finish"].fillna(
    base_df["positionOrder"].median(), inplace=True
)

base_df["constructor_rolling_avg_finish"].fillna(
    base_df["positionOrder"].median(), inplace=True
)

base_df["constructor_rolling_podium_rate"].fillna(0, inplace=True)


In [124]:
base_df[
    [
        "constructor_prev_avg_finish",
        "constructor_rolling_avg_finish",
        "constructor_rolling_podium_rate"
    ]
].describe()


Unnamed: 0,constructor_prev_avg_finish,constructor_rolling_avg_finish,constructor_rolling_podium_rate
count,4991.0,4991.0,4991.0
mean,10.553797,10.555311,0.149953
std,4.981686,4.050197,0.242675
min,1.0,1.4,0.0
25%,6.5,7.4,0.0
50%,11.0,11.1,0.0
75%,14.5,13.7,0.2
max,21.5,19.6,1.0


In [130]:
base_df[
    [
        "qualifying_position",
        "prev_race_finish",
        "rolling_avg_finish",
        "constructor_rolling_avg_finish",
        "podium"
    ]
].corr()["podium"].sort_values(ascending=False)


podium                            1.000000
prev_race_finish                 -0.402064
rolling_avg_finish               -0.513284
constructor_rolling_avg_finish   -0.516992
qualifying_position              -0.522044
Name: podium, dtype: float64

In [134]:
#Features set

FEATURES = [
    "qualifying_position",
    "prev_race_finish",
    "rolling_avg_finish",
    "rolling_podium_rate",
    "constructor_prev_avg_finish",
    "constructor_rolling_avg_finish",
    "constructor_rolling_podium_rate"
]

TARGET = "podium"


In [135]:
X = base_df[FEATURES].copy()
y = base_df[TARGET].copy()

In [137]:
#train - test split
latest_year = base_df["year"].max()

train_idx = base_df["year"] < latest_year
test_idx = base_df["year"] == latest_year

X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]

print("Train samples:", X_train.shape)
print("Test samples:", X_test.shape)


Train samples: (4533, 7)
Test samples: (458, 7)


## Logistic Regression

In [138]:
#scaling data to 0-1 so that larger value data does not dominate the smaller calue data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [139]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(
    max_iter=1000,
    class_weight="balanced",
    solver="lbfgs"
)

log_reg.fit(X_train_scaled, y_train)


In [140]:
from sklearn.metrics import (
    classification_report,
    roc_auc_score
)

y_pred = log_reg.predict(X_test_scaled)
y_prob = log_reg.predict_proba(X_test_scaled)[:, 1]

print(classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))


              precision    recall  f1-score   support

           0       0.99      0.85      0.91       389
           1       0.53      0.93      0.67        69

    accuracy                           0.86       458
   macro avg       0.76      0.89      0.79       458
weighted avg       0.92      0.86      0.88       458

ROC-AUC: 0.9415073954025559


In [141]:
test_df = base_df[test_idx].copy()
test_df["podium_prob"] = y_prob


In [142]:
def precision_at_3(df):
    correct = 0
    total = 0

    for race_id, race_df in df.groupby("raceId"):
        top3_pred = race_df.sort_values(
            "podium_prob", ascending=False
        ).head(3)

        correct += top3_pred["podium"].sum()
        total += 3

    return correct / total


In [143]:
p_at_3 = precision_at_3(test_df)
print("Precision@3:", p_at_3)


Precision@3: 0.6956521739130435


## XGBoost

In [145]:
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, classification_report


In [146]:
xgb_model = XGBClassifier(
    n_estimators=300,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="binary:logistic",
    eval_metric="auc",
    random_state=42,
    scale_pos_weight=(len(y_train) - y_train.sum()) / y_train.sum()
)


In [147]:
xgb_model.fit(X_train, y_train)


In [148]:
y_prob_xgb = xgb_model.predict_proba(X_test)[:, 1]
y_pred_xgb = (y_prob_xgb >= 0.5).astype(int)

print(classification_report(y_test, y_pred_xgb))
print("ROC-AUC:", roc_auc_score(y_test, y_prob_xgb))


              precision    recall  f1-score   support

           0       0.99      0.89      0.93       389
           1       0.59      0.93      0.72        69

    accuracy                           0.89       458
   macro avg       0.79      0.91      0.83       458
weighted avg       0.93      0.89      0.90       458

ROC-AUC: 0.957453150031668


In [151]:
test_df_xgb = base_df[test_idx].copy()
test_df_xgb["podium_prob"] = y_prob_xgb


In [152]:
def precision_at_3(df):
    correct = 0
    total = 0

    for _, race_df in df.groupby("raceId"):
        top3 = race_df.sort_values("podium_prob", ascending=False).head(3)
        correct += top3["podium"].sum()
        total += 3

    return correct / total


In [153]:
p_at_3_xgb = precision_at_3(test_df_xgb)
print("Precision@3 (XGBoost):", p_at_3_xgb)


Precision@3 (XGBoost): 0.7536231884057971


In [154]:
print("Logistic Precision@3:", p_at_3)
print("XGBoost Precision@3:", p_at_3_xgb)


Logistic Precision@3: 0.6956521739130435
XGBoost Precision@3: 0.7536231884057971


In [155]:
import pandas as pd

importance_df = pd.DataFrame({
    "feature": FEATURES,
    "importance": xgb_model.feature_importances_
}).sort_values(by="importance", ascending=False)

importance_df


Unnamed: 0,feature,importance
0,qualifying_position,0.363937
6,constructor_rolling_podium_rate,0.355087
3,rolling_podium_rate,0.089779
5,constructor_rolling_avg_finish,0.05699
2,rolling_avg_finish,0.049058
4,constructor_prev_avg_finish,0.044396
1,prev_race_finish,0.040753
