In [3]:
import sys
sys.path.append('../utils')

In [4]:
import pipeline
import importlib
import pandas as pd

importlib.reload(pipeline)
from pipeline import extract_session_data, clean_session_data, get_driver_laps, get_manual_finish_dict
from pipeline import assemble_race_dataset, extract_fp2_features, extract_quali_features, assemble_race_dataset_pre_race
from fastf1 import get_session

In [5]:
# Singapore GP 2025 (Round 18)
# This cell was made before the race so it excludes the race finish data 
df_fp2_18 = extract_session_data(2025, "Singapore", "FP2")
df_quali_18 = extract_session_data(2025, "Singapore", "Q")

fp2_clean_18 = clean_session_data(df_fp2_18)
quali_clean_18 = clean_session_data(df_quali_18)

fp2_features_18 = extract_fp2_features(fp2_clean_18)

quali_features_18 = extract_quali_features(quali_clean_18)
                       
race_df_18 = assemble_race_dataset_pre_race(fp2_features_18, quali_features_18)
race_df_18["grand_prix"] = "Singapore"
race_df_18

core           INFO 	Loading data for Singapore Grand Prix - Practice 2 [v3.4.4]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '4', '5', '6', '10', '12', '14', '16', '18', '22', '23', '27', '30', '31', '43', '44', '55', '63', '81', '87']
core           INFO 	Loading data for Singapore Grand Prix - Qualifying [v3.4.4]
req            INFO 	Using

Unnamed: 0,Driver,fp2_avg_lap,fp2_best_lap,fp2_total_laps,FastestQualiLap,QualiPosition,grand_prix
0,ALB,114.801909,92.06,11,90.202,12,Singapore
1,ALO,110.489846,90.877,13,89.955,10,Singapore
2,ANT,100.927,92.719,9,89.537,4,Singapore
3,BEA,104.0635,91.711,6,89.868,9,Singapore
4,BOR,112.5625,92.319,8,90.82,16,Singapore
5,COL,104.457,93.139,9,90.982,18,Singapore
6,GAS,103.17175,92.458,8,91.261,20,Singapore
7,HAD,117.708273,90.846,11,89.846,8,Singapore
8,HAM,108.410571,91.491,7,89.688,6,Singapore
9,HUL,109.689273,92.069,11,90.141,11,Singapore


In [6]:
race_df_18.to_csv("../data/r18_singapore_2025.csv", index=False)

In [7]:
import joblib

model  = joblib.load("../models/logreg_scaled_quali_pre_r18_singapore.pkl")
scaler = joblib.load("../models/scaler_pre_r18_singapore.pkl")

In [8]:
import pandas as pd

df_singapore = pd.read_csv('../data/r18_singapore_2025.csv')
df_singapore.head()

Unnamed: 0,Driver,fp2_avg_lap,fp2_best_lap,fp2_total_laps,FastestQualiLap,QualiPosition,grand_prix
0,ALB,114.801909,92.06,11,90.202,12,Singapore
1,ALO,110.489846,90.877,13,89.955,10,Singapore
2,ANT,100.927,92.719,9,89.537,4,Singapore
3,BEA,104.0635,91.711,6,89.868,9,Singapore
4,BOR,112.5625,92.319,8,90.82,16,Singapore


In [9]:
feature_cols = ['fp2_avg_lap', 'fp2_best_lap', 'fp2_total_laps', 'FastestQualiLap', 'QualiPosition']
X_singapore = df_singapore[feature_cols]

X_singapore_scaled = scaler.transform(X_singapore) 
proba_podium = model.predict_proba(X_singapore_scaled)[:, 1]

results = df_singapore.copy()
results["podium_probability"] = proba_podium

results_sorted = results.sort_values(by="podium_probability", ascending=False)

predicted_podium = results_sorted[["Driver", "podium_probability"]]

In [10]:
predicted_podium

Unnamed: 0,Driver,podium_probability
15,RUS,0.894233
19,VER,0.892693
14,PIA,0.828713
2,ANT,0.694641
12,NOR,0.537308
8,HAM,0.347043
11,LEC,0.238252
7,HAD,0.163438
3,BEA,0.07294
1,ALO,0.064597


In [11]:
# Singapore GP 2025
predicted_podium.head(3)

Unnamed: 0,Driver,podium_probability
15,RUS,0.894233
19,VER,0.892693
14,PIA,0.828713


In [13]:
print("Number of coefficients:", model.coef_.shape[1])
print("Number of input features:", len(feature_cols))

Number of coefficients: 5
Number of input features: 5


In [14]:
import numpy as np
import pandas as pd

# 1) Coefficients (sorted by absolute magnitude)
coef_s = pd.Series(model.coef_.ravel(), index=feature_cols)
print("Coefficients (signed):")
print(coef_s.sort_values(key=np.abs, ascending=False))

# 2) Odds ratios per +1 SD (since you used a scaler)
#    This shows multiplicative change in odds for a 1-std increase.
or_per_std = np.exp(coef_s)
print("\nOdds ratio per +1 SD:")
print(or_per_std.sort_values(ascending=False))

Coefficients (signed):
QualiPosition     -3.547228
fp2_total_laps     0.439475
FastestQualiLap    0.377708
fp2_avg_lap       -0.072973
fp2_best_lap       0.064597
dtype: float64

Odds ratio per +1 SD:
fp2_total_laps     1.551892
FastestQualiLap    1.458937
fp2_best_lap       1.066729
fp2_avg_lap        0.929626
QualiPosition      0.028804
dtype: float64


In [15]:
from scipy.stats import spearmanr

X_sg = df_singapore[feature_cols]
p_sg = model.predict_proba(scaler.transform(X_sg))[:,1]
rank_pred = pd.Series(p_sg).rank(ascending=False)
rho, _ = spearmanr(rank_pred, df_singapore["QualiPosition"])
print("\nSpearman(predicted rank vs starting grid):", round(rho, 3))


Spearman(predicted rank vs starting grid): 1.0


In [16]:
# Freeze QualiPosition at its median and see if rankings collapse
X_freeze = X_sg.copy()
qp_med = X_freeze["QualiPosition"].median()
X_freeze["QualiPosition"] = qp_med

p_freeze = model.predict_proba(scaler.transform(X_freeze))[:,1]
rho_freeze, _ = spearmanr(pd.Series(p_freeze).rank(ascending=False),
                          df_singapore["QualiPosition"])
print("Spearman after freezing QualiPosition:", round(rho_freeze, 3))

Spearman after freezing QualiPosition: -0.177


In [18]:
p = model.predict_proba(scaler.transform(df_singapore[feature_cols]))[:,1]
spearman = pd.Series(p).rank(ascending=False).corr(df_singapore["QualiPosition"], method="spearman")
print("Spearman vs grid:", round(spearman, 3))

Spearman vs grid: 1.0


In [21]:
# p: normal model probs
# p_freeze: probs with QualiPosition set to median for all drivers (your cell 3)

lam = 0.40   # 0..1; higher = less grid-like (try 0.3â€“0.6)
p_blend = (1 - lam) * p + lam * p_freeze

out = df_singapore[["Driver", "QualiPosition"]].copy()
out["p_model"]   = p
out["p_freeze"]  = p_freeze
out["p_final"]   = p_blend
out = out.sort_values("p_final", ascending=False).reset_index(drop=True)

rho_final = out["p_final"].rank(ascending=False).corr(out["QualiPosition"], method="spearman")
print("Spearman after blend:", round(rho_final, 3))
out

Spearman after blend: 0.923


Unnamed: 0,Driver,QualiPosition,p_model,p_freeze,p_final
0,VER,2,0.892693,0.03987,0.551564
1,RUS,1,0.894233,0.022122,0.545389
2,PIA,3,0.828713,0.04311,0.514472
3,ANT,4,0.694641,0.038014,0.43199
4,NOR,5,0.537308,0.036267,0.336891
5,HAM,6,0.347043,0.03113,0.220678
6,LEC,7,0.238252,0.034071,0.156579
7,HAD,8,0.163438,0.03948,0.113855
8,ALO,10,0.064597,0.048127,0.058009
9,BEA,9,0.07294,0.029954,0.055746


In [22]:
# Use p_final to get your podium prediction
predicted_podium = (
    out[["Driver", "QualiPosition", "p_final"]]
    .rename(columns={"p_final": "podium_probability"})
    .sort_values("podium_probability", ascending=False)
    .reset_index(drop=True)
)

# Show top 10
predicted_podium.head(10)

Unnamed: 0,Driver,QualiPosition,podium_probability
0,VER,2,0.551564
1,RUS,1,0.545389
2,PIA,3,0.514472
3,ANT,4,0.43199
4,NOR,5,0.336891
5,HAM,6,0.220678
6,LEC,7,0.156579
7,HAD,8,0.113855
8,ALO,10,0.058009
9,BEA,9,0.055746
