In [1]:
"""
DECISION TREE & RANDOM FOREST BASELINE
Answers 3 questions:
  1. Do they beat the true online baseline?
  2. How do they compare to each other?
  3. What patterns do they learn?
"""

import numpy as np
import pandas as pd
import json
import warnings
import os
warnings.filterwarnings('ignore')

np.random.seed(42)

# Create output folder for non-linear models
os.makedirs('csv_output/nonlinear', exist_ok=True)

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import ParameterGrid, GroupKFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [2]:


print("Loading data")
df_train = pd.read_excel('csv_output/Train_set.xlsx')
df_val = pd.read_excel('csv_output/Validation_set.xlsx')
df_test = pd.read_excel('csv_output/Test_set.xlsx')
print(f"  Train: {len(df_train):,} | Val: {len(df_val):,} | Test: {len(df_test):,}")

# Define features
TARGET = 'LapTime_next'

NUMERICAL_FEATURES = [
    'LapInStint', 'LapInStint_squared', 'is_new_tyre', 'TyreAgeAtStart',
    'laptime_rolling_std_3', 'laptime_cumulative_trend', 'laptime_change_prev', #'stint_mean_so_far',
    'AirTemp', 'Humidity', 'Pressure', 'TrackTemp', 'WindSpeed', 'wind_sin', 'wind_cos',
]

# Add binary flags if they exist
for flag in ['is_leader', 'in_drs_range', 'in_clean_air', 'in_dirty_air', 'pushing']:
    if flag in df_train.columns:
        NUMERICAL_FEATURES.append(flag)

CATEGORICAL_FEATURES = ['Compound']

# Add PCA geometry
geom_pca_cols = [c for c in df_train.columns if c.startswith('geom_PC')]
NUMERICAL_FEATURES.extend(geom_pca_cols)
NUMERICAL_FEATURES = [f for f in NUMERICAL_FEATURES if f in df_train.columns]
CATEGORICAL_FEATURES = [f for f in CATEGORICAL_FEATURES if f in df_train.columns]

print(f"  Features: {len(NUMERICAL_FEATURES)} numerical + {len(CATEGORICAL_FEATURES)} categorical")

Loading data
  Train: 28,244 | Val: 5,035 | Test: 7,147
  Features: 18 numerical + 1 categorical


In [3]:
print("\nComputing true online baseline (past laps only)")

for df in [df_train, df_val, df_test]:
    df['race_id'] = df['year'].astype(str) + '_' + df['round'].astype(str) + '_' + df['name']

# Sort test data by race and lap number (to reconstruct chronological order for online baseline)
sort_cols = ['year', 'round', 'name']
if 'lap_number' in df_test.columns:
    sort_cols.append('lap_number')
elif 'LapNumber' in df_test.columns:
    sort_cols.append('LapNumber')
df_test_sorted = df_test.sort_values(sort_cols).reset_index(drop=True)

baseline_preds = np.full(len(df_test_sorted), np.nan, dtype=float)

for (year, round_no, circuit), group_df in df_test_sorted.groupby(['year', 'round', 'name'], sort=False):
    past_laps = []
    for idx in group_df.index:
        lap_idx_in_test = df_test_sorted.index.get_loc(idx)
        if past_laps:
            baseline_preds[lap_idx_in_test] = float(np.median(past_laps))
        past_laps.append(df_test_sorted.at[idx, TARGET])

mask = ~np.isnan(baseline_preds)
baseline_preds_valid = baseline_preds[mask]
y_test_valid = df_test_sorted[TARGET].values[mask]

# Per-race baseline
baseline_per_race = []
for (year, round_no, circuit), group_df in df_test_sorted.groupby(['year', 'round', 'name'], sort=False):
    group_indices = group_df.index
    group_mask = np.zeros(len(df_test_sorted), dtype=bool)
    group_mask[group_indices] = True
    group_mask = group_mask & mask
    if group_mask.sum() > 0:
        baseline_per_race.append(mean_absolute_error(df_test_sorted[TARGET].values[group_mask], baseline_preds[group_mask]))

baseline_median_mae_per_race = float(np.median(baseline_per_race))
print(f"  Baseline (median MAE/race): {baseline_median_mae_per_race:.4f}s")




Computing true online baseline (past laps only)
  Baseline (median MAE/race): 0.7637s


In [4]:
# Prepare datasets

X_train = df_train[NUMERICAL_FEATURES + CATEGORICAL_FEATURES].copy()
y_train = df_train[TARGET].copy()
race_train = df_train['race_id'].copy()

X_val = df_val[NUMERICAL_FEATURES + CATEGORICAL_FEATURES].copy()
y_val = df_val[TARGET].copy()
race_val = df_val['race_id'].copy()

X_test = df_test_sorted[NUMERICAL_FEATURES + CATEGORICAL_FEATURES].copy()
y_test = df_test_sorted[TARGET].copy()
race_test = df_test_sorted['race_id'].copy()

X_trainval = pd.concat([X_train, X_val], axis=0).reset_index(drop=True)
y_trainval = pd.concat([y_train, y_val], axis=0).reset_index(drop=True)
race_trainval = pd.concat([race_train, race_val], axis=0).reset_index(drop=True)

# Race-balanced weights
vc = race_trainval.value_counts()
w_trainval = race_trainval.map(lambda r: 1.0 / vc.loc[r])

# FIX 4: OneHotEncoder compatibility shim
ohe_args = dict(handle_unknown="ignore")
try:
    OHE = OneHotEncoder(sparse_output=False, **ohe_args)
except TypeError:
    OHE = OneHotEncoder(sparse=False, **ohe_args)

# Single preprocessor (reused for both DT and RF)
has_cat = len(CATEGORICAL_FEATURES) > 0
if has_cat:
    preprocessor = ColumnTransformer([
        ('num', SimpleImputer(strategy='median'), NUMERICAL_FEATURES),
        ('cat', OHE, CATEGORICAL_FEATURES)
    ], remainder='drop')
else:
    preprocessor = ColumnTransformer([
        ('num', SimpleImputer(strategy='median'), NUMERICAL_FEATURES),
    ], remainder='drop')

# FIX 1: Precompute valid-subset indices once (don't fit preprocessor externally)
valid_idx = np.where(mask)[0]

# Map each race to positions within the valid subset
race_groups_valid = []
for _, g in df_test_sorted.groupby(['year', 'round', 'name'], sort=False):
    full_idx = g.index.values
    in_valid = np.intersect1d(valid_idx, full_idx)
    if in_valid.size > 0:
        pos_in_valid = np.searchsorted(valid_idx, in_valid)
        race_groups_valid.append(pos_in_valid)

# FIX 3: Helper to get feature names per pipeline
def get_feature_names(num_feats, cat_feats, pipeline=None):
    names = list(num_feats)
    if len(cat_feats) > 0:
        enc = pipeline.named_steps['preprocess'].named_transformers_['cat']
        names += list(enc.get_feature_names_out(cat_feats))
    return names

# FIX 2: Rewritten helper for valid-subset permutation importance (transforms inside using pipeline's preprocessor)
def compute_permutation_importance_valid(pipeline, X_test_raw, y_test_raw, valid_idx, race_groups_valid, feature_names, n_repeats=10):
    """Compute race-aware permutation importance on valid subset using pipeline's preprocessor."""
    rng = np.random.RandomState(42)
    preproc = pipeline.named_steps['preprocess']
    mdl = pipeline.named_steps['model']

    # Transform test data using pipeline's preprocessor (ensures consistency)
    X_test_t = preproc.transform(X_test_raw)
    X_valid = X_test_t[valid_idx]
    y_valid = y_test_raw[valid_idx]

    base_pred = mdl.predict(X_valid)
    base_med = np.median([mean_absolute_error(y_valid[g], base_pred[g]) for g in race_groups_valid if g.size > 0])

    imps = []
    for j, fname in enumerate(feature_names):
        drops = []
        for _ in range(n_repeats):
            Xs = X_valid.copy()
            rng.shuffle(Xs[:, j])
            yp = mdl.predict(Xs)
            med = np.median([mean_absolute_error(y_valid[g], yp[g]) for g in race_groups_valid if g.size > 0])
            drops.append(med - base_med)
        imps.append((fname, float(np.median(drops))))
    
    imps.sort(key=lambda x: x[1], reverse=True)
    return imps[:15]


In [18]:

# DECISION TREE

print("\n[DT] Tuning Decision Tree...")

pipe_dt = Pipeline([("preprocess", preprocessor), ("model", DecisionTreeRegressor(random_state=42))])

# param_dt = {
#     "model__max_depth": [4, 6, 8, 12],
#     "model__min_samples_leaf": [5, 10, 20],
#     "model__min_samples_split": [2, 10],
#     "model__max_features": [None, "sqrt", 0.5],
#     "model__max_leaf_nodes": [None, 64, 128, 256],
#     "model__ccp_alpha": [0.0, 1e-4, 5e-4],
# }

param_dt = {
    "model__max_depth": [6, 10],
    "model__min_samples_leaf": [5, 10],
    "model__max_features": ["sqrt", 0.5],
    "model__ccp_alpha": [0.0, 1e-4],
}

best_dt_score, best_dt_params = np.inf, None
gkf = GroupKFold(n_splits=min(5, race_trainval.nunique()))

for i, params in enumerate(list(ParameterGrid(param_dt)), 1):
    fold_scores = []
    for tr, va in gkf.split(X_trainval, y_trainval, groups=race_trainval):
        pipe_dt.set_params(**params)
        w_tr = race_trainval.iloc[tr].map(lambda r: 1.0 / race_trainval.iloc[tr].value_counts().loc[r])
        pipe_dt.fit(X_trainval.iloc[tr], y_trainval.iloc[tr], model__sample_weight=w_tr.values)
        yp = pipe_dt.predict(X_trainval.iloc[va])
        per_race = [mean_absolute_error(y_trainval.iloc[va][race_trainval.iloc[va] == rid], yp[race_trainval.iloc[va] == rid]) 
                    for rid in race_trainval.iloc[va].unique()]
        fold_scores.append(float(np.median(per_race)))
    score = float(np.median(fold_scores))
    if score < best_dt_score:
        best_dt_score = score
        best_dt_params = params
        if i % 50 == 0: print(f"  [{i}] Best: {score:.4f}s")

print(f"  Best CV: {best_dt_score:.4f}s")

# Evaluate DT
pipe_dt.set_params(**best_dt_params)
pipe_dt.fit(X_trainval, y_trainval, model__sample_weight=w_trainval.values)

dt_pred_all = pipe_dt.predict(X_test)
dt_pred = dt_pred_all[mask]
dt_mae = mean_absolute_error(y_test_valid, dt_pred)
dt_rmse = np.sqrt(mean_squared_error(y_test_valid, dt_pred))
dt_r2 = r2_score(y_test_valid, dt_pred)

# FIX 3: Use race_groups_valid consistently for per-race errors
dt_per_race = []
dt_race_ids = []
for (year, round_no, circuit), g in df_test_sorted.groupby(['year', 'round', 'name'], sort=False):
    group_indices = g.index.values
    group_mask = np.zeros(len(df_test_sorted), dtype=bool)
    group_mask[group_indices] = True
    group_mask = group_mask & mask
    if group_mask.sum() > 0:
        dt_per_race.append(mean_absolute_error(df_test_sorted[TARGET].values[group_mask], dt_pred_all[group_mask]))
        dt_race_ids.append(f"{year}_{round_no}_{circuit}")

dt_median_mae_per_race = float(np.median(dt_per_race))
tree_depth = pipe_dt.named_steps['model'].get_depth()
tree_leaves = pipe_dt.named_steps['model'].get_n_leaves()

print(f"\n  DT Results:")
print(f"    Median MAE/race: {dt_median_mae_per_race:.4f}s")
print(f"    Overall MAE: {dt_mae:.4f}s, RMSE: {dt_rmse:.4f}s, R²: {dt_r2:.4f}")
print(f"    Tree: depth={tree_depth}, leaves={tree_leaves}")

# DT permutation importance (FIX 2: transform inside helper)
print(f"  Computing permutation importance...")
dt_feats = get_feature_names(NUMERICAL_FEATURES, CATEGORICAL_FEATURES, pipe_dt)
dt_importances = compute_permutation_importance_valid(pipe_dt, X_test, y_test.values, valid_idx, race_groups_valid, dt_feats)

# Save DT
pd.DataFrame({
    "Model": ["Decision Tree"],
    "CV_median_race_MAE": [best_dt_score],
    "Test_median_race_MAE": [dt_median_mae_per_race],
    "Test_MAE": [dt_mae],
    "Test_RMSE": [dt_rmse],
    "Test_R2": [dt_r2],
    "Tree_Depth": [tree_depth],
}).to_csv('csv_output/nonlinear/dt_results.csv', index=False)

# FIX 3: Use consistent race identifiers
pd.DataFrame({'Race': dt_race_ids, 'MAE': dt_per_race}).sort_values('MAE', ascending=False).to_csv('csv_output/nonlinear/dt_per_race_mae.csv', index=False)

pd.DataFrame(dt_importances, columns=['Feature', 'Importance']).to_csv('csv_output/nonlinear/dt_feature_importances_perm.csv', index=False)

with open('csv_output/nonlinear/dt_best_hyperparameters.json', 'w') as f:
    json.dump({'DecisionTree': best_dt_params, 'CV_score': float(best_dt_score), 'Test_score': float(dt_median_mae_per_race)}, f, indent=2)




[DT] Tuning Decision Tree...
  Best CV: 7.7178s

  DT Results:
    Median MAE/race: 5.2309s
    Overall MAE: 6.4656s, RMSE: 8.4255s, R²: 0.0102
    Tree: depth=6, leaves=60
  Computing permutation importance...


In [5]:

# RANDOM FOREST

print("\n[RF] Tuning Random Forest...")

# FIX 5: Make RF OOB explicit + keep GroupKFold robust
pipe_rf = Pipeline([
    ("preprocess", preprocessor),
    ("model", RandomForestRegressor(random_state=42, n_jobs=-1, oob_score=True, bootstrap=True))
])

param_rf = {
    "model__n_estimators": [600],
    "model__max_depth": [12, 20, None],
    "model__min_samples_leaf": [5, 10, 20],
    "model__min_samples_split": [2, 10, 20],
    "model__max_features": ["sqrt", 0.5],
}

best_rf_score, best_rf_params = np.inf, None
gkf_rf = GroupKFold(n_splits=min(5, race_trainval.nunique()))

for i, params in enumerate(list(ParameterGrid(param_rf)), 1):
    fold_scores = []
    for tr, va in gkf_rf.split(X_trainval, y_trainval, groups=race_trainval):
        pipe_rf.set_params(**params)
        w_tr = race_trainval.iloc[tr].map(lambda r: 1.0 / race_trainval.iloc[tr].value_counts().loc[r])
        pipe_rf.fit(X_trainval.iloc[tr], y_trainval.iloc[tr], model__sample_weight=w_tr.values)
        yp = pipe_rf.predict(X_trainval.iloc[va])
        per_race = [mean_absolute_error(y_trainval.iloc[va][race_trainval.iloc[va] == rid], yp[race_trainval.iloc[va] == rid]) 
                    for rid in race_trainval.iloc[va].unique()]
        fold_scores.append(float(np.median(per_race)))
    score = float(np.median(fold_scores))
    if score < best_rf_score:
        best_rf_score = score
        best_rf_params = params
        if i % 10 == 0: print(f"  [{i}] Best: {score:.4f}s")

print(f"  Best CV: {best_rf_score:.4f}s")

# Evaluate RF
pipe_rf.set_params(**best_rf_params)
pipe_rf.fit(X_trainval, y_trainval, model__sample_weight=w_trainval.values)

rf_pred_all = pipe_rf.predict(X_test)
rf_pred = rf_pred_all[mask]
rf_mae = mean_absolute_error(y_test_valid, rf_pred)
rf_rmse = np.sqrt(mean_squared_error(y_test_valid, rf_pred))
rf_r2 = r2_score(y_test_valid, rf_pred)
rf_oob = pipe_rf.named_steps['model'].oob_score_

rf_per_race = []
rf_race_ids = []
for (year, round_no, circuit), g in df_test_sorted.groupby(['year', 'round', 'name'], sort=False):
    group_indices = g.index.values
    group_mask = np.zeros(len(df_test_sorted), dtype=bool)
    group_mask[group_indices] = True
    group_mask = group_mask & mask
    if group_mask.sum() > 0:
        rf_per_race.append(mean_absolute_error(df_test_sorted[TARGET].values[group_mask], rf_pred_all[group_mask]))
        rf_race_ids.append(f"{year}_{round_no}_{circuit}")

rf_median_mae_per_race = float(np.median(rf_per_race))

print(f"\n  RF Results:")
print(f"    Median MAE/race: {rf_median_mae_per_race:.4f}s")
print(f"    Overall MAE: {rf_mae:.4f}s, RMSE: {rf_rmse:.4f}s, R²: {rf_r2:.4f}")
print(f"    OOB R²: {rf_oob:.4f} (diagnostic)")

# RF permutation importance (FIX 2: transform inside helper)
print(f"  Computing permutation importance...")
rf_feats = get_feature_names(NUMERICAL_FEATURES, CATEGORICAL_FEATURES, pipe_rf)
rf_importances = compute_permutation_importance_valid(pipe_rf, X_test, y_test.values, valid_idx, race_groups_valid, rf_feats)


# Save RF
pd.DataFrame({
    "Model": ["Random Forest"],
    "CV_median_race_MAE": [best_rf_score],
    "Test_median_race_MAE": [rf_median_mae_per_race],
    "Test_MAE": [rf_mae],
    "Test_RMSE": [rf_rmse],
    "Test_R2": [rf_r2],
    "OOB_R2": [rf_oob],
}).to_csv('csv_output/nonlinear/rf_results.csv', index=False)

# FIX 3: Use consistent race identifiers
pd.DataFrame({'Race': rf_race_ids, 'MAE': rf_per_race}).sort_values('MAE', ascending=False).to_csv('csv_output/nonlinear/rf_per_race_mae.csv', index=False)

pd.DataFrame(rf_importances, columns=['Feature', 'Importance']).to_csv('csv_output/nonlinear/rf_feature_importances_perm.csv', index=False)

with open('csv_output/nonlinear/rf_best_hyperparameters.json', 'w') as f:
    json.dump({'RandomForest': best_rf_params, 'CV_score': float(best_rf_score), 'Test_score': float(rf_median_mae_per_race)}, f, indent=2)




[RF] Tuning Random Forest...
  Best CV: 6.4333s

  RF Results:
    Median MAE/race: 2.6484s
    Overall MAE: 4.4563s, RMSE: 5.8316s, R²: 0.5258
    OOB R²: 0.9875 (diagnostic)
  Computing permutation importance...


In [6]:

# k-NEAREST NEIGHBORS (kNN)

print("\n[kNN] Tuning k-Nearest Neighbors...")

# kNN-specific preprocessing: StandardScaler is critical for distance-based models
#              numeric features scaled to N(0,1), categorical one-hot in {0,1}
#              This asymmetry is acceptable for initial implementation.
#              If kNN performs poorly, consider full-matrix scaling with StandardScaler(with_mean=False)


preprocessor_knn = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("imp", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler()),  # Essential for kNN: normalize feature magnitudes
        ]), NUMERICAL_FEATURES),
        ("cat", Pipeline([
            ("imp", SimpleImputer(strategy="most_frequent")),
            ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=True)),  # FIXED: use sparse to save RAM
        ]), CATEGORICAL_FEATURES),
    ],
    remainder="drop"
)

pipe_knn = Pipeline([("preprocess", preprocessor_knn), ("model", KNeighborsRegressor())])

param_knn = {
    "model__n_neighbors": [3, 5, 7, 11, 15, 25],      # Odd numbers preferred (symmetry/stability)
    "model__weights": ["uniform", "distance"],        # uniform vs. distance-weighted
    "model__p": [1, 2],                               # 1=Manhattan, 2=Euclidean (Minkowski)
}

best_knn_score = np.inf
best_knn_params = None
knn_fold_scores = []

gkf = GroupKFold(n_splits=min(5, df_train['race_id'].nunique()))

for params in ParameterGrid(param_knn):
    fold_mae = []
    for tr_idx, val_idx in gkf.split(X_train, y_train.values, groups=df_train['race_id']):
        X_tr, X_va = X_train.iloc[tr_idx], X_train.iloc[val_idx]
        y_tr, y_va = y_train.iloc[tr_idx], y_train.iloc[val_idx]
        
        pipe_knn.set_params(**params)
        pipe_knn.fit(X_tr, y_tr)
        pred = pipe_knn.predict(X_va)
        fold_mae.append(mean_absolute_error(y_va, pred))
    
    score = np.median(fold_mae)  # Consistent with per-race grouping philosophy
    knn_fold_scores.append(score)
    
    if score < best_knn_score:
        best_knn_score = score
        best_knn_params = params

print(f"  Best params: {best_knn_params}")
print(f"  CV median MAE (fold-level): {best_knn_score:.4f}s")

# Train final kNN on full train set
pipe_knn.set_params(**best_knn_params)
pipe_knn.fit(X_train, y_train.values)

# Test predictions
knn_pred_all = pipe_knn.predict(X_test)
knn_mae = mean_absolute_error(y_test, knn_pred_all)
knn_rmse = np.sqrt(mean_squared_error(y_test, knn_pred_all))
knn_r2 = r2_score(y_test, knn_pred_all)

# Per-race evaluation (FIXED: use positions, not index labels)
knn_per_race = []
knn_race_ids = []
for (year, round_no, circuit), group_df in df_test_sorted.groupby(['year', 'round', 'name'], sort=False):
    # Convert index labels to positions (handles non-contiguous indices)
    pos = df_test_sorted.index.get_indexer(group_df.index.to_numpy())
    # Guard: filter out -1 (invalid) positions
    pos = pos[pos >= 0]
    # Filter to valid (non-NaN) positions
    valid_pos = pos[mask[pos]]
    if len(valid_pos) > 0:
        knn_per_race.append(mean_absolute_error(df_test_sorted[TARGET].values[valid_pos], knn_pred_all[valid_pos]))
        knn_race_ids.append(f"{year}_{round_no}_{circuit}")

knn_median_mae_per_race = float(np.median(knn_per_race))

print(f"\n  kNN Results:")
print(f"    Median MAE/race: {knn_median_mae_per_race:.4f}s")
print(f"    Overall MAE: {knn_mae:.4f}s, RMSE: {knn_rmse:.4f}s, R²: {knn_r2:.4f}")

# kNN permutation importance (same approach as DT/RF)
print(f"  Computing permutation importance...")
knn_feats = get_feature_names(NUMERICAL_FEATURES, CATEGORICAL_FEATURES, pipe_knn)
knn_importances = compute_permutation_importance_valid(pipe_knn, X_test, y_test.values, valid_idx, race_groups_valid, knn_feats)

# Save kNN
pd.DataFrame({
    "Model": ["k-Nearest Neighbors"],
    "CV_median_race_MAE": [best_knn_score],
    "Test_median_race_MAE": [knn_median_mae_per_race],
    "Test_MAE": [knn_mae],
    "Test_RMSE": [knn_rmse],
    "Test_R2": [knn_r2],
}).to_csv('csv_output/nonlinear/knn_results.csv', index=False)

pd.DataFrame({'Race': knn_race_ids, 'MAE': knn_per_race}).sort_values('MAE', ascending=False).to_csv('csv_output/nonlinear/knn_per_race_mae.csv', index=False)

pd.DataFrame(knn_importances, columns=['Feature', 'Importance']).to_csv('csv_output/nonlinear/knn_feature_importances_perm.csv', index=False)

with open('csv_output/nonlinear/knn_best_hyperparameters.json', 'w') as f:
    json.dump({'kNN': best_knn_params, 'CV_score': float(best_knn_score), 'Test_score': float(knn_median_mae_per_race)}, f, indent=2)



[kNN] Tuning k-Nearest Neighbors...
  Best params: {'model__n_neighbors': 25, 'model__p': 2, 'model__weights': 'distance'}
  CV median MAE (fold-level): 8.9625s

  kNN Results:
    Median MAE/race: 6.7547s
    Overall MAE: 8.2754s, RMSE: 11.6704s, R²: -0.8987
  Computing permutation importance...


In [7]:

# SMALL DIAGNOSTIC MLP (Shallow Neural Network)
# Purpose: Detect smooth non-linear interactions

print("\n[MLP] Training diagnostic MLP (small, shallow network)...")

# MLP-specific preprocessing: StandardScaler is REQUIRED for neural networks
preprocessor_mlp = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("imp", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler()),  # CRITICAL: neural nets need normalization
        ]), NUMERICAL_FEATURES),
        ("cat", Pipeline([
            ("imp", SimpleImputer(strategy="most_frequent")),
            ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),  # FIXED: dense for MLP compatibility
        ]), CATEGORICAL_FEATURES),
    ],
    remainder="drop"
)

# Simple MLPRegressor: [(32, 16)] architecture
# - Shallow: only 2 hidden layers
# - Low capacity: 32→16 neurons (approximates smooth interactions)
# - Early stopping + L2 regularization: prevents overfitting
mlp_model = MLPRegressor(
    hidden_layer_sizes=(32, 16),
    activation="relu",
    solver="adam",
    learning_rate_init=1e-3,       # Learning rate (stable default)
    batch_size=256,                # Batch size (stable)
    alpha=1e-3,                    # L2 regularization (mild)
    max_iter=500,
    early_stopping=True,           # Stop if validation doesn't improve
    validation_fraction=0.1,       # Use 10% of train for early stopping validation
    n_iter_no_change=20,           # Stop after 20 no-improvement epochs
    random_state=42,
    verbose=0
)

pipe_mlp = Pipeline([
    ("preprocess", preprocessor_mlp),
    ("model", mlp_model)
])

# Train on full training set (no hyperparameter tuning for MLP — keep it diagnostic)
print("  Fitting MLP on full training set...")
pipe_mlp.fit(X_train, y_train.values)

# Test predictions
mlp_pred_all = pipe_mlp.predict(X_test)
mlp_mae = mean_absolute_error(y_test, mlp_pred_all)
mlp_rmse = np.sqrt(mean_squared_error(y_test, mlp_pred_all))
mlp_r2 = r2_score(y_test, mlp_pred_all)

# Per-race evaluation (using corrected indexing from kNN)
mlp_per_race = []
mlp_race_ids = []
for (year, round_no, circuit), group_df in df_test_sorted.groupby(['year', 'round', 'name'], sort=False):
    # Convert index labels to positions (handles non-contiguous indices)
    pos = df_test_sorted.index.get_indexer(group_df.index.to_numpy())
    # Guard: filter out -1 (invalid) positions
    pos = pos[pos >= 0]
    # Filter to valid (non-NaN) positions
    valid_pos = pos[mask[pos]]
    if len(valid_pos) > 0:
        mlp_per_race.append(mean_absolute_error(df_test_sorted[TARGET].values[valid_pos], mlp_pred_all[valid_pos]))
        mlp_race_ids.append(f"{year}_{round_no}_{circuit}")

mlp_median_mae_per_race = float(np.median(mlp_per_race))

# Get n_iter from fitted model inside pipeline
mlp_fitted_model = pipe_mlp.named_steps["model"]
mlp_n_iter = mlp_fitted_model.n_iter_

print(f"\n  MLP Results (diagnostic, non-optimized):")
print(f"    Median MAE/race: {mlp_median_mae_per_race:.4f}s")
print(f"    Overall MAE: {mlp_mae:.4f}s, RMSE: {mlp_rmse:.4f}s, R²: {mlp_r2:.4f}")
print(f"    Network: input → [32 ReLU] → [16 ReLU] → output")
print(f"    Early stopping: converged after {mlp_n_iter} iterations")

# MLP permutation importance (same approach as others)
print(f"  Computing permutation importance...")
mlp_feats = get_feature_names(NUMERICAL_FEATURES, CATEGORICAL_FEATURES, pipe_mlp)
mlp_importances = compute_permutation_importance_valid(pipe_mlp, X_test, y_test.values, valid_idx, race_groups_valid, mlp_feats)

# Save MLP
pd.DataFrame({
    "Model": ["Small MLP"],
    "Architecture": ["(32, 16)"],
    "Test_median_race_MAE": [mlp_median_mae_per_race],
    "Test_MAE": [mlp_mae],
    "Test_RMSE": [mlp_rmse],
    "Test_R2": [mlp_r2],
    "Iterations": [mlp_n_iter],
}).to_csv('csv_output/nonlinear/mlp_results.csv', index=False)

pd.DataFrame({'Race': mlp_race_ids, 'MAE': mlp_per_race}).sort_values('MAE', ascending=False).to_csv('csv_output/nonlinear/mlp_per_race_mae.csv', index=False)

pd.DataFrame(mlp_importances, columns=['Feature', 'Importance']).to_csv('csv_output/nonlinear/mlp_feature_importances_perm.csv', index=False)

with open('csv_output/nonlinear/mlp_best_hyperparameters.json', 'w') as f:
    json.dump({
        'MLP': {
            'hidden_layer_sizes': (32, 16),
            'activation': 'relu',
            'alpha': 1e-3,
            'early_stopping': True,
            'n_iter_no_change': 20
        },
        'Test_score': float(mlp_median_mae_per_race),
        'Iterations': int(mlp_n_iter),
        'Purpose': 'Diagnostic - detect smooth non-linear interactions'
    }, f, indent=2)



[MLP] Training diagnostic MLP (small, shallow network)...
  Fitting MLP on full training set...

  MLP Results (diagnostic, non-optimized):
    Median MAE/race: 4.7458s
    Overall MAE: 7.1505s, RMSE: 10.3437s, R²: -0.4915
    Network: input → [32 ReLU] → [16 ReLU] → output
    Early stopping: converged after 478 iterations
  Computing permutation importance...


In [None]:
# ══════════════════════════════════════════════════════════════════════════
# SUMMARY
# ══════════════════════════════════════════════════════════════════════════

print("\n" + "="*80)
print("SUMMARY: NON-LINEAR BASELINE")
print("="*80)
print(f"\nBaseline (online, past laps):      {baseline_median_mae_per_race:.4f}s")
print(f"Decision Tree:                     {dt_median_mae_per_race:.4f}s")
print(f"Random Forest:                     {rf_median_mae_per_race:.4f}s")
print(f"\nWinner: {'Random Forest' if rf_median_mae_per_race < dt_median_mae_per_race else 'Decision Tree'}")
print("\nFiles saved to csv_output/nonlinear/:")
print("  - dt_results.csv, dt_per_race_mae.csv, dt_feature_importances_perm.csv, dt_best_hyperparameters.json")
print("  - rf_results.csv, rf_per_race_mae.csv, rf_feature_importances_perm.csv, rf_best_hyperparameters.json")
print("\n✓ Non-linear baseline complete!\n")



SUMMARY: NON-LINEAR BASELINE

Baseline (online, past laps):      0.7905s
Decision Tree:                     0.7417s
Random Forest:                     0.5688s

Winner: Random Forest

Files saved to csv_output/nonlinear/:
  - dt_results.csv, dt_per_race_mae.csv, dt_feature_importances_perm.csv, dt_best_hyperparameters.json
  - rf_results.csv, rf_per_race_mae.csv, rf_feature_importances_perm.csv, rf_best_hyperparameters.json

✓ Non-linear baseline complete!



Why Random Forest and Decision Tree underperdform ? 
- Are they underfitting ? 
- Feature interactions 
- GB 

why it might be happening ?  relationship is fairly linear , df/rf may be overfitting, Stint mean so far does most of the job 
