In [7]:
# Initial libraries, seed set, data and model directories, metrics functions

import os
import json
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GroupKFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.dummy import DummyRegressor


RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
sns.set(style='whitegrid')
DATA_DIR = r'C:\Users\ASUS\Desktop\F1 Predictions & Visualizations\F1-ML-Project\data\features'
MODEL_DIR = r'C:\Users\ASUS\Desktop\F1 Predictions & Visualizations\F1-ML-Project\data\models'

os.makedirs(MODEL_DIR, exist_ok = True)

def mae(y_true, y_pred): return mean_absolute_error(y_true, y_pred)
def rmse(y_true, y_pred): return np.sqrt(mean_squared_error(y_true, y_pred))
def r2(y_true, y_pred): return r2_score(y_true, y_pred)

def evaluate_regression(y_true, y_pred):
    return {
        'MAE': float(mae(y_true, y_pred)),
        'RMSE': float(rmse(y_true,y_pred)),
        'R2': float(r2(y_true, y_pred))
    }


In [8]:
df = pd.read_csv(r'C:\Users\ASUS\Desktop\F1 Predictions & Visualizations\F1-ML-Project\data\features\season_2021_round_1_features.csv')
df.shape

(1027, 57)

In [9]:
df.head()
df.isna().sum().sort_values(ascending=False).head(20)

Unnamed: 0       0
season           0
round            0
session          0
driver_name      0
driver_number    0
team             0
lap_number       0
sector1_time     0
sector2_time     0
sector3_time     0
is_outlap        0
is_inlap         0
position         0
speed_trap       0
compound         0
tyre_age         0
stint_number     0
air_temp         0
track_temp       0
dtype: int64

In [10]:
target = 'lap_time'
drop_cols =  ['lap_time']

candidate_features = [c for c in df.columns if c not in drop_cols and c not in ['race_name', 'driver_name', 'gp', 'season', 'round', 'race_date']]

num_features = df[candidate_features].select_dtypes(include='number').columns.tolist()

X = df[num_features].copy()
y = df[target].copy()

groups = df['round'] ## using race/round as the group for GroupKFold
drivers = df['driver_name']

In [11]:
## do this when we're using the full features csv
# rounds = df['round'].unique()
# train_round = np.random.choice(rounds, size = int(len(rounds)*0.7), replace = False)

# train_mask = df['round'].isin(train_round)

#-------------------------------------------------
# for now use 0.7 quantilee


lap_cutoff = df['lap_time'].quantile(0.7)
train_mask = df['lap_time'] <= lap_cutoff
val_mask = df['lap_time'] > lap_cutoff

#--------------------------------------------------

X_train, X_val  = X[train_mask], X[val_mask]
y_train, y_val = y[train_mask], y[val_mask]



In [12]:
## BASELINE 1 - GLOBAL MEAN
## Naive Baseline = Creating a baseline to compare, this is the simplest possible guess
## global mean is the y_train mean, and pred_mean is an array full of the same y_training data mean, basically comparing the mean of the training data
## and the predicted mean, the simplest prediction of the varaince between the training data and the test data, using the simplest metric 

global_mean = y_train.mean()
pred_mean = np.full(len(y_val), global_mean)

evaluate_regression(y_val, pred_mean)


{'MAE': 12.671586027671909,
 'RMSE': 19.608533942106142,
 'R2': -0.7170647587667422}

In [14]:
## Baseline 2: Feature based baseline, moved from earlier, 'what the average of everyone, to whats the average of every driver', to see how much my traget
## variable is affected by the drivers

# we're checking if knowing the drivers name, helps making my predictions better

driver_mean = y_train.groupby(df.loc[train_mask,'driver_name']).mean() ## per driver average for the initial 70% of the race

## map to validation

pred_driver_mean = df.loc[~train_mask, 'driver_name'].map(driver_mean).fillna(global_mean).values  
evaluate_regression(y_val, pred_driver_mean)

{'MAE': 12.277970995223836,
 'RMSE': 19.449801049554967,
 'R2': -0.6893776834242131}

In [15]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge

nums_cols = num_features

pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('model', Ridge(alpha=1.0, random_state=RANDOM_SEED))
])


pipeline.fit(X_train[nums_cols], y_train)
pred_lr = pipeline.predict(X_val[nums_cols])
evaluate_regression(y_val, pred_lr)

{'MAE': 1.3392664856200909,
 'RMSE': 2.0721457744821836,
 'R2': 0.9808249265864898}

In [16]:
val_df = df.loc[~train_mask].copy()
val_df['pred_lr'] = pred_lr

val_df['error'] = val_df['pred_lr'] - val_df['lap_time']
val_by_driver = val_df.groupby('driver_name')['error'].apply(lambda s : np.mean(abs(s))).sort_values()
val_by_driver.head(10)

driver_name
VET    0.700592
MSC    0.797701
LAT    0.829032
ALO    1.189792
OCO    1.319799
RUS    1.338028
RIC    1.417728
GAS    1.425539
TSU    1.485703
RAI    1.544448
Name: error, dtype: float64