# ML Modelling

In [None]:
# Manipulation and Visualization
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from datetime import datetime
from dateutil.relativedelta import relativedelta
import unidecode
import plotly.graph_objects as go
import plotly.express as px
import json
from IPython.display import display, HTML

# API and Web Scraping
import requests
from datetime import datetime
from bs4 import BeautifulSoup

# Machine Learning - Preprocessing
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.svm import SVC
from sklearn.preprocessing import normalize, FunctionTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, accuracy_score, mean_squared_error
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import roc_auc_score, roc_curve, auc

from sklearn.metrics import confusion_matrix, plot_confusion_matrix,\
    precision_score, recall_score, accuracy_score, f1_score, log_loss,\
    roc_curve, roc_auc_score, classification_report
from sklearn.ensemble import VotingClassifier, BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, AdaBoostRegressor, GradientBoostingClassifier
from collections import Counter
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import make_column_selector as selector
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.feature_selection import RFECV

# Machine Learning - Modelling & Metrics
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.ensemble import RandomForestClassifier

# NN
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.wrappers import scikit_learn
from keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.callbacks import EarlyStopping
from keras.constraints import maxnorm
from tensorflow.keras import activations

import warnings
warnings.filterwarnings('ignore')


### (1) Final Data Cleaning and Feature Engineering

In [7]:
f1_data = pd.read_csv(r'/Users/julianterenzio/Desktop/f1_final_df.csv')
f1_races = pd.read_csv(r'/Users/julianterenzio/Desktop/f1_races_df.csv')

# Convert dates into datetimes and find relative driver age

f1_races['race_date'] = pd.to_datetime(f1_races['race_date'])
f1_races['dateOfBirth'] = pd.to_datetime(f1_races['dateOfBirth'])
f1_races['driver_age'] = [relativedelta(x, y).years for x, y in zip(f1_races['race_date'], f1_races['dateOfBirth'])]

f1_data['race_date'] = pd.to_datetime(f1_data['race_date'])
f1_data['dateOfBirth'] = pd.to_datetime(f1_data['dateOfBirth'])
f1_data['driver_age'] = [relativedelta(x, y).years for x, y in zip(f1_data['race_date'], f1_data['dateOfBirth'])]


# Clean DNF status data to reflect concise categorical variables

def accident(x):
    if x in ['collision damage','accident','collision','disqualified','damage','spun off','retired']:
        return 'accident'
    else:
        return x

def categories(x):
    if x == 'finished' or x == 'lapped' or x == 'accident':
        return x
    else:
        return 'mechanical_issue'

f1_data['DNF_status'] = f1_data['DNF_status'].apply(lambda x: x.lower())
f1_data.loc[f1_data['DNF_status'].str.contains('lap'), 'DNF_status'] = 'lapped'
f1_data['DNF_status'] = f1_data['DNF_status'].apply(accident)
f1_data['DNF_status'] = f1_data['DNF_status'].apply(categories)

f1_data = f1_data.sort_values(['season', 'round', 'podium'], ascending=(True, True, True))


# Clean qualifying data from string object to seconds float

def clean_times(factor, df_series):
    f1_data[df_series].fillna('00:00.000', inplace = True)
    f1_data.replace({df_series: {'DEL': '00:00.000', 'DNF': '00:00.000', 'DNS': '00:00.000'}}, inplace = True) 
    f1_data[df_series+'_v2'] = [(sum([x * y for x, y in zip(factor, map(float, n.split(':')))]) if ':' in n else float(n)) for n in f1_data[df_series]]

clean_times([60, 1], 'Time')
clean_times([60, 1], 'Q1')
clean_times([60, 1], 'Q2')
clean_times([60, 1], 'Q3')


# Engineer qualifying time feature to reflect cumulative difference from fastest qualifier

f1_data['qualy_list'] = f1_data[['Time_v2', 'Q1_v2', 'Q2_v2', 'Q3_v2']].values.tolist()
f1_data['qualy_list'] = [[item for item in x if item != 0.0] for x in f1_data['qualy_list']]
f1_data = f1_data[f1_data['qualy_list'].map(lambda x: len(x)) > 0]
f1_data['qualy_time'] = [min(x) for x in f1_data['qualy_list']]
f1_data.drop(['Time', 'Q1', 'Q2', 'Q3', 'Time_v2', 'Q1_v2', 'Q2_v2', 'Q3_v2', 'qualy_list'], axis = 1, inplace = True)
f1_data.sort_values(['season', 'round', 'podium'], inplace = True)
f1_data['qualifying_time_diff'] = f1_data.groupby(['season', 'round']).qualy_time.diff()
f1_data['qualy_sec'] = f1_data.groupby(['season', 'round']).qualifying_time_diff.cumsum().fillna(0.000)
f1_data.drop(['qualifying_time_diff', 'qualy_time'], axis = 1, inplace = True)


# Engineer/clean finishing time data to reflect cumulative difference from fastest (first place) driver
## Null finish times for drivers that have been "lapped" or "dnf" reflect the slowest finish time + 10 second error factor

max_finish = f1_data.groupby(['season', 'round']).max().reset_index()[['season', 'round','finish_ms']]
max_finish.rename(columns = {'finish_ms': 'finish_ms_v2'}, inplace = True)
f1_data['finish_ms'] = f1_data['finish_ms'].fillna(0.0)
merged_df = pd.merge(f1_data, 
                     max_finish, 
                     how='inner', 
                     on=['season', 'round']).fillna(0.0)
merged_df['finish_ms_v2'] = merged_df['finish_ms_v2'] + (merged_df['podium']*1000.0)
f1_data['finish_ms'] = [merged_df.iloc[i,-1] if x == 0.0 else x for i, x in enumerate(f1_data['finish_ms'])]
f1_data['finish_sec_v1'] = f1_data['finish_ms'] / 1000.0
f1_data['finish_sec_diff'] = f1_data.groupby(['season', 'round']).finish_sec_v1.diff()
f1_data['finish_sec'] = f1_data.groupby(['season', 'round']).finish_sec_diff.cumsum().fillna(0.000)
f1_data.drop(['finish_sec_diff', 'finish_sec_v1'], axis = 1, inplace = True)

# Turn DNF_status into binary variable columns for better analysis and feature engineering

for status in f1_data['DNF_status'].unique():
    f1_data[status] = f1_data['DNF_status'].apply(lambda x: 1 if (status == x) else 0)

f1_data['cum_finish_ratio'] = f1_data.groupby(['full_name']).finished.cumsum() / (f1_data.groupby(['full_name']).finished.cumcount() + 1)
f1_data['cum_accident_ratio'] = f1_data.groupby(['full_name']).accident.cumsum() / (f1_data.groupby(['full_name']).finished.cumcount() + 1)
f1_data['cum_lapped_ratio'] = f1_data.groupby(['full_name']).lapped.cumsum() / (f1_data.groupby(['full_name']).finished.cumcount() + 1)
f1_data[['cum_finish_ratio', 'cum_accident_ratio', 'cum_lapped_ratio']] = f1_data[['cum_finish_ratio', 'cum_accident_ratio', 'cum_lapped_ratio']].round(5)

f1_data = f1_data.drop(['finished', 'lapped', 'mechanical_issue', 'accident'], axis=1)

# Clean "0" Qualifying Errors

f1_data = f1_data[(f1_data['grid_position'] != 0)].reset_index()


# Only as good as your last race: calcualte the 3-race rolling average finishing time

final_df = pd.DataFrame()
for name in f1_data['full_name'].unique():
    test_df = f1_data
    test_df = test_df[(test_df['full_name'] == name)].reset_index()
    test_df['rolling_3_finish_sec'] = test_df['finish_sec'].transform(lambda x: round(x.rolling(3, min_periods=0).mean().shift().bfill(), 3))
    test_df['rolling_3_qualy_sec'] = test_df['qualy_sec'].transform(lambda x: round(x.rolling(3, min_periods=0).mean().shift().bfill(), 3))
    final_df = pd.concat([final_df, test_df])

final_df['rolling_3_finish_sec'] = final_df['rolling_3_finish_sec'].fillna(0.000)
final_df['rolling_3_qualy_sec'] = final_df['rolling_3_qualy_sec'].fillna(0.000)
final_df = final_df.drop(['index'], axis=1)
final_df = final_df.sort_values(['season', 'round', 'podium'], ascending=(True, True, True)).reset_index(drop=True)


# Voila!

pd.set_option('display.max_columns', None)
print(f1_data.shape)
print(final_df.shape)
display(final_df.tail())


(14039, 36)
(14039, 38)


Unnamed: 0,level_0,season,round,circuit_id,country,lat,long,race_date,grid_position,podium,points_earned,finish_ms,DNF_status,constructor,full_name,dateOfBirth,nationality,points_after,wins_after,points_before,wins_before,c_points_after,c_wins_after,c_points_before,c_wins_before,warm_weather,cold_weather,dry_weather,wet_weather,cloudy_weather,driver_age,qualy_sec,finish_sec,cum_finish_ratio,cum_accident_ratio,cum_lapped_ratio,rolling_3_finish_sec,rolling_3_qualy_sec
14034,14034,2020,17,yas_marina,UAE,24.4672,54.6031,2020-12-13,14,16,0.0,5894363.0,lapped,alfa,antonio_giovinazzi,1993-12-14,Italian,4.0,0.0,4.0,0.0,8.0,0.0,8.0,0.0,True,False,False,False,False,26,1.829,105.718,0.30769,0.12821,0.53846,59.458,1.229
14035,14035,2020,17,yas_marina,UAE,24.4672,54.6031,2020-12-13,18,17,0.0,5895363.0,lapped,williams,nicholas_latifi,1995-06-29,Canadian,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,False,False,False,False,25,3.197,106.718,0.35294,0.11765,0.47059,39.599,2.069
14036,14036,2020,17,yas_marina,UAE,24.4672,54.6031,2020-12-13,20,18,0.0,5896363.0,lapped,haas,kevin_magnussen,1992-10-05,Danish,1.0,0.0,1.0,0.0,3.0,0.0,3.0,0.0,True,False,False,False,False,28,2.617,107.718,0.31356,0.07627,0.47458,61.544,6.404
14037,14037,2020,17,yas_marina,UAE,24.4672,54.6031,2020-12-13,17,19,0.0,5897363.0,lapped,haas,pietro_fittipaldi,1996-06-25,Brazilian,0.0,0.0,0.0,0.0,3.0,0.0,3.0,0.0,True,False,False,False,False,24,2.927,108.718,0.5,0.0,0.5,36.858,1.639
14038,14038,2020,17,yas_marina,UAE,24.4672,54.6031,2020-12-13,19,20,0.0,5898363.0,mechanical_issue,racing_point,sergio_perez,1990-01-26,Mexican,125.0,1.0,125.0,1.0,195.0,1.0,194.0,1.0,True,False,False,False,False,30,0.788,109.718,0.51832,0.05759,0.34031,24.145,-0.727


### (2) Train, Test, Split

In [15]:
model = final_df
model = model.drop(['level_0', 'finish_sec', 'qualy_sec', 'lat', 'finish_ms', 
                    'race_date', 'long', 'dateOfBirth', 'points_earned', 
                    'points_after', 'wins_after', 'c_points_after', 'c_wins_after'], axis=1)
model = model[model['full_name'] != 'pietro_fittipaldi']
model.reset_index()
model['podium'] = model['podium'].apply(lambda x: 1 if (x < 11) else 0)

for column in ['warm_weather', 'cold_weather','dry_weather', 'wet_weather', 'cloudy_weather']:
    model[column] = model[column].apply(lambda x: float(x)) 

    
# Train, Test, Split

X_train = model[model['season'] != 2020].drop(columns = ['podium'])
y_train = model.loc[model['season'] != 2020, ['season', 'round', 'full_name', 'podium']]
X_test = model[model['season'] == 2020].drop(columns = ['podium'])
y_test = model.loc[(model['season'] == 2020), ['season', 'round', 'full_name', 'podium']]


# Set ML model index

X_train = X_train.set_index(['season', 'round', 'full_name'])
y_train = y_train.set_index(['season', 'round', 'full_name'])
X_test = X_test.set_index(['season', 'round', 'full_name'])
y_test = y_test.set_index(['season', 'round', 'full_name'])

numeric_features = ['grid_position', 'points_before', 'wins_before', 
                    'c_points_before', 'c_wins_before', 
                    'driver_age', 'cum_finish_ratio', 
                    'cum_accident_ratio', 'cum_lapped_ratio', 
                    'rolling_3_finish_sec', 'rolling_3_qualy_sec']
categorical_features = ['DNF_status', 'constructor', 'circuit_id', 'country', 'nationality']

display(X_test.head())
display(y_test.head())


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,circuit_id,country,grid_position,DNF_status,constructor,nationality,points_before,wins_before,c_points_before,c_wins_before,warm_weather,cold_weather,dry_weather,wet_weather,cloudy_weather,driver_age,cum_finish_ratio,cum_accident_ratio,cum_lapped_ratio,rolling_3_finish_sec,rolling_3_qualy_sec
season,round,full_name,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
2020,1,valtteri_bottas,red_bull_ring,Austria,1,finished,mercedes,Finnish,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,30,0.76596,0.02837,0.13475,27.479,0.188
2020,1,charles_leclerc,red_bull_ring,Austria,7,finished,ferrari,Monegasque,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,22,0.5814,0.13953,0.2093,43.911,0.258
2020,1,lando_norris,red_bull_ring,Austria,3,finished,mclaren,British,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,20,0.40909,0.09091,0.31818,58.108,1.389
2020,1,lewis_hamilton,red_bull_ring,Austria,5,finished,mercedes,British,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,35,0.8583,0.05263,0.04049,5.096,0.163
2020,1,carlos_sainz,red_bull_ring,Austria,8,finished,mclaren,Spanish,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,25,0.38144,0.09278,0.39175,87.015,1.033


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,podium
season,round,full_name,Unnamed: 3_level_1
2020,1,valtteri_bottas,1
2020,1,charles_leclerc,1
2020,1,lando_norris,1
2020,1,lewis_hamilton,1
2020,1,carlos_sainz,1


### (3) Model Scaling and Model Prediction Functions

- I decided to use the `pipeline` feature from the `scikit-learn` package to create a sequence of scaling transformations and model-fitting operations on the dataset. Applying a transformer and model estimator separately (i.e. not using pipeline) will result in fitted training features being wrongly included in the test-fold of `GridSearchCV`. 


- According to the `pipeline` documentation, "pipelines help avoid leaking statistics from your test data into the trained model in cross-validation, by ensuring that the same samples are used to train the transformers and predictors."


- In laymen’s terms, if you separate feature scaling and model-fitting functions while using `GridSearchCV`, you will be creating a biased testing dataset that already contains information about the training set — not good. 

In [75]:
# Formalize transformer objects for scaling the numerical features and one hot encoding the categorical features
prediction_scorecard = {'model':[],
                        'accuracy_score':[],
                        'precision_score':[],
                        'recall_score':[],
                        'best_params':[]}

def prediction_model(model_type, model_id):
    # Scale numeric features using 'StandardScaler' and 'One-Hot Encode' categorical features
    scoring = ['neg_log_loss', 'accuracy']
    numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
    categorical_transformer = Pipeline(steps=[('ohe', OneHotEncoder(handle_unknown = 'ignore'))])
    preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features),
                                                   ('cat', categorical_transformer, categorical_features)])
    pipeline = Pipeline(steps=[('prep', preprocessor), 
                               (model_id, model_type)])
    return pipeline


def model_results(X_test, model, model_id):
    # Predict!
    pred = model.predict(X_test)
    pred_proba = model.predict_proba(X_test)
    df_pred = pd.DataFrame(np.around(pred_proba, 4), index=X_test.index, columns=['prob_0', 'prob_1'])
    df_pred['prediction'] = list(pred)
    df_pred['actual'] = y_test['podium']
    df_pred['grid_position'] = X_test['grid_position']

    # Include row if an 'actual' or 'predicted' podium occured for calculating accuracy
    df_pred['sort'] = df_pred['prediction'] + df_pred['actual']
    df_pred = df_pred[df_pred['sort'] > 0]
    df_pred.reset_index(inplace=True)
    df_pred = df_pred.groupby(['round']).apply(pd.DataFrame.sort_values, 'prob_1', ascending=False)
    df_pred.drop(['sort'], axis=1, inplace=True)
    df_pred.reset_index(drop=True, inplace=True) 
    
    # Save Accuracy, Precision, 
    prediction_scorecard['model'].append(model_id)
    prediction_scorecard['accuracy_score'].append(accuracy_score(df_pred['actual'], df_pred['prediction']))
    prediction_scorecard['precision_score'].append(precision_score(df_pred['actual'], df_pred['prediction']))
    prediction_scorecard['recall_score'].append(recall_score(df_pred['actual'], df_pred['prediction']))
    prediction_scorecard['best_params'].append(str(model.best_params_))
    display(df_pred.head(10))


### (4) Machine Learning Algorithms

In [76]:
# Logistic Regression
scoring = ['neg_log_loss', 'accuracy']
lr_params = {'lr__solver' : ['liblinear', 'lbfgs', 'newton-cg'],
             'lr__penalty': ['l1', 'l2'],
             'lr__C': [.00001, .0001, .001, .01, .05, 0.1],
             'lr__class_weight': [None]}

lr_cv = GridSearchCV(prediction_model(LogisticRegression(max_iter=10000), 'lr'), 
                     param_grid=lr_params, 
                     cv=5, 
                     scoring=scoring, 
                     refit='neg_log_loss',  
                     verbose=1)
# Train Model
lr_cv.fit(X_train, y_train)

# Test Model
model_results(X_test, lr_cv, 'Logistic Regression')
pd.set_option('display.max_colwidth', 0)
display(pd.DataFrame(prediction_scorecard))


Fitting 5 folds for each of 36 candidates, totalling 180 fits


Unnamed: 0,season,round,full_name,prob_0,prob_1,prediction,actual,grid_position
0,2020,1,valtteri_bottas,0.0274,0.9726,1,1,1
1,2020,1,lando_norris,0.0313,0.9687,1,1,3
2,2020,1,lewis_hamilton,0.0381,0.9619,1,1,5
3,2020,1,sergio_perez,0.067,0.933,1,1,6
4,2020,1,charles_leclerc,0.0701,0.9299,1,1,7
5,2020,1,carlos_sainz,0.0997,0.9003,1,1,8
6,2020,1,pierre_gasly,0.1134,0.8866,1,1,12
7,2020,1,sebastian_vettel,0.15,0.85,1,1,11
8,2020,1,esteban_ocon,0.2266,0.7734,1,1,14
9,2020,1,antonio_giovinazzi,0.3468,0.6532,1,1,18


Unnamed: 0,model,accuracy_score,precision_score,recall_score,best_params
0,Logistic Regression,0.75576,0.769953,0.97619,"{'lr__C': 0.1, 'lr__class_weight': None, 'lr__penalty': 'l1', 'lr__solver': 'liblinear'}"


In [78]:
# Support Vector Machines
svm_params= {'svm__C': [0.1, 0.01, 0.001],
             'svm__kernel': ['linear', 'poly', 'rbf'],
             'svm__degree': [1, 2, 3],
             'svm__gamma': [0.1, 0.01, 0.001]}

svm_cv = GridSearchCV(prediction_model(SVC(probability=True), 'svm'),
                      param_grid=svm_params,
                      scoring=scoring, 
                      refit='neg_log_loss',  
                      verbose=10)

# Train Model
svm_cv.fit(X_train, y_train)

# Test Model
model_results(X_test, svm_cv, 'Support Vector Machines')
display(pd.DataFrame(prediction_scorecard))


Fitting 5 folds for each of 1 candidates, totalling 5 fits


Unnamed: 0,season,round,full_name,prob_0,prob_1,prediction,actual,grid_position
0,2020,1,valtteri_bottas,0.0178,0.9822,1,1,1
1,2020,1,lando_norris,0.0227,0.9773,1,1,3
2,2020,1,lewis_hamilton,0.0237,0.9763,1,1,5
3,2020,1,charles_leclerc,0.0604,0.9396,1,1,7
4,2020,1,sergio_perez,0.063,0.937,1,1,6
5,2020,1,pierre_gasly,0.0949,0.9051,1,1,12
6,2020,1,carlos_sainz,0.1018,0.8982,1,1,8
7,2020,1,sebastian_vettel,0.1038,0.8962,1,1,11
8,2020,1,esteban_ocon,0.2006,0.7994,1,1,14
9,2020,1,antonio_giovinazzi,0.4036,0.5964,1,1,18


Unnamed: 0,model,accuracy_score,precision_score,recall_score,best_params
0,Logistic Regression,0.75576,0.769953,0.97619,"{'lr__C': 0.1, 'lr__class_weight': None, 'lr__penalty': 'l1', 'lr__solver': 'liblinear'}"
1,Support Vector Machines,0.77619,0.795122,0.970238,"{'svm__C': 0.01, 'svm__degree': 3, 'svm__gamma': 0.01, 'svm__kernel': 'linear'}"


In [79]:
# Decision Tree Classifier
dt_params = {'dt__criterion': ['gini', 'entropy'],
             'dt__max_depth': [6,8,10,12,14,16,18,20,22,24,26,28],
             'dt__min_samples_leaf': [2,3,4,5,6,7,8],
             'dt__min_samples_split': [2,3,4,5,6,7,8]}

dt_cv = GridSearchCV(prediction_model(DecisionTreeClassifier(), 'dt'), 
                     param_grid=dt_params, 
                     cv=5, 
                     scoring=scoring, 
                     refit='neg_log_loss', 
                     verbose=10)

# Train Model
dt_cv.fit(X_train, y_train)

# Test Model
model_results(X_test, dt_cv, 'Decision Tree Classifier')
display(pd.DataFrame(prediction_scorecard))


Fitting 5 folds for each of 36 candidates, totalling 180 fits


Unnamed: 0,season,round,full_name,prob_0,prob_1,prediction,actual,grid_position
0,2020,1,valtteri_bottas,0.008,0.992,1,1,1
1,2020,1,lando_norris,0.008,0.992,1,1,3
2,2020,1,lewis_hamilton,0.008,0.992,1,1,5
3,2020,1,sergio_perez,0.008,0.992,1,1,6
4,2020,1,charles_leclerc,0.0577,0.9423,1,1,7
5,2020,1,carlos_sainz,0.0577,0.9423,1,1,8
6,2020,1,nicholas_latifi,0.1489,0.8511,1,0,20
7,2020,1,pierre_gasly,0.1502,0.8498,1,1,12
8,2020,1,sebastian_vettel,0.1502,0.8498,1,1,11
9,2020,1,esteban_ocon,0.4839,0.5161,1,1,14


Unnamed: 0,model,accuracy_score,precision_score,recall_score,best_params
0,Logistic Regression,0.75576,0.769953,0.97619,"{'lr__C': 0.1, 'lr__class_weight': None, 'lr__penalty': 'l1', 'lr__solver': 'liblinear'}"
1,Support Vector Machines,0.77619,0.795122,0.970238,"{'svm__C': 0.01, 'svm__degree': 3, 'svm__gamma': 0.01, 'svm__kernel': 'linear'}"
2,Decision Tree Classifier,0.725664,0.738739,0.97619,"{'dt__criterion': 'gini', 'dt__max_depth': 6, 'dt__min_samples_leaf': 4, 'dt__min_samples_split': 3}"


In [85]:
# Random Forest Classifier

rfc_params = {'rfc__bootstrap': [True, False],
              'rfc__max_depth': [8, 10, 12, 14, 16, 18, 20, 22, 24],
              'rfc__max_features': ['auto', 'sqrt'],
              'rfc__min_samples_leaf': [2, 4, 6],
              'rfc__min_samples_split': [2, 4, 6],
              'rfc__n_estimators': [64, 128, 256, 512]}

rfc_cv = GridSearchCV(prediction_model(RandomForestClassifier(), 'rfc'), 
                      param_grid=rfc_params, 
                      cv=5, 
                      scoring=scoring, 
                      refit='neg_log_loss', 
                      verbose=10)

# Train Model
rfc_cv.fit(X_train, y_train)

# Test Model
model_results(X_test, rfc_cv, 'Random Forest Classifier')

from IPython.display import display, HTML

def wrap_df_text(df):
    return display(HTML(df.to_html().replace("\\n","<br>")))

df = pd.DataFrame(prediction_scorecard)
df['best_params'] = df['best_params'].str.wrap(30)
wrap_df_text(df)


Fitting 5 folds for each of 1 candidates, totalling 5 fits


Unnamed: 0,season,round,full_name,prob_0,prob_1,prediction,actual,grid_position
0,2020,1,lewis_hamilton,0.0564,0.9436,1,1,5
1,2020,1,valtteri_bottas,0.0592,0.9408,1,1,1
2,2020,1,charles_leclerc,0.0771,0.9229,1,1,7
3,2020,1,sebastian_vettel,0.0785,0.9215,1,1,11
4,2020,1,lando_norris,0.0987,0.9013,1,1,3
5,2020,1,carlos_sainz,0.1247,0.8753,1,1,8
6,2020,1,sergio_perez,0.1415,0.8585,1,1,6
7,2020,1,pierre_gasly,0.2095,0.7905,1,1,12
8,2020,1,nicholas_latifi,0.4067,0.5933,1,0,20
9,2020,1,esteban_ocon,0.4366,0.5634,1,1,14


Unnamed: 0,model,accuracy_score,precision_score,recall_score,best_params
0,Logistic Regression,0.75576,0.769953,0.97619,"{'lr__C': 0.1, 'lr__class_weight': None, 'lr__penalty': 'l1', 'lr__solver': 'liblinear'}"
1,Support Vector Machines,0.77619,0.795122,0.970238,"{'svm__C': 0.01, 'svm__degree': 3, 'svm__gamma': 0.01, 'svm__kernel': 'linear'}"
2,Decision Tree Classifier,0.725664,0.738739,0.97619,"{'dt__criterion': 'gini', 'dt__max_depth': 6, 'dt__min_samples_leaf': 4, 'dt__min_samples_split': 3}"
3,Random Forest Classifier,0.769953,0.784689,0.97619,"{'rfc__bootstrap': False, 'rfc__max_depth': 22, 'rfc__max_features': 'sqrt', 'rfc__min_samples_leaf': 2, 'rfc__min_samples_split': 6, 'rfc__n_estimators': 256}"
4,Random Forest Classifier,0.773585,0.788462,0.97619,"{'rfc__bootstrap': False, 'rfc__max_depth': 22, 'rfc__max_features': 'sqrt', 'rfc__min_samples_leaf': 2, 'rfc__min_samples_split': 6, 'rfc__n_estimators': 256}"


In [None]:
# Neural Network

nn_params = {'nn__epochs': [8],
             'nn__optimizer' : ['Adam'], 
             'nn__activation' : ['hard_sigmoid','relu'],
             'nn__neurons' : [12],
             'nn__weight_constraint': [1,3],
             'nn__dropout_rate' : [0.3,0.6]}

def build_model(optimizer='adam', activation='relu', neurons=8, learn_rate=0.01, dropout_rate=0.0, weight_constraint=0):
    model = Sequential()
    model.add(Dense(neurons, activation=activation, input_dim=187, kernel_constraint=maxnorm(weight_constraint)))
    model.add(Dropout(dropout_rate))
    model.add(Dense(4, activation=activation))
    model.add(Dense(1, activation = 'relu'))
    model.compile(optimizer=optimizer,
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

keras_model = KerasClassifier(build_fn=build_model, verbose=1)
nn_cv = GridSearchCV(prediction_model(keras_model, 'nn'), 
                     param_grid=nn_params, 
                     cv=3, 
                     scoring=scoring, 
                     refit='neg_log_loss', 
                     verbose=1)

# Train Model
nn_cv.fit(X_train, y_train)

# Test Model
model_results(X_test, nn_cv, 'Neural Network')
display(pd.DataFrame(prediction_scorecard))
