In [1]:
#%pip install scikit-learn
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
from datetime import datetime

import warnings

# machine learning
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import (RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor)

In [2]:
df_train = pd.read_csv('horse_racing_training_data.csv')
df_test = pd.read_csv('horse_racing_holdout_data.csv')
combine = [df_train, df_test]

The data:
- race_url: a (made up) URL for the race run;
- X1: The pre race price for the horse that finished 1st;
- X2: The pre race price for the horse that finished 2nd;
- X3: The pre race price for the horse that finished 3rd;
- runners: How many horses were in the race;
- favourite: The price of the pre-race favourite of that race;
- jumps/flat: The type of course the race is on (whether there is fences for the horses to
jump over or not)
- places: number of places being offered in the race
- sf: the straight forecast price (your target variable)

In [3]:
print(df_train.columns.values)

['race_url' 'X1' 'X2' 'X3' 'runners' 'favourite' 'favourite_cloth_number'
 'jumps.flat' 'places' 'expert_opinion' 'sf']


In [4]:
print(df_train['race_url'].head(1))

0    https://www.fakehorsedata.com/racing/results/2...
Name: race_url, dtype: object


We can see that we can extract some features from the URL like the date, place, race_id, race_name

In [5]:
aux = df_train
df2 =aux['race_url'].str.split('/', expand = True, n = -1)
df2

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,https:,,www.fakehorsedata.com,racing,results,2019-03-13,charles-town,516636,race-2-allowance
1,https:,,www.fakehorsedata.com,racing,results,2019-03-01,delta-downs,515195,race-8-allowance
2,https:,,www.fakehorsedata.com,racing,results,2019-03-01,santa-anita,515185,race-2-allowance-optional-claiming
3,https:,,www.fakehorsedata.com,racing,results,2019-03-20,charles-town,517491,race-7-allowance
4,https:,,www.fakehorsedata.com,racing,results,2019-03-15,hawthorne,516919,race-3-starter-optional-claiming
...,...,...,...,...,...,...,...,...,...
3207,https:,,www.fakehorsedata.com,racing,results,2019-03-02,laurel-park,515369,race-2-maiden-claiming
3208,https:,,www.fakehorsedata.com,racing,results,2019-03-10,naas,515819,toalsbet-casino-mares-proam-flat-race
3209,https:,,www.fakehorsedata.com,racing,results,2019-02-17,huntingdon,512843,mansionbet-handicap-hurdle-div-2
3210,https:,,www.fakehorsedata.com,racing,results,2019-03-20,charles-town,517489,race-5-optional-claiming


In [6]:
df2 = df2.drop(columns=[0,1,2,3,4])
df2

Unnamed: 0,5,6,7,8
0,2019-03-13,charles-town,516636,race-2-allowance
1,2019-03-01,delta-downs,515195,race-8-allowance
2,2019-03-01,santa-anita,515185,race-2-allowance-optional-claiming
3,2019-03-20,charles-town,517491,race-7-allowance
4,2019-03-15,hawthorne,516919,race-3-starter-optional-claiming
...,...,...,...,...
3207,2019-03-02,laurel-park,515369,race-2-maiden-claiming
3208,2019-03-10,naas,515819,toalsbet-casino-mares-proam-flat-race
3209,2019-02-17,huntingdon,512843,mansionbet-handicap-hurdle-div-2
3210,2019-03-20,charles-town,517489,race-5-optional-claiming


In [7]:
df2.rename(columns={5: 'date', 6: 'place', 7: 'race_id', 8: 'race_name'}, inplace= True)
df2


Unnamed: 0,date,place,race_id,race_name
0,2019-03-13,charles-town,516636,race-2-allowance
1,2019-03-01,delta-downs,515195,race-8-allowance
2,2019-03-01,santa-anita,515185,race-2-allowance-optional-claiming
3,2019-03-20,charles-town,517491,race-7-allowance
4,2019-03-15,hawthorne,516919,race-3-starter-optional-claiming
...,...,...,...,...
3207,2019-03-02,laurel-park,515369,race-2-maiden-claiming
3208,2019-03-10,naas,515819,toalsbet-casino-mares-proam-flat-race
3209,2019-02-17,huntingdon,512843,mansionbet-handicap-hurdle-div-2
3210,2019-03-20,charles-town,517489,race-5-optional-claiming


We might get even more information and features about the 'race_name'. With the scoop of time we are not doing it.

In [8]:
print(len(df2['place'].unique()))
l_place =df2['place'].unique()
l_place

103


array(['charles-town', 'delta-downs', 'santa-anita', 'hawthorne',
       'market-rasen', 'turfway-park', 'laurel-park', 'kelso', 'vire',
       'sam-houston-race-park', 'dundalk', 'marseille', 'tampa-bay-downs',
       'meydan', 'catterick', 'penn-national', 'turf-paradise',
       'vincennes', 'golden-gate-fields', 'gulfstream', 'southwell',
       'sandown', 'caen', 'wolverhampton', 'newcastle', 'enghien',
       'mahoning-valley', 'jebel-ali', 'newbury', 'lyon-la-soie',
       'aqueduct', 'kempton', 'wincanton', 'stratford', 'chelmsford-city',
       'toulouse', 'angers', 'sunland-park', 'thurles', 'chepstow',
       'cagnes-sur-mer', 'leopardstown', 'mauquenchy', 'vaal', 'hereford',
       'fontainebleau', 'agen', 'amiens', 'ludlow', 'huntingdon',
       'taunton', 'chantilly', 'leicester', 'punchestown', 'plumpton',
       'cordemais', 'wetherby', 'wexford', 'neuss', 'le-croise-laroche',
       'carlisle', 'navan', 'uttoxeter', 'naas', 'down-royal',
       'mont-de-marsan', 'chelt

With more information, it would be nice to cluster the places to countries or regions. It could also be needed to go through a spelling code detector (to avoid different duplicates due to mispelling).

In [9]:
#One-hot encoding
df2_encoded = pd.get_dummies(df2['place'])
df2_encoded

Unnamed: 0,agen,amiens,angers,aqueduct,ascot,auteuil,ayr,bordeaux-le-bouscat,caen,cagnes-sur-mer,...,turfway-park,uttoxeter,vaal,vincennes,vire,warwick,wetherby,wexford,wincanton,wolverhampton
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3207,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3208,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3209,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3210,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [10]:
df2[['year', 'month', 'day']]= df2['date'].str.split('-', expand = True, n = -1)
df2

Unnamed: 0,date,place,race_id,race_name,year,month,day
0,2019-03-13,charles-town,516636,race-2-allowance,2019,03,13
1,2019-03-01,delta-downs,515195,race-8-allowance,2019,03,01
2,2019-03-01,santa-anita,515185,race-2-allowance-optional-claiming,2019,03,01
3,2019-03-20,charles-town,517491,race-7-allowance,2019,03,20
4,2019-03-15,hawthorne,516919,race-3-starter-optional-claiming,2019,03,15
...,...,...,...,...,...,...,...
3207,2019-03-02,laurel-park,515369,race-2-maiden-claiming,2019,03,02
3208,2019-03-10,naas,515819,toalsbet-casino-mares-proam-flat-race,2019,03,10
3209,2019-02-17,huntingdon,512843,mansionbet-handicap-hurdle-div-2,2019,02,17
3210,2019-03-20,charles-town,517489,race-5-optional-claiming,2019,03,20


In [11]:
df2['year'].unique()
# We do not add the year

array(['2019'], dtype=object)

In [12]:
df2['month']= df2['month'].astype(int)
df2['day']= df2['day'].astype(int)
df_date = df2.drop(columns=['date', 'place', 'race_id', 'race_name', 'year'])

In [13]:
aux['expert_opinion'].describe()

count    191.000000
mean      54.220105
std       88.935587
min        2.890000
25%       11.310000
50%       26.420000
75%       56.385000
max      773.010000
Name: expert_opinion, dtype: float64

We do not get the meaning of this variable. And we get very few in the training dataset 191/3212. Filling the NA values would be practically random. Therefore, we drop this variable. The cloth number of the favourite should be independent of the target variable (from a logical point of view and we could corraborate this). We drop also this. We change the categorical variable 'jumps.falt' to a boolean one (0,1) being 1 if it contains jumps.

In [14]:
aux['jumps.flat'].unique()

array(['Flat', 'Jumps'], dtype=object)

In [15]:
aux['jumps'] = (aux['jumps.flat']=='Jumps')
aux['jumps']

0       False
1       False
2       False
3       False
4       False
        ...  
3207    False
3208    False
3209     True
3210    False
3211     True
Name: jumps, Length: 3212, dtype: bool

In [16]:
l_var = ['race_url' 'X1' 'X2' 'X3' 'runners' 'favourite' 'favourite_cloth_number'
 'jumps.flat' 'places' 'expert_opinion' 'sf']
Y = aux['sf']
aux = aux.drop(columns=['race_url', 'favourite_cloth_number', 'jumps.flat', 'expert_opinion', 'sf'])

X = pd.concat([aux, df2_encoded, df_date], axis = 1)
sorted_col = sorted(X.columns)
X = X[sorted_col]
X

Unnamed: 0,X1,X2,X3,agen,amiens,angers,aqueduct,ascot,auteuil,ayr,...,turfway-park,uttoxeter,vaal,vincennes,vire,warwick,wetherby,wexford,wincanton,wolverhampton
0,3.750000,29.000000,10.000000,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,4.500000,2.750000,13.000000,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,4.000000,1.727273,8.000000,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,1.727273,7.500000,5.000000,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,2.500000,6.500000,6.500000,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3207,4.000000,19.000000,2.375000,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3208,1.909091,26.000000,4.333333,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3209,8.500000,6.000000,11.000000,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3210,1.250000,15.000000,12.000000,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [17]:
X.isnull().sum()

X1               0
X2               0
X3               0
agen             0
amiens           0
                ..
warwick          0
wetherby         0
wexford          0
wincanton        0
wolverhampton    0
Length: 112, dtype: int64

In [18]:
X = X.fillna(X.mean())

In [19]:
Y.describe()

count    3212.000000
mean       44.820109
std        87.624639
min         1.350000
25%         9.375000
50%        20.215000
75%        46.867500
max      2483.900000
Name: sf, dtype: float64

In [20]:
Y_discrete = pd.qcut(Y, q=4, labels=['Q1', 'Q2', 'Q3', 'Q4'])

In [21]:
# Classification
#Some examples. And we could try to find the best hyper-parameters
models={
    'Logistic Regression': LogisticRegression(),
    'SVC': SVC(),
    '3-KNN': KNeighborsClassifier(n_neighbors=3),
    'Gaussian Naive Bayes': GaussianNB(),
    'Linear SVC': LinearSVC(),
    'SGDClassifier': SGDClassifier(),
    'Random Forest': RandomForestClassifier()
}

skf = StratifiedKFold(n_splits =5, shuffle= True, random_state= 35)

for name, model in models.items():
    scores = cross_val_score(model, X, Y_discrete, cv=skf, scoring='accuracy')
    print(f'{name}: Mean accuracy= {scores.mean():.4f}, Std ={scores.std(): .4f}')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Logistic Regression: Mean accuracy= 0.7407, Std = 0.0189
SVC: Mean accuracy= 0.8182, Std = 0.0100
3-KNN: Mean accuracy= 0.7095, Std = 0.0142
Gaussian Naive Bayes: Mean accuracy= 0.3586, Std = 0.0106




Linear SVC: Mean accuracy= 0.6834, Std = 0.0245
SGDClassifier: Mean accuracy= 0.6221, Std = 0.0470
Random Forest: Mean accuracy= 0.8543, Std = 0.0133


In [22]:


# Regression
#Some examples. And we could try to find the best hyper-parameters
models={
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'KNN Regressor': KNeighborsRegressor(),
    'Decission Treee Regressor': DecisionTreeRegressor(),
    'Gradient Boosting Regressor': GradientBoostingRegressor(),
    'Adaboost Regressor': AdaBoostRegressor(),
    'Random Forest Regressor': RandomForestRegressor()
}

kf = KFold(n_splits =5, shuffle= True, random_state= 35)

for name, model in models.items():
    RMSE = np.sqrt((-1)* cross_val_score(model, X, Y, cv=kf, scoring='neg_mean_squared_error').mean())
    r2 = cross_val_score(model, X, Y, cv=kf, scoring='r2').mean()
    print(f'{name}: RMSE= {RMSE:.4f}, r2 ={r2: .4f}')


Linear Regression: RMSE= 51.1074, r2 = 0.6499
Ridge Regression: RMSE= 50.9041, r2 = 0.6533
Lasso Regression: RMSE= 50.2822, r2 = 0.6639
KNN Regressor: RMSE= 43.0971, r2 = 0.7935
Decission Treee Regressor: RMSE= 34.3193, r2 = 0.8733
Gradient Boosting Regressor: RMSE= 27.9989, r2 = 0.9279
Adaboost Regressor: RMSE= 55.0868, r2 = 0.5721
Random Forest Regressor: RMSE= 30.6735, r2 = 0.9113


We can see that the lowest RSME with highest r2 are Gradient Boosting Regressor, Random Forest Regressor and Decission Tree Regressor. We have two options:
- Try to tune the Gradient Boosting Regressor to find the best model.
- Try to tune a Decission Tree Regressor to get a good performance model easy to explain (if this were relevant).

In [23]:
GradientBoostingRegressor()

In [24]:
#tuning for number of trees
from sklearn.model_selection import GridSearchCV
param_grid = {'n_estimators':range(20,1021,100),
              'max_depth':[10], #range(2,33,10), 
              'min_samples_split':[15], #range(2,33,10), 
              'learning_rate':[0.2]}
clf = GridSearchCV(GradientBoostingRegressor(random_state=1), 
                   param_grid = param_grid, scoring='r2', 
                   cv=kf).fit(X, Y)
print(clf.best_estimator_) 
print("R Squared:",clf.best_score_)

GradientBoostingRegressor(learning_rate=0.2, max_depth=10, min_samples_split=15,
                          n_estimators=520, random_state=1)
R Squared: 0.9277406445113294


In [25]:
#tuning the tree specific parameters
param_grid = {'n_estimators': [230],
              'max_depth': range(2,33,10), 
              'min_samples_split': range(2,33,10), 
              'learning_rate':[0.2]}
clf = GridSearchCV(GradientBoostingRegressor(random_state=1), 
                   param_grid = param_grid, scoring='r2', 
                   cv=kf).fit(X, Y)
print(clf.best_estimator_) 
print("R Squared:",clf.best_score_)

GradientBoostingRegressor(learning_rate=0.2, max_depth=2, n_estimators=230,
                          random_state=1)
R Squared: 0.9322479816839196


In [26]:
aux = df_test
df2 =aux['race_url'].str.split('/', expand = True, n = -1)
df2 = df2.drop(columns=[0,1,2,3,4])
df2.rename(columns={5: 'date', 6: 'place', 7: 'race_id', 8: 'race_name'}, inplace= True)
print(len(df2['place'].unique()))
nl_place = df2['place'].unique()

97


In [27]:
cont = 0
for x in nl_place:
    if x not in l_place:
        cont += 1

print(cont)

0


In [28]:
add_places = []
for x in l_place:
    if x not in nl_place:
        add_places.append(x)

print(add_places)

['stratford', 'chepstow', 'punchestown', 'krefeld', 'deauville', 'turffontein']


No problem of a new place in the new dataset from the training set.

In [29]:
df2_encoded = pd.get_dummies(df2['place'])
for x in add_places:
    df2_encoded[x]= False

In [30]:
aux['jumps'] = (aux['jumps.flat']=='Jumps')
aux = aux.drop(columns=['race_url', 'favourite_cloth_number', 'jumps.flat', 'expert_opinion'])

df2[['year', 'month', 'day']]= df2['date'].str.split('-', expand = True, n = -1)
df2['month']= df2['month'].astype(int)
df2['day']= df2['day'].astype(int)
df_date = df2.drop(columns=['date', 'place', 'race_id', 'race_name', 'year'])
X_test = pd.concat([aux, df2_encoded, df_date], axis = 1)

In [31]:
X_test

Unnamed: 0,X1,X2,X3,runners,favourite,places,jumps,agen,amiens,angers,...,wincanton,wolverhampton,stratford,chepstow,punchestown,krefeld,deauville,turffontein,month,day
0,3.750000,4.500000,3.50,7,3.500000,2,True,False,False,False,...,False,False,False,False,False,False,False,False,2,13
1,5.500000,1.250000,13.00,3,1.250000,0,True,False,False,False,...,False,False,False,False,False,False,False,False,2,13
2,4.500000,10.000000,6.00,8,3.500000,2,False,False,False,False,...,False,False,False,False,False,False,False,False,2,13
3,9.000000,4.333333,5.00,13,4.333333,3,False,False,False,False,...,False,False,False,False,False,False,False,False,2,13
4,2.375000,2.750000,7.00,5,2.375000,2,True,False,False,False,...,False,False,False,False,False,False,False,False,2,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
798,2.625000,4.000000,3.25,9,2.625000,3,False,False,False,False,...,False,False,False,False,False,False,False,False,3,20
799,2.375000,4.000000,17.00,8,2.375000,2,False,False,False,False,...,False,False,False,False,False,False,False,False,3,20
800,1.285714,13.000000,34.00,6,1.285714,2,False,False,False,False,...,False,False,False,False,False,False,False,False,3,20
801,2.250000,34.000000,2.50,9,2.250000,3,False,False,False,False,...,False,False,False,False,False,False,False,False,3,20


In [32]:
X_test.isnull().sum()
#X_test = X_test.fillna(X_test.mean())

X1             0
X2             0
X3             0
runners        0
favourite      0
              ..
krefeld        0
deauville      0
turffontein    0
month          0
day            0
Length: 112, dtype: int64

In [33]:
X_test = X_test[sorted_col]

In [34]:
clf = GradientBoostingRegressor().fit(X, Y)

out=clf.predict(X_test)
df_test['sf']=out
df_test['sf'].describe()

count     803.000000
mean       44.458039
std        89.741047
min        -3.015299
25%        10.721990
50%        20.884588
75%        47.124735
max      1752.676431
Name: sf, dtype: float64

In [35]:
df_train['sf'].describe()

count    3212.000000
mean       44.820109
std        87.624639
min         1.350000
25%         9.375000
50%        20.215000
75%        46.867500
max      2483.900000
Name: sf, dtype: float64

The test predictions follow a similar distribution that the training sample. Possible things to do:
- Scale the data (training and evaluating)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X)
X_scaled = pd.DataFrame(scaler.transform(X), columns = X.columns)
- Penalize negative outputs, forcing them to be positive (ie by an affine transformation)
- Fine-tuning hyperparameters of the model

In [36]:
clf = GradientBoostingRegressor(learning_rate=0.2, max_depth=2, n_estimators=520,
                          random_state=1).fit(X, Y)
r2 = cross_val_score(clf, X, Y, cv=kf, scoring='r2').mean()
print(f'New R2 is {r2: .4f}')

out=clf.predict(X_test)
df_test['sf_2']=out
df_test['sf_2'].describe()

New R2 is  0.9342


count     803.000000
mean       43.544429
std        80.827625
min        -8.461102
25%         9.794758
50%        19.437447
75%        46.659381
max      1319.535291
Name: sf_2, dtype: float64

Better R2 but larger negative numbers with this new model

In [37]:
df_test.to_csv('predictions.csv', index=False)