In [155]:
import tensorflow as tf
from keras.models import Sequential
from keras.utils import to_categorical
from keras.layers import Dense
from sklearn.preprocessing import LabelEncoder
from keras.callbacks import EarlyStopping

# Clean and Preprocess Data for Machine Learning
import numpy as np
import pandas as pd

In [156]:
#Import and create DataFrames
kderby_stats = pd.read_csv('K_Derby_Stats_2019-2008.csv')
kderby_placing_order = pd.read_csv('Kentucky_Derby_Placing_Order2008-2019.csv')

In [157]:
kderby_stats_df = pd.DataFrame(kderby_stats) 
kderby_placing_order_df = pd.DataFrame(kderby_placing_order) 

In [158]:
# Merge CSV's to create one dataframe 
merge_tables = pd.merge(kderby_stats_df, kderby_placing_order_df, how='left', left_on=['Horse','Year'], right_on = ['Horse','Year'])
merge_tables.head()

Unnamed: 0,Year,Post_Position,WinnersatPost_last49yrs,ITM,Starters,ITM%atPost_last49yrs,Horse,Morning _Line,Derby_Points,Trainer,...,Dam,Dams_Sire,Tomlinson_DST,Tomlinson_Wet,BRIS_Pace,AWD_Sire,AWD_Dams_Sire,LastPrep_Finish,Dosage_Index,Finish_Position
0,2019,1,1,6.0,47.0,13%,War of Will,15-1,60,Casse,...,Visions of Clarity,Sadler's Wells,370.0,329.0,117.0,7.3,10.7,"9th, Louisiana Derby",1.72,7.0
1,2019,2,4,12.0,49.0,24%,Tax,20-1,52,Gargan,...,Toll,Giant's Causeway,338.0,405.0,116.0,7.9,8.1,"2nd, Wood Memorial",1.56,14.0
2,2019,3,4,11.0,49.0,22%,By My Standards,15-1,100,Calhoun,...,A Jealous Woman,Muqtarib,252.0,292.0,104.0,6.3,5.8,"1st, Louisiana Derby",3.0,11.0
3,2019,4,6,9.0,49.0,18%,Gray Magician,50-1,41,Miller,...,Burg Berg,Johannesburg,227.0,393.0,105.0,6.7,7.0,"2nd, UAE Derby",5.0,19.0
4,2019,5,6,13.0,49.0,27%,Improbable,1-May,65,Baffert,...,Rare Event,A.P. Indy,308.0,443.0,112.0,6.5,8.2,"2nd, Arkansas Derby",4.23,4.0


In [159]:
#Now create new dataframe with only the below columns 
merge_tables_clean = merge_tables.dropna(subset=["Post_Position", "WinnersatPost_last49yrs", "ITM", "Starters", "Finish_Position","Dosage_Index","Derby_Points"])
merge_tables_clean.head()



Unnamed: 0,Year,Post_Position,WinnersatPost_last49yrs,ITM,Starters,ITM%atPost_last49yrs,Horse,Morning _Line,Derby_Points,Trainer,...,Dam,Dams_Sire,Tomlinson_DST,Tomlinson_Wet,BRIS_Pace,AWD_Sire,AWD_Dams_Sire,LastPrep_Finish,Dosage_Index,Finish_Position
0,2019,1,1,6.0,47.0,13%,War of Will,15-1,60,Casse,...,Visions of Clarity,Sadler's Wells,370.0,329.0,117.0,7.3,10.7,"9th, Louisiana Derby",1.72,7.0
1,2019,2,4,12.0,49.0,24%,Tax,20-1,52,Gargan,...,Toll,Giant's Causeway,338.0,405.0,116.0,7.9,8.1,"2nd, Wood Memorial",1.56,14.0
2,2019,3,4,11.0,49.0,22%,By My Standards,15-1,100,Calhoun,...,A Jealous Woman,Muqtarib,252.0,292.0,104.0,6.3,5.8,"1st, Louisiana Derby",3.0,11.0
3,2019,4,6,9.0,49.0,18%,Gray Magician,50-1,41,Miller,...,Burg Berg,Johannesburg,227.0,393.0,105.0,6.7,7.0,"2nd, UAE Derby",5.0,19.0
4,2019,5,6,13.0,49.0,27%,Improbable,1-May,65,Baffert,...,Rare Event,A.P. Indy,308.0,443.0,112.0,6.5,8.2,"2nd, Arkansas Derby",4.23,4.0


In [160]:
#merge_tables_clean.isna

In [161]:
merge_tables_clean["Finish_Position"].unique()

array([ 7., 14., 11., 19.,  4., 12., 17.,  3.,  8., 10.,  2.,  9.,  5.,
       15., 16., 18.,  1.,  6., 20., 13.])

In [162]:
X = merge_tables_clean[["Post_Position", "WinnersatPost_last49yrs", "ITM", "Starters","Dosage_Index","Derby_Points"]]
y = merge_tables_clean["Finish_Position"]
print(X.shape, y.shape)

(119, 6) (119,)


In [163]:
# Dummy Encoding transforms each categorical feature into new columns with a 1 (True) or 0 (False) encoding 
# to represent if that categorical label was present or not in the original row.  
# Pandas provides a shortcut to create Binary Encoded data.
data_x = X.copy()
data_y = y.copy()

#encode only specific columns using: data_binary_encoded_x = pd.get_dummies(data, columns=["JOCKEY", "TRK"])
#encode all columns using:
data_binary_encoded_x = pd.get_dummies(data_x)
data_binary_encoded_y = pd.get_dummies(data_y)

#data_binary_encoded_x.head()
data_binary_encoded_y.head()

Unnamed: 0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0,12.0,13.0,14.0,15.0,16.0,17.0,18.0,19.0,20.0
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [164]:
#One-Hot Encoding 
label_encoder = LabelEncoder()
label_encoder.fit(data_y)
encoded_y_train = label_encoder.transform(data_y)

#encoded_y_test = label_encoder.transform(y_test)
#y_train_categorical = to_categorical(encoded_y_train)
#y_test_categorical = to_categorical(encoded_y_test)

In [165]:
#One-Hot Encoding 
for label, original_class in zip(encoded_y_train, y):
    print('Original Class: ' + str(original_class))
    print('Encoded Label: ' + str(label))
    print('-' * 12)

Original Class: 7.0
Encoded Label: 6
------------
Original Class: 14.0
Encoded Label: 13
------------
Original Class: 11.0
Encoded Label: 10
------------
Original Class: 19.0
Encoded Label: 18
------------
Original Class: 4.0
Encoded Label: 3
------------
Original Class: 12.0
Encoded Label: 11
------------
Original Class: 17.0
Encoded Label: 16
------------
Original Class: 3.0
Encoded Label: 2
------------
Original Class: 8.0
Encoded Label: 7
------------
Original Class: 10.0
Encoded Label: 9
------------
Original Class: 2.0
Encoded Label: 1
------------
Original Class: 9.0
Encoded Label: 8
------------
Original Class: 5.0
Encoded Label: 4
------------
Original Class: 15.0
Encoded Label: 14
------------
Original Class: 16.0
Encoded Label: 15
------------
Original Class: 18.0
Encoded Label: 17
------------
Original Class: 1.0
Encoded Label: 0
------------
Original Class: 11.0
Encoded Label: 10
------------
Original Class: 16.0
Encoded Label: 15
------------
Original Class: 15.0
Encoded 

In [166]:
#One-Hot Encoding 
one_hot_y = to_categorical(encoded_y_train)
one_hot_y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.]], dtype=float32)

In [167]:
from sklearn.model_selection import train_test_split

X_train = pd.get_dummies(data_binary_encoded_x)

X_train, X_test, y_train, y_test = train_test_split(data_binary_encoded_x, one_hot_y, random_state=42)

X_train.head()

Unnamed: 0,Post_Position,ITM,Starters,Dosage_Index,WinnersatPost_last49yrs_0,WinnersatPost_last49yrs_1,WinnersatPost_last49yrs_2,WinnersatPost_last49yrs_3,WinnersatPost_last49yrs_4,WinnersatPost_last49yrs_5,...,Derby_Points_63,Derby_Points_65,Derby_Points_70,Derby_Points_74,Derby_Points_76,Derby_Points_80,Derby_Points_84,Derby_Points_85,Derby_Points_90,Derby_Points_93
81,18,6.0,25.0,2.33,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28,8,11.0,48.0,2.71,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
46,5,11.0,47.0,5.0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
70,7,4.0,46.0,2.53,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
101,16,8.0,32.0,2.73,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [182]:
#from sklearn.preprocessing import StandardScaler
#X_scaler = StandardScaler().fit(X_train)
#y_scaler = StandardScaler().fit(y_train)

#X_train_scaled = X_scaler.transform(X_train)
#X_test_scaled = X_scaler.transform(X_test)
#y_train_scaled = y_scaler.transform(y_train)
#y_test_scaled = y_scaler.transform(y_test)

In [183]:
#from sklearn.linear_model import LinearRegression
#model = LinearRegression()
#model.fit(X_train_scaled, y_train_scaled)

In [184]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X, y)
rf.score(X, y)

1.0

In [185]:
feature_names = ["Post_Position", "WinnersatPost_last49yrs", "ITM", "Starters","Dosage_Index","Derby_Points"]
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.22351323246293084, 'Derby_Points'),
 (0.2164670513201554, 'Starters'),
 (0.20921523318567115, 'Dosage_Index'),
 (0.14661765056529702, 'Post_Position'),
 (0.12052774573741198, 'ITM'),
 (0.08365908672853363, 'WinnersatPost_last49yrs')]

In [186]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X, y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [187]:
print(f"Training Data Score: {model.score(X, y)}")

Training Data Score: 0.24369747899159663


In [190]:
#Validate the model using the test data
print(f"Training Data Score: {model.score(X_train_scaled, y_train_scaled)}")
print(f"Testing Data Score: {model.score(X_test_scaled, y_test_scaled)}")

ValueError: X has 67 features per sample; expecting 6

In [42]:
#Make predictions
predictions = model.predict(X_test_scaled)
print(f"First 10 Predictions: {predictions[:10]}")
print(f"First 10 Actual labels: {y_test[:10].tolist()}")

First 10 Predictions: [[ 0.06422197 -0.13083541 -0.10004461  0.04084667 -0.04534392 -0.00316948
  -0.04387434 -0.0208421   0.05687751 -0.04658879  0.00330326 -0.05900223
   0.06857613  0.01820834 -0.03544705 -0.00891484  0.02315322  0.12095939
   0.02159632  0.08570734]
 [ 0.04461851 -0.14575705 -0.10221902  0.02457426 -0.01468227  0.0283835
  -0.00256976 -0.02597062  0.10707795  0.01387378 -0.02809722 -0.08843974
   0.03788021 -0.01135899 -0.15879267 -0.03092603  0.04757856  0.18666919
   0.01637496  0.14406883]
 [ 0.1323912  -0.21153393 -0.17154601  0.08823824 -0.11131332 -0.03897022
  -0.12004656 -0.03077938  0.04560021 -0.14508097  0.03904093 -0.07131186
   0.15172135  0.06299809  0.06921214  0.00785152  0.01433091  0.14051023
   0.04306622  0.08704417]
 [-0.04011574  0.03619272  0.03534064 -0.02867199  0.04228479  0.02736576
   0.05057278  0.00348175  0.02054836  0.06748175 -0.02728882 -0.00252629
  -0.05093912 -0.03130142 -0.08921573 -0.01495932  0.0116765   0.00962828
  -0.01235

In [18]:
from sklearn.metrics import mean_squared_error

MSE = mean_squared_error(y_test_scaled, predictions)
r2 = model.score(X_test_scaled, y_test_scaled)

print(f"MSE: {MSE}, R2: {r2}")

MSE: 2.343605528877624e+22, R2: -2.2000155245081455e+22




In [19]:
# Save the model
import pickle
filename = 'finalized_winning_horsemodel.sav'
pickle.dump(model, open(filename, 'wb'))

In [20]:
# How to load the model from disk (perhaps at a later date when needed):   
 loaded_model = pickle.load(open(filename, 'rb'))
 result = loaded_model.score(X_test_scaled, y_test_scaled)
 print(result)

-2.2000155245081455e+22


