# ML Model Horse Racing Predictions
#### Create an ML model that can predict whether a horse has a good chance of winning a race, based on the horse's features

## Preprocessing

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf

#  Import and read the charity_data.csv.
cleaned_data_df = pd.read_csv("Data_Racing/International_Data/cleaned_data_df.csv")
cleaned_data_df.head()



2023-07-19 19:08:02.929270: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Unnamed: 0.1,Unnamed: 0,ages,metric,ncond,age,saddle,decimalPrice,isFav,RPR,TR,OR,father,weight,res_win
0,1,4yo+,3218.0,10,6.0,7.0,0.444444,1,96.0,89.0,110.0,Getaway,65,0.0
1,26,5yo+,3218.0,5,8.0,1.0,0.266667,1,115.0,33.0,109.0,Beneficial,73,0.0
2,27,5yo+,3218.0,5,9.0,2.0,0.125,0,108.0,29.0,103.0,Beneficial,68,0.0
3,43,4yo+,4827.0,6,7.0,13.0,0.076923,0,95.0,39.0,83.0,Kalanisi,63,1.0
4,44,4yo+,4827.0,6,8.0,2.0,0.076923,0,119.0,64.0,108.0,Ask,71,0.0


In [2]:
cleaned_data_df = cleaned_data_df.drop(columns = "Unnamed: 0")

In [3]:
# Determine the number of unique values in each column.
cleaned_data_df.nunique()

ages             16
metric           52
ncond            13
age              14
saddle           29
decimalPrice     79
isFav             2
RPR             165
TR              157
OR              149
father          327
weight           32
res_win           2
dtype: int64

## Select equal res_win values

In [4]:
# Look at value counts for binning
res_win_counts = cleaned_data_df["res_win"].value_counts()
class_count = res_win_counts.min()
class_count

3169

In [5]:
res_win_equal = pd.concat([cleaned_data_df[cleaned_data_df.res_win==0].sample(n=class_count), 
                           cleaned_data_df[cleaned_data_df.res_win==1].sample(n=class_count)])
res_win_equal = res_win_equal.sort_index()
res_win_equal

Unnamed: 0,ages,metric,ncond,age,saddle,decimalPrice,isFav,RPR,TR,OR,father,weight,res_win
3,4yo+,4827.0,6,7.0,13.0,0.076923,0,95.0,39.0,83.0,Kalanisi,63,1.0
10,4yo+,1005.0,0,5.0,4.0,0.047619,0,91.0,69.0,83.0,Showcasing,58,1.0
16,5yo+,4122.5,10,9.0,1.0,0.090909,0,150.0,56.0,140.0,Dylan Thomas,73,1.0
18,5yo+,4122.5,10,8.0,11.0,0.105263,0,135.0,40.0,127.0,Fair Mix,69,0.0
21,5yo+,5832.0,10,9.0,4.0,0.222222,0,113.0,39.0,102.0,September Storm,74,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
24897,3yo+,1609.0,0,6.0,9.0,0.181818,0,90.0,41.0,82.0,Rio De La Plata,56,1.0
24905,4yo+,3821.0,5,6.0,2.0,0.222222,0,134.0,40.0,135.0,Jeremy,69,1.0
24913,4yo+,4022.0,5,7.0,5.0,0.153846,0,144.0,82.0,129.0,Poliglote,70,1.0
24919,4yo+,4725.5,9,8.0,1.0,0.076923,0,141.0,3.0,139.0,Shantou,75,1.0


## Convert and Standardize Data

In [25]:
# Convert categorical data to numeric with `pd.get_dummies`
converted_data = pd.get_dummies(res_win_equal)
converted_data.head()

Unnamed: 0,metric,ncond,age,saddle,decimalPrice,isFav,RPR,TR,OR,weight,...,father_War Front,father_Well Chosen,father_Westerner,father_Whipper,father_Winged Love,father_Wootton Bassett,father_Yeats,father_Youmzain,father_Zebedee,father_Zoffany
3,4827.0,6,7.0,13.0,0.076923,0,95.0,39.0,83.0,63,...,0,0,0,0,0,0,0,0,0,0
10,1005.0,0,5.0,4.0,0.047619,0,91.0,69.0,83.0,58,...,0,0,0,0,0,0,0,0,0,0
16,4122.5,10,9.0,1.0,0.090909,0,150.0,56.0,140.0,73,...,0,0,0,0,0,0,0,0,0,0
18,4122.5,10,8.0,11.0,0.105263,0,135.0,40.0,127.0,69,...,0,0,0,0,0,0,0,0,0,0
21,5832.0,10,9.0,4.0,0.222222,0,113.0,39.0,102.0,74,...,0,0,0,0,0,0,0,0,0,0


In [26]:
# Split our preprocessed data into our features and target arrays
y = converted_data['res_win']
X = converted_data.drop(columns=['res_win'])

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [27]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [28]:
input_features = len(X_train_scaled[0])
input_features

337

## Compile, Train and Evaluate the Model

In [29]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
input_features = len(X_train_scaled[0])
neurons_hidden_layer1 = 356
neurons_hidden_layer2 = 268
neurons_hidden_layer3 = 176

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units = neurons_hidden_layer1, activation="relu", input_dim = input_features))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units = neurons_hidden_layer2, activation="relu"))

# Third hidden layer
nn.add(tf.keras.layers.Dense(units = neurons_hidden_layer3, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_4 (Dense)             (None, 356)               120328    
                                                                 
 dense_5 (Dense)             (None, 268)               95676     
                                                                 
 dense_6 (Dense)             (None, 176)               47344     
                                                                 
 dense_7 (Dense)             (None, 1)                 177       
                                                                 
Total params: 263,525
Trainable params: 263,525
Non-trainable params: 0
_________________________________________________________________


In [30]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [31]:
# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs = 150)

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

Epoch 82/150
Epoch 83/150
Epoch 84/150
Epoch 85/150
Epoch 86/150
Epoch 87/150
Epoch 88/150
Epoch 89/150
Epoch 90/150
Epoch 91/150
Epoch 92/150
Epoch 93/150
Epoch 94/150
Epoch 95/150
Epoch 96/150
Epoch 97/150
Epoch 98/150
Epoch 99/150
Epoch 100/150
Epoch 101/150
Epoch 102/150
Epoch 103/150
Epoch 104/150
Epoch 105/150
Epoch 106/150
Epoch 107/150
Epoch 108/150
Epoch 109/150
Epoch 110/150
Epoch 111/150
Epoch 112/150
Epoch 113/150
Epoch 114/150
Epoch 115/150
Epoch 116/150
Epoch 117/150
Epoch 118/150
Epoch 119/150
Epoch 120/150
Epoch 121/150
Epoch 122/150
Epoch 123/150
Epoch 124/150
Epoch 125/150
Epoch 126/150
Epoch 127/150
Epoch 128/150
Epoch 129/150
Epoch 130/150
Epoch 131/150
Epoch 132/150
Epoch 133/150
Epoch 134/150
Epoch 135/150
Epoch 136/150
Epoch 137/150
Epoch 138/150
Epoch 139/150
Epoch 140/150
Epoch 141/150
Epoch 142/150
Epoch 143/150
Epoch 144/150
Epoch 145/150
Epoch 146/150
Epoch 147/150
Epoch 148/150
Epoch 149/150
Epoch 150/150


In [32]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

50/50 - 0s - loss: 1.3882 - accuracy: 0.7836 - 101ms/epoch - 2ms/step
Loss: 1.3882046937942505, Accuracy: 0.7835962176322937


In [33]:
(y_test == 0).mean()

0.5072555205047319

In [34]:
# Make predictions on the test data
y_pred = nn.predict(X_test_scaled)
y_pred_binary = (y_pred > 0.5).astype(int)



In [35]:
from sklearn.metrics import precision_score

precision = precision_score(y_test, y_pred_binary)
print("Precision:", precision)

Precision: 0.7765151515151515


In [36]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred_binary))

              precision    recall  f1-score   support

         0.0       0.79      0.78      0.79       804
         1.0       0.78      0.79      0.78       781

    accuracy                           0.78      1585
   macro avg       0.78      0.78      0.78      1585
weighted avg       0.78      0.78      0.78      1585



In [18]:
# Export our model to HDF5 file
# nn.save('h5_files/AlphabetSoup_Model4.h5')