## Horse racing prediction  

This is an experiment to predict the outcome of horse racing based on past 5 race results, jockey, and trainer.



## Prepare data

In [1]:
# prepare mongodb conection
import numpy as np
from pymongo import MongoClient
client = MongoClient()

db = client.keiba
# training_data_Kisyu_Kyusya_1_race_5_with_odds contains data
training_data = db.training_data_Kisyu_Kyusya_1_race_5_with_odds
# data_models_Kisyu_Kyusya_1_race_5_with_odds contains only std and mean data
data_models = db.data_models_Kisyu_Kyusya_1_race_5_with_odds

In [2]:
# get cursor of mongodb
all_data_cursor = training_data.find({})

In [3]:
all_data_count = all_data_cursor.count()
print("all_data_count: {}".format(all_data_count))

all_data_count: 9245046


In [4]:
# get std and mean. we use data_model later
mean_and_std = data_models.find_one({})

In [6]:
# 
# get all data from mongodb and keep them as numpy.array
# target Y is float value
#
def prepare_training_data():
    
    input_X = np.zeros(shape=(all_data_count, 105), dtype=float)
    target_Y = np.zeros(shape=(all_data_count, 1), dtype=float)
    
    idx1 = 0
    for data1 in all_data_cursor:
        
        # normalize x values
        for idx2 in data1['input_x_object']:

            # get model data which contains mean and std
            x1 = data1['input_x_object'][idx2]

            mean_name = 'input_x_avg_'+idx2
            mean_value = mean_and_std['mean_and_std'][mean_name]

            std_name = 'input_x_std_'+idx2
            std_value = mean_and_std['mean_and_std'][std_name]

            normarized_x = (x1 - mean_value) / std_value
            input_X[idx1, int(idx2)] = normarized_x

        # normarize y value
        y1 = data1['target_y']
        y_mean_value = mean_and_std['mean_and_std']['target_y_mean']
        y_std_value = mean_and_std['mean_and_std']['target_y_stddev']
        normalized_y = (y1 - y_mean_value) / y_std_value

        target_Y[idx1] = normalized_y
        
        idx1 = idx1 + 1
    
    return (input_X, target_Y)


In [7]:
# get data actually
training_x, training_y = prepare_training_data()

In [9]:
# save data for future use
import pickle
with open('filename.pickle', 'wb') as handle:
    pickle.dump((training_x, training_y), handle, protocol=pickle.HIGHEST_PROTOCOL)

In [42]:
# 
# get all data from mongodb and keep them as numpy.array
# target y is bainary value 0 or 1. 1 is win, 0 is lose
#

def prepare_training_data_binary():
    
    input_X = np.zeros(shape=(all_data_count, 105), dtype=float)
    target_Y = np.zeros(shape=(all_data_count, 1), dtype=float)
    
    idx1 = 0
    for data1 in all_data_cursor:
        
        # normalize x values
        for idx2 in data1['input_x_object']:

            # get model data which contains mean and std
            x1 = data1['input_x_object'][idx2]

            mean_name = 'input_x_avg_'+idx2
            mean_value = mean_and_std['mean_and_std'][mean_name]

            std_name = 'input_x_std_'+idx2
            std_value = mean_and_std['mean_and_std'][std_name]

            normarized_x = (x1 - mean_value) / std_value
            input_X[idx1, int(idx2)] = normarized_x

        # normarize y value
        y1 = data1['target_y']
        if y1 > 0:
            target_Y[idx1] = 1
        else:
            target_Y[idx1] = 0
        
        idx1 = idx1 + 1
    
    return (input_X, target_Y)


In [44]:
# get binary version of y
training_x_binary, training_y_binary = prepare_training_data_binary()

In [45]:
# save data for future use
import pickle
with open('filename_binary.pickle', 'wb') as handle:
    pickle.dump((training_x_binary, training_y_binary), handle, protocol=pickle.HIGHEST_PROTOCOL)

## Restart from here

In [7]:
# load float version of output y
import pickle
with open('filename.pickle', 'rb') as handle:
    training_x, training_y = pickle.load(handle)

In [5]:
# load float version of output y
import pickle
with open('filename_binary.pickle', 'rb') as handle:
    training_x_binary, training_y_binary = pickle.load(handle)

In [9]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(training_x, training_y, test_size = 0.1)

## Create model 

In [7]:
# import dependancies

# allocate 50% of GPU memory (if you like, feel free to change this)
from keras.backend.tensorflow_backend import set_session
import tensorflow as tf 

# gpu specific
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.5
set_session(tf.Session(config=config))

import keras
from keras import metrics, initializers

from keras_tqdm import TQDMNotebookCallback
from keras.layers import Dropout, Dense, LeakyReLU, BatchNormalization, Activation
from keras.models import Sequential
from keras.callbacks import ModelCheckpoint
from keras.optimizers import SGD, Adam, RMSprop




Using TensorFlow backend.


In [13]:
# model 1: 3 layers, LeakyReLU, and dropout
model_1 = Sequential()

model_1.add(Dense(128, input_shape=(105,), activation=None))
model_1.add(LeakyReLU(alpha=0.3))
# model_1.add(Dropout(0.2))

model_1.add(Dense(256, activation=None))
model_1.add(LeakyReLU(alpha=0.3))
# model_1.add(Dropout(0.2))

model_1.add(Dense(128, activation=None))
model_1.add(LeakyReLU(alpha=0.3))
# model_1.add(Dropout(0.2))

model_1.add(Dense(1, activation=None))

model_1.compile(optimizer='rmsprop',
              loss='mean_absolute_error',
              metrics=[metrics.mae])

model_1.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 128)               13568     
_________________________________________________________________
leaky_re_lu_1 (LeakyReLU)    (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 256)               33024     
_________________________________________________________________
leaky_re_lu_2 (LeakyReLU)    (None, 256)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 128)               32896     
_________________________________________________________________
leaky_re_lu_3 (LeakyReLU)    (None, 128)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 129       
Total para

In [14]:
# training model_1

# add checkpointer
save_model_name = "keiba_model_g1.h5"
checkpointer = ModelCheckpoint(filepath='results/'+save_model_name, verbose=0)

# minibatch_size = 32

# steps_per_epoch = training_data_count // minibatch_size
# validation_steps = validation_data_count // minibatch_size


model_1.fit(x=x_train, 
            y=y_train, 
            batch_size=64, 
            epochs=5, 
            verbose=1, 
            callbacks=[checkpointer],
            validation_split=0.2,
            shuffle=True)


# model_1.fit_generator(generator=data_generator(batch_size=minibatch_size, data_type='training'),
#                     steps_per_epoch=steps_per_epoch,
#                     validation_data=data_generator(batch_size=minibatch_size, data_type='validation'),
#                     validation_steps=validation_steps,
#                     epochs=20,
#                     callbacks=[checkpointer])

Train on 6656432 samples, validate on 1664109 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fc7603d7940>

## evaluate model  
inference to probability

In [49]:
# modify y_test to binary data

# y_mean_value = mean_and_std['mean_and_std']['target_y_mean']
# y_std_value = mean_and_std['mean_and_std']['target_y_stddev']

# y1 = y_test * y_std_value + y_mean_value

# pred_normalized = model_1.predict(x_train)
# pred1 = pred_normalized * y_std_value + y_mean_value

# # multiply prediction and actuall value. if sing is same the result should be positive
# check_1 = pred1 * y1

# idx = 0
# for item in check_1:
#     if item * y_train[idx] > 0:
#         check_1[idx] = 1
#     else:
#         check_1[idx] = 0
    
#     idx = idx + 1


# accuracy1 = 100*np.sum(check_1) / len(check_1)
# print("accuracy1:{}".format(accuracy1))

# normalized_y = (y1 - y_mean_value) / y_std_value

y_mean_value = mean_and_std['mean_and_std']['target_y_mean']
y_std_value = mean_and_std['mean_and_std']['target_y_stddev']

# x_test[2:3]

y1 = y_test[2:12] * y_std_value + y_mean_value

pred_normalized = model_1.predict(x_test[2:12])
pred1 = pred_normalized * y_std_value + y_mean_value

print("y_test[2:12]:{}".format(y_test[2:12]))
print("y1:{}".format(y1))
print("")
print("pred_normalized:{}".format(pred_normalized))
print("pred1:{}".format(pred1))

aaa = pred1 * y1
print("aaa: {}".format(aaa))

idx = 0
for item in aaa:
    if item > 0:
        aaa[idx] = 1
    else:
        aaa[idx] = 0
    
    idx = idx + 1

print("aaa : {}".format(aaa))
print(len(aaa))

accuracy1 = 100*np.sum(aaa) / len(aaa)
print("accuracy1:{}".format(accuracy1))


y_test[2:12]:[[ 0.02262738]
 [ 0.08081207]
 [ 0.04525476]
 [-0.02262738]
 [ 0.08404455]
 [-0.00646497]
 [-0.00969745]
 [-0.04202228]
 [-0.00323248]
 [ 0.04525476]]
y1:[[  7.]
 [ 25.]
 [ 14.]
 [ -7.]
 [ 26.]
 [ -2.]
 [ -3.]
 [-13.]
 [ -1.]
 [ 14.]]

pred_normalized:[[-0.03939489]
 [-0.01960922]
 [-0.02069423]
 [-0.00197923]
 [ 0.23016065]
 [-0.00114011]
 [-0.01365444]
 [-0.01148138]
 [-0.01751973]
 [-0.0013935 ]]
pred1:[[-12.18719101]
 [ -6.06630373]
 [ -6.40196085]
 [ -0.61229342]
 [ 71.20243073]
 [ -0.35270301]
 [ -4.22413254]
 [ -3.55187583]
 [ -5.41989899]
 [ -0.43109414]]
aaa: [[ -8.53103371e+01]
 [ -1.51657593e+02]
 [ -8.96274519e+01]
 [  4.28605396e+00]
 [  1.85126320e+03]
 [  7.05406010e-01]
 [  1.26723976e+01]
 [  4.61743858e+01]
 [  5.41989899e+00]
 [ -6.03531796e+00]]
aaa : [[ 0.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 0.]]
10
accuracy1:60.0


## Model 2  


In [12]:
# model 2:
model_2 = Sequential()

model_2.add(Dense(128, 
                  input_shape=(105,), 
                  kernel_initializer=initializers.TruncatedNormal(mean=0.0, stddev=0.01, seed=None),
                  bias_initializer=initializers.TruncatedNormal(mean=0.0, stddev=0.01, seed=None),
                  activation=None))
# model_2.add(BatchNormalization())
model_2.add(LeakyReLU(alpha=0.3))
# model_2.add(Dropout(0.2))

# model_2.add(Dense(256, activation=None))
# # model_2.add(BatchNormalization())
# model_2.add(LeakyReLU(alpha=0.3))

model_2.add(Dense(256, activation=None))
# model_2.add(BatchNormalization())
model_2.add(LeakyReLU(alpha=0.3))
# model_2.add(Dropout(0.2))

model_2.add(Dense(128, activation=None))
# model_2.add(BatchNormalization())
model_2.add(LeakyReLU(alpha=0.3))
# model_2.add(Dropout(0.2))

model_2.add(Dense(1, activation=None))
model_2.add(Activation('sigmoid'))

# Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)

model_2.compile(optimizer=Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0),
              loss='binary_crossentropy',
              metrics=[metrics.binary_accuracy])

model_2.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_10 (Dense)             (None, 128)               13568     
_________________________________________________________________
leaky_re_lu_8 (LeakyReLU)    (None, 128)               0         
_________________________________________________________________
dense_11 (Dense)             (None, 256)               33024     
_________________________________________________________________
leaky_re_lu_9 (LeakyReLU)    (None, 256)               0         
_________________________________________________________________
dense_12 (Dense)             (None, 128)               32896     
_________________________________________________________________
leaky_re_lu_10 (LeakyReLU)   (None, 128)               0         
_________________________________________________________________
dense_13 (Dense)             (None, 1)                 129       
__________

In [14]:
# training model_2

# add checkpointer
save_model_name = "keiba_model_g2.h5"
checkpointer = ModelCheckpoint(filepath='results/'+save_model_name, verbose=0)

# training_x_binary, training_y_binary

model_2.fit(x=training_x_binary, 
            y=training_y_binary, 
            batch_size=128, 
            epochs=20, 
            verbose=1, 
            callbacks=[checkpointer],
            validation_split=0.2,
            shuffle=True)

Train on 7396036 samples, validate on 1849010 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f5e44b43b00>

## other helper functions

In [5]:
# get batch as generator: not used here
def data_generator(batch_size, data_type):
    
    input_X = np.zeros(shape=(batch_size, 105), dtype=float)
    target_Y = np.zeros(shape=(batch_size, 1), dtype=float)
    
    while True:
        
        for idx1 in range(batch_size):
            
            # get one row
            data1 = None
            if data_type == 'validation':
                data1 = validation_data_cursor.next()
            else:
                data1 = training_data_cursor.next()

            # normalize x values
            for idx2 in data1['input_x_object']:

                # get model data which contains mean and std
                x1 = data1['input_x_object'][idx2]

                mean_name = 'input_x_avg_'+idx2
                mean_value = mean_and_std['mean_and_std'][mean_name]

                std_name = 'input_x_std_'+idx2
                std_value = mean_and_std['mean_and_std'][std_name]

                normarized_x = (x1 - mean_value) / std_value
                input_X[idx1, int(idx2)] = normarized_x

            # normarize y value
            y1 = data1['target_y']
            y_mean_value = mean_and_std['mean_and_std']['target_y_mean']
            y_std_value = mean_and_std['mean_and_std']['target_y_stddev']
            normalized_y = (y1 - y_mean_value) / y_std_value

            target_Y[idx1] = normalized_y

            yield (input_X, target_Y)

In [13]:
# get batch (y is binary data) as generator: not used here
def data_generator_binary(batch_size, data_type):
    
    input_X = np.zeros(shape=(batch_size, 105), dtype=float)
    target_Y = np.zeros(shape=(batch_size, 1), dtype=float)
    
    while True:
        
        for idx1 in range(batch_size):
            
            # get one row
            data1 = None
            if data_type == 'validation':
                data1 = validation_data_cursor.next()
            else:
                data1 = training_data_cursor.next()

            # normalize x values
            for idx2 in data1['input_x_object']:

                # get model data which contains mean and std
                x1 = data1['input_x_object'][idx2]

                mean_name = 'input_x_avg_'+idx2
                mean_value = mean_and_std['mean_and_std'][mean_name]

                std_name = 'input_x_std_'+idx2
                std_value = mean_and_std['mean_and_std'][std_name]

                normarized_x = (x1 - mean_value) / std_value
                input_X[idx1, int(idx2)] = normarized_x

            # normarize y value
            y1 = data1['target_y']
            if y1 >= 0:
                target_Y[idx1] = 1
            else:
                target_Y[idx1] = 0

            yield (input_X, target_Y)
