## label as normalized order of finish based on number of horses in race
the label should be resricted to a 0 - 1 range taking into account the number of horses per race
so a horse that places 5th in a race of 10 horses gets a score of 0.5

$
\text{label} = 1 - \dfrac{(\text{finish order} - 1)}{(\text{number of horses in race} - 1)}
$

In [1]:
import sys
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

from IPython.display import display

sys.path.insert(0, '..')
np.random.seed(42)
pd.options.display.max_columns = None
%matplotlib inline

In [2]:
from scripts.utils import read_netkeiba

df = read_netkeiba()

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240471 entries, 0 to 240470
Data columns (total 66 columns):
c_id                                      240471 non-null int64
c_weight_carried                          240471 non-null float64
c_post_position                           240471 non-null int64
c_order_of_finish                         240471 non-null int64
c_order_of_finish_lowered                 240471 non-null int64
c_finish_time                             240471 non-null float64
c_horse_weight                            240471 non-null int64
c_horse_weight_diff                       240471 non-null int64
c_popularity                              240471 non-null int64
c_first_place_odds                        240471 non-null float64
r_id                                      240471 non-null int64
r_key                                     240471 non-null object
r_racetrack                               240471 non-null object
r_course_type                             240471 

In [4]:
df.head()

Unnamed: 0,c_id,c_weight_carried,c_post_position,c_order_of_finish,c_order_of_finish_lowered,c_finish_time,c_horse_weight,c_horse_weight_diff,c_popularity,c_first_place_odds,r_id,r_key,r_racetrack,r_course_type,r_weather,r_url,r_distance,r_date,r_dirt_condition,r_turf_condition,r_impost_category,r_is_non_winner_regional_horse_allowed,r_is_winner_regional_horse_allowed,r_is_regional_jockey_allowed,r_is_foreign_horse_allowed,r_is_foreign_horse_and_trainer_allowed,r_is_apprentice_jockey_allowed,r_is_female_only,h_id,h_key,h_url,h_total_races,h_total_wins,h_sex,h_birthday,h_user_rating,j_id,j_key,j_url,j_career_1st_place_count,j_career_2nd_place_count,j_career_3rd_place_count,j_career_4th_place_or_below_count,j_career_turf_race_count,j_career_turf_win_count,j_career_dirt_race_count,j_career_dirt_win_count,j_career_1st_place_rate,j_career_1st_2nd_place_rate,j_career_any_place_rate,j_career_earnings,t_id,t_key,t_url,t_career_1st_place_count,t_career_2nd_place_count,t_career_3rd_place_count,t_career_4th_place_or_below_count,t_career_turf_race_count,t_career_turf_win_count,t_career_dirt_race_count,t_career_dirt_win_count,t_career_1st_place_rate,t_career_1st_2nd_place_rate,t_career_any_place_rate,t_career_earnings
0,1,54.0,7,1,0,109.9,514,-2,1,3.7,1,201803030101,fuma,dirt,sunny,http://db.netkeiba.com/race/201803030101/,1700,2018-11-03,good,,age_based,0,0,0,0,0,0,1,1,2016104408,http://db.netkeiba.com/horse/2016104408/,2,1,female,2016-04-03,3.17,1,1092,http://db.netkeiba.com/jockey/result/01092,423,477,565,6501,4156,222,3810,201,0.053,0.113,0.184,829574.0,1,359,http://db.netkeiba.com/trainer/result/00359,556,577,557,5185,3613,299,3052,241,0.081,0.165,0.246,892578.6
1,2,54.0,8,2,0,110.1,490,0,3,6.3,1,201803030101,fuma,dirt,sunny,http://db.netkeiba.com/race/201803030101/,1700,2018-11-03,good,,age_based,0,0,0,0,0,0,1,2,2016100908,http://db.netkeiba.com/horse/2016100908/,3,0,female,2016-02-07,3.25,2,1109,http://db.netkeiba.com/jockey/result/01109,142,182,189,3746,2091,74,2167,68,0.033,0.076,0.12,253625.0,2,1147,http://db.netkeiba.com/trainer/result/01147,92,84,81,866,508,36,537,45,0.082,0.157,0.229,136755.2
2,3,54.0,4,3,0,110.4,464,10,4,6.4,1,201803030101,fuma,dirt,sunny,http://db.netkeiba.com/race/201803030101/,1700,2018-11-03,good,,age_based,0,0,0,0,0,0,1,3,2016105892,http://db.netkeiba.com/horse/2016105892/,2,0,female,2016-04-29,,3,1128,http://db.netkeiba.com/jockey/result/01128,330,299,350,4005,2498,146,2486,184,0.066,0.126,0.196,521757.2,3,1006,http://db.netkeiba.com/trainer/result/01006,286,255,279,3530,1854,111,2237,154,0.066,0.124,0.189,454950.2
3,4,54.0,6,4,0,110.4,450,-6,2,4.5,1,201803030101,fuma,dirt,sunny,http://db.netkeiba.com/race/201803030101/,1700,2018-11-03,good,,age_based,0,0,0,0,0,0,1,4,2016106313,http://db.netkeiba.com/horse/2016106313/,3,0,female,2016-03-19,,4,1015,http://db.netkeiba.com/jockey/result/01015,461,528,542,7051,4271,197,4311,264,0.054,0.115,0.178,852023.5,4,1099,http://db.netkeiba.com/trainer/result/01099,269,246,289,2686,1831,136,1558,118,0.077,0.148,0.23,493068.1
4,5,54.0,6,5,0,110.5,426,-8,10,43.3,1,201803030101,fuma,dirt,sunny,http://db.netkeiba.com/race/201803030101/,1700,2018-11-03,good,,age_based,0,0,0,0,0,0,1,5,2016102321,http://db.netkeiba.com/horse/2016102321/,2,0,female,2016-05-21,,5,1117,http://db.netkeiba.com/jockey/result/01117,372,429,435,5399,3408,201,3227,171,0.056,0.121,0.186,660899.9,5,1131,http://db.netkeiba.com/trainer/result/01131,60,71,77,1541,703,16,1008,43,0.034,0.075,0.119,94999.5


In [5]:
df['r_contender_count'] = df.groupby('r_id').c_id.count().loc[df.r_id].values
df['c_norm_order_of_finish'] = 1.0 - (df.c_order_of_finish - 1) / (df.r_contender_count - 1)

index_cols = ['c_id', 'r_id', 'h_id', 'j_id', 't_id', 'r_key', 'r_url', 'h_key', 'h_url', 'j_key', 'j_url', 't_key', 't_url']
label_cols = ['c_norm_order_of_finish', 'c_order_of_finish', 'c_finish_time']

X = df.drop(columns=index_cols + label_cols)
y = df.c_norm_order_of_finish

X.shape, y.shape

((240471, 52), (240471,))

In [6]:
from sklearn.model_selection import StratifiedShuffleSplit

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_idx, test_idx in sss.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((192376, 52), (192376,), (48095, 52), (48095,))

In [81]:
# fillna('')
# OneHotEncoder(handle_unknown='ignore')
cat_attribs = [
    'r_racetrack', 'r_course_type', 'r_weather', 'h_sex', 
    'r_impost_category', 'r_dirt_condition', 'r_turf_condition'
]

bool_attribs = [
    'r_is_non_winner_regional_horse_allowed', 'r_is_winner_regional_horse_allowed',
    'r_is_regional_jockey_allowed', 'r_is_foreign_horse_allowed',
    'r_is_foreign_horse_and_trainer_allowed', 'r_is_apprentice_jockey_allowed',
    'r_is_female_only', 'c_order_of_finish_lowered'
]

# CombinedDateAttributesAdder
date_attribs = [
    'r_date', 'h_birthday'
]

# Imputer(strategy='median')
# CombinedNumericAttributesAdder
# StandardScaler
num_attribs = [
    'c_weight_carried', 'c_post_position',
    'c_horse_weight', 'c_horse_weight_diff', 'c_popularity',
    'c_first_place_odds', 'r_distance', 'r_contender_count',
    'h_total_races', 'h_total_wins', 'h_user_rating',
    'j_career_1st_place_count', 'j_career_2nd_place_count',
    'j_career_3rd_place_count', 'j_career_4th_place_or_below_count',
    'j_career_turf_race_count', 'j_career_turf_win_count',
    'j_career_dirt_race_count', 'j_career_dirt_win_count',
    'j_career_1st_place_rate', 'j_career_1st_2nd_place_rate',
    'j_career_any_place_rate', 'j_career_earnings',
    't_career_1st_place_count', 't_career_2nd_place_count',
    't_career_3rd_place_count', 't_career_4th_place_or_below_count',
    't_career_turf_race_count', 't_career_turf_win_count',
    't_career_dirt_race_count', 't_career_dirt_win_count',
    't_career_1st_place_rate', 't_career_1st_2nd_place_rate',
    't_career_any_place_rate', 't_career_earnings'
]

In [88]:
from sklearn.base import BaseEstimator, TransformerMixin


class CombinedDateAttributesAdder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        df = pd.DataFrame(X)
        df['h_age_days'] = pd.to_datetime(df['r_date']) - pd.to_datetime(df['h_birthday'])
        return df.drop(columns=['r_date', 'h_birthday']).values

    
class CombinedNumericAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.j_turf_wins_ix = columns.index('j_career_turf_win_count')
        self.j_turf_races_ix = columns.index('j_career_turf_race_count')
        self.j_dirt_wins_ix = columns.index('j_career_dirt_win_count')
        self.j_dirt_races_ix = columns.index('j_career_dirt_race_count')
        self.t_turf_wins_ix = columns.index('t_career_turf_win_count')
        self.t_turf_races_ix = columns.index('t_career_turf_race_count')
        self.t_dirt_wins_ix = columns.index('t_career_dirt_win_count')
        self.t_dirt_races_ix = columns.index('t_career_dirt_race_count')
        self.h_total_wins_ix = columns.index('h_total_wins')
        self.h_total_races_ix = columns.index('h_total_races')
        
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        j_turf_wins, j_turf_races = X[:, self.j_turf_wins_ix], X[:, self.j_turf_races_ix]
        j_turf_win_rate = np.divide(j_turf_wins, j_turf_races, out=np.zeros_like(j_turf_wins), where=j_turf_wins!=0.)

        j_dirt_wins, j_dirt_races = X[:, self.j_dirt_wins_ix], X[:, self.j_dirt_races_ix]
        j_dirt_win_rate = np.divide(j_dirt_wins, j_dirt_races, out=np.zeros_like(j_dirt_wins), where=j_dirt_wins!=0.)

        t_turf_wins, t_turf_races = X[:, self.t_turf_wins_ix], X[:, self.t_turf_races_ix]
        t_turf_win_rate = np.divide(t_turf_wins, t_turf_races, out=np.zeros_like(t_turf_wins), where=t_turf_wins!=0.)

        t_dirt_wins, t_dirt_races = X[:, self.t_dirt_wins_ix], X[:, self.t_dirt_races_ix]
        t_dirt_win_rate = np.divide(t_dirt_wins, t_dirt_races, out=np.zeros_like(t_dirt_wins), where=t_dirt_wins!=0.)

        h_wins, h_races = X[:, self.h_total_wins_ix], X[:, self.h_total_races_ix]
        h_win_rate = np.divide(h_wins, h_races, out=np.zeros_like(h_wins), where=h_wins!=0.)

        return np.c_[X, j_turf_win_rate, j_dirt_win_rate, t_turf_win_rate, t_dirt_win_rate, h_win_rate]

In [83]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('num_attribs_adder', CombinedNumericAttributesAdder(num_attribs)),
    ('std_scaler', StandardScaler())
])

added_columns = ['j_career_turf_win_rate', 'j_career_dirt_win_rate', 
                 't_career_turf_win_rate', 't_career_dirt_win_rate', 'h_win_rate']

X_train_num = num_pipeline.fit_transform(X[num_attribs])
X_train_num_df = pd.DataFrame(X_train_num, columns=num_attribs+added_columns)
X_train_num_df.describe()

Unnamed: 0,c_weight_carried,c_post_position,c_horse_weight,c_horse_weight_diff,c_popularity,c_first_place_odds,r_distance,r_contender_count,h_total_races,h_total_wins,h_user_rating,j_career_1st_place_count,j_career_2nd_place_count,j_career_3rd_place_count,j_career_4th_place_or_below_count,j_career_turf_race_count,j_career_turf_win_count,j_career_dirt_race_count,j_career_dirt_win_count,j_career_1st_place_rate,j_career_1st_2nd_place_rate,j_career_any_place_rate,j_career_earnings,t_career_1st_place_count,t_career_2nd_place_count,t_career_3rd_place_count,t_career_4th_place_or_below_count,t_career_turf_race_count,t_career_turf_win_count,t_career_dirt_race_count,t_career_dirt_win_count,t_career_1st_place_rate,t_career_1st_2nd_place_rate,t_career_any_place_rate,t_career_earnings,j_career_turf_win_rate,j_career_dirt_win_rate,t_career_turf_win_rate,t_career_dirt_win_rate,h_win_rate
count,240471.0,240471.0,240471.0,240471.0,240471.0,240471.0,240471.0,240471.0,240471.0,240471.0,240471.0,240471.0,240471.0,240471.0,240471.0,240471.0,240471.0,240471.0,240471.0,240471.0,240471.0,240471.0,240471.0,240471.0,240471.0,240471.0,240471.0,240471.0,240471.0,240471.0,240471.0,240471.0,240471.0,240471.0,240471.0,240471.0,240471.0,240471.0,240471.0,240471.0
mean,1.794034e-15,-1.315475e-16,-7.453177e-16,-2.5293050000000003e-17,-9.266240000000001e-17,-8.923484000000001e-17,-1.954302e-16,-1.891069e-17,-4.5385670000000006e-17,6.429636000000001e-17,-2.69241e-16,-2.6002200000000002e-17,-6.736935e-17,8.734377e-17,6.80785e-17,2.17473e-17,4.2549060000000005e-17,3.640309e-17,-7.942492e-17,4.188719e-16,2.651043e-16,1.705508e-16,-9.289878000000001e-17,-5.578655000000001e-17,9.171687000000001e-17,1.132278e-16,-3.025711e-17,-4.3021830000000004e-17,1.1819180000000002e-17,2.4111130000000002e-17,6.666020000000001e-17,-8.338434e-16,-9.651545e-16,-4.120167e-16,4.758403e-16,1.310747e-16,2.919338e-16,-6.760573e-16,4.896097e-16,2.486756e-16
std,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002
min,-4.496857,-1.644975,-4.456702,-7.555114,-1.544014,-0.6893252,-3.924796,-4.171941,-1.446032,-1.101964,-3.293532,-0.932973,-1.050803,-1.145341,-1.500885,-1.334453,-0.8828953,-1.44183,-0.96419,-1.811911,-2.141232,-2.476202,-0.88993,-1.464802,-1.635806,-1.752109,-2.104924,-1.736076,-1.155583,-2.128275,-1.671901,-2.875665,-3.482781,-4.053646,-1.255234,-1.60284,-1.90303,-2.50763,-2.932909,-1.149225
25%,-0.5628975,-0.7652313,-0.6781131,-0.6694141,-0.8717282,-0.6084657,-0.5876076,-0.74179,-0.739503,-0.6839368,0.2596911,-0.7486741,-0.8197801,-0.8596617,-0.945429,-0.9484583,-0.7516211,-0.89131,-0.7645817,-0.701178,-0.7168785,-0.7401268,-0.7358911,-0.7184793,-0.7777596,-0.8010489,-0.7305498,-0.7480102,-0.6858802,-0.7617424,-0.790528,-0.6320579,-0.6508884,-0.6415498,-0.7066177,-0.6648138,-0.7583222,-0.741097,-0.6267767,-0.6543154
50%,-0.0009032722,0.1145121,-0.01520274,-0.04344137,0.02465251,-0.4299235,-0.1101843,0.5445165,-0.1507289,-0.2659096,0.2596911,-0.3046813,-0.1917887,-0.1718476,-0.02253637,-0.05260161,-0.2974124,-0.07938717,-0.2596901,-0.1962994,-0.2520893,-0.1961564,-0.2839126,-0.2255377,-0.2302942,-0.1439529,-0.1003158,-0.1263447,-0.318925,0.04921649,-0.1546005,-0.1377037,-0.07058267,-0.04261808,-0.296945,-0.2368304,-0.2421059,-0.1718807,-0.07251707,-0.1594056
75%,0.561091,0.9942556,0.6477076,0.5825313,0.6969381,0.1626845,0.367239,0.5445165,0.4969226,0.570145,0.2596911,0.3696851,0.4459644,0.5432606,0.5673355,0.4458777,0.3720859,0.5752443,0.4271973,0.2580914,0.302659,0.4288308,0.3547411,0.7096692,0.6488089,0.7264111,0.8491277,0.5404697,0.3489336,0.6988797,0.6375198,0.5467867,0.5097231,0.5381642,0.4542958,0.2820651,0.3979793,0.6369339,0.4689352,0.4441431
max,5.057045,1.434127,5.619536,8.40719,2.265604,10.15128,6.215674,1.402054,8.73976,12.69294,2.273184,4.668597,3.784406,3.218093,2.257998,2.505411,4.777648,2.24403,4.483942,6.771026,5.430333,5.243547,4.779057,5.113895,3.838848,3.222223,3.425312,3.445627,6.1982,2.541459,3.013881,16.13796,8.123335,14.0958,5.505559,6.615149,6.403929,15.01049,9.727288,9.738792


In [84]:
X[date_attribs].describe()

Unnamed: 0,r_date,h_birthday
count,240471,240471
unique,535,1673
top,2015-02-22,2012-04-30
freq,565,785


In [90]:
date_pipeline = Pipeline([
    ('date_attribs_adder', CombinedDateAttributesAdder()),
    ('imputer', SimpleImputer(strategy='median'))
])

pd.DataFrame(date_pipeline.fit_transform(X[date_attribs])).describe()

Unnamed: 0,0
count,240471.0
mean,1.226805e+17
std,4.147111e+16
min,6.35904e+16
25%,9.21888e+16
50%,1.077408e+17
75%,1.462752e+17
max,3.742848e+17


In [9]:
X.r_dirt_condition.value_counts(dropna=False)

NaN               120464
good               70944
slightly_heavy     25888
heavy              15379
bad                 7796
Name: r_dirt_condition, dtype: int64

In [10]:
X.r_turf_condition.value_counts(dropna=False)

NaN               117982
good               98109
slightly_heavy     17023
heavy               6211
bad                 1146
Name: r_turf_condition, dtype: int64

In [11]:
from sklearn.preprocessing import OneHotEncoder

X['r_dirt_condition'] = X['r_dirt_condition'].fillna('')
X['r_turf_condition'] = X['r_turf_condition'].fillna('')

cat_attribs = [
    'r_racetrack', 'r_course_type', 'r_weather', 'h_sex', 
    'r_impost_category', 'r_dirt_condition', 'r_turf_condition'
]

cat_encoder = OneHotEncoder(handle_unknown='ignore')
cat_encoder.fit_transform(X[cat_attribs]).toarray()

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 1., 0., 0.],
       [0., 1., 0., ..., 1., 0., 0.],
       [0., 1., 0., ..., 1., 0., 0.]])

In [12]:
cat_encoder.categories_

[array(['chukyo', 'fuma', 'hakodate', 'hanshin', 'kyoto', 'nakayama',
        'niigata', 'ogura', 'sapporo', 'tokyo'], dtype=object),
 array(['dirt', 'obstacle', 'turf'], dtype=object),
 array(['cloudy', 'rainy', 'snowy', 'sunny'], dtype=object),
 array(['castrated', 'female', 'male'], dtype=object),
 array(['age_based', 'age_sex_based', 'decided_per_race', 'handicap'],
       dtype=object),
 array(['', 'bad', 'good', 'heavy', 'slightly_heavy'], dtype=object),
 array(['', 'bad', 'good', 'heavy', 'slightly_heavy'], dtype=object)]