## label as normalized order of finish based on number of horses in race
the label should be resricted to a 0 - 1 range taking into account the number of horses per race
so a horse that places 5th in a race of 10 horses gets a score of 0.5

$
\text{label} = 1 - \dfrac{(\text{finish order} - 1)}{(\text{number of horses in race} - 1)}
$

In [1]:
import sys
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

from IPython.display import display

sys.path.insert(0, '..')
np.random.seed(42)
pd.options.display.max_columns = None
%matplotlib inline

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

In [2]:
%%html
<style>
    /* force tables to display at left of cell */
    table {
        display: inline-block
    }
</style>

In [3]:
from scripts.utils import read_netkeiba

df = read_netkeiba()

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240471 entries, 0 to 240470
Data columns (total 66 columns):
c_id                                      240471 non-null int64
c_weight_carried                          240471 non-null float64
c_post_position                           240471 non-null int64
c_order_of_finish                         240471 non-null int64
c_order_of_finish_lowered                 240471 non-null int64
c_finish_time                             240471 non-null float64
c_horse_weight                            240471 non-null int64
c_horse_weight_diff                       240471 non-null int64
c_popularity                              240471 non-null int64
c_first_place_odds                        240471 non-null float64
r_id                                      240471 non-null int64
r_key                                     240471 non-null object
r_racetrack                               240471 non-null object
r_course_type                             240471 

In [5]:
df.head()

Unnamed: 0,c_id,c_weight_carried,c_post_position,c_order_of_finish,c_order_of_finish_lowered,c_finish_time,c_horse_weight,c_horse_weight_diff,c_popularity,c_first_place_odds,r_id,r_key,r_racetrack,r_course_type,r_weather,r_url,r_distance,r_date,r_dirt_condition,r_turf_condition,r_impost_category,r_is_non_winner_regional_horse_allowed,r_is_winner_regional_horse_allowed,r_is_regional_jockey_allowed,r_is_foreign_horse_allowed,r_is_foreign_horse_and_trainer_allowed,r_is_apprentice_jockey_allowed,r_is_female_only,h_id,h_key,h_url,h_total_races,h_total_wins,h_sex,h_birthday,h_user_rating,j_id,j_key,j_url,j_career_1st_place_count,j_career_2nd_place_count,j_career_3rd_place_count,j_career_4th_place_or_below_count,j_career_turf_race_count,j_career_turf_win_count,j_career_dirt_race_count,j_career_dirt_win_count,j_career_1st_place_rate,j_career_1st_2nd_place_rate,j_career_any_place_rate,j_career_earnings,t_id,t_key,t_url,t_career_1st_place_count,t_career_2nd_place_count,t_career_3rd_place_count,t_career_4th_place_or_below_count,t_career_turf_race_count,t_career_turf_win_count,t_career_dirt_race_count,t_career_dirt_win_count,t_career_1st_place_rate,t_career_1st_2nd_place_rate,t_career_any_place_rate,t_career_earnings
0,1,54.0,7,1,0,109.9,514,-2,1,3.7,1,201803030101,fuma,dirt,sunny,http://db.netkeiba.com/race/201803030101/,1700,2018-11-03,good,,age_based,0,0,0,0,0,0,1,1,2016104408,http://db.netkeiba.com/horse/2016104408/,2,1,female,2016-04-03,3.17,1,1092,http://db.netkeiba.com/jockey/result/01092,423,477,565,6501,4156,222,3810,201,0.053,0.113,0.184,829574.0,1,359,http://db.netkeiba.com/trainer/result/00359,556,577,557,5185,3613,299,3052,241,0.081,0.165,0.246,892578.6
1,2,54.0,8,2,0,110.1,490,0,3,6.3,1,201803030101,fuma,dirt,sunny,http://db.netkeiba.com/race/201803030101/,1700,2018-11-03,good,,age_based,0,0,0,0,0,0,1,2,2016100908,http://db.netkeiba.com/horse/2016100908/,3,0,female,2016-02-07,3.25,2,1109,http://db.netkeiba.com/jockey/result/01109,142,182,189,3746,2091,74,2167,68,0.033,0.076,0.12,253625.0,2,1147,http://db.netkeiba.com/trainer/result/01147,92,84,81,866,508,36,537,45,0.082,0.157,0.229,136755.2
2,3,54.0,4,3,0,110.4,464,10,4,6.4,1,201803030101,fuma,dirt,sunny,http://db.netkeiba.com/race/201803030101/,1700,2018-11-03,good,,age_based,0,0,0,0,0,0,1,3,2016105892,http://db.netkeiba.com/horse/2016105892/,2,0,female,2016-04-29,,3,1128,http://db.netkeiba.com/jockey/result/01128,330,299,350,4005,2498,146,2486,184,0.066,0.126,0.196,521757.2,3,1006,http://db.netkeiba.com/trainer/result/01006,286,255,279,3530,1854,111,2237,154,0.066,0.124,0.189,454950.2
3,4,54.0,6,4,0,110.4,450,-6,2,4.5,1,201803030101,fuma,dirt,sunny,http://db.netkeiba.com/race/201803030101/,1700,2018-11-03,good,,age_based,0,0,0,0,0,0,1,4,2016106313,http://db.netkeiba.com/horse/2016106313/,3,0,female,2016-03-19,,4,1015,http://db.netkeiba.com/jockey/result/01015,461,528,542,7051,4271,197,4311,264,0.054,0.115,0.178,852023.5,4,1099,http://db.netkeiba.com/trainer/result/01099,269,246,289,2686,1831,136,1558,118,0.077,0.148,0.23,493068.1
4,5,54.0,6,5,0,110.5,426,-8,10,43.3,1,201803030101,fuma,dirt,sunny,http://db.netkeiba.com/race/201803030101/,1700,2018-11-03,good,,age_based,0,0,0,0,0,0,1,5,2016102321,http://db.netkeiba.com/horse/2016102321/,2,0,female,2016-05-21,,5,1117,http://db.netkeiba.com/jockey/result/01117,372,429,435,5399,3408,201,3227,171,0.056,0.121,0.186,660899.9,5,1131,http://db.netkeiba.com/trainer/result/01131,60,71,77,1541,703,16,1008,43,0.034,0.075,0.119,94999.5


In [38]:
df.corr()['c_order_of_finish'].sort_values(ascending=False)

c_order_of_finish                         1.000000
c_popularity                              0.587627
c_first_place_odds                        0.468498
r_contender_count                         0.261869
h_id                                      0.058853
j_id                                      0.047443
h_total_races                             0.028172
r_is_female_only                          0.020186
t_career_4th_place_or_below_count         0.016382
t_career_dirt_race_count                  0.013492
c_post_position                           0.003866
r_is_foreign_horse_and_trainer_allowed    0.001678
r_is_apprentice_jockey_allowed            0.001003
c_order_of_finish_lowered                -0.006146
c_horse_weight_diff                      -0.006669
t_id                                     -0.009606
c_id                                     -0.019052
r_id                                     -0.019170
t_career_turf_race_count                 -0.030850
c_finish_time                  

In [6]:
df['r_contender_count'] = df.groupby('r_id').c_id.count().loc[df.r_id].values
df['c_norm_order_of_finish'] = 1.0 - (df.c_order_of_finish - 1) / (df.r_contender_count - 1)

index_cols = ['c_id', 'r_id', 'h_id', 'j_id', 't_id', 'r_key', 'r_url', 'h_key', 'h_url', 'j_key', 'j_url', 't_key', 't_url']
label_cols = ['c_norm_order_of_finish', 'c_order_of_finish', 'c_finish_time', 'c_order_of_finish_lowered']

X = df.drop(columns=index_cols + label_cols)
y = df.c_norm_order_of_finish

X.shape, y.shape

((240471, 51), (240471,))

In [7]:
from sklearn.model_selection import StratifiedShuffleSplit

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_idx, test_idx in sss.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((192376, 51), (192376,), (48095, 51), (48095,))

In [8]:
# fillna('')
# OneHotEncoder(handle_unknown='ignore')
cat_attribs = [
    'r_racetrack', 'r_course_type', 'r_weather', 'h_sex', 
    'r_impost_category', 'r_dirt_condition', 'r_turf_condition'
]

# Use as-is
bool_attribs = [
    'r_is_non_winner_regional_horse_allowed', 'r_is_winner_regional_horse_allowed',
    'r_is_regional_jockey_allowed', 'r_is_foreign_horse_allowed',
    'r_is_foreign_horse_and_trainer_allowed', 'r_is_apprentice_jockey_allowed',
    'r_is_female_only'
]

# CombinedDateAttributesAdder
# Imputer(strategy='median')
# StandardScaler
date_attribs = [
    'r_date', 'h_birthday'
]

# Imputer(strategy='median')
# CombinedNumericAttributesAdder
# StandardScaler
num_attribs = [
    'c_weight_carried', 'c_post_position',
    'c_horse_weight', 'c_horse_weight_diff', 'c_popularity',
    'c_first_place_odds', 'r_distance', 'r_contender_count',
    'h_total_races', 'h_total_wins', 'h_user_rating',
    'j_career_1st_place_count', 'j_career_2nd_place_count',
    'j_career_3rd_place_count', 'j_career_4th_place_or_below_count',
    'j_career_turf_race_count', 'j_career_turf_win_count',
    'j_career_dirt_race_count', 'j_career_dirt_win_count',
    'j_career_1st_place_rate', 'j_career_1st_2nd_place_rate',
    'j_career_any_place_rate', 'j_career_earnings',
    't_career_1st_place_count', 't_career_2nd_place_count',
    't_career_3rd_place_count', 't_career_4th_place_or_below_count',
    't_career_turf_race_count', 't_career_turf_win_count',
    't_career_dirt_race_count', 't_career_dirt_win_count',
    't_career_1st_place_rate', 't_career_1st_2nd_place_rate',
    't_career_any_place_rate', 't_career_earnings'
]

In [9]:
from sklearn.base import BaseEstimator, TransformerMixin


class CombinedDateAttributesAdder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        df = pd.DataFrame(X)
        df['h_age_days'] = (pd.to_datetime(df['r_date']) - pd.to_datetime(df['h_birthday'])).dt.days
        return df.drop(columns=['r_date', 'h_birthday']).values

    
class CombinedNumericAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self):
        # passing 'num_attribs' via init is preferable, but 'not found' errors occurs in pipeline
        self.j_turf_wins_ix = num_attribs.index('j_career_turf_win_count')
        self.j_turf_races_ix = num_attribs.index('j_career_turf_race_count')
        self.j_dirt_wins_ix = num_attribs.index('j_career_dirt_win_count')
        self.j_dirt_races_ix = num_attribs.index('j_career_dirt_race_count')
        self.t_turf_wins_ix = num_attribs.index('t_career_turf_win_count')
        self.t_turf_races_ix = num_attribs.index('t_career_turf_race_count')
        self.t_dirt_wins_ix = num_attribs.index('t_career_dirt_win_count')
        self.t_dirt_races_ix = num_attribs.index('t_career_dirt_race_count')
        self.h_total_wins_ix = num_attribs.index('h_total_wins')
        self.h_total_races_ix = num_attribs.index('h_total_races')
        
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        j_turf_wins, j_turf_races = X[:, self.j_turf_wins_ix], X[:, self.j_turf_races_ix]
        j_turf_win_rate = np.divide(j_turf_wins, j_turf_races, out=np.zeros_like(j_turf_wins), where=j_turf_wins!=0.)

        j_dirt_wins, j_dirt_races = X[:, self.j_dirt_wins_ix], X[:, self.j_dirt_races_ix]
        j_dirt_win_rate = np.divide(j_dirt_wins, j_dirt_races, out=np.zeros_like(j_dirt_wins), where=j_dirt_wins!=0.)

        t_turf_wins, t_turf_races = X[:, self.t_turf_wins_ix], X[:, self.t_turf_races_ix]
        t_turf_win_rate = np.divide(t_turf_wins, t_turf_races, out=np.zeros_like(t_turf_wins), where=t_turf_wins!=0.)

        t_dirt_wins, t_dirt_races = X[:, self.t_dirt_wins_ix], X[:, self.t_dirt_races_ix]
        t_dirt_win_rate = np.divide(t_dirt_wins, t_dirt_races, out=np.zeros_like(t_dirt_wins), where=t_dirt_wins!=0.)

        h_wins, h_races = X[:, self.h_total_wins_ix], X[:, self.h_total_races_ix]
        h_win_rate = np.divide(h_wins, h_races, out=np.zeros_like(h_wins), where=h_wins!=0.)

        return np.c_[X, j_turf_win_rate, j_dirt_win_rate, t_turf_win_rate, t_dirt_win_rate, h_win_rate]

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Numeric Attributes

In [11]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('num_attribs_adder', CombinedNumericAttributesAdder()),
    ('std_scaler', StandardScaler())
])

added_columns = ['j_career_turf_win_rate', 'j_career_dirt_win_rate', 
                 't_career_turf_win_rate', 't_career_dirt_win_rate', 'h_win_rate']

X_train_num = num_pipeline.fit_transform(X_train[num_attribs])
X_train_num_df = pd.DataFrame(X_train_num, columns=num_attribs+added_columns)
X_train_num_df.describe()

Unnamed: 0,c_weight_carried,c_post_position,c_horse_weight,c_horse_weight_diff,c_popularity,c_first_place_odds,r_distance,r_contender_count,h_total_races,h_total_wins,h_user_rating,j_career_1st_place_count,j_career_2nd_place_count,j_career_3rd_place_count,j_career_4th_place_or_below_count,j_career_turf_race_count,j_career_turf_win_count,j_career_dirt_race_count,j_career_dirt_win_count,j_career_1st_place_rate,j_career_1st_2nd_place_rate,j_career_any_place_rate,j_career_earnings,t_career_1st_place_count,t_career_2nd_place_count,t_career_3rd_place_count,t_career_4th_place_or_below_count,t_career_turf_race_count,t_career_turf_win_count,t_career_dirt_race_count,t_career_dirt_win_count,t_career_1st_place_rate,t_career_1st_2nd_place_rate,t_career_any_place_rate,t_career_earnings,j_career_turf_win_rate,j_career_dirt_win_rate,t_career_turf_win_rate,t_career_dirt_win_rate,h_win_rate
count,192376.0,192376.0,192376.0,192376.0,192376.0,192376.0,192376.0,192376.0,192376.0,192376.0,192376.0,192376.0,192376.0,192376.0,192376.0,192376.0,192376.0,192376.0,192376.0,192376.0,192376.0,192376.0,192376.0,192376.0,192376.0,192376.0,192376.0,192376.0,192376.0,192376.0,192376.0,192376.0,192376.0,192376.0,192376.0,192376.0,192376.0,192376.0,192376.0,192376.0
mean,1.300319e-15,-6.419321e-17,5.915526e-16,-2.4524910000000002e-17,4.3878900000000005e-17,-9.924462e-17,1.483683e-16,-5.917003e-17,-5.861601000000001e-17,2.7553590000000003e-17,5.594929e-16,2.1939450000000002e-17,-1.8615290000000002e-17,-2.031431e-17,-1.082383e-16,-6.796059e-17,-2.77752e-17,4.967771e-18,4.2844720000000004e-17,3.560544e-17,4.10349e-16,3.338933e-17,6.426708e-17,9.036173000000001e-17,-5.84498e-18,1.042678e-16,6.914251e-17,6.581835e-17,-2.5198970000000002e-17,-3.763687e-17,1.578976e-16,-3.618163e-16,-1.243236e-16,-6.644994e-16,-1.502889e-16,4.2992460000000004e-17,-3.89296e-17,1.3003e-16,2.852867e-16,-3.527302e-17
std,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003
min,-4.50014,-1.647437,-4.457277,-7.092334,-1.544255,-0.6894195,-3.924732,-4.172835,-1.446184,-1.102005,-3.290279,-0.9327761,-1.050533,-1.144817,-1.500234,-1.333858,-0.8825783,-1.441293,-0.9640961,-1.812043,-2.141611,-2.476578,-0.8896405,-1.464245,-1.635323,-1.752141,-2.105824,-1.735867,-1.154751,-2.129329,-1.67202,-2.873765,-3.480223,-4.049515,-1.254713,-1.602492,-1.903641,-2.505775,-2.93235,-1.149429
25%,-0.5628271,-0.7676061,-0.6780781,-0.6696089,-0.8718885,-0.6090987,-0.8259402,-0.7419648,-0.7398654,-0.6838309,0.2593818,-0.7509091,-0.8191047,-0.8587348,-0.9443246,-0.9474602,-0.7510709,-0.8901905,-0.7640513,-0.7004926,-0.7162134,-0.7395057,-0.7352807,-0.7181309,-0.7618059,-0.8010976,-0.7312749,-0.7482462,-0.6854016,-0.7623145,-0.7903872,-0.6305861,-0.6492817,-0.6397179,-0.7062319,-0.6641036,-0.7577131,-0.7397111,-0.6340793,-0.6543274
50%,-0.0003537825,0.1122244,-0.01506071,-0.04300157,0.0246006,-0.4310904,-0.1097327,0.5446116,-0.1512662,-0.2656565,0.2593818,-0.3032367,-0.1900098,-0.1699514,-0.02067953,-0.05066788,-0.2960552,-0.07740826,-0.2580557,-0.1952423,-0.2510836,-0.195223,-0.2823603,-0.2253271,-0.2303284,-0.1440131,-0.1009607,-0.1268608,-0.3187221,0.04893028,-0.1319525,-0.1363263,-0.06917076,-0.05932694,-0.2966602,-0.235955,-0.2409468,-0.1706461,-0.07100164,-0.1592257
75%,0.5621195,0.9920549,0.6479567,0.5836058,0.6969674,0.1615469,0.367739,0.5446116,0.4961929,0.5706924,0.2593818,0.3724688,0.4064894,0.5461646,0.5696733,0.448332,0.3746326,0.5779161,0.4303336,0.259483,0.3040714,0.4301232,0.3576244,0.7096183,0.6484513,0.7263358,0.8486036,0.5396532,0.3486347,0.6988225,0.6380813,0.5480333,0.5109402,0.5392012,0.4543953,0.283141,0.3998206,0.6379535,0.4706316,0.4445567
max,5.061906,1.43197,5.620587,8.416197,2.265823,10.15172,6.216767,1.402329,8.736581,12.69775,2.270856,4.679917,3.793171,3.224767,2.261714,2.510016,4.788022,2.248468,4.495949,6.777212,5.435504,5.247604,4.79116,5.112613,3.837316,3.222104,3.425115,3.443502,6.193507,2.542051,3.015142,16.13623,8.121996,14.0877,5.504412,6.618671,6.41217,15.00769,6.565973,9.742807


# DateTime Attributes

In [12]:
X_train[date_attribs].describe()

Unnamed: 0,r_date,h_birthday
count,192376,192376
unique,535,1660
top,2015-05-23,2012-04-30
freq,461,643


In [13]:
date_pipeline = Pipeline([
    ('date_attribs_adder', CombinedDateAttributesAdder()),
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scaler', StandardScaler())
])

pd.DataFrame(date_pipeline.fit_transform(X_train[date_attribs])).describe()

Unnamed: 0,0
count,192376.0
mean,-1.528375e-16
std,1.000003
min,-1.424788
25%,-0.734929
50%,-0.3597788
75%,0.5676759
max,6.069879


# Boolean Attributes

In [14]:
X_train[bool_attribs].astype(bool).describe()

Unnamed: 0,r_is_non_winner_regional_horse_allowed,r_is_winner_regional_horse_allowed,r_is_regional_jockey_allowed,r_is_foreign_horse_allowed,r_is_foreign_horse_and_trainer_allowed,r_is_apprentice_jockey_allowed,r_is_female_only
count,192376,192376,192376,192376,192376,192376,192376
unique,1,2,1,1,2,2,2
top,False,False,False,False,False,False,False
freq,192376,171472,192376,192376,184494,190652,162794


# Categorical Attributes

In [15]:
from sklearn.preprocessing import OneHotEncoder

In [16]:
X.r_dirt_condition.value_counts(dropna=False)

NaN               120464
good               70944
slightly_heavy     25888
heavy              15379
bad                 7796
Name: r_dirt_condition, dtype: int64

In [17]:
X.r_turf_condition.value_counts(dropna=False)

NaN               117982
good               98109
slightly_heavy     17023
heavy               6211
bad                 1146
Name: r_turf_condition, dtype: int64

In [18]:
one_hot_enc = OneHotEncoder(
    sparse=False, 
    categories=[['bad', 'good', 'heavy', 'slightly_heavy']],
    handle_unknown='ignore')
one_hot_enc.fit_transform(X_train[['r_turf_condition']]), one_hot_enc.categories_

(array([[0., 0., 0., 1.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        ...,
        [0., 1., 0., 0.],
        [0., 0., 0., 0.],
        [0., 1., 0., 0.]]),
 [array(['bad', 'good', 'heavy', 'slightly_heavy'], dtype=object)])

In [19]:
cat_attrib_categories = {
    'r_racetrack': ['chukyo', 'fuma', 'hakodate', 'hanshin', 'kyoto', 'nakayama', 'niigata', 'ogura', 'sapporo', 'tokyo'],
    'r_course_type': ['dirt', 'obstacle', 'turf'],
    'r_weather': ['cloudy', 'rainy', 'snowy', 'sunny'],
    'h_sex': ['castrated', 'female', 'male'],
    'r_impost_category': ['age_based', 'age_sex_based', 'decided_per_race', 'handicap'],
    'r_dirt_condition': ['bad', 'good', 'heavy', 'slightly_heavy'],
    'r_turf_condition': ['bad', 'good', 'heavy', 'slightly_heavy']
}

cat_attribs = list(cat_attrib_categories.keys())
categories = list(cat_attrib_categories.values())

cat_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore', categories=categories)
cat_encoder.fit_transform(X_train[cat_attribs])

array([[0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.]])

In [20]:
cat_encoder.categories_

[array(['chukyo', 'fuma', 'hakodate', 'hanshin', 'kyoto', 'nakayama',
        'niigata', 'ogura', 'sapporo', 'tokyo'], dtype=object),
 array(['dirt', 'obstacle', 'turf'], dtype=object),
 array(['cloudy', 'rainy', 'snowy', 'sunny'], dtype=object),
 array(['castrated', 'female', 'male'], dtype=object),
 array(['age_based', 'age_sex_based', 'decided_per_race', 'handicap'],
       dtype=object),
 array(['bad', 'good', 'heavy', 'slightly_heavy'], dtype=object),
 array(['bad', 'good', 'heavy', 'slightly_heavy'], dtype=object)]

In [21]:
cat_pipeline = Pipeline([
    ('one_hot', OneHotEncoder(sparse=False, handle_unknown='ignore', categories=categories))
])

# Full Pipeline

In [22]:
from sklearn.compose import ColumnTransformer

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', cat_pipeline, cat_attribs),
    ('date', date_pipeline, date_attribs)
])

full_pipeline.fit_transform(X_train)

array([[-0.56282706,  0.1122244 ,  0.31644799, ...,  0.        ,
         1.        , -1.15384668],
       [-0.56282706,  0.1122244 , -0.61177637, ...,  0.        ,
         0.        , -0.41188295],
       [-1.12530035,  0.55213968, -0.34656941, ...,  0.        ,
         0.        ,  0.09665398],
       ...,
       [ 2.81201263,  1.43197022, -0.81068159, ...,  0.        ,
         0.        ,  1.63268564],
       [ 0.5621195 , -0.32769087,  0.78056017, ...,  0.        ,
         0.        ,  0.75733517],
       [ 1.12459278, -0.32769087, -0.14766419, ...,  0.        ,
         0.        ,  1.67436899]])

# Model Selection

In [23]:
X_train_prepared = full_pipeline.fit_transform(X_train)

### LinearRegression

In [24]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression(n_jobs=-1)
lin_reg.fit(X_train_prepared, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=False)

In [25]:
X_train_sample = full_pipeline.fit_transform(X_train[:10])
y_train_sample = y_train[:10]

sample_pred = lin_reg.predict(X_train_sample)
pred_vs_actual = np.c_[sample_pred, y_train_sample.values]
pd.DataFrame(np.c_[pred_vs_actual, sample_pred - y_train_sample.values], columns=['prediction', 'label', 'diff'])

Unnamed: 0,prediction,label,diff
0,0.643799,0.8,-0.156201
1,0.51001,0.357143,0.152867
2,0.366211,0.1,0.266211
3,0.359375,0.466667,-0.107292
4,0.285278,0.2,0.085278
5,0.626831,0.9,-0.273169
6,0.604614,0.285714,0.3189
7,0.647339,0.25,0.397339
8,0.688721,0.733333,-0.044613
9,0.358276,0.533333,-0.175057


In [40]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

lin_reg_pred = lin_reg.predict(X_train_prepared)
lin_mse = mean_squared_error(y_train, lin_reg_pred)
lin_mae = mean_absolute_error(y_train, lin_reg_pred)

print(f'MSRE: {np.sqrt(lin_mse)}')
print(f'MAE: {lin_mae}')

MSRE: 0.25601567709799133
MAE: 0.21162276250179368


### DecisionTreeRegressor

In [27]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(X_train_prepared, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=42, splitter='best')

In [41]:
tree_predictions = tree_reg.predict(X_train_prepared)
tree_mse = mean_squared_error(y_train, tree_predictions)
tree_mae = mean_absolute_error(y_train, tree_predictions)

print(f'MSRE: {np.sqrt(tree_mse)}')
print(f'MAE: {tree_mae}')

MSRE: 1.2879864487311744e-17
MAE: 1.7085370651239115e-18


### RandomForestRegressor

In [29]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(random_state=42)
forest_reg.fit(X_train_prepared, y_train)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [42]:
forest_predictions = forest_reg.predict(X=X_train_prepared)
forest_mse = mean_squared_error(y_true=y_train, y_pred=forest_predictions)
forest_mae = mean_absolute_error(y_train, forest_predictions)

print(f'MSRE: {np.sqrt(forest_mse)}')
print(f'MAE: {forest_mae}')

MSRE: 0.11138481808170068
MAE: 0.08411162258162252


### LinearSVR

In [31]:
from sklearn.svm import LinearSVR

lin_svreg = LinearSVR(random_state=42, verbose=5)
lin_svreg.fit(X_train_prepared, y_train)

[LibLinear]



LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
     random_state=42, tol=0.0001, verbose=5)

In [43]:
lin_svreg_predictions = lin_svreg.predict(X_train_prepared)
lin_svreg_mse = mean_squared_error(y_true=y_train, y_pred=lin_svreg_predictions)
lin_svreg_mae = mean_absolute_error(y_true=y_train, y_pred=lin_svreg_predictions)

print(f'MSRE: {np.sqrt(lin_svreg_mse)}')
print(f'MAE: {lin_svreg_mae}')

MSRE: 0.2618646663196695
MAE: 0.20968300354267466


### SGDRegressor
https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDRegressor.html#sklearn.linear_model.SGDRegressor

In [33]:
from sklearn.linear_model import SGDRegressor

sgd_reg = SGDRegressor(random_state=42)
sgd_reg.fit(X_train_prepared, y_train)



SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1,
       eta0=0.01, fit_intercept=True, l1_ratio=0.15,
       learning_rate='invscaling', loss='squared_loss', max_iter=None,
       n_iter=None, n_iter_no_change=5, penalty='l2', power_t=0.25,
       random_state=42, shuffle=True, tol=None, validation_fraction=0.1,
       verbose=0, warm_start=False)

In [44]:
sgd_predictions = sgd_reg.predict(X_train_prepared)
sgd_mse = mean_squared_error(y_true=y_train, y_pred=sgd_predictions)
sgd_mae = mean_absolute_error(y_train, sgd_predictions)

print(f'RMSE: {np.sqrt(sgd_mse)}')
print(f'MAE: {sgd_mae}')

RMSE: 0.25915695591579146
MAE: 0.21279224556823823


### TheilSenRegressor
https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.TheilSenRegressor.html#sklearn.linear_model.TheilSenRegressor

In [35]:
from sklearn.linear_model import TheilSenRegressor

ts_reg = TheilSenRegressor(random_state=42)
ts_reg.fit(X_train_prepared, y_train)

TheilSenRegressor(copy_X=True, fit_intercept=True, max_iter=300,
         max_subpopulation=10000, n_jobs=None, n_subsamples=None,
         random_state=42, tol=0.001, verbose=False)

In [45]:
ts_predictions = ts_reg.predict(X_train_prepared)
ts_mse = mean_squared_error(y_true=y_train, y_pred=ts_predictions)
ts_mae = mean_absolute_error(y_train, ts_predictions)

print(f'RMSE: {np.sqrt(ts_mse)}')
print(f'MAE: {ts_mae}')

RMSE: 0.25632188340399753
MAE: 0.21179997014019325


## Quick n' dirty model performance comparison

| model | rmse | mae |
| :- | :- | :- |
| LinearRegression | 0.256 | 0.212 |
| DecisionTreeRegressor | 1.288e-17 | 1.708e-18 |
| RandomForestRegressor | 0.111 | 0.0841 |
| LinearSVR | 0.262 | 0.2097 |
| SGDRegressor | 0.259 | 0.213 |
| TheilSenRegressor | 0.256 | 0.212 |