In [44]:
import os
import sys

import numpy as np
from sklearn.externals import joblib

sys.path.insert(0, '..')
from trainer.pipeline import num_attrs, cat_attr_categories, date_attrs, bool_attrs

np.random.seed(42)

In [45]:
model = joblib.load('../rand_forest_reg_2018-12-10T0830/model.joblib')

In [46]:
model

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=7, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=190, n_jobs=-1, oob_score=False, random_state=42,
           verbose=5, warm_start=False)

In [47]:
model.get_params()

{'bootstrap': True,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 7,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 190,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': 42,
 'verbose': 5,
 'warm_start': False}

In [48]:
model.feature_importances_

array([1.20460564e-02, 1.80015777e-02, 2.80795643e-02, 2.24938338e-02,
       1.17496363e-01, 1.58801700e-01, 1.71425194e-02, 2.05753909e-02,
       2.78031057e-02, 1.80427286e-02, 1.66185359e-02, 1.13537088e-02,
       1.08928602e-02, 1.05632339e-02, 1.01704232e-02, 1.07302457e-02,
       1.07476893e-02, 1.01232697e-02, 1.07969151e-02, 1.49902279e-02,
       1.49597675e-02, 1.46636851e-02, 1.09085107e-02, 1.38032961e-02,
       1.40310653e-02, 1.40559198e-02, 1.46911913e-02, 1.45970478e-02,
       1.43086524e-02, 1.50510154e-02, 1.46240891e-02, 1.41281197e-02,
       1.51820828e-02, 1.51716002e-02, 1.42754664e-02, 1.49681818e-02,
       1.42288817e-02, 1.61384810e-02, 1.65199310e-02, 2.89884229e-02,
       2.83794739e-03, 2.51210767e-03, 1.71194433e-03, 3.31650954e-03,
       3.43324785e-03, 3.28427068e-03, 2.85981290e-03, 2.49848742e-03,
       1.68064082e-03, 3.34591814e-03, 3.64000717e-03, 3.66169718e-04,
       3.68383654e-03, 4.73584629e-03, 2.90540364e-03, 1.30870377e-04,
      

In [49]:
extra_cat_attrs = ['j_turf_win_rate', 'j_dirt_win_rate', 't_turf_win_rate', 't_dirt_win_rate', 'h_win_rate']
attributes = num_attrs + list(cat_attr_categories.keys()) + extra_cat_attrs + date_attrs + bool_attrs

In [50]:
import pandas as pd

features = list(zip(attributes, model.feature_importances_))
df = pd.DataFrame(features, columns=['feature', 'importance']).sort_values(by='importance', ascending=False)

In [51]:
df

Unnamed: 0,feature,importance
5,c_first_place_odds,0.158802
4,c_popularity,0.117496
39,r_impost_category,0.028988
2,c_horse_weight,0.02808
8,h_total_races,0.027803
3,c_horse_weight_diff,0.022494
7,r_contender_count,0.020575
9,h_total_wins,0.018043
1,c_post_position,0.018002
6,r_distance,0.017143


In [52]:
importance_thresh = 0.015
df[df['importance'] < importance_thresh]

Unnamed: 0,feature,importance
19,j_career_1st_place_rate,0.01499
35,r_racetrack,0.014968
20,j_career_1st_2nd_place_rate,0.01496
26,t_career_4th_place_or_below_count,0.014691
21,j_career_any_place_rate,0.014664
30,t_career_dirt_win_count,0.014624
27,t_career_turf_race_count,0.014597
28,t_career_turf_win_count,0.014309
34,t_career_earnings,0.014275
36,r_course_type,0.014229


In [53]:
print(f">= thresh: {len(df[df['importance'] >= importance_thresh].index)}")
print(f"< thresh: {len(df[df['importance'] < importance_thresh].index)}")

>= thresh: 16
< thresh: 40


In [54]:
df[df['importance'] >= importance_thresh]

Unnamed: 0,feature,importance
5,c_first_place_odds,0.158802
4,c_popularity,0.117496
39,r_impost_category,0.028988
2,c_horse_weight,0.02808
8,h_total_races,0.027803
3,c_horse_weight_diff,0.022494
7,r_contender_count,0.020575
9,h_total_wins,0.018043
1,c_post_position,0.018002
6,r_distance,0.017143


In [57]:
df[df['importance'] >= importance_thresh].feature.values

array(['c_first_place_odds', 'c_popularity', 'r_impost_category',
       'c_horse_weight', 'h_total_races', 'c_horse_weight_diff',
       'r_contender_count', 'h_total_wins', 'c_post_position',
       'r_distance', 'h_user_rating', 'h_sex', 'r_weather',
       't_career_1st_2nd_place_rate', 't_career_any_place_rate',
       't_career_dirt_race_count'], dtype=object)