# Hyperparams tuning 

## Features 
    - lag / mean encoding (numerical)
    - hashing/tfidf svd text features
## cross validation 
Use only simple hold out scheme
- train on `[0-32]`
- validate at `date_block_num = 33`
- via `hyperopt ` packages

In [2]:
import pandas as pd 
import gc

## Load data

- basic numerical features (lag/mean encoding)

In [3]:
with pd.HDFStore('../data/feat/data.h5') as store:
    print(store.keys())
#     X_train = store['X_train']
#     X_cv = store['X_cv']
    y_train = store['y_train']
    y_cv = store['y_cv']
#     X_test = store['X_test']

['/X_cv', '/X_test', '/X_train', '/y_cv', '/y_train']


In [4]:
y_train = y_train.clip(0,20)
y_cv = y_cv.clip(0,20)

- text features

In [6]:
with pd.HDFStore('../data/feat/all_feat_df_all.h5') as feat:
    print(feat.keys())
    all_feats_df_all = feat['all_feats_df_all']

['/all_feats_df_all']


### shrink date_block_num for memory issud

In [8]:
mask = all_feats_df_all.date_block_num>=12
all_feats_df_all = all_feats_df_all[mask]
y_cv = y_cv[mask]
y_train = y_train[mask]

In [9]:
train_mask = all_feats_df_all.date_block_num <= 32
cv_mask = all_feats_df_all.date_block_num == 33
test_mask = all_feats_df_all.date_block_num == 34

In [10]:
gc.collect()

4279

# Modeling

In [11]:
import lightgbm as lgb

### Naive testing 
without hyper param optimization

In [12]:
lgb_train = lgb.Dataset(all_feats_df_all[train_mask], 
                        y_train,
                        free_raw_data=False) ## if you want to reuse data --> remember to set free_raw_data=False
lgb_cv = lgb.Dataset(all_feats_df_all[cv_mask], 
                     y_cv, 
                     reference=lgb_train,
                     free_raw_data=False)

In [13]:
''' simple test on params tuning for validation data (33)'''
# train_mask = all_feats_df_all.date_block_num <= 32
# cv_mask = all_feats_df_all.date_block_num == 33
### this one I want to repeat previous benchmark: rmse~ 0.90905
opt_params = {
    'max_depth': 9, 
    'num_leaves': 80,
    'metric': 'rmse',
    'bagging_fraction': 0.7257895345351948, 
    'boosting_type': 'gbdt', 
    'feature_fraction': 0.6932767211963933, 
    'gamma': 0.3004315340493337,  
    'reg_lambda': 0.519291939932697
}

params = {    
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'max_depth': 500,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5    
}

In [11]:
# del all_feats_df_all;gc.collect()

0

In [18]:
reg = lgb.train(opt_params,
                train_set = lgb_train,
                valid_sets=lgb_cv,
                early_stopping_rounds = 5,
                verbose_eval = True)

[1]	valid_0's rmse: 1.02277
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's rmse: 1.00788
[3]	valid_0's rmse: 0.996426
[4]	valid_0's rmse: 0.986937
[5]	valid_0's rmse: 0.977267
[6]	valid_0's rmse: 0.968423
[7]	valid_0's rmse: 0.961645
[8]	valid_0's rmse: 0.95564
[9]	valid_0's rmse: 0.951398
[10]	valid_0's rmse: 0.945285
[11]	valid_0's rmse: 0.940741
[12]	valid_0's rmse: 0.937988
[13]	valid_0's rmse: 0.935239
[14]	valid_0's rmse: 0.933181
[15]	valid_0's rmse: 0.931515
[16]	valid_0's rmse: 0.928914
[17]	valid_0's rmse: 0.928424
[18]	valid_0's rmse: 0.92708
[19]	valid_0's rmse: 0.925929
[20]	valid_0's rmse: 0.923982
[21]	valid_0's rmse: 0.922608
[22]	valid_0's rmse: 0.921129
[23]	valid_0's rmse: 0.921481
[24]	valid_0's rmse: 0.920943
[25]	valid_0's rmse: 0.91925
[26]	valid_0's rmse: 0.918267
[27]	valid_0's rmse: 0.917091
[28]	valid_0's rmse: 0.91712
[29]	valid_0's rmse: 0.916605
[30]	valid_0's rmse: 0.916102
[31]	valid_0's rmse: 0.915631
[32]	valid_0's rmse: 0.9

In [15]:
reg.save_model('../data/model/model_block_12_32_prev.txt')

In [16]:
gc.collect()

941

#### continue train `reg` model

In [17]:
# continue training
# init_model accepts:
# 1. model file name
# 2. Booster()
opt_params = {
    'max_depth': 9, 
    'num_leaves': 80,
    'metric': 'rmse',
    'bagging_fraction': 0.7257895345351948, 
    'boosting_type': 'gbdt', 
    'feature_fraction': 0.6932767211963933, 
    'gamma': 0.3004315340493337,  
    'reg_lambda': 0.519291939932697
}
params = {    
    'boosting_type': 'dart',
    'objective': 'regression',
    'metric': 'rmse',
    'max_depth': 500,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5    
}
gbm = lgb.train(opt_params,                
                lgb_train,
                early_stopping_rounds=5,
                num_boost_round=500,
                init_model='../data/model/model_tmp.txt',                
                valid_sets=lgb_cv)

[101]	valid_0's rmse: 1.02277
Training until validation scores don't improve for 5 rounds.
[102]	valid_0's rmse: 1.00788
[103]	valid_0's rmse: 0.996426
[104]	valid_0's rmse: 0.986937
[105]	valid_0's rmse: 0.977267
[106]	valid_0's rmse: 0.968423
[107]	valid_0's rmse: 0.961645
[108]	valid_0's rmse: 0.95564
[109]	valid_0's rmse: 0.951398
[110]	valid_0's rmse: 0.945285
[111]	valid_0's rmse: 0.940741
[112]	valid_0's rmse: 0.937988
[113]	valid_0's rmse: 0.935239
[114]	valid_0's rmse: 0.933181
[115]	valid_0's rmse: 0.931515
[116]	valid_0's rmse: 0.928914
[117]	valid_0's rmse: 0.928424
[118]	valid_0's rmse: 0.92708
[119]	valid_0's rmse: 0.925929
[120]	valid_0's rmse: 0.923982
[121]	valid_0's rmse: 0.922608
[122]	valid_0's rmse: 0.921129
[123]	valid_0's rmse: 0.921481
[124]	valid_0's rmse: 0.920943
[125]	valid_0's rmse: 0.91925
[126]	valid_0's rmse: 0.918267
[127]	valid_0's rmse: 0.917091
[128]	valid_0's rmse: 0.91712
[129]	valid_0's rmse: 0.916605
[130]	valid_0's rmse: 0.916102
[131]	valid_0's

In [15]:
# del all_feats_df_all;
gc.collect()

68

# Hyperparam tunning

In [27]:
from hyperopt import fmin, tpe, hp, STATUS_OK,Trials
import numpy as np 

In [None]:
lgb_train = lgb.Dataset(X_train,y_train)
lgb_cv = lgb.Dataset(X_cv,y_cv,reference=lgb_train)

In [39]:
reg.best_score['valid_0']['rmse']

0.90845843207123433

In [74]:
from pprint import pprint

In [63]:
def lgbm_objective(params):
    
    reg = lgb.train(params,
                    train_set = lgb_train,
                    valid_sets = lgb_cv,
                    num_boost_round = 300,
                    early_stopping_rounds = 5,
                    verbose_eval = False)
    print('rmse:{:.5f}'.format(reg.best_score['valid_0']['rmse']))
    print('best iteration:{}'.format(reg.best_iteration))
    
    pprint('params:{}'.format(params))
    print('-------------'*5)
    
    return {'loss':reg.best_score['valid_0']['rmse'], 'status': STATUS_OK }

In [83]:
lgbm_space = {
#     'boosting_type'   : hp.choice('boosting_type',['gbdt', 'dart']), ## gbdt 
    'boosting_type'   : 'gbdt',  
#     'max_depth'       : hp.choice("max_depth", np.arange(20, 500, dtype=int)),
    'learning_rate'   : hp.uniform('learning_rate', 0.05, 0.1),    
    'num_leaves'      : hp.choice('num_leaves', np.arange(8,129,2,dtype=int)), ## no limit 
    'min_data_in_leaf': 200, ## prevent overfit
    'feature_fraction': hp.uniform('feature_fraction', 0.3, 1.0), ## colsample_bytree
    'bagging_fraction': hp.uniform ('bagging_fraction', 0.7, 1), ## can be used to speed up training/ deal with overfitting
    'reg_lambda'      : hp.uniform('reg_lambda',0,1),
    'gamma'           : hp.uniform('gamma', 0.1,0.5),
    'metric'          : 'rmse'
}


In [84]:
trials = Trials()
best =fmin(fn = lgbm_objective,
           space = lgbm_space,
           algo = tpe.suggest,
           trials = trials,
           max_evals = 30
           )



rmse:0.89445
params:{'bagging_fraction': 0.8135469584425394, 'boosting_type': 'gbdt', 'feature_fraction': 0.6879735800500992, 'gamma': 0.3394655342808342, 'learning_rate': 0.09310523644963001, 'metric': 'rmse', 'min_data_in_leaf': 200, 'num_leaves': 96, 'reg_lambda': 0.27046222035396306, 'verbose': 1}
best iteration:61
-----------------------------------------------------------------
rmse:0.88947
params:{'bagging_fraction': 0.8981770831336265, 'boosting_type': 'gbdt', 'feature_fraction': 0.5654124422725608, 'gamma': 0.26816163495935863, 'learning_rate': 0.060870030014356986, 'metric': 'rmse', 'min_data_in_leaf': 200, 'num_leaves': 108, 'reg_lambda': 0.9056219575093959, 'verbose': 1}
best iteration:119
-----------------------------------------------------------------
rmse:0.89294
params:{'bagging_fraction': 0.7411557240931375, 'boosting_type': 'gbdt', 'feature_fraction': 0.4824427800879857, 'gamma': 0.20757491134283598, 'learning_rate': 0.09274139830991965, 'metric': 'rmse', 'min_data_i

rmse:0.88918
params:{'bagging_fraction': 0.7988921010253146, 'boosting_type': 'gbdt', 'feature_fraction': 0.38493920907359375, 'gamma': 0.16801222922580134, 'learning_rate': 0.058749173945578076, 'metric': 'rmse', 'min_data_in_leaf': 200, 'num_leaves': 108, 'reg_lambda': 0.7230472621470556, 'verbose': 1}
best iteration:106
-----------------------------------------------------------------
rmse:0.89601
params:{'bagging_fraction': 0.7954414828427172, 'boosting_type': 'gbdt', 'feature_fraction': 0.3751414311868554, 'gamma': 0.15486838111034315, 'learning_rate': 0.05711998795029177, 'metric': 'rmse', 'min_data_in_leaf': 200, 'num_leaves': 52, 'reg_lambda': 0.7637739783385638, 'verbose': 1}
best iteration:151
-----------------------------------------------------------------
rmse:0.89681
params:{'bagging_fraction': 0.7916096246763383, 'boosting_type': 'gbdt', 'feature_fraction': 0.3850287544783591, 'gamma': 0.4899835249233203, 'learning_rate': 0.08117741093150559, 'metric': 'rmse', 'min_data_

In [87]:
trials.best_trial['result']

{'loss': 0.887297835603789, 'status': 'ok'}

In [88]:
best

{'bagging_fraction': 0.7004522170291932,
 'feature_fraction': 0.35210794640316456,
 'gamma': 0.15577564980992592,
 'learning_rate': 0.07432644801356227,
 'num_leaves': 58,
 'reg_lambda': 0.8820739482513653}

> best rmse: 0.8873
  
    -  params:

        {'bagging_fraction': 0.7004522170291932, 'boosting_type': 'gbdt', 'feature_fraction': 0.35210794640316456, 'gamma': 0.15577564980992592, 'learning_rate': 0.07432644801356227, 'metric': 'rmse', 'min_data_in_leaf': 200, 'num_leaves': 124, 'reg_lambda': 0.8820739482513653, 'verbose': 1}

  
    - best iteration:92

# submission

In [108]:
final_train_mask = all_feats_df_all.date_block_num <=33
test_mask = all_feats_df_all.date_block_num == 34
cv_mask = all_feats_df_all.date_block_num == 33
X_final_train = all_feats_df_all[final_train_mask]
X_test = all_feats_df_all[test_mask]
X_val = all_feats_df_all[cv_mask]##

In [107]:
y_final_train = pd.concat([y_train,y_cv])

In [110]:
lgb_train = lgb.Dataset(X_final_train, 
                        y_final_train
                        ) ## if you want to reuse data --> remember to set free_raw_data=False
lgb_cv = lgb.Dataset(X_val, 
                     y_cv,
                     reference=lgb_train)

In [114]:
print(X_final_train.shape,y_final_train.shape,X_val.shape,y_cv.shape);


(6425094, 136) (6425094,) (238172, 136) (238172,)


In [116]:
opt_params = {
    'bagging_fraction': 0.7004522170291932, 
    'boosting_type': 'gbdt',
    'feature_fraction': 0.35210794640316456,
    'gamma': 0.15577564980992592, 
    'learning_rate': 0.07432644801356227, 
    'metric': 'rmse',
    'min_data_in_leaf': 200, 
    'num_leaves': 124, 
    'reg_lambda': 0.8820739482513653, 
    'verbose': 1
}
reg_final = lgb.train(opt_params,
                train_set = lgb_train,
                valid_sets=lgb_cv,
                num_boost_round = 300,
                early_stopping_rounds = 5,
                verbose_eval = True)

[1]	valid_0's rmse: 1.10707
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's rmse: 1.08346
[3]	valid_0's rmse: 1.06079
[4]	valid_0's rmse: 1.04592
[5]	valid_0's rmse: 1.02478
[6]	valid_0's rmse: 1.00846
[7]	valid_0's rmse: 0.992945
[8]	valid_0's rmse: 0.980565
[9]	valid_0's rmse: 0.966726
[10]	valid_0's rmse: 0.957829
[11]	valid_0's rmse: 0.947502
[12]	valid_0's rmse: 0.939856
[13]	valid_0's rmse: 0.934401
[14]	valid_0's rmse: 0.924998
[15]	valid_0's rmse: 0.919699
[16]	valid_0's rmse: 0.915181
[17]	valid_0's rmse: 0.909639
[18]	valid_0's rmse: 0.904245
[19]	valid_0's rmse: 0.900393
[20]	valid_0's rmse: 0.894043
[21]	valid_0's rmse: 0.889206
[22]	valid_0's rmse: 0.885384
[23]	valid_0's rmse: 0.881927
[24]	valid_0's rmse: 0.879338
[25]	valid_0's rmse: 0.878028
[26]	valid_0's rmse: 0.874953
[27]	valid_0's rmse: 0.871652
[28]	valid_0's rmse: 0.868427
[29]	valid_0's rmse: 0.867221
[30]	valid_0's rmse: 0.864829
[31]	valid_0's rmse: 0.862701
[32]	valid_0's rmse: 0.8

[268]	valid_0's rmse: 0.788225
[269]	valid_0's rmse: 0.788061
[270]	valid_0's rmse: 0.787931
[271]	valid_0's rmse: 0.787855
[272]	valid_0's rmse: 0.787796
[273]	valid_0's rmse: 0.787695
[274]	valid_0's rmse: 0.787641
[275]	valid_0's rmse: 0.787606
[276]	valid_0's rmse: 0.787547
[277]	valid_0's rmse: 0.787456
[278]	valid_0's rmse: 0.787347
[279]	valid_0's rmse: 0.787263
[280]	valid_0's rmse: 0.787199
[281]	valid_0's rmse: 0.787118
[282]	valid_0's rmse: 0.787015
[283]	valid_0's rmse: 0.786907
[284]	valid_0's rmse: 0.786902
[285]	valid_0's rmse: 0.785911
[286]	valid_0's rmse: 0.785869
[287]	valid_0's rmse: 0.785774
[288]	valid_0's rmse: 0.785585
[289]	valid_0's rmse: 0.785427
[290]	valid_0's rmse: 0.785355
[291]	valid_0's rmse: 0.785344
[292]	valid_0's rmse: 0.785038
[293]	valid_0's rmse: 0.784953
[294]	valid_0's rmse: 0.784841
[295]	valid_0's rmse: 0.784736
[296]	valid_0's rmse: 0.784655
[297]	valid_0's rmse: 0.784545
[298]	valid_0's rmse: 0.784494
[299]	valid_0's rmse: 0.784433
[300]	va

In [None]:
lgb.train()

In [117]:
reg_final.save_model('../data/model/model_final_12_33_overfit.txt')

In [118]:
y_test = reg_final.predict(X_test)
y_test = y_test.clip(0,20)

In [120]:
len(y_test)

214200

In [122]:
y_submit = pd.DataFrame({'ID':range(len(y_test)), 'item_cnt_month':y_test})
# y_submit
y_submit.to_csv('../data/output/submission_lgbm_text_on_12_33_overfit.csv',index=False) # LB:0.89911 6/502

______

## submission 2

In [131]:
opt_params = {
    'bagging_fraction': 0.7004522170291932, 
    'boosting_type': 'gbdt',
    'feature_fraction': 0.35210794640316456,
    'gamma': 0.15577564980992592, 
    'learning_rate': 0.07432644801356227, 
    'metric': 'rmse',
    'min_data_in_leaf': 200, 
    'num_leaves': 124, 
    'reg_lambda': 0.8820739482513653, 
    'verbose': 1
}
reg_final = lgb.train(opt_params,
                train_set = lgb_train,
                valid_sets=lgb_cv,
                num_boost_round = 100, ## reduce num_boost
                early_stopping_rounds = 5,
                verbose_eval = True)

[1]	valid_0's rmse: 1.10707
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's rmse: 1.08346
[3]	valid_0's rmse: 1.06079
[4]	valid_0's rmse: 1.04592
[5]	valid_0's rmse: 1.02478
[6]	valid_0's rmse: 1.00846
[7]	valid_0's rmse: 0.992945
[8]	valid_0's rmse: 0.980565
[9]	valid_0's rmse: 0.966726
[10]	valid_0's rmse: 0.957829
[11]	valid_0's rmse: 0.947502
[12]	valid_0's rmse: 0.939856
[13]	valid_0's rmse: 0.934401
[14]	valid_0's rmse: 0.924998
[15]	valid_0's rmse: 0.919699
[16]	valid_0's rmse: 0.915181
[17]	valid_0's rmse: 0.909639
[18]	valid_0's rmse: 0.904245
[19]	valid_0's rmse: 0.900393
[20]	valid_0's rmse: 0.894043
[21]	valid_0's rmse: 0.889206
[22]	valid_0's rmse: 0.885384
[23]	valid_0's rmse: 0.881927
[24]	valid_0's rmse: 0.879338
[25]	valid_0's rmse: 0.878028
[26]	valid_0's rmse: 0.874953
[27]	valid_0's rmse: 0.871652
[28]	valid_0's rmse: 0.868427
[29]	valid_0's rmse: 0.867221
[30]	valid_0's rmse: 0.864829
[31]	valid_0's rmse: 0.862701
[32]	valid_0's rmse: 0.8

In [132]:
y_test = reg_final.predict(X_test)
y_test = y_test.clip(0,20)

y_submit = pd.DataFrame({'ID':range(len(y_test)), 'item_cnt_month':y_test})
# y_submit
y_submit.to_csv('../data/output/submission_lgbm_text_on_12_33_num_100.csv',index=False) # LB:0.89944 6/502 ... not that bad ^^

note: decrease/increase `num_iterations` will not effect final LB result. (compare submission1&2)

______
## Submission3