# Hyperparams tuning 

## Features 
    - lag / mean encoding (numerical)
    - hashing/tfidf svd text features
## cross validation 
Use only simple hold out scheme
- train on `[0-32]`
- validate at `date_block_num = 33`
- via `hyperopt ` packages

In [1]:
import pandas as pd 

## Load data

In [2]:
with pd.HDFStore('../data/feat/data.h5') as store:
    print(store.keys())
    X_train = store['X_train']
    X_cv = store['X_cv']
    y_train = store['y_train']
    y_cv = store['y_cv']
    X_test = store['X_test']

['/X_cv', '/X_test', '/X_train', '/y_cv', '/y_train']


In [3]:
y_train = y_train.clip(0,20)
y_cv = y_cv.clip(0,20)


In [4]:
with pd.HDFStore('../data/feat/all_feat_df_all.h5') as feat:
    print(feat.keys())
    all_feats_df_all = feat['all_feats_df_all']

['/all_feats_df_all']


# Modeling

In [6]:
import lightgbm as lgb
from hyperopt import fmin, tpe, hp, STATUS_OK

(11128004, 134)

In [11]:
''' simple test on params tuning for validation data (33)'''
train_mask = all_feats_df_all.date_block_num <= 32
cv_mask = all_feats_df_all.date_block_num == 33

params = {    
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'max_depth': 100,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5    
}

lgb_train = lgb.Dataset(all_feats_df_all[train_mask], y_train)
lgb_cv = lgb.Dataset(all_feats_df_all[cv_mask], y_cv, reference=lgb_train)

reg = lgb.train(params,
                early_stopping_rounds = 5,
                train_set = lgb_train,
                valid_sets=lgb_cv,
                verbose_eval = True)

[1]	valid_0's rmse: 1.11852
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's rmse: 1.10167
[3]	valid_0's rmse: 1.08613
[4]	valid_0's rmse: 1.0737
[5]	valid_0's rmse: 1.06229
[6]	valid_0's rmse: 1.05183
[7]	valid_0's rmse: 1.04235
[8]	valid_0's rmse: 1.034
[9]	valid_0's rmse: 1.02549
[10]	valid_0's rmse: 1.01715
[11]	valid_0's rmse: 1.01078
[12]	valid_0's rmse: 1.00436
[13]	valid_0's rmse: 0.998014
[14]	valid_0's rmse: 0.993048
[15]	valid_0's rmse: 0.986648
[16]	valid_0's rmse: 0.98167
[17]	valid_0's rmse: 0.977493
[18]	valid_0's rmse: 0.973237
[19]	valid_0's rmse: 0.969171
[20]	valid_0's rmse: 0.966458
[21]	valid_0's rmse: 0.964129
[22]	valid_0's rmse: 0.960643
[23]	valid_0's rmse: 0.958231
[24]	valid_0's rmse: 0.956003
[25]	valid_0's rmse: 0.953993
[26]	valid_0's rmse: 0.952234
[27]	valid_0's rmse: 0.950771
[28]	valid_0's rmse: 0.948667
[29]	valid_0's rmse: 0.946728
[30]	valid_0's rmse: 0.945399
[31]	valid_0's rmse: 0.944374
[32]	valid_0's rmse: 0.942995
[33]

In [None]:
def lgbm_objective(params):
    ## stolen from :https://github.com/hyperopt/hyperopt/issues/357
    lgb_train = lgb.Dataset(X_train,y_train)
    lgb_cv = lgb.Dataset(X_cv,y_cv,reference=lgb_train)
    params
    reg = lgb.train(params_set,
                    early_stopping_rounds = 5,
                    train_set = lgb_train,
                    valid_sets=lgb_cv,
                    verbose_eval = False)
#     score = cross_val_score(reg, X_train,y_train, cv=StratifiedKFold()).mean()
    print('params:{}'.format(params))
    pred = reg.predict(X_cv, num_iteration=reg.best_iteration)
    mse = mean_squared_error(y_cv, pred)
    rmse = mse**0.5
    print("SCORE:{:.5f}".format(rmse))
    return {'loss':rmse, 'status': STATUS_OK }

In [None]:
lgbm_space = {
    'boosting_type'   : hp.choice('boosting_type',['gbdt', 'dart']), ## gbdt 
    'max_depth'       : hp.choice("max_depth", np.arange(4, 17, dtype=int)),
    'num_leaves'      : hp.choice('num_leaves', np.arange(8,129,2,dtype=int)),
    ''
    'feature_fraction': hp.uniform('feature_fraction', 0.3, 1.0),
    'bagging_fraction': hp.uniform ('bagging_fraction', 0.7, 1),
    'reg_lambda': hp.uniform('reg_lambda',0,1),
    'gamma' : hp.uniform('gamma', 0.1,0.5),
}
