# Lightgbm model

In [1]:
import pandas as pd 
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
import gc

In [2]:
with pd.HDFStore('../data/feat/data.h5') as store:
    print(store.keys())
    X_train = store['X_train']
    X_cv = store['X_cv']
    y_train = store['y_train']
    y_cv = store['y_cv']
    X_test = store['X_test']

['/X_cv', '/X_test', '/X_train', '/y_cv', '/y_train']


# clip 
* `to [0,20]`
* `to [0,40]`

In [3]:
y_train = y_train.clip(0,20)
y_cv = y_cv.clip(0,20)


# training

In [4]:
feature_name = list(X_train.columns)
# specify your configurations as a dict
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2', 'rmse'},
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0,
    'feature_name' : feature_name
}

In [6]:
gbm = lgb.LGBMRegressor(**params,random_state=0)
gbm.fit(X_train,y_train,eval_metric='rmse',eval_set=[(X_cv,y_cv)],verbose=True)

  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


[1]	valid_0's rmse: 1.11882
[2]	valid_0's rmse: 1.1019
[3]	valid_0's rmse: 1.08643
[4]	valid_0's rmse: 1.07392
[5]	valid_0's rmse: 1.06105
[6]	valid_0's rmse: 1.05029
[7]	valid_0's rmse: 1.03988
[8]	valid_0's rmse: 1.03043
[9]	valid_0's rmse: 1.02194
[10]	valid_0's rmse: 1.01485
[11]	valid_0's rmse: 1.00792
[12]	valid_0's rmse: 1.00209
[13]	valid_0's rmse: 0.997167
[14]	valid_0's rmse: 0.992442
[15]	valid_0's rmse: 0.98783
[16]	valid_0's rmse: 0.984423
[17]	valid_0's rmse: 0.980853
[18]	valid_0's rmse: 0.977162
[19]	valid_0's rmse: 0.974068
[20]	valid_0's rmse: 0.971363
[21]	valid_0's rmse: 0.968459
[22]	valid_0's rmse: 0.966184
[23]	valid_0's rmse: 0.964173
[24]	valid_0's rmse: 0.96256
[25]	valid_0's rmse: 0.960972
[26]	valid_0's rmse: 0.959446
[27]	valid_0's rmse: 0.957661
[28]	valid_0's rmse: 0.956553
[29]	valid_0's rmse: 0.955508
[30]	valid_0's rmse: 0.954409
[31]	valid_0's rmse: 0.953578
[32]	valid_0's rmse: 0.952902
[33]	valid_0's rmse: 0.952213
[34]	valid_0's rmse: 0.951412
[35]

LGBMRegressor(bagging_fraction=0.8, bagging_freq=5, boosting_type='gbdt',
       class_weight=None, colsample_bytree=1.0, feature_fraction=0.9,
       feature_name=['shop_id', 'item_id', 'date_block_num', 'item_category_id', 'item_id_avg_item_price_lag_1', 'item_id_sum_item_cnt_day_lag_1', 'item_id_avg_item_cnt_day_lag_1', 'shop_id_avg_item_price_lag_1', 'shop_id_sum_item_cnt_day_lag_1', 'shop_id_avg_item_cnt_day_lag_1', 'item_category_id_avg_item...ry_id_sum_item_cnt_day_lag_12', 'item_category_id_avg_item_cnt_day_lag_12', 'item_cnt_month_lag_12'],
       learning_rate=0.05, max_depth=-1, metric={'mse', 'l2'},
       min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
       n_estimators=100, n_jobs=-1, num_leaves=31, objective='regression',
       random_state=0, reg_alpha=0.0, reg_lambda=0.0, silent=True,
       subsample=1.0, subsample_for_bin=200000, subsample_freq=1,
       task='train', verbose=0)

In [7]:
def extract_importance(gbm,topn=20):
    return pd.DataFrame(
        {'features':gbm.feature_name,'importance':gbm.feature_importances_}).sort_values(by='importance',ascending=False).head(topn)

In [8]:
extract_importance(gbm,40)

Unnamed: 0,features,importance
5,item_id_sum_item_cnt_day_lag_1,352
6,item_id_avg_item_cnt_day_lag_1,239
13,item_cnt_month_lag_1,224
3,item_category_id,213
23,item_cnt_month_lag_2,182
8,shop_id_sum_item_cnt_day_lag_1,168
2,date_block_num,139
33,item_cnt_month_lag_3,119
43,item_cnt_month_lag_5,112
0,shop_id,94


* use `lgb.train`
* `lgb.Dataset`

In [66]:
lgb_train = lgb.Dataset(X_train,y_train)
lgb_cv = lgb.Dataset(X_cv,y_cv,reference=lgb_train)

In [67]:
print('Start training...')
# train
gbm = lgb.train(params,
                lgb_train,                
                valid_sets=lgb_cv)
print('end training')
# print('Save model...')
# save model to file
# gbm.save_model('../data/model/lgb_model1.txt')

Start training...


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


[1]	valid_0's l2: 1.25112	valid_0's l2: 1.25112
[2]	valid_0's l2: 1.21476	valid_0's l2: 1.21476
[3]	valid_0's l2: 1.18125	valid_0's l2: 1.18125
[4]	valid_0's l2: 1.15188	valid_0's l2: 1.15188
[5]	valid_0's l2: 1.1268	valid_0's l2: 1.1268
[6]	valid_0's l2: 1.10553	valid_0's l2: 1.10553
[7]	valid_0's l2: 1.0829	valid_0's l2: 1.0829
[8]	valid_0's l2: 1.06514	valid_0's l2: 1.06514
[9]	valid_0's l2: 1.04681	valid_0's l2: 1.04681
[10]	valid_0's l2: 1.03218	valid_0's l2: 1.03218
[11]	valid_0's l2: 1.01917	valid_0's l2: 1.01917
[12]	valid_0's l2: 1.0073	valid_0's l2: 1.0073
[13]	valid_0's l2: 0.997146	valid_0's l2: 0.997146
[14]	valid_0's l2: 0.987576	valid_0's l2: 0.987576
[15]	valid_0's l2: 0.979684	valid_0's l2: 0.979684
[16]	valid_0's l2: 0.971877	valid_0's l2: 0.971877
[17]	valid_0's l2: 0.964579	valid_0's l2: 0.964579
[18]	valid_0's l2: 0.958288	valid_0's l2: 0.958288
[19]	valid_0's l2: 0.952578	valid_0's l2: 0.952578
[20]	valid_0's l2: 0.946659	valid_0's l2: 0.946659
[21]	valid_0's l2: 

In [9]:
print('Feature importances:', list(gbm.feature_importance()))

Feature importances: [3, 20, 19, 70, 1, 61, 36, 0, 33, 0, 3, 5, 0, 121, 2, 4, 2, 0, 7, 0, 0, 0, 4, 55, 0, 1, 0, 0, 6, 0, 0, 0, 7, 58, 2, 9, 3, 0, 4, 0, 0, 0, 1, 47, 0, 8, 0, 0, 1, 1, 0, 0, 0, 6]


In [22]:
pd.DataFrame({'features':gbm.feature_name(),'importance':gbm.feature_importance()}).sort_values(by='importance',ascending=False).head(20)

Unnamed: 0,features,importance
5,item_id_sum_item_cnt_day_lag_1,326
6,item_id_avg_item_cnt_day_lag_1,244
3,item_category_id,238
13,item_cnt_month_lag_1,230
23,item_cnt_month_lag_2,185
8,shop_id_sum_item_cnt_day_lag_1,167
2,date_block_num,159
33,item_cnt_month_lag_3,119
43,item_cnt_month_lag_5,106
0,shop_id,93


In [23]:
gc.collect()

2713

In [16]:
gbm.save_model('../data/model/lgb_model1.txt')

In [25]:
print('Start predicting...')
# predict
y_pred = gbm.predict(X_cv, num_iteration=gbm.best_iteration)
# eval
print('The mse of prediction on cv(test) is:', mean_squared_error(y_cv, y_pred)**0.5)

Start predicting...
The mse of prediction on cv(test) is: 0.935438500234


# pred 

In [58]:
tests = pd.read_csv('../data/test.csv')

In [59]:
print('shape of test :{}'.format(tests.shape))
tests.head()

shape of test :(214200, 3)


Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


In [63]:
X_test = X_test[X_test.date_block_num ==34].merge(tests, on = ['shop_id','item_id'],how='inner')
assert X_test.shape[0] == tests.shape[0]

In [64]:
X_test.loc[:,['ID','shop_id','item_id']].head()

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


In [65]:
X_test = X_test.drop('ID',axis=1)

In [69]:
y_test = gbm.predict(X_test)#,num_iteration=gbm.best_iteration)

In [70]:
y_submit = pd.DataFrame({'ID':range(len(y_test)), 'item_cnt_month':y_test})

In [71]:
y_submit.to_csv('../data/output/submission_lgbm2.csv',index=False)

____

# Grid search

In [7]:
from sklearn.model_selection import GridSearchCV

In [8]:
estimator = lgb.LGBMRegressor(boosting_type='gbdt',
                              objective='regression',                              
                              num_leaves=31,
                              learning_rate=0.01,n_estimators=40)

In [9]:
param_grid = {
    'learning_rate': [0.01, 0.1, 1],
    'n_estimators': [20, 40]
}

In [10]:
gsearch = GridSearchCV(estimator, param_grid) ## memory use a lot
gsearch.fit(X_train,y_train)

KeyboardInterrupt: 

In [11]:
gc.collect()

352

# HyperOpt

In [38]:
from hyperopt import fmin,tpe,hp,Trials, STATUS_OK

In [7]:
# from sklearn.metrics import mean_squared_error
# import xgboost as xgb
# from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
# import numpy as np

# def objective(space):
#     ## stolen from :
#     print(space)
#     clf = xgb.XGBRegressor(n_estimators =1000,colsample_bytree=space['colsample_bytree'],
#                            learning_rate = .3,
#                             max_depth = int(space['max_depth']),
#                             min_child_weight = space['min_child_weight'],
#                             subsample = space['subsample'],
#                            gamma = space['gamma'],
#                            reg_lambda = space['reg_lambda'],)

#     eval_set  = [( X, y), ( Xcv, ycv)]

#     clf.fit(X, y,
#             eval_set=eval_set, eval_metric="rmse",
#             early_stopping_rounds=10,verbose=False)

#     pred = clf.predict(Xcv)
#     mse_scr = mean_squared_error(ycv, pred)
#     print "SCORE:", np.sqrt(mse_scr)
#     #change the metric if you like
#     return {'loss':mse_scr, 'status': STATUS_OK }


# space ={'max_depth': hp.quniform("x_max_depth", 4, 16, 1),
#         'min_child_weight': hp.quniform ('x_min_child', 1, 10, 1),
#         'subsample': hp.uniform ('x_subsample', 0.7, 1),
#         'gamma' : hp.uniform ('x_gamma', 0.1,0.5),
#         'colsample_bytree' : hp.uniform ('x_colsample_bytree', 0.7,1),
#         'reg_lambda' : hp.uniform ('x_reg_lambda', 0,1)
#     }


# trials = Trials()
# best = fmin(fn=objective,
#             space=space,
#             algo=tpe.suggest,
#             max_evals=100,
#             trials=trials)

# print best

<hyperopt.pyll.base.Apply at 0x9cd84c2080>

In [3]:
import pickle
import time
from hyperopt import fmin, tpe, hp, STATUS_OK

def objective(x):
    return {'loss': x ** 2, 'status': STATUS_OK }

best = fmin(objective,
    space=hp.uniform('x', -10, 10),
    algo=tpe.suggest,
    max_evals=100)

In [11]:
STATUS_OK

'ok'

In [10]:
from sklearn.metrics import mean_squared_error

In [34]:
import numpy as np 

In [None]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2', 'mse'},
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

In [102]:
def lgbm_objective(params):
    ## stolen from :https://github.com/hyperopt/hyperopt/issues/357
    lgb_train = lgb.Dataset(X_train,y_train)
    lgb_cv = lgb.Dataset(X_cv,y_cv,reference=lgb_train)
    params_set = {
        'bossting_type':'gbdt',
        'objective': 'regression',
        'metric': 'rmse',        
        'max_depth' : int(params['max_depth']),
        'num_leaves': int(params['num_leaves']),
        'feature_fraction': '{:.3f}'.format(params['feature_fraction']),
        'bagging_fraction' : '{:.3f}'.format(params['bagging_fraction']),
        'reg_lambda' : '{:.3f}'.format(params['reg_lambda']),
        'gamma' : '{:.3f}'.format(params['gamma'])        
    }
    reg = lgb.train(params_set,
                    early_stopping_rounds = 5,
                    train_set = lgb_train,
                    valid_sets=lgb_cv,
                    verbose_eval = False)
#     score = cross_val_score(reg, X_train,y_train, cv=StratifiedKFold()).mean()
    print('params:{}'.format(params))
    pred = reg.predict(X_cv, num_iteration=reg.best_iteration)
    mse = mean_squared_error(y_cv, pred)
    rmse = mse**0.5
    print("SCORE:{:.5f}".format(rmse))
    return {'loss':rmse, 'status': STATUS_OK }

In [103]:
lgbm_space = { 
    'max_depth' : hp.quniform("max_depth", 4, 16, 1),
    'num_leaves': hp.quniform('num_leaves', 8, 128, 2),
    'feature_fraction': hp.uniform('feature_fraction', 0.3, 1.0),
    'bagging_fraction': hp.uniform ('bagging_fraction', 0.7, 1),
    'reg_lambda': hp.uniform('reg_lambda',0,1),
    'gamma' : hp.uniform('gamma', 0.1,0.5),
}

trials = Trials()
lgbm_best = fmin(fn=lgbm_objective,
            space=lgbm_space,
            algo=tpe.suggest,
            trials = trials,
            max_evals=100)

params:{'bagging_fraction': 0.8460151149931732, 'feature_fraction': 0.8427806633874917, 'gamma': 0.24601059580847584, 'max_depth': 12.0, 'num_leaves': 60.0, 'reg_lambda': 0.5178707655430725}
SCORE:0.93338
params:{'bagging_fraction': 0.7946363213572958, 'feature_fraction': 0.44432171431103207, 'gamma': 0.30905812049880055, 'max_depth': 5.0, 'num_leaves': 110.0, 'reg_lambda': 0.3532894187706621}
SCORE:0.93351
params:{'bagging_fraction': 0.8994224108856765, 'feature_fraction': 0.9306706501441817, 'gamma': 0.17464294503316152, 'max_depth': 13.0, 'num_leaves': 120.0, 'reg_lambda': 0.2816150459059076}
SCORE:0.93293
params:{'bagging_fraction': 0.9487511904150518, 'feature_fraction': 0.3868872691181628, 'gamma': 0.4543479971044322, 'max_depth': 16.0, 'num_leaves': 44.0, 'reg_lambda': 0.8781978025348828}
SCORE:0.93484
params:{'bagging_fraction': 0.7408426448796761, 'feature_fraction': 0.49194718434949414, 'gamma': 0.39565802025828833, 'max_depth': 14.0, 'num_leaves': 122.0, 'reg_lambda': 0.8045

SCORE:0.95030
params:{'bagging_fraction': 0.9993507342849083, 'feature_fraction': 0.82587553407689, 'gamma': 0.16011545601200106, 'max_depth': 15.0, 'num_leaves': 108.0, 'reg_lambda': 0.426209054481027}
SCORE:0.93558
params:{'bagging_fraction': 0.7005601057321159, 'feature_fraction': 0.4786373593854183, 'gamma': 0.38045165222442645, 'max_depth': 13.0, 'num_leaves': 92.0, 'reg_lambda': 0.9108965249808549}
SCORE:0.93080
params:{'bagging_fraction': 0.90095845688285, 'feature_fraction': 0.8604838642395956, 'gamma': 0.32760502981527456, 'max_depth': 10.0, 'num_leaves': 40.0, 'reg_lambda': 8.594886003798763e-06}
SCORE:0.92835
params:{'bagging_fraction': 0.726831091241271, 'feature_fraction': 0.7307735605541289, 'gamma': 0.28801038586602246, 'max_depth': 6.0, 'num_leaves': 82.0, 'reg_lambda': 0.06213711415080825}
SCORE:0.92953
params:{'bagging_fraction': 0.9663375507516703, 'feature_fraction': 0.944412204410428, 'gamma': 0.4336892486435121, 'max_depth': 13.0, 'num_leaves': 74.0, 'reg_lambda':

params:{'bagging_fraction': 0.9462624700192365, 'feature_fraction': 0.7214391808996808, 'gamma': 0.3519592459749137, 'max_depth': 13.0, 'num_leaves': 74.0, 'reg_lambda': 0.03850405491267084}
SCORE:0.93254
params:{'bagging_fraction': 0.8407271471583676, 'feature_fraction': 0.6064289321013199, 'gamma': 0.22615109880768522, 'max_depth': 11.0, 'num_leaves': 32.0, 'reg_lambda': 0.34180778870025375}
SCORE:0.93818
params:{'bagging_fraction': 0.9297083541932376, 'feature_fraction': 0.49641041955590975, 'gamma': 0.15282870543806434, 'max_depth': 14.0, 'num_leaves': 102.0, 'reg_lambda': 0.18777533730295443}
SCORE:0.92752
params:{'bagging_fraction': 0.9595171460701322, 'feature_fraction': 0.6703540075166412, 'gamma': 0.1776425634981456, 'max_depth': 10.0, 'num_leaves': 78.0, 'reg_lambda': 0.30325268208965594}
SCORE:0.92934
params:{'bagging_fraction': 0.8876022881183826, 'feature_fraction': 0.7508498555324167, 'gamma': 0.26894455624139435, 'max_depth': 16.0, 'num_leaves': 94.0, 'reg_lambda': 0.572

In [104]:
lgbm_best['num_leaves'] = int(lgbm_best['num_leaves'])
lgbm_best['max_depth'] = int(lgbm_best['max_depth'])

In [105]:
lgbm_best

{'bagging_fraction': 0.9568845079308161,
 'feature_fraction': 0.6203248801718259,
 'gamma': 0.39624896070423066,
 'max_depth': 12,
 'num_leaves': 64,
 'reg_lambda': 0.38856229720270463}

In [107]:
trials.best_trial

{'book_time': datetime.datetime(2018, 2, 7, 9, 47, 10, 734000),
 'exp_key': None,
 'misc': {'cmd': ('domain_attachment', 'FMinIter_Domain'),
  'idxs': {'bagging_fraction': [12],
   'feature_fraction': [12],
   'gamma': [12],
   'max_depth': [12],
   'num_leaves': [12],
   'reg_lambda': [12]},
  'tid': 12,
  'vals': {'bagging_fraction': [0.9568845079308161],
   'feature_fraction': [0.6203248801718259],
   'gamma': [0.39624896070423066],
   'max_depth': [12.0],
   'num_leaves': [64.0],
   'reg_lambda': [0.38856229720270463]},
  'workdir': None},
 'owner': None,
 'refresh_time': datetime.datetime(2018, 2, 7, 9, 48, 18, 902000),
 'result': {'loss': 0.9229616663760128, 'status': 'ok'},
 'spec': None,
 'state': 2,
 'tid': 12,
 'version': 0}

______

# SUBMISSION TASK

In [130]:
tests = pd.read_csv('../data/test.csv')

In [131]:
print('shape of test :{}'.format(tests.shape))
tests.head()

shape of test :(214200, 3)


Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


In [132]:
X_test = X_test[X_test.date_block_num ==34].merge(tests, on = ['shop_id','item_id'],how='inner')
assert X_test.shape[0] == tests.shape[0]

In [133]:
X_test.loc[:,['ID','shop_id','item_id']].head()

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


In [134]:
X_test = X_test.drop('ID',axis=1)

only verify data on 2015 Oct `rmse = 0.92296`, if we submit this model to LB

In [110]:
defalut_params = {
    'bossting_type':'gbdt',
    'objective': 'regression',
    'metric': 'rmse'
}

In [112]:
params_best = {**defalut_params,**lgbm_best}
params_best

{'bagging_fraction': 0.9568845079308161,
 'bossting_type': 'gbdt',
 'feature_fraction': 0.6203248801718259,
 'gamma': 0.39624896070423066,
 'max_depth': 12,
 'metric': 'rmse',
 'num_leaves': 64,
 'objective': 'regression',
 'reg_lambda': 0.38856229720270463,
 'verbose': 1}

In [113]:
lgb_train = lgb.Dataset(X_train,y_train)
lgb_cv = lgb.Dataset(X_cv,y_cv,reference=lgb_train)
reg = lgb.train(params_best,
                    early_stopping_rounds = 5,
                    train_set = lgb_train,
                    valid_sets=lgb_cv,
                    verbose_eval = True)

[1]	valid_0's rmse: 1.09962
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's rmse: 1.07412
[3]	valid_0's rmse: 1.05276
[4]	valid_0's rmse: 1.03218
[5]	valid_0's rmse: 1.01344
[6]	valid_0's rmse: 1.00072
[7]	valid_0's rmse: 0.988391
[8]	valid_0's rmse: 0.978198
[9]	valid_0's rmse: 0.970309
[10]	valid_0's rmse: 0.9634
[11]	valid_0's rmse: 0.957548
[12]	valid_0's rmse: 0.952815
[13]	valid_0's rmse: 0.950651
[14]	valid_0's rmse: 0.946677
[15]	valid_0's rmse: 0.943847
[16]	valid_0's rmse: 0.94178
[17]	valid_0's rmse: 0.939557
[18]	valid_0's rmse: 0.938239
[19]	valid_0's rmse: 0.937224
[20]	valid_0's rmse: 0.936522
[21]	valid_0's rmse: 0.936182
[22]	valid_0's rmse: 0.934861
[23]	valid_0's rmse: 0.933711
[24]	valid_0's rmse: 0.932294
[25]	valid_0's rmse: 0.931933
[26]	valid_0's rmse: 0.931333
[27]	valid_0's rmse: 0.930696
[28]	valid_0's rmse: 0.930352
[29]	valid_0's rmse: 0.930218
[30]	valid_0's rmse: 0.930417
[31]	valid_0's rmse: 0.930262
[32]	valid_0's rmse: 0.9294

In [128]:
lgb_train = lgb.Dataset(pd.concat([X_train,X_cv]),np.concatenate([y_train,y_cv]))

# lgb_cv = lgb.Dataset(X_cv,y_cv,reference=lgb_train)
reg_sub = lgb.train(params_best,                    
                    train_set = lgb_train,                    
                    verbose_eval = False)

In [135]:
y_pred = reg_sub.predict(X_test)

In [141]:
y_submit = pd.DataFrame({'ID':range(len(y_pred)), 'item_cnt_month':y_pred}) # validate on 2015oct : 0.92296, LB: 0.93418
y_submit.to_csv('../data/output/submission_lgbm3.csv',index=False)