# Lightgbm model

In [1]:
import pandas as pd 
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
import gc

In [12]:
with pd.HDFStore('../data/feat/data.h5') as store:
    print(store.keys())
    X_train = store['X_train']
    X_cv = store['X_cv']
    y_train = store['y_train']
    y_cv = store['y_cv']
    X_test = store['X_test']

['/X_cv', '/X_test', '/X_train', '/y_cv', '/y_train']


# clip 
* `to [0,20]`
* `to [0,40]`

In [13]:
y_train = y_train.clip(0,20)
y_cv = y_cv.clip(0,20)


# training

In [4]:
lgb_train = lgb.Dataset(X_train,y_train)
lgb_cv = lgb.Dataset(X_cv,y_cv,reference=lgb_train)

In [35]:
feature_name = list(X_train.columns)
# specify your configurations as a dict
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2', 'mse'},
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
#     'feature_name' : feature_name
}

In [36]:
gbm = lgb.LGBMRegressor(**params)
gbm.fit(X_train,y_train,eval_metric='l2',eval_set=[(X_cv,y_cv)],verbose=True)

  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


[1]	valid_0's l2: 1.25112
[2]	valid_0's l2: 1.21476
[3]	valid_0's l2: 1.18125
[4]	valid_0's l2: 1.15188
[5]	valid_0's l2: 1.1268
[6]	valid_0's l2: 1.10553
[7]	valid_0's l2: 1.0829
[8]	valid_0's l2: 1.06514
[9]	valid_0's l2: 1.04681
[10]	valid_0's l2: 1.03218
[11]	valid_0's l2: 1.01917
[12]	valid_0's l2: 1.0073
[13]	valid_0's l2: 0.997146
[14]	valid_0's l2: 0.987576
[15]	valid_0's l2: 0.979684
[16]	valid_0's l2: 0.971877
[17]	valid_0's l2: 0.964579
[18]	valid_0's l2: 0.958288
[19]	valid_0's l2: 0.952578
[20]	valid_0's l2: 0.946659
[21]	valid_0's l2: 0.941658
[22]	valid_0's l2: 0.937524
[23]	valid_0's l2: 0.933119
[24]	valid_0's l2: 0.930093
[25]	valid_0's l2: 0.927006
[26]	valid_0's l2: 0.924199
[27]	valid_0's l2: 0.921861
[28]	valid_0's l2: 0.919828
[29]	valid_0's l2: 0.917089
[30]	valid_0's l2: 0.914766
[31]	valid_0's l2: 0.912982
[32]	valid_0's l2: 0.911246
[33]	valid_0's l2: 0.909394
[34]	valid_0's l2: 0.907603
[35]	valid_0's l2: 0.905799
[36]	valid_0's l2: 0.904743
[37]	valid_0's l

LGBMRegressor(bagging_fraction=0.8, bagging_freq=5, boosting_type='gbdt',
       class_weight=None, colsample_bytree=1.0, feature_fraction=0.9,
       feature_name=['shop_id', 'item_id', 'date_block_num', 'item_category_id', 'item_id_avg_item_price_lag_1', 'item_id_sum_item_cnt_day_lag_1', 'item_id_avg_item_cnt_day_lag_1', 'shop_id_avg_item_price_lag_1', 'shop_id_sum_item_cnt_day_lag_1', 'shop_id_avg_item_cnt_day_lag_1', 'item_category_id_avg_item...ry_id_sum_item_cnt_day_lag_12', 'item_category_id_avg_item_cnt_day_lag_12', 'item_cnt_month_lag_12'],
       learning_rate=0.05, max_depth=-1, metric={'mse', 'l2'},
       min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
       n_estimators=100, n_jobs=-1, num_leaves=31, objective='regression',
       random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
       subsample=1.0, subsample_for_bin=200000, subsample_freq=1,
       task='train', verbose=0)

In [47]:
def extract_importance(gbm,topn=20):
    return pd.DataFrame(
        {'features':gbm.feature_name,'importance':gbm.feature_importances_}).sort_values(by='importance',ascending=False).head(topn)

In [48]:
extract_importance(gbm,40)

Unnamed: 0,features,importance
5,item_id_sum_item_cnt_day_lag_1,326
6,item_id_avg_item_cnt_day_lag_1,244
3,item_category_id,238
13,item_cnt_month_lag_1,230
23,item_cnt_month_lag_2,185
8,shop_id_sum_item_cnt_day_lag_1,167
2,date_block_num,159
33,item_cnt_month_lag_3,119
43,item_cnt_month_lag_5,106
0,shop_id,93


In [55]:
gbm.score(X_,y_train)

0.44538977481097519

In [8]:
print('Start training...')
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=20,
                valid_sets=lgb_cv,
early_stopping_rounds=20)
print('end training')
# print('Save model...')
# save model to file
# gbm.save_model('../data/model/lgb_model1.txt')

Start training...
[1]	valid_0's l2: 1.25112	valid_0's l2: 1.25112
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's l2: 1.21476	valid_0's l2: 1.21476
[3]	valid_0's l2: 1.18125	valid_0's l2: 1.18125
[4]	valid_0's l2: 1.15188	valid_0's l2: 1.15188
[5]	valid_0's l2: 1.1268	valid_0's l2: 1.1268
[6]	valid_0's l2: 1.10553	valid_0's l2: 1.10553
[7]	valid_0's l2: 1.0829	valid_0's l2: 1.0829
[8]	valid_0's l2: 1.06514	valid_0's l2: 1.06514
[9]	valid_0's l2: 1.04681	valid_0's l2: 1.04681
[10]	valid_0's l2: 1.03218	valid_0's l2: 1.03218
[11]	valid_0's l2: 1.01917	valid_0's l2: 1.01917
[12]	valid_0's l2: 1.0073	valid_0's l2: 1.0073
[13]	valid_0's l2: 0.997146	valid_0's l2: 0.997146
[14]	valid_0's l2: 0.987576	valid_0's l2: 0.987576
[15]	valid_0's l2: 0.979684	valid_0's l2: 0.979684
[16]	valid_0's l2: 0.971877	valid_0's l2: 0.971877
[17]	valid_0's l2: 0.964579	valid_0's l2: 0.964579
[18]	valid_0's l2: 0.958288	valid_0's l2: 0.958288
[19]	valid_0's l2: 0.952578	valid_0's l2: 

In [9]:
print('Feature importances:', list(gbm.feature_importance()))

Feature importances: [3, 20, 19, 70, 1, 61, 36, 0, 33, 0, 3, 5, 0, 121, 2, 4, 2, 0, 7, 0, 0, 0, 4, 55, 0, 1, 0, 0, 6, 0, 0, 0, 7, 58, 2, 9, 3, 0, 4, 0, 0, 0, 1, 47, 0, 8, 0, 0, 1, 1, 0, 0, 0, 6]


In [16]:
pd.DataFrame({'features':gbm.feature_name(),'importance':gbm.feature_importance()}).sort_values(by='importance',ascending=False).head(20)

Unnamed: 0,features,importance
13,item_cnt_month_lag_1,121
3,item_category_id,70
5,item_id_sum_item_cnt_day_lag_1,61
33,item_cnt_month_lag_3,58
23,item_cnt_month_lag_2,55
43,item_cnt_month_lag_5,47
6,item_id_avg_item_cnt_day_lag_1,36
8,shop_id_sum_item_cnt_day_lag_1,33
1,item_id,20
2,date_block_num,19


In [19]:
gc.collect()

0

In [16]:
gbm.save_model('../data/model/lgb_model1.txt')

In [17]:
print('Start predicting...')
# predict
y_pred = gbm.predict(X_cv, num_iteration=gbm.best_iteration)
# eval
print('The mse of prediction is:', mean_squared_error(y_cv, y_pred))

Start predicting...
The mse of prediction is: 0.946658903874


# pred 

In [58]:
tests = pd.read_csv('../data/test.csv')

In [59]:
print('shape of test :{}'.format(tests.shape))
tests.head()

shape of test :(214200, 3)


Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


In [63]:
X_test = X_test[X_test.date_block_num ==34].merge(tests, on = ['shop_id','item_id'],how='inner')
assert X_test.shape[0] == tests.shape[0]

In [64]:
X_test.loc[:,['ID','shop_id','item_id']].head()

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


In [65]:
X_test = X_test.drop('ID',axis=1)

In [69]:
y_test = gbm.predict(X_test)#,num_iteration=gbm.best_iteration)

In [70]:
y_submit = pd.DataFrame({'ID':range(len(y_test)), 'item_cnt_month':y_test})

In [71]:
y_submit.to_csv('../data/output/submission_lgbm2.csv',index=False)

____

# Grid search

In [7]:
from sklearn.model_selection import GridSearchCV

In [8]:
estimator = lgb.LGBMRegressor(boosting_type='gbdt',
                              objective='regression',                              
                              num_leaves=31,
                              learning_rate=0.01,n_estimators=40)

In [9]:
param_grid = {
    'learning_rate': [0.01, 0.1, 1],
    'n_estimators': [20, 40]
}

In [10]:
gsearch = GridSearchCV(estimator, param_grid) ## memory use a lot
gsearch.fit(X_train,y_train)

KeyboardInterrupt: 

In [11]:
gc.collect()

352

# HyperOpt

In [2]:
import hyperopt

In [3]:
import pickle
import time
from hyperopt import fmin, tpe, hp, STATUS_OK

def objective(x):
    return {'loss': x ** 2, 'status': STATUS_OK }

best = fmin(objective,
    space=hp.uniform('x', -10, 10),
    algo=tpe.suggest,
    max_evals=100)

In [11]:
STATUS_OK

'ok'

In [10]:
from sklearn.metrics import mean_squared_error

In [75]:
import numpy as np 

In [76]:
def lgbm_objective(params):
    ## stolen from :https://github.com/hyperopt/hyperopt/issues/357
    params = {
        'num_leaves': int(params['num_leaves']),
        'feature_fraction': '{:.3f}'.format(params['feature_fraction'])
    }
    reg = lgb.LGBMRegressor(
        n_estimators=100,
        learning_rate=0.05,
        **params
    )
    eval_set  = [( X_cv, y_cv)]
    reg.fit(X_train, y_train,
            eval_set = eval_set,
            eval_metric = 'l2',
            early_stopping_rounds = 5
            )
#     score = cross_val_score(reg, X_train,y_train, cv=StratifiedKFold()).mean()
    pred = reg.predict(X_cv)
    mse = mean_squared_error(y_cv, pred)
    print("SCORE:", np.sqrt(mse))
    return {'loss':mse, 'status': STATUS_OK }

lgbm_space = {
    'num_leaves': hp.quniform('num_leaves', 8, 128, 2),
    'feature_fraction': hp.uniform('feature_fraction', 0.3, 1.0),
}

lgbm_best = fmin(fn=lgbm_objective,
            space=lgbm_space,
            algo=tpe.suggest,
            max_evals=10)

[1]	valid_0's l2: 1.24907
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's l2: 1.21165
[3]	valid_0's l2: 1.17796
[4]	valid_0's l2: 1.14689
[5]	valid_0's l2: 1.11914
[6]	valid_0's l2: 1.09515
[7]	valid_0's l2: 1.07395
[8]	valid_0's l2: 1.05354
[9]	valid_0's l2: 1.03734
[10]	valid_0's l2: 1.02249
[11]	valid_0's l2: 1.00903
[12]	valid_0's l2: 0.995803
[13]	valid_0's l2: 0.984502
[14]	valid_0's l2: 0.976471
[15]	valid_0's l2: 0.968689
[16]	valid_0's l2: 0.961637
[17]	valid_0's l2: 0.954744
[18]	valid_0's l2: 0.947841
[19]	valid_0's l2: 0.942435
[20]	valid_0's l2: 0.937997
[21]	valid_0's l2: 0.934547
[22]	valid_0's l2: 0.930471
[23]	valid_0's l2: 0.929449
[24]	valid_0's l2: 0.925946
[25]	valid_0's l2: 0.922238
[26]	valid_0's l2: 0.918756
[27]	valid_0's l2: 0.915443
[28]	valid_0's l2: 0.912678
[29]	valid_0's l2: 0.909401
[30]	valid_0's l2: 0.907716
[31]	valid_0's l2: 0.906702
[32]	valid_0's l2: 0.905886
[33]	valid_0's l2: 0.906035
[34]	valid_0's l2: 0.90419
[35]	val

[97]	valid_0's l2: 0.869157
[98]	valid_0's l2: 0.869023
[99]	valid_0's l2: 0.868837
[100]	valid_0's l2: 0.868546
Did not meet early stopping. Best iteration is:
[100]	valid_0's l2: 0.868546
SCORE: 0.931958016583
[1]	valid_0's l2: 1.25359
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's l2: 1.22756
[3]	valid_0's l2: 1.20621
[4]	valid_0's l2: 1.18783
[5]	valid_0's l2: 1.16629
[6]	valid_0's l2: 1.14888
[7]	valid_0's l2: 1.12385
[8]	valid_0's l2: 1.11079
[9]	valid_0's l2: 1.09031
[10]	valid_0's l2: 1.08135
[11]	valid_0's l2: 1.06376
[12]	valid_0's l2: 1.04748
[13]	valid_0's l2: 1.03757
[14]	valid_0's l2: 1.02342
[15]	valid_0's l2: 1.01718
[16]	valid_0's l2: 1.01057
[17]	valid_0's l2: 1.00518
[18]	valid_0's l2: 0.992334
[19]	valid_0's l2: 0.97621
[20]	valid_0's l2: 0.966633
[21]	valid_0's l2: 0.958545
[22]	valid_0's l2: 0.953885
[23]	valid_0's l2: 0.946613
[24]	valid_0's l2: 0.942466
[25]	valid_0's l2: 0.93841
[26]	valid_0's l2: 0.931566
[27]	valid_0's l2: 0.925946

[48]	valid_0's l2: 0.891163
[49]	valid_0's l2: 0.890308
[50]	valid_0's l2: 0.889837
[51]	valid_0's l2: 0.889067
[52]	valid_0's l2: 0.88886
[53]	valid_0's l2: 0.888197
[54]	valid_0's l2: 0.88794
[55]	valid_0's l2: 0.886987
[56]	valid_0's l2: 0.886081
[57]	valid_0's l2: 0.885897
[58]	valid_0's l2: 0.885623
[59]	valid_0's l2: 0.884705
[60]	valid_0's l2: 0.884199
[61]	valid_0's l2: 0.883605
[62]	valid_0's l2: 0.882761
[63]	valid_0's l2: 0.882867
[64]	valid_0's l2: 0.882519
[65]	valid_0's l2: 0.882576
[66]	valid_0's l2: 0.882553
[67]	valid_0's l2: 0.881349
[68]	valid_0's l2: 0.881289
[69]	valid_0's l2: 0.880739
[70]	valid_0's l2: 0.880473
[71]	valid_0's l2: 0.880144
[72]	valid_0's l2: 0.879695
[73]	valid_0's l2: 0.879387
[74]	valid_0's l2: 0.878999
[75]	valid_0's l2: 0.878929
[76]	valid_0's l2: 0.878488
[77]	valid_0's l2: 0.877942
[78]	valid_0's l2: 0.877616
[79]	valid_0's l2: 0.877533
[80]	valid_0's l2: 0.876972
[81]	valid_0's l2: 0.876519
[82]	valid_0's l2: 0.876523
[83]	valid_0's l2: 0.8

[46]	valid_0's l2: 0.874602
[47]	valid_0's l2: 0.875455
[48]	valid_0's l2: 0.875414
[49]	valid_0's l2: 0.874506
[50]	valid_0's l2: 0.874338
Early stopping, best iteration is:
[45]	valid_0's l2: 0.871753
SCORE: 0.93367732738


In [77]:
lgbm_best

{'feature_fraction': 0.9711229402003267, 'num_leaves': 52.0}

In [7]:
from sklearn.metrics import mean_squared_error
import xgboost as xgb
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
import numpy as np

def objective(space):
    print(space)
    clf = xgb.XGBRegressor(n_estimators =1000,colsample_bytree=space['colsample_bytree'],
                           learning_rate = .3,
                            max_depth = int(space['max_depth']),
                            min_child_weight = space['min_child_weight'],
                            subsample = space['subsample'],
                           gamma = space['gamma'],
                           reg_lambda = space['reg_lambda'],)

    eval_set  = [( X, y), ( Xcv, ycv)]

    clf.fit(X, y,
            eval_set=eval_set, eval_metric="rmse",
            early_stopping_rounds=10,verbose=False)

    pred = clf.predict(Xcv)
    mse_scr = mean_squared_error(ycv, pred)
    print "SCORE:", np.sqrt(mse_scr)
    #change the metric if you like
    return {'loss':mse_scr, 'status': STATUS_OK }


space ={'max_depth': hp.quniform("x_max_depth", 4, 16, 1),
        'min_child_weight': hp.quniform ('x_min_child', 1, 10, 1),
        'subsample': hp.uniform ('x_subsample', 0.7, 1),
        'gamma' : hp.uniform ('x_gamma', 0.1,0.5),
        'colsample_bytree' : hp.uniform ('x_colsample_bytree', 0.7,1),
        'reg_lambda' : hp.uniform ('x_reg_lambda', 0,1)
    }


trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=100,
            trials=trials)

print best

<hyperopt.pyll.base.Apply at 0x9cd84c2080>