# Lightgbm model

In [8]:
import pandas as pd 
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
import gc

In [3]:
with pd.HDFStore('../data/feat/data.h5') as store:
    print(store.keys())
    X_train = store['X_train']
    X_cv = store['X_cv']
    y_train = store['y_train']
    y_cv = store['y_cv']
    X_test = store['X_test']

['/X_cv', '/X_test', '/X_train', '/y_cv', '/y_train']


# clip 
* `to [0,20]`
* `to [0,40]`

In [10]:
y_train = y_train.clip(0,20)
y_cv = y_cv.clip(0,20)


# training

In [12]:
lgb_train = lgb.Dataset(X_train,y_train)
lgb_cv = lgb.Dataset(X_cv,y_cv,reference=lgb_train)

In [13]:
# specify your configurations as a dict
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2', 'mse'},
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

In [14]:
print('Start training...')
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=20,
                valid_sets=lgb_cv,
early_stopping_rounds=5)
print('end training')
# print('Save model...')
# save model to file
# gbm.save_model('../data/model/lgb_model1.txt')

Start training...
[1]	valid_0's l2: 1.25112	valid_0's l2: 1.25112
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's l2: 1.21476	valid_0's l2: 1.21476
[3]	valid_0's l2: 1.18125	valid_0's l2: 1.18125
[4]	valid_0's l2: 1.15188	valid_0's l2: 1.15188
[5]	valid_0's l2: 1.1268	valid_0's l2: 1.1268
[6]	valid_0's l2: 1.10553	valid_0's l2: 1.10553
[7]	valid_0's l2: 1.0829	valid_0's l2: 1.0829
[8]	valid_0's l2: 1.06514	valid_0's l2: 1.06514
[9]	valid_0's l2: 1.04681	valid_0's l2: 1.04681
[10]	valid_0's l2: 1.03218	valid_0's l2: 1.03218
[11]	valid_0's l2: 1.01917	valid_0's l2: 1.01917
[12]	valid_0's l2: 1.0073	valid_0's l2: 1.0073
[13]	valid_0's l2: 0.997146	valid_0's l2: 0.997146
[14]	valid_0's l2: 0.987576	valid_0's l2: 0.987576
[15]	valid_0's l2: 0.979684	valid_0's l2: 0.979684
[16]	valid_0's l2: 0.971877	valid_0's l2: 0.971877
[17]	valid_0's l2: 0.964579	valid_0's l2: 0.964579
[18]	valid_0's l2: 0.958288	valid_0's l2: 0.958288
[19]	valid_0's l2: 0.952578	valid_0's l2: 

In [15]:
gc.collect()

68

In [16]:
gbm.save_model('../data/model/lgb_model1.txt')

In [17]:
print('Start predicting...')
# predict
y_pred = gbm.predict(X_cv, num_iteration=gbm.best_iteration)
# eval
print('The mse of prediction is:', mean_squared_error(y_cv, y_pred))

Start predicting...
The mse of prediction is: 0.946658903874


# pred 

In [32]:
tests = pd.read_csv('../data/test.csv')

In [33]:
tests.head()

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


In [41]:
X_test = X_test[X_test.date_block_num ==34].merge(tests, on = ['shop_id','item_id'],how='inner')


In [45]:
X_test.loc[:,['ID','shop_id','item_id']].head()

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


In [46]:
X_test = X_test.drop('ID',axis=1)

In [47]:
y_test = gbm.predict(X_test,num_iteration=gbm.best_iteration)

In [48]:
y_submit = pd.DataFrame({'ID':range(len(y_test)), 'item_cnt_month':y_test})

In [51]:
y_submit.to_csv('../data/output/submission_lgbm1.csv',index=False)