Creating all pair in (shop_id, item_id) for each date_block_num. And if row has not item_count from origin sales data, item_count will fill 0.

In [1]:
# coding: utf-8
import os
from itertools import product

import pandas as pd
import numpy as np
import scipy
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
from sklearn.linear_model import ElasticNet, ElasticNetCV
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from IPython.display import display

from mydatools.features_generate import features_read

% matplotlib inline

## Config

In [2]:
id_col = 'ID'
label_col = 'item_cnt_month'

submission_path = './data/output/submission/lightgbm.csv'
output_id_col = id_col
output_label_col = label_col

## Load Data

In [3]:
full_df, feature_columns = features_read()

  mask |= (ar1 == a)


## LightGBM

**Valdation**

In [4]:
dates = full_df['date_block_num'].copy()

dates_trn = dates[dates <= 33]
dates_tst = dates[dates == 34]

trn_df = full_df[dates <= 33]
tst_df = full_df[dates == 34]

X_trn = trn_df[feature_columns]
y_trn = trn_df[label_col]
X_tst = tst_df[feature_columns]

**Score**

In [5]:
# score
def rmse(y, y_pred):
    return np.sqrt(metrics.mean_squared_error(y, y_pred))

**lightgbm parameters tuning**

In [6]:
XX_trn = full_df[dates <= 32][feature_columns]
yy_trn = full_df[dates <= 32][label_col]
XX_val = full_df[dates == 33][feature_columns]
yy_val = full_df[dates == 33][label_col]

In [7]:
trn_lgb = lgb.Dataset(XX_trn, yy_trn)
val_lgb = lgb.Dataset(XX_val, yy_val)

lgb_params = {
    'application': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.1,
    'max_depth': 5,
    'num_leaves': 20,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.5,
    'bagging_freq': 1,
    'seed': 0,
}
train_round = 150

bst = lgb.train(lgb_params, trn_lgb, train_round, valid_sets=[trn_lgb, val_lgb], early_stopping_rounds=20)

[1]	training's rmse: 1.13974	valid_1's rmse: 1.10691
Training until validation scores don't improve for 20 rounds.
[2]	training's rmse: 1.09969	valid_1's rmse: 1.08105
[3]	training's rmse: 1.06622	valid_1's rmse: 1.052
[4]	training's rmse: 1.03792	valid_1's rmse: 1.02607
[5]	training's rmse: 1.00609	valid_1's rmse: 1.0081
[6]	training's rmse: 0.983047	valid_1's rmse: 0.996264
[7]	training's rmse: 0.958717	valid_1's rmse: 0.984539
[8]	training's rmse: 0.940494	valid_1's rmse: 0.978195
[9]	training's rmse: 0.923412	valid_1's rmse: 0.97096
[10]	training's rmse: 0.908627	valid_1's rmse: 0.96556
[11]	training's rmse: 0.895897	valid_1's rmse: 0.960899
[12]	training's rmse: 0.885635	valid_1's rmse: 0.957885
[13]	training's rmse: 0.877481	valid_1's rmse: 0.956268
[14]	training's rmse: 0.870309	valid_1's rmse: 0.953756
[15]	training's rmse: 0.863621	valid_1's rmse: 0.948511
[16]	training's rmse: 0.857585	valid_1's rmse: 0.943964
[17]	training's rmse: 0.85216	valid_1's rmse: 0.940886
[18]	traini

[147]	training's rmse: 0.760378	valid_1's rmse: 0.882362
[148]	training's rmse: 0.760221	valid_1's rmse: 0.882432
[149]	training's rmse: 0.760136	valid_1's rmse: 0.882446
[150]	training's rmse: 0.760025	valid_1's rmse: 0.882416
Did not meet early stopping. Best iteration is:
[150]	training's rmse: 0.760025	valid_1's rmse: 0.882416


**trian use all data**

In [8]:
lgb_model = lgb.train(lgb_params, lgb.Dataset(X_trn, y_trn), train_round)
predictions = lgb_model.predict(X_tst)

## Predict

In [9]:
res_df = pd.DataFrame(predictions.clip(0,20), columns=[output_label_col])
res_df[output_id_col] = tst_df[output_id_col].astype(int).values
res_df.sort_values('ID')[[output_id_col, output_label_col]].to_csv(submission_path, index=False)