Creating all pair in (shop_id, item_id) for each date_block_num. And if row has not item_count from origin sales data, item_count will fill 0.

In [1]:
# coding: utf-8
import os
from itertools import product

import pandas as pd
import numpy as np
import scipy
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
from sklearn.linear_model import ElasticNet, ElasticNetCV
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from IPython.display import display

from mydatools.features_generate import features_read

% matplotlib inline

## Config

In [2]:
id_col = 'ID'
label_col = 'item_cnt_month'

submission_path = './data/output/submission/lightgbm.csv'
output_id_col = id_col
output_label_col = label_col

## Load Data

In [3]:
full_df, feature_columns = features_read()

  mask |= (ar1 == a)


## LightGBM

**Valdation**

In [4]:
dates = full_df['date_block_num'].copy()

dates_trn = dates[dates <= 33]
dates_tst = dates[dates == 34]

trn_df = full_df[dates <= 33]
tst_df = full_df[dates == 34]

X_trn = trn_df[feature_columns]
y_trn = trn_df[label_col]
X_tst = tst_df[feature_columns]

**Score**

In [5]:
# score
def rmse(y, y_pred):
    return np.sqrt(metrics.mean_squared_error(y, y_pred))

**lightgbm parameters tuning**

In [6]:
XX_trn = full_df[dates <= 32][feature_columns]
yy_trn = full_df[dates <= 32][label_col]
XX_val = full_df[dates == 33][feature_columns]
yy_val = full_df[dates == 33][label_col]

In [7]:
trn_lgb = lgb.Dataset(XX_trn, yy_trn)
val_lgb = lgb.Dataset(XX_val, yy_val)

lgb_params = {
    'application': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.1,
    'max_depth': 5,
    'num_leaves': 20,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.5,
    'bagging_freq': 1,
    'seed': 0,
}
train_round = 150

bst = lgb.train(lgb_params, trn_lgb, train_round, valid_sets=[trn_lgb, val_lgb], early_stopping_rounds=20)

[1]	training's rmse: 1.13606	valid_1's rmse: 1.10224
Training until validation scores don't improve for 20 rounds.
[2]	training's rmse: 1.09583	valid_1's rmse: 1.07735
[3]	training's rmse: 1.05655	valid_1's rmse: 1.05479
[4]	training's rmse: 1.02382	valid_1's rmse: 1.02619
[5]	training's rmse: 0.995031	valid_1's rmse: 1.01123
[6]	training's rmse: 0.970421	valid_1's rmse: 0.999928
[7]	training's rmse: 0.949391	valid_1's rmse: 0.991009
[8]	training's rmse: 0.93481	valid_1's rmse: 0.98277
[9]	training's rmse: 0.919357	valid_1's rmse: 0.976922
[10]	training's rmse: 0.905905	valid_1's rmse: 0.973544
[11]	training's rmse: 0.89467	valid_1's rmse: 0.970501
[12]	training's rmse: 0.885167	valid_1's rmse: 0.968221
[13]	training's rmse: 0.876258	valid_1's rmse: 0.963248
[14]	training's rmse: 0.868661	valid_1's rmse: 0.9604
[15]	training's rmse: 0.862444	valid_1's rmse: 0.958847
[16]	training's rmse: 0.856462	valid_1's rmse: 0.955534
[17]	training's rmse: 0.851652	valid_1's rmse: 0.953287
[18]	trai

[147]	training's rmse: 0.759224	valid_1's rmse: 0.885652
[148]	training's rmse: 0.759014	valid_1's rmse: 0.885587
[149]	training's rmse: 0.75872	valid_1's rmse: 0.885298
[150]	training's rmse: 0.758563	valid_1's rmse: 0.885263
Did not meet early stopping. Best iteration is:
[150]	training's rmse: 0.758563	valid_1's rmse: 0.885263


**trian use all data**

In [8]:
lgb_model = lgb.train(lgb_params, lgb.Dataset(X_trn, y_trn), train_round)
predictions = lgb_model.predict(X_tst)

## Predict

In [9]:
res_df = pd.DataFrame(predictions.clip(0,20), columns=[output_label_col])
res_df[output_id_col] = tst_df[output_id_col].astype(int).values
res_df.sort_values('ID')[[output_id_col, output_label_col]].to_csv(submission_path, index=False)