Creating all pair in (shop_id, item_id) for each date_block_num. And if row has not item_count from origin sales data, item_count will fill 0.

In [1]:
# coding: utf-8
import os
from itertools import product

import pandas as pd
import numpy as np
import scipy
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
from sklearn.linear_model import ElasticNet, ElasticNetCV
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from IPython.display import display

from mydatools.features_generate import features_read

% matplotlib inline

## Config

In [2]:
id_col = 'ID'
label_col = 'item_cnt_month'

submission_path = './data/output/submission/lightgbm.csv'
output_id_col = id_col
output_label_col = label_col

## Load Data

In [3]:
full_df, feature_columns = features_read()

  mask |= (ar1 == a)


## LightGBM

**Valdation**

In [4]:
dates = full_df['date_block_num'].copy()

dates_trn = dates[dates <= 33]
dates_tst = dates[dates == 34]

trn_df = full_df[dates <= 33]
tst_df = full_df[dates == 34]

X_trn = trn_df[feature_columns]
y_trn = trn_df[label_col]
X_tst = tst_df[feature_columns]

**Score**

In [5]:
# score
def rmse(y, y_pred):
    return np.sqrt(metrics.mean_squared_error(y, y_pred))

**lightgbm parameters tuning**

In [6]:
XX_trn = full_df[dates <= 32][feature_columns]
yy_trn = full_df[dates <= 32][label_col]
XX_val = full_df[dates == 33][feature_columns]
yy_val = full_df[dates == 33][label_col]

In [7]:
trn_lgb = lgb.Dataset(XX_trn, yy_trn)
val_lgb = lgb.Dataset(XX_val, yy_val)

lgb_params = {
    'application': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.1,
    'max_depth': 5,
    'num_leaves': 20,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.5,
    'bagging_freq': 1,
    'seed': 0,
}
train_round = 100

bst = lgb.train(lgb_params, trn_lgb, train_round, valid_sets=[trn_lgb, val_lgb], early_stopping_rounds=20)

[1]	training's rmse: 1.13594	valid_1's rmse: 1.10206
Training until validation scores don't improve for 20 rounds.
[2]	training's rmse: 1.09227	valid_1's rmse: 1.07449
[3]	training's rmse: 1.05461	valid_1's rmse: 1.05277
[4]	training's rmse: 1.02425	valid_1's rmse: 1.03684
[5]	training's rmse: 0.995863	valid_1's rmse: 1.02167
[6]	training's rmse: 0.971607	valid_1's rmse: 1.01151
[7]	training's rmse: 0.950417	valid_1's rmse: 0.995365
[8]	training's rmse: 0.932646	valid_1's rmse: 0.987992
[9]	training's rmse: 0.918201	valid_1's rmse: 0.982883
[10]	training's rmse: 0.905477	valid_1's rmse: 0.978697
[11]	training's rmse: 0.89423	valid_1's rmse: 0.975032
[12]	training's rmse: 0.884321	valid_1's rmse: 0.969497
[13]	training's rmse: 0.874834	valid_1's rmse: 0.962554
[14]	training's rmse: 0.868036	valid_1's rmse: 0.960084
[15]	training's rmse: 0.861739	valid_1's rmse: 0.958868
[16]	training's rmse: 0.856649	valid_1's rmse: 0.958455
[17]	training's rmse: 0.851659	valid_1's rmse: 0.950863
[18]	t

**trian use all data**

In [8]:
lgb_model = lgb.train(lgb_params, lgb.Dataset(X_trn, y_trn), train_round)
predictions = lgb_model.predict(X_tst)

## Predict

In [9]:
res_df = pd.DataFrame(predictions.clip(0,20), columns=[output_label_col])
res_df[output_id_col] = tst_df[output_id_col].astype(int).values
res_df.sort_values('ID')[[output_id_col, output_label_col]].to_csv(submission_path, index=False)