Creating all pair in (shop_id, item_id) for each date_block_num. And if row has not item_count from origin sales data, item_count will fill 0.

In [1]:
# coding: utf-8
import os
from itertools import product

import pandas as pd
import numpy as np
import scipy
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn import preprocessing
from sklearn.linear_model import ElasticNet, ElasticNetCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from IPython.display import display

from mydatools.features_generate import features_read

% matplotlib inline

## Config

In [2]:
id_col = 'ID'
label_col = 'item_cnt_month'

submission_path = './data/output/submission/stacking.csv'
output_id_col = id_col
output_label_col = label_col

## Load Data

In [3]:
full_df, feature_columns = features_read()

  mask |= (ar1 == a)


## Level1 Ensemble

**Valdation**

In [4]:
dates = full_df['date_block_num'].copy()

dates_trn = dates[dates <= 33]
dates_tst = dates[dates == 34]

trn_df = full_df[dates <= 33]
tst_df = full_df[dates == 34]

X_trn = trn_df[feature_columns]
y_trn = trn_df[label_col]
X_tst = tst_df[feature_columns]

**Score**

In [5]:
# score
def rmse(y, y_pred):
    return np.sqrt(metrics.mean_squared_error(y, y_pred))

**ElasticNet parameters tuning**

In [6]:
en_params = {
    'alpha': 0.01,
    'l1_ratio': 0.5,
}

**LightGBM parameters tuning**

In [7]:
lgb_params = {
    'application': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.1,
    'max_depth': 5,
    'num_leaves': 20,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.5,
    'bagging_freq': 1,
    'seed': 0,
}

In [8]:
rfr_params = {
    'n_estimators': 10,
    'max_depth': 6,
}

**Get level2 train data**

In [9]:
y_trn_level2 = y_trn

In [10]:
model_num = 4
X_trn_level2 = np.zeros([X_trn.shape[0], model_num])
X_tst_level2 = np.zeros([X_tst.shape[0], model_num])

ii = 0

#     en = ElasticNet(**en_params)
#     en.fit(X_trn, y_trn)
#     X_trn_level2[:, ii] = en.predict(X_trn)
#     ii += 1

print(ii, 'LigbtGBM')
lgb_model = lgb.train(lgb_params, lgb.Dataset(X_trn, label=y_trn), 100)
X_trn_level2[:, ii] = lgb_model.predict(X_trn)
X_tst_level2[:, ii] = lgb_model.predict(X_tst)
ii += 1

print(ii, 'RandomForestRegressor')
rfr = RandomForestRegressor(**rfr_params)
rfr.fit(X_trn, y_trn)
X_trn_level2[:, ii] = rfr.predict(X_trn)
X_tst_level2[:, ii] = rfr.predict(X_tst)
ii += 1

print(ii, 'LinearRegression')
lr1 = LinearRegression(normalize=False)
lr1.fit(X_trn, y_trn)
X_trn_level2[:, ii] = lr1.predict(X_trn)
X_tst_level2[:, ii] = lr1.predict(X_tst)
ii += 1

# print(ii, 'LinearRegression normalized')
# lr2 = LinearRegression(normalize=True)
# lr2.fit(X_trn, y_trn)
# X_trn_level2[:, ii] = lr2.predict(X_trn)
# X_tst_level2[:, ii] = lr2.predict(X_tst)
# ii += 1


0 LigbtGBM
1 RandomForestRegressor
2 LinearRegression


  linalg.lstsq(X, y)


3 LinearRegression normalized


In [20]:
pd.DataFrame(X_trn_level2, columns=['lgb', 'rfr', 'lr1']).corr()

Unnamed: 0,lgb,rfr,lr1
lgb,1.0,0.949793,0.674826
rfr,0.949793,1.0,0.665645
lr1,0.674826,0.665645,1.0


## Level2 Ensemble: Stacking

In [25]:
lr = LinearRegression()
lr.fit(X_trn_level2, y_trn_level2)
predictions = lr.predict(X_tst_level2)

In [26]:
rmse(y_trn_level2, lr.predict(X_trn_level2).clip(0,20))

0.7719416633608611

## Predict

In [27]:
res_df = pd.DataFrame(predictions.clip(0,20), columns=[output_label_col])
res_df[output_id_col] = tst_df[output_id_col].astype(int).values
res_df.sort_values('ID')[[output_id_col, output_label_col]].to_csv(submission_path, index=False)