Creating all pair in (shop_id, item_id) for each date_block_num. And if row has not item_count from origin sales data, item_count will fill 0.

In [1]:
# coding: utf-8
import os
from itertools import product

import pandas as pd
import numpy as np
import scipy
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn import preprocessing
from sklearn.linear_model import ElasticNet, ElasticNetCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from IPython.display import display

from mydatools.features_generate import features_read

% matplotlib inline

## Config

In [2]:
id_col = 'ID'
label_col = 'item_cnt_month'

submission_path = './data/output/submission/stacking_val_scheme.csv'
output_id_col = id_col
output_label_col = label_col

## Load Data

In [3]:
full_df, feature_columns = features_read()

  mask |= (ar1 == a)


## Level1 Ensemble

**Valdation**

In [4]:
dates = full_df['date_block_num'].copy()

dates_trn = dates[dates <= 33]
dates_tst = dates[dates == 34]

trn_df = full_df[dates <= 33]
tst_df = full_df[dates == 34]

X_trn = trn_df[feature_columns]
y_trn = trn_df[label_col]
X_tst = tst_df[feature_columns]

**Score**

In [5]:
# score
def rmse(y, y_pred):
    return np.sqrt(metrics.mean_squared_error(y, y_pred))

**ElasticNet parameters tuning**

In [6]:
en_params = {
    'alpha': 0.01,
    'l1_ratio': 0.5,
}

**LightGBM parameters tuning**

In [7]:
lgb_params = {
    'application': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.1,
    'max_depth': 5,
    'num_leaves': 20,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.5,
    'bagging_freq': 1,
    'seed': 0,
}

In [8]:
rfr_params = {
    'n_estimators': 100,
    'max_depth': 6,
}

**Get level2 train data**

In [None]:
level2_block_num = [28, 29, 30, 31, 32, 33]
dates_trn_level2 = dates_trn[dates_trn.isin(level2_block_num)]
y_trn_level2 = y_trn[dates_trn.isin(level2_block_num)]

In [None]:
X_trn_level2 = np.zeros([y_trn_level2.shape[0], 4])

for cur_block_num in level2_block_num:
    print(cur_block_num)

    X_trn_i = X_trn[dates_trn < cur_block_num]
    y_trn_i = y_trn[dates_trn < cur_block_num]
    X_tst_i = X_trn[dates_trn == cur_block_num]
    
    ii = 0
    
#     en = ElasticNet(**en_params)
#     en.fit(X_trn_i, y_trn_i)
#     X_trn_level2[dates_trn_level2 == cur_block_num, ii] = en.predict(X_tst_i)
#     ii += 1
    
    print(ii, 'LigbtGBM')
    lgb_model = lgb.train(lgb_params, lgb.Dataset(X_trn_i, label=y_trn_i), 100)
    X_trn_level2[dates_trn_level2 == cur_block_num, ii] = lgb_model.predict(X_tst_i)
    ii += 1
    
    print(ii, 'RandomForestRegressor')
    rfr = RandomForestRegressor()
    rfr.fit(X_trn_i, y_trn_i)
    X_trn_level2[dates_trn_level2 == cur_block_num, ii] = rfr.predict(X_tst_i)
    ii += 1
    
    print(ii, 'LinearRegression')
    lr1 = LinearRegression(normalize=False)
    lr1.fit(X_trn_id, y_trn_i)
    X_trn_level2[dates_trn_level2 == cur_block_num, ii] = lr1.predict(X_tst_i)
    ii += 1

    print(ii, 'LinearRegression normalized')
    lr1 = LinearRegression(normalize=True)
    lr1.fit(X_trn_id, y_trn_i)
    X_trn_level2[dates_trn_level2 == cur_block_num, ii] = lr2.predict(X_tst_i)
    ii += 1
    

28
0 LigbtGBM
1 RandomForestRegressor


**Get level2 test data**

In [None]:
en = ElasticNet(**en_params)
en.fit(X_trn, y_trn)
X_tst_level2_en = en.predict(X_tst)

lgb = lgb.train(lgb_params, lgb.Dataset(X_trn, label=y_trn), 100)
X_tst_level2_lgb = lgb.predict(X_tst)

X_tst_level2 = np.c_[X_tst_level2_en, X_tst_level2_lgb]

## Level2 Ensemble: Stacking

In [None]:
# XX_trn_level2 = X_trn_level2[dates_trn_level2 < 33]
# yy_trn_level2 = y_trn_level2[dates_trn_level2 < 33]
# XX_val_level2 = X_trn_level2[dates_trn_level2 == 33]
# yy_val_level2 = y_trn_level2[dates_trn_level2 == 33]

# lr = LinearRegression()
# lr.fit(XX_trn_level2, yy_trn_level2)

# rmse(yy_val_level2, lr.predict(XX_val_level2))

In [None]:
lr = LinearRegression()
lr.fit(X_trn_level2, y_trn_level2)
predictions = lr.predict(X_tst_level2)

In [None]:
rmse(y_trn_level2, lr.predict(X_trn_level2).clip(0,20))

## Predict

In [None]:
res_df = pd.DataFrame(predictions.clip(0,20), columns=[output_label_col])
res_df[output_id_col] = tst_df[output_id_col].astype(int).values
res_df.sort_values('ID')[[output_id_col, output_label_col]].to_csv(submission_path, index=False)