Creating all pair in (shop_id, item_id) for each date_block_num. And if row has not item_count from origin sales data, item_count will fill 0.

In [1]:
# coding: utf-8
import os
from itertools import product

import pandas as pd
import numpy as np
import scipy
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
from sklearn.linear_model import ElasticNet, ElasticNetCV
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from IPython.display import display

from mydatools.features_generate import features_read

% matplotlib inline

## Config

In [2]:
id_col = 'ID'
label_col = 'item_cnt_month'

submission_path = './data/output/submission/elastic_net.csv'
output_id_col = id_col
output_label_col = label_col

## Load Data

In [3]:
full_df, feature_columns = features_read()

KeyboardInterrupt: 

## ElasticNet

**Valdation**

In [None]:
dates = full_df['date_block_num'].copy()

dates_trn = dates[dates <= 33]
dates_tst = dates[dates == 34]

trn_df = full_df[dates <= 33]
tst_df = full_df[dates == 34]

X_trn = trn_df[feature_columns]
y_trn = trn_df[label_col]
X_tst = tst_df[feature_columns]
# y_tst = tst_df[label_col]

# # scale
# scaler = preprocessing.StandardScaler()
# X_trn = scaler.fit_transform(X_trn)
# X_val = scaler.transform(X_val)

**Score**

In [None]:
# score
def rmse(y, y_pred):
    return np.sqrt(metrics.mean_squared_error(y, y_pred))

**ElasticNet parameters tuning**

In [None]:
XX_trn = full_df[dates <= 32][feature_columns]
yy_trn = full_df[dates <= 32][label_col]
XX_val = full_df[dates == 33][feature_columns]
yy_val = full_df[dates == 33][label_col]

In [None]:
param_grid = {
    'alpha': [0.001, 0.01, 0.1],
#     'alpha': [0.01],
    'l1_ratio': [0.25, 0.5, 0.75],
#     'l1_ratio': [0.5],
}

best_score = 9999
best_param = None
for param in model_selection.ParameterGrid(param_grid):
    print(param)
    en_model = ElasticNet(**param)
    en_model.fit(XX_trn, yy_trn)
    yy_pred = en_model.predict(XX_val)
    s = rmse(yy_val, yy_pred)
    if s < best_score:
        best_score = s
        best_param = param
    print(s)

en_model = ElasticNet(**best_param)

In [None]:
rmse(yy_val.clip(0,20), yy_pred.clip(0,20))

In [None]:
best_param

In [None]:
en_model.fit(X_trn, y_trn)
predictions = en_model.predict(X_tst)

## Predict

In [None]:
res_df = pd.DataFrame(predictions.clip(0,20), columns=[output_label_col])
res_df[output_id_col] = tst_df[output_id_col].astype(int).values
res_df.sort_values('ID')[[output_id_col, output_label_col]].to_csv(submission_path, index=False)