Creating all pair in (shop_id, item_id) for each date_block_num. And if row has not item_count from origin sales data, item_count will fill 0.

In [1]:
# coding: utf-8
import os
from itertools import product

import pandas as pd
import numpy as np
import scipy
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn import preprocessing
from sklearn.linear_model import ElasticNet, ElasticNetCV
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from IPython.display import display

from mydatools.features_generate import features_save

% matplotlib inline

## Config

In [2]:
# trn_path = './data/input/train.csv'
tst_path = './data/input/test.csv'
id_col = 'ID'
label_col = 'item_cnt_month'

submission_path = './data/output/submission/stacking_simple.csv'
output_id_col = id_col
output_label_col = label_col

## Load Data

In [3]:
item_cate_df = pd.read_csv('./data/input/item_categories.csv')
item_df = pd.read_csv('./data/input/items.csv')
sales_df = pd.read_csv('./data/input/sales_train.csv')
shop_df = pd.read_csv('./data/input/shops.csv')

In [4]:
sales_df['revenue'] = sales_df['item_price'] * sales_df['item_cnt_day']
trn_df = sales_df.groupby(['shop_id', 'item_id', 'date_block_num'])['item_cnt_day'].sum().reset_index()
trn_df = trn_df.rename(columns={'item_cnt_day': 'item_cnt_month'})
trn_df['ID'] = 0
# trn_df['ds_type'] = 'trn'

tst_df = pd.read_csv(tst_path)
tst_df['date_block_num'] = 34
tst_df['item_cnt_month'] = 0
# tst_df['ds_type'] = 'tst'

full_df = pd.concat([trn_df, tst_df])
full_df.head()

Unnamed: 0,ID,date_block_num,item_cnt_month,item_id,shop_id
0,0,1,31.0,30,0
1,0,1,11.0,31,0
2,0,0,6.0,32,0
3,0,1,10.0,32,0
4,0,0,3.0,33,0


## Get grid

(shop_id, item_id) for each date_block_num

In [5]:
index_cols = ['shop_id', 'item_id', 'date_block_num']
grid = []
for block_num in full_df['date_block_num'].unique():
    cur_shops = full_df.loc[full_df['date_block_num'] == block_num, 'shop_id'].unique()
    cur_items = full_df.loc[full_df['date_block_num'] == block_num, 'item_id'].unique()
    grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])), dtype='int32'))

grid = pd.DataFrame(np.vstack(grid), columns=index_cols, dtype=np.int32)

full_df = grid.merge(full_df, how='left', on=index_cols).fillna(0)

In [6]:
# for dbn in range(0, 34):
#     # create pair in (shop_id, item_id)
#     unique_shop_id = full_df[full_df['date_block_num'] == dbn]['shop_id'].copy().drop_duplicates()
#     unique_item_id = full_df[full_df['date_block_num'] == dbn]['item_id'].copy().drop_duplicates()
#     m_index = pd.MultiIndex.from_product([unique_shop_id, unique_item_id], names=['shop_id', 'item_id'])
#     val_df = pd.DataFrame([], index=m_index).reset_index()
#     val_df['ID'] = 0
#     val_df['date_block_num'] = dbn
# #     # distinguish origin train data
# #     val_df['ds_type'] = 'add_val'
#     # remove pair (shop_id, item_id) exists in full_df
#     origin_33_df = full_df[full_df['date_block_num'] == dbn][['item_id', 'shop_id', 'item_cnt_month']]
#     val_df = val_df.merge(origin_33_df, how='left', on=['item_id', 'shop_id'])
#     val_df = val_df[val_df.item_cnt_month.isnull()]
#     # no item_count then fill 0
#     val_df['item_cnt_month'] = 0

#     # merge to full_df
#     full_df = pd.concat([full_df, val_df])

In [7]:
full_df = full_df.sort_values(['date_block_num', 'shop_id', 'item_id'])

## Features

In [8]:
feature_columns = []

def add_features(features):
    if not isinstance(features, list):
        features = [features]
    global feature_columns
    feature_columns.extend([f for f in features if f not in feature_columns])
    
def remove_features(features):
    if not isinstance(features, list):
        features = [features]
    global feature_columns
    feature_columns = [f for f in feature_columns if f not in features]

**shop_id, item_id**

In [9]:
# add_features(['shop_id', 'item_id'])

**datetime info**

In [10]:
full_df['dt_year'] = full_df['date_block_num'] // 12 + 2013
full_df['dt_month'] = full_df['date_block_num'] % 12 + 1
add_features(['dt_year', 'dt_month'])

**item_df**

In [11]:
# tfidf item_name
tfidf_model = TfidfVectorizer(max_df=0.95, min_df=3)
tfidf_res = tfidf_model.fit_transform(item_df['item_name'])

# decomposition
dcp_num = 5
dcp_model = TruncatedSVD(dcp_num)
dcp_tfidf_res = dcp_model.fit_transform(tfidf_res)

# add to item_df
item_name_tfidf_dcp_feats = ['item_name_tfidf_dcp_%d'%i for i in range(dcp_num)]
item_df[item_name_tfidf_dcp_feats] = pd.DataFrame(dcp_tfidf_res)

In [12]:
# item_name's words count
item_df['item_name_words_count'] = item_df['item_name'].map(lambda x: len(x.split(' ')))

In [13]:
# add to full_df
full_df = full_df.merge(item_df, how='left', on='item_id')

# add feautures
add_features(item_name_tfidf_dcp_feats + ['item_category_id', 'item_name_words_count'])

In [14]:
sales_df = sales_df.merge(item_df, how='left', on='item_id')

**mean encoding**

In [15]:
# global_mean = full_df[full_df['date_block_num'] < 34]['item_cnt_month'].mean()

# def get_aggragation_feature(df, groupby_cols, agg_col, fillna_value):
#     gb = df.groupby(groupby_cols)[agg_col]
#     fname_fmt = '-'.join(groupby_cols+[agg_col]) + ':%s'
#     agg_df = pd.DataFrame({
#             fname_fmt%'mean': gb.mean(),
# #             fname_fmt%'median': gb.median(),
# #             fname_fmt%'max': gb.max(),
# #             fname_fmt%'min': gb.min(),
#         })
    
#     new_df = df.join(agg_df, on=groupby_cols).fillna(fillna_value)
#     return new_df, agg_df.columns.tolist()

# groupby_cols_list = [
#     ['shop_id', 'item_id'],
#     ['shop_id'],
#     ['item_id'],
# #     ['date_block_num'],
# #     ['dt_year'],
# #     ['dt_month'],
# #     ['item_category_id'],
# ]
# for groupby_cols in groupby_cols_list:
#     full_df, new_feats = get_aggragation_feature(full_df, groupby_cols, 'item_cnt_month', global_mean)
#     add_features(new_feats)

In [16]:
global_mean = full_df[full_df['date_block_num'] < 34]['item_cnt_month'].mean()

mean_encoding_cols_list = [
    ['shop_id', 'item_id'],
    ['shop_id'],
    ['item_id'],
]
get_mean_encoding_feat_name = lambda cols: 'mean_encoding-' + '-'.join(cols)

for cols in mean_encoding_cols_list:
    full_df[get_mean_encoding_feat_name(cols)] = np.nan
    add_features(get_mean_encoding_feat_name(cols))

kf = KFold(5, shuffle=True, random_state=20180717)
for trn_idx, val_idx in kf.split(full_df['item_cnt_month'].values):
    trn_df, val_df = full_df.iloc[trn_idx].copy(), full_df.iloc[val_idx].copy()
    for cols in mean_encoding_cols_list:
        feat_name = get_mean_encoding_feat_name(cols)
        mean_map = trn_df.groupby(cols)['item_cnt_month'].mean()
        val_df[feat_name] = val_df[cols].join(mean_map, how='left', on=cols)['item_cnt_month']
    full_df.iloc[val_idx] = val_df
full_df.fillna(global_mean, inplace=True)

**lag features**

In [17]:
# 获取当月的一些数据（然后进行lag）
cur_month_cols_list = [
    ['shop_id'],
    ['item_id'],
    ['shop_id', 'item_id'],
    ['item_category_id'],
    ['shop_id', 'item_category_id'],
]
cur_month_feats = []
for cols in cur_month_cols_list:
    print(cols)
    gb = sales_df.groupby(['date_block_num'] + cols) \
        .agg({'item_cnt_day': 'sum', 'item_price': 'mean'}) \
        .rename(columns={'item_cnt_day': 'target-' + '-'.join(cols),
                         'item_price': 'item_price-' + '-'.join(cols)})
    cur_month_feats += ['target-' + '-'.join(cols), 'item_price-' + '-'.join(cols)]
    full_df = full_df.join(gb, how='left', on=['date_block_num'] + cols).fillna(0)

['shop_id']
['item_id']
['shop_id', 'item_id']
['item_category_id']
['shop_id', 'item_category_id']


In [18]:
shift_range = [1,2,3,6,12]
shift_features = cur_month_feats

for shift_month in shift_range:
    print(shift_month)
    tmp_df = full_df[['shop_id', 'item_id', 'date_block_num'] + shift_features].copy()
    tmp_df['date_block_num'] = tmp_df['date_block_num'] + shift_month
    for f in shift_features:
        new_f = f + '_lag_' + str(shift_month)
        tmp_df = tmp_df.rename(columns={f: new_f})
        add_features(new_f)
    full_df = full_df.merge(tmp_df, how='left', on=['shop_id', 'item_id', 'date_block_num']).fillna(0)

1
2
3
6
12


In [19]:
# remove 2013
full_df = full_df[full_df['date_block_num'] > 12]

**show all features**

In [20]:
feature_columns

['dt_year',
 'dt_month',
 'item_name_tfidf_dcp_0',
 'item_name_tfidf_dcp_1',
 'item_name_tfidf_dcp_2',
 'item_name_tfidf_dcp_3',
 'item_name_tfidf_dcp_4',
 'item_category_id',
 'item_name_words_count',
 'mean_encoding-shop_id-item_id',
 'mean_encoding-shop_id',
 'mean_encoding-item_id',
 'target-shop_id_lag_1',
 'item_price-shop_id_lag_1',
 'target-item_id_lag_1',
 'item_price-item_id_lag_1',
 'target-shop_id-item_id_lag_1',
 'item_price-shop_id-item_id_lag_1',
 'target-item_category_id_lag_1',
 'item_price-item_category_id_lag_1',
 'target-shop_id-item_category_id_lag_1',
 'item_price-shop_id-item_category_id_lag_1',
 'target-shop_id_lag_2',
 'item_price-shop_id_lag_2',
 'target-item_id_lag_2',
 'item_price-item_id_lag_2',
 'target-shop_id-item_id_lag_2',
 'item_price-shop_id-item_id_lag_2',
 'target-item_category_id_lag_2',
 'item_price-item_category_id_lag_2',
 'target-shop_id-item_category_id_lag_2',
 'item_price-shop_id-item_category_id_lag_2',
 'target-shop_id_lag_3',
 'item_pric

In [21]:
full_df

Unnamed: 0,shop_id,item_id,date_block_num,ID,item_cnt_month,dt_year,dt_month,item_name,item_category_id,item_name_tfidf_dcp_0,...,target-shop_id_lag_12,item_price-shop_id_lag_12,target-item_id_lag_12,item_price-item_id_lag_12,target-shop_id-item_id_lag_12,item_price-shop_id-item_id_lag_12,target-item_category_id_lag_12,item_price-item_category_id_lag_12,target-shop_id-item_category_id_lag_12,item_price-shop_id-item_category_id_lag_12
4836102,2,30,13,0.0,0.0,2014,2,007: КООРДИНАТЫ «СКАЙФОЛЛ»,40,0.001040,...,488.0,1062.437181,861.0,383.921877,0.0,0.0,31649.0,252.174936,40.0,260.250000
4836103,2,31,13,0.0,0.0,2014,2,007: КООРДИНАТЫ «СКАЙФОЛЛ» (BD),37,0.243195,...,488.0,1062.437181,628.0,666.991044,4.0,699.0,6307.0,494.550783,21.0,577.333333
4836104,2,32,13,0.0,0.0,2014,2,1+1,40,-0.000000,...,488.0,1062.437181,208.0,337.771930,0.0,0.0,31649.0,252.174936,40.0,260.250000
4836105,2,33,13,0.0,0.0,2014,2,1+1 (BD),37,0.976919,...,488.0,1062.437181,39.0,484.170732,0.0,0.0,6307.0,494.550783,21.0,577.333333
4836106,2,34,13,0.0,0.0,2014,2,10 000 ЛЕТ ДО НАШЕЙ ЭРЫ WB (регион),40,0.016747,...,488.0,1062.437181,11.0,149.000000,0.0,0.0,31649.0,252.174936,40.0,260.250000
4836107,2,36,13,0.0,0.0,2014,2,10 ЛЕТ СПУСТЯ (BD),37,0.251092,...,488.0,1062.437181,8.0,525.000000,0.0,0.0,6307.0,494.550783,21.0,577.333333
4836108,2,37,13,0.0,0.0,2014,2,10 ЛЕТ СПУСТЯ (регион),40,0.009630,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000
4836109,2,40,13,0.0,0.0,2014,2,100 Best classical melodies (mp3-CD) (Digipack),57,0.005770,...,488.0,1062.437181,5.0,224.600000,0.0,0.0,1540.0,272.022084,0.0,0.000000
4836110,2,42,13,0.0,0.0,2014,2,100 Best romantic melodies (mp3-CD) (Digipack),57,0.006035,...,488.0,1062.437181,6.0,208.333333,0.0,0.0,1540.0,272.022084,0.0,0.000000
4836111,2,44,13,0.0,0.0,2014,2,100 лучших мелодий против стресса (mp3-CD) (CD...,57,0.008027,...,488.0,1062.437181,5.0,249.000000,0.0,0.0,1540.0,272.022084,0.0,0.000000


**clip target**

In [22]:
full_df['item_cnt_month'] = full_df['item_cnt_month'].clip(0, 20)

## Test features use lightgbm

In [23]:
dates = full_df['date_block_num'].copy()
XX_trn = full_df[dates <= 32][feature_columns]
yy_trn = full_df[dates <= 32][label_col]
XX_val = full_df[dates == 33][feature_columns]
yy_val = full_df[dates == 33][label_col]

In [24]:
trn_lgb = lgb.Dataset(XX_trn, yy_trn)
val_lgb = lgb.Dataset(XX_val, yy_val)

lgb_params = {
    'application': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.1,
    'max_depth': 5,
    'num_leaves': 20,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.5,
    'bagging_freq': 1,
    'seed': 0,
}
train_round = 150

bst = lgb.train(lgb_params, trn_lgb, train_round, valid_sets=[trn_lgb, val_lgb], early_stopping_rounds=20)

[1]	training's rmse: 1.13974	valid_1's rmse: 1.10691
Training until validation scores don't improve for 20 rounds.
[2]	training's rmse: 1.09969	valid_1's rmse: 1.08105
[3]	training's rmse: 1.06622	valid_1's rmse: 1.052
[4]	training's rmse: 1.03792	valid_1's rmse: 1.02607
[5]	training's rmse: 1.00609	valid_1's rmse: 1.0081
[6]	training's rmse: 0.983047	valid_1's rmse: 0.996264
[7]	training's rmse: 0.958717	valid_1's rmse: 0.984539
[8]	training's rmse: 0.940494	valid_1's rmse: 0.978195
[9]	training's rmse: 0.923412	valid_1's rmse: 0.97096
[10]	training's rmse: 0.908627	valid_1's rmse: 0.96556
[11]	training's rmse: 0.895897	valid_1's rmse: 0.960899
[12]	training's rmse: 0.885635	valid_1's rmse: 0.957885
[13]	training's rmse: 0.877481	valid_1's rmse: 0.956268
[14]	training's rmse: 0.870309	valid_1's rmse: 0.953756
[15]	training's rmse: 0.863621	valid_1's rmse: 0.948511
[16]	training's rmse: 0.857585	valid_1's rmse: 0.943964
[17]	training's rmse: 0.85216	valid_1's rmse: 0.940886
[18]	traini

In [25]:
imp_df = pd.DataFrame([bst.feature_importance()], columns=feature_columns, index=['importance']).T.sort_values(by='importance', ascending=False)
imp_df

Unnamed: 0,importance
target-item_id_lag_1,192
mean_encoding-shop_id-item_id,161
mean_encoding-item_id,154
target-shop_id-item_id_lag_1,109
dt_month,93
target-item_id_lag_2,72
mean_encoding-shop_id,61
item_name_tfidf_dcp_4,56
target-item_category_id_lag_2,51
target-item_category_id_lag_1,49


## Save

In [26]:
%%time
features_save(full_df, feature_columns)

CPU times: user 10min 6s, sys: 26 s, total: 10min 32s
Wall time: 10min 41s
