将训练集扩展为(shop_id, item_id, date_block_num)的所有情况，没有销量的补0

In [1]:
# coding: utf-8
import os

import pandas as pd
import numpy as np
import scipy
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
from sklearn.linear_model import ElasticNet, ElasticNetCV

from mydatools.plot import plot_grid_search_result

% matplotlib inline

## Config

In [20]:
# trn_path = './data/input/train.csv'
tst_path = './data/input/test.csv'
id_col = 'ID'
label_col = 'item_cnt_month'

submission_path = './data/output/submission/extend_all_train_datasets.csv'
output_id_col = id_col
output_label_col = label_col

## Load Data

In [3]:
item_cate_df = pd.read_csv('./data/input/item_categories.csv')
item_df = pd.read_csv('./data/input/items.csv')
sales_df = pd.read_csv('./data/input/sales_train.csv')
shop_df = pd.read_csv('./data/input/shops.csv')

In [4]:
sales_df['revenue'] = sales_df['item_price'] * sales_df['item_cnt_day']
trn_df = sales_df.groupby(['shop_id', 'item_id', 'date_block_num'])['item_cnt_day'].sum().reset_index()
trn_df = trn_df.rename(columns={'item_cnt_day': 'item_cnt_month'})
trn_df['ID'] = 0
trn_df['ds_type'] = 'trn'

tst_df = pd.read_csv(tst_path)
tst_df['date_block_num'] = 34
tst_df['item_cnt_month'] = 0
tst_df['ds_type'] = 'tst'

full_df = pd.concat([trn_df, tst_df])
full_df.head()

Unnamed: 0,ID,date_block_num,ds_type,item_cnt_month,item_id,shop_id
0,0,1,trn,31.0,30,0
1,0,1,trn,11.0,31,0
2,0,0,trn,6.0,32,0
3,0,1,trn,10.0,32,0
4,0,0,trn,3.0,33,0


## Add Valdation

验证集策略：

测试集是2015.11这个月的数据，需要预测所有店铺所有商品的销量

那么验证集可以为2015.10这个月，所有店铺所有商品的数据，如果没有记录就补0

In [5]:
for dbn in range(0, 34):
    # 生成所有可能的(shop_id, item_id)
    unique_shop_id = full_df[full_df['date_block_num'] == dbn]['shop_id'].copy().drop_duplicates()
    unique_item_id = full_df[full_df['date_block_num'] == dbn]['item_id'].copy().drop_duplicates()
    m_index = pd.MultiIndex.from_product([unique_shop_id, unique_item_id], names=['shop_id', 'item_id'])
    val_df = pd.DataFrame([], index=m_index).reset_index()
    val_df['ID'] = 0
    val_df['date_block_num'] = dbn
    # 为了区别开原始trn
    val_df['ds_type'] = 'add_val'
    # 去掉已经有的(shop_id, item_id)
    origin_33_df = full_df[full_df['date_block_num'] == dbn][['item_id', 'shop_id', 'item_cnt_month']]
    val_df = val_df.merge(origin_33_df, how='left', on=['item_id', 'shop_id'])
    val_df = val_df[val_df.item_cnt_month.isnull()]
    # 没有的记录 说明销售为0
    val_df['item_cnt_month'] = 0

    # 合并到full_df
    full_df = pd.concat([full_df, val_df])

## Features

In [6]:
feature_columns = []

def add_features(features):
    if not isinstance(features, list):
        features = [features]
    global feature_columns
    feature_columns.extend([f for f in features if f not in feature_columns])
    
def remove_features(features):
    if not isinstance(features, list):
        features = [features]
    global feature_columns
    feature_columns = [f for f in feature_columns if f not in features]

**shop_id, item_id**

In [7]:
# add_features(['shop_id', 'item_id'])

**datetime info**

In [8]:
full_df['dt_year'] = full_df['date_block_num'] // 12 + 2013
full_df['dt_month'] = full_df['date_block_num'] % 12 + 1
add_features(['dt_year', 'dt_month'])

**aggragation data**

In [9]:
def get_aggragation_feature(df, groupby_cols, agg_col):
    gb = df[df['ds_type'] == 'trn'].groupby(groupby_cols)[agg_col]
    fname_fmt = '-'.join(groupby_cols+[agg_col]) + ':%s'
    agg_df = pd.DataFrame({
            fname_fmt%'mean': gb.mean(),
#             fname_fmt%'median': gb.median(),
#             fname_fmt%'max': gb.max(),
#             fname_fmt%'min': gb.min(),
        })
    new_df = df.join(agg_df, on=groupby_cols).fillna(0)
    return new_df, agg_df.columns.tolist()

In [10]:
groupby_cols_list = [
    ['shop_id', 'item_id'],
    ['shop_id'],
    ['item_id'],
    ['date_block_num'],
    ['dt_month'],
]
for groupby_cols in groupby_cols_list:
    full_df, new_feats = get_aggragation_feature(full_df, groupby_cols, 'item_cnt_month')
    add_features(new_feats)

## Valdation

In [11]:
trn_df = full_df[full_df['date_block_num'] <= 32]
val_df = full_df[full_df['date_block_num'] == 33].copy()

X_trn = trn_df[feature_columns]
y_trn = trn_df[label_col]
X_val = val_df[feature_columns]
y_val = val_df[label_col]

# scale
scaler = preprocessing.StandardScaler()
X_trn = scaler.fit_transform(X_trn)
X_val = scaler.transform(X_val)

## ElasticNet

In [12]:
# score
def rmse(y, y_pred):
    return np.sqrt(metrics.mean_squared_error(y, y_pred))

In [13]:
param_grid = {
    'alpha': [0.001, 0.01, 0.1, 1, 10],
#     'l1_ratio': [0, 0.25, 0.5, 0.75, 1],
    'l1_ratio': [0.5],
}

best_score = 9999
best_param = None
for param in model_selection.ParameterGrid(param_grid):
    print(param)
    clf = ElasticNet(**param)
    clf.fit(X_trn, y_trn)
    s = rmse(y_val, clf.predict(X_val).clip(0,20))
    if s < best_score:
        best_score = s
        best_param = param
        val_df['pred'] = clf.predict(X_val).clip(0,20)
    print(s)

{'l1_ratio': 0.5, 'alpha': 0.001}
5.182724165208148
{'l1_ratio': 0.5, 'alpha': 0.01}
5.1823909616115165
{'l1_ratio': 0.5, 'alpha': 0.1}
5.1788754622253474
{'l1_ratio': 0.5, 'alpha': 1}
5.156887222716557
{'l1_ratio': 0.5, 'alpha': 10}
5.3428664013614515


In [14]:
val_df

Unnamed: 0,ID,date_block_num,ds_type,item_cnt_month,item_id,shop_id,dt_year,dt_month,shop_id-item_id-item_cnt_month:mean,shop_id-item_cnt_month:mean,item_id-item_cnt_month:mean,date_block_num-item_cnt_month:mean,dt_month-item_cnt_month:mean,pred
8113,0,33,trn,1.0,31,2,2015,10,1.600000,1.968752,2.841584,2.253528,2.443929,0.599023
8205,0,33,trn,3.0,486,2,2015,10,1.777778,1.968752,4.214976,2.253528,2.443929,0.668214
8254,0,33,trn,1.0,787,2,2015,10,1.000000,1.968752,2.692308,2.253528,2.443929,0.365500
8284,0,33,trn,1.0,794,2,2015,10,1.000000,1.968752,1.642857,2.253528,2.443929,0.365500
8367,0,33,trn,1.0,968,2,2015,10,1.000000,1.968752,2.701613,2.253528,2.443929,0.365500
8378,0,33,trn,1.0,988,2,2015,10,1.000000,1.968752,1.761905,2.253528,2.443929,0.365500
8422,0,33,trn,1.0,1075,2,2015,10,1.500000,1.968752,2.940653,2.253528,2.443929,0.560102
8436,0,33,trn,1.0,1121,2,2015,10,1.000000,1.968752,3.102041,2.253528,2.443929,0.365500
8467,0,33,trn,1.0,1377,2,2015,10,1.000000,1.968752,1.607143,2.253528,2.443929,0.365500
8476,0,33,trn,1.0,1387,2,2015,10,3.333333,1.968752,3.445545,2.253528,2.443929,1.273643


In [15]:
# val_df中原始训练集中的rmse
rmse(*zip(*val_df[val_df['ds_type'] == 'trn'][['item_cnt_month', 'pred']].values))

14.085175072861256

In [16]:
# 所有val_df的rmse
rmse(*zip(*val_df[['item_cnt_month', 'pred']].values))

5.156887222716557

## Retrain(All data)

In [17]:
trn_df = full_df[full_df['date_block_num'] <= 33]
tst_df = full_df[full_df['date_block_num'] == 34]

X_trn = trn_df[feature_columns]
y_trn = trn_df[label_col]
X_tst = tst_df[feature_columns]

# scale
scaler = preprocessing.StandardScaler()
X_trn = scaler.fit_transform(X_trn)
X_tst = scaler.transform(X_tst)

In [18]:
model = ElasticNet(**best_param)
model.fit(X_trn, y_trn)

ElasticNet(alpha=1, copy_X=True, fit_intercept=True, l1_ratio=0.5,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

## Predict

In [21]:
res_df = pd.DataFrame(model.predict(X_tst).clip(0,20), columns=[output_label_col])
res_df[output_id_col] = tst_df[output_id_col].astype(int).values
res_df[[output_id_col, output_label_col]].to_csv(submission_path, index=False)