In [1]:
# coding: utf-8
import os

import pandas as pd
import numpy as np
import scipy
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
from sklearn.linear_model import ElasticNet, ElasticNetCV

from mydatools.plot import plot_grid_search_result

% matplotlib inline

## Config

In [2]:
# trn_path = './data/input/train.csv'
tst_path = './data/input/test.csv'
id_col = 'ID'
label_col = 'item_cnt_month'

submission_path = './data/output/submission/submission.csv'
output_id_col = id_col
output_label_col = label_col

## Load Data

In [3]:
item_cate_df = pd.read_csv('./data/input/item_categories.csv')
item_df = pd.read_csv('./data/input/items.csv')
sales_df = pd.read_csv('./data/input/sales_train.csv')
shop_df = pd.read_csv('./data/input/shops.csv')

In [4]:
sales_df['revenue'] = sales_df['item_price'] * sales_df['item_cnt_day']
trn_df = sales_df.groupby(['shop_id', 'item_id', 'date_block_num'])['item_cnt_day'].sum().reset_index()
trn_df = trn_df.rename(columns={'item_cnt_day': 'item_cnt_month'})
trn_df['ID'] = 0
trn_df['ds_type'] = 'trn'

tst_df = pd.read_csv(tst_path)
tst_df['date_block_num'] = 34
tst_df['item_cnt_month'] = 0
tst_df['ds_type'] = 'tst'

full_df = pd.concat([trn_df, tst_df])
full_df.head()

Unnamed: 0,ID,date_block_num,ds_type,item_cnt_month,item_id,shop_id
0,0,1,trn,31.0,30,0
1,0,1,trn,11.0,31,0
2,0,0,trn,6.0,32,0
3,0,1,trn,10.0,32,0
4,0,0,trn,3.0,33,0


## Add Valdation

验证集策略：

测试集是2015.11这个月的数据，需要预测所有店铺所有商品的销量

那么验证集可以为2015.10这个月，所有店铺所有商品的数据，如果没有记录就补0

In [5]:
full_df['is_val'] = False

In [6]:
# 生成所有可能的(shop_id, item_id)
unique_shop_id = full_df[full_df['date_block_num'] == 33]['shop_id'].copy().drop_duplicates()
unique_item_id = full_df[full_df['date_block_num'] == 33]['item_id'].copy().drop_duplicates()
m_index = pd.MultiIndex.from_product([unique_shop_id, unique_item_id], names=['shop_id', 'item_id'])
val_df = pd.DataFrame([], index=m_index).reset_index()
val_df['date_block_num'] = 33
# 为了区别开原始trn
val_df['ds_type'] = 'add_val'
# 去掉已经有的(shop_id, item_id)
origin_33_df = full_df[full_df['date_block_num'] == 33][['item_id', 'shop_id', 'item_cnt_month']]
val_df = val_df.merge(origin_33_df, how='left', on=['item_id', 'shop_id'])
val_df = val_df[val_df.item_cnt_month.isnull()]
# 没有的记录 说明销售为0
val_df['item_cnt_month'] = 0

# 合并到full_df
val_df['is_val'] = True
full_df = pd.concat([full_df, val_df])

## Features

In [7]:
feature_columns = []

def add_features(features):
    if not isinstance(features, list):
        features = [features]
    global feature_columns
    feature_columns.extend([f for f in features if f not in feature_columns])
    
def remove_features(features):
    if not isinstance(features, list):
        features = [features]
    global feature_columns
    feature_columns = [f for f in feature_columns if f not in features]

**shop_id, item_id**

In [8]:
# add_features(['shop_id', 'item_id'])

**datetime info**

In [9]:
full_df['dt_year'] = full_df['date_block_num'] // 12 + 2013
full_df['dt_month'] = full_df['date_block_num'] % 12 + 1
add_features(['dt_year', 'dt_month'])

**aggragation data**

In [10]:
def get_aggragation_feature(df, groupby_cols, agg_col):
    gb = df[df['ds_type'] == 'trn'].groupby(groupby_cols)[agg_col]
    fname_fmt = '-'.join(groupby_cols+[agg_col]) + ':%s'
    agg_df = pd.DataFrame({
            fname_fmt%'mean': gb.mean(),
            fname_fmt%'median': gb.median(),
            fname_fmt%'max': gb.max(),
            fname_fmt%'min': gb.min(),
        })
    new_df = df.join(agg_df, on=groupby_cols).fillna(0)
    return new_df, agg_df.columns.tolist()

In [11]:
groupby_cols_list = [
    ['shop_id', 'item_id'],
    ['shop_id'],
    ['item_id'],
    ['date_block_num'],
    ['dt_month'],
]
for groupby_cols in groupby_cols_list:
    full_df, new_feats = get_aggragation_feature(full_df, groupby_cols, 'item_cnt_month')
    add_features(new_feats)

## Valdation

In [12]:
trn_df = full_df[full_df['date_block_num'] <= 32]
val_df = full_df[full_df['date_block_num'] == 33].copy()

X_trn = trn_df[feature_columns]
y_trn = trn_df[label_col]
X_val = val_df[feature_columns]
y_val = val_df[label_col]

# scale
scaler = preprocessing.StandardScaler()
X_trn = scaler.fit_transform(X_trn)
X_val = scaler.transform(X_val)

## ElasticNet

In [13]:
# score
def rmse(y, y_pred):
    return np.sqrt(metrics.mean_squared_error(y, y_pred))

In [14]:
param_grid = {
    'alpha': [0.001, 0.01, 0.1, 1, 10],
#     'l1_ratio': [0, 0.25, 0.5, 0.75, 1],
    'l1_ratio': [0.5],
}

best_score = 9999
best_param = None
for param in model_selection.ParameterGrid(param_grid):
    print(param)
    clf = ElasticNet(**param)
    clf.fit(X_trn, y_trn)
    s = rmse(y_val, clf.predict(X_val).clip(0,20))
    if s < best_score:
        best_score = s
        best_param = param
        val_df['pred'] = clf.predict(X_val).clip(0,20)
    print(s)

{'alpha': 0.001, 'l1_ratio': 0.5}
5.233957267325411
{'alpha': 0.01, 'l1_ratio': 0.5}
5.236928822629936
{'alpha': 0.1, 'l1_ratio': 0.5}
5.265577813250332
{'alpha': 1, 'l1_ratio': 0.5}
5.3543749832749805
{'alpha': 10, 'l1_ratio': 0.5}
5.589103921806671


In [15]:
val_df

Unnamed: 0,ID,date_block_num,ds_type,is_val,item_cnt_month,item_id,shop_id,dt_year,dt_month,shop_id-item_id-item_cnt_month:max,...,item_id-item_cnt_month:min,date_block_num-item_cnt_month:max,date_block_num-item_cnt_month:mean,date_block_num-item_cnt_month:median,date_block_num-item_cnt_month:min,dt_month-item_cnt_month:max,dt_month-item_cnt_month:mean,dt_month-item_cnt_month:median,dt_month-item_cnt_month:min,pred
8113,0.0,33,trn,False,1.0,31,2,2015,10,4.0,...,-1.0,2253.0,2.253528,1.0,-1.0,2253.0,2.443929,1.0,-1.0,0.847550
8205,0.0,33,trn,False,3.0,486,2,2015,10,3.0,...,1.0,2253.0,2.253528,1.0,-1.0,2253.0,2.443929,1.0,-1.0,0.902376
8254,0.0,33,trn,False,1.0,787,2,2015,10,1.0,...,1.0,2253.0,2.253528,1.0,-1.0,2253.0,2.443929,1.0,-1.0,0.122541
8284,0.0,33,trn,False,1.0,794,2,2015,10,1.0,...,1.0,2253.0,2.253528,1.0,-1.0,2253.0,2.443929,1.0,-1.0,0.138959
8367,0.0,33,trn,False,1.0,968,2,2015,10,1.0,...,1.0,2253.0,2.253528,1.0,-1.0,2253.0,2.443929,1.0,-1.0,0.167705
8378,0.0,33,trn,False,1.0,988,2,2015,10,1.0,...,1.0,2253.0,2.253528,1.0,-1.0,2253.0,2.443929,1.0,-1.0,0.144606
8422,0.0,33,trn,False,1.0,1075,2,2015,10,3.0,...,1.0,2253.0,2.253528,1.0,-1.0,2253.0,2.443929,1.0,-1.0,0.722513
8436,0.0,33,trn,False,1.0,1121,2,2015,10,1.0,...,1.0,2253.0,2.253528,1.0,-1.0,2253.0,2.443929,1.0,-1.0,0.150977
8467,0.0,33,trn,False,1.0,1377,2,2015,10,1.0,...,1.0,2253.0,2.253528,1.0,-1.0,2253.0,2.443929,1.0,-1.0,0.164057
8476,0.0,33,trn,False,1.0,1387,2,2015,10,8.0,...,1.0,2253.0,2.253528,1.0,-1.0,2253.0,2.443929,1.0,-1.0,2.753188


## Retrain(All data)

In [16]:
trn_df = full_df[full_df['ds_type'] == 'trn']
tst_df = full_df[full_df['ds_type'] == 'tst']

X_trn = trn_df[feature_columns]
y_trn = trn_df[label_col]
X_tst = tst_df[feature_columns]

# scale
scaler = preprocessing.StandardScaler()
X_trn = scaler.fit_transform(X_trn)
X_tst = scaler.transform(X_tst)

In [17]:
model = ElasticNet(**best_param)
model.fit(X_trn, y_trn)

ElasticNet(alpha=0.001, copy_X=True, fit_intercept=True, l1_ratio=0.5,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

## Predict

In [18]:
res_df = pd.DataFrame(model.predict(X_tst).clip(0,20), columns=[output_label_col])
res_df[output_id_col] = tst_df[output_id_col].astype(int).values
res_df[[output_id_col, output_label_col]].to_csv(submission_path, index=False)