In [1]:
# coding: utf-8
import os

import pandas as pd
import numpy as np
import scipy
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
from sklearn.linear_model import ElasticNet, ElasticNetCV

from mydatools.plot import plot_grid_search_result

% matplotlib inline

## Config

In [2]:
# trn_path = './data/input/train.csv'
tst_path = './data/input/test.csv'
id_col = 'ID'
label_col = 'item_cnt_month'

submission_path = './data/output/submission/submission_weights_mean.csv'
output_id_col = id_col
output_label_col = label_col

## Load Data

In [3]:
item_cate_df = pd.read_csv('./data/input/item_categories.csv')
item_df = pd.read_csv('./data/input/items.csv')
sales_df = pd.read_csv('./data/input/sales_train.csv')
shop_df = pd.read_csv('./data/input/shops.csv')

In [4]:
sales_df['revenue'] = sales_df['item_price'] * sales_df['item_cnt_day']
trn_df = sales_df.groupby(['shop_id', 'item_id', 'date_block_num'])['item_cnt_day'].sum().reset_index()
trn_df = trn_df.rename(columns={'item_cnt_day': 'item_cnt_month'})
trn_df['ID'] = 0
trn_df['ds_type'] = 'trn'

tst_df = pd.read_csv(tst_path)
tst_df['date_block_num'] = 34
tst_df['item_cnt_month'] = np.nan
tst_df['ds_type'] = 'tst'

full_df = pd.concat([trn_df, tst_df])
full_df.head()

Unnamed: 0,ID,date_block_num,ds_type,item_cnt_month,item_id,shop_id
0,0,1,trn,31.0,30,0
1,0,1,trn,11.0,31,0
2,0,0,trn,6.0,32,0
3,0,1,trn,10.0,32,0
4,0,0,trn,3.0,33,0


In [5]:
# dataset type
is_trn = full_df['ds_type'] == 'trn'
is_tst = full_df['ds_type'] == 'tst'

## Weighted mean

用历史的趋势来估计下一个月的销售情况

具体的，设置每个月的权重（越后面的月份权重越大），加权估计下一个月的销量

In [6]:
N = 34
# weights = (np.arange(1, N+1) / N) ** 2 # LB 1.63549
weights = (np.arange(1, N+1) / N) ** 5 # LB 
weights = pd.Series(weights, index=range(N), name='weights')
# weights = weights / weights.sum()
weights

0     2.200926e-08
1     7.042963e-07
2     5.348250e-06
3     2.253748e-05
4     6.877893e-05
5     1.711440e-04
6     3.699096e-04
7     7.211994e-04
8     1.299625e-03
9     2.200926e-03
10    3.544613e-03
11    5.476608e-03
12    8.171884e-03
13    1.183711e-02
14    1.671328e-02
15    2.307838e-02
16    3.125000e-02
17    4.158799e-02
18    5.449710e-02
19    7.042963e-02
20    8.988804e-02
21    1.134276e-01
22    1.416591e-01
23    1.752515e-01
24    2.149342e-01
25    2.615003e-01
26    3.158088e-01
27    3.787874e-01
28    4.514352e-01
29    5.348250e-01
30    6.301064e-01
31    7.385082e-01
32    8.613410e-01
33    1.000000e+00
Name: weights, dtype: float64

In [7]:
full_df = full_df.join(weights, on='date_block_num')

In [8]:
wmean = lambda x: (x.item_cnt_month * x.weights).sum() / x.weights.sum()
def fill_target(tst_df, trn_df, join_key):
    new_df = pd.DataFrame({'v': trn_df.groupby(join_key).apply(wmean)})
    tst_df = tst_df.join(new_df, on=join_key)
    is_missing = tst_df['item_cnt_month'].isnull()
    tst_df.loc[is_missing, 'item_cnt_month'] = tst_df.loc[is_missing, 'v']
    tst_df.drop('v', axis=1, inplace=True)
    return tst_df

In [9]:
tst_df = full_df[is_tst]
trn_df = full_df[is_trn]

In [10]:
tst_df = fill_target(tst_df, trn_df, ['shop_id', 'item_id'])

In [11]:
# 未填充值数量
tst_df['item_cnt_month'].isnull().sum()

102796

In [12]:
tst_df = tst_df.fillna(0)

save to file

In [13]:
res_df = tst_df[['ID', 'item_cnt_month']].copy()
res_df['ID'] = res_df['ID'].astype('int')
res_df['item_cnt_month'] = res_df['item_cnt_month'].clip(0,20)
res_df.to_csv(submission_path, index=False)