In [1]:
# coding: utf-8
import os

import pandas as pd
import numpy as np
import scipy
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
from sklearn.linear_model import ElasticNet, ElasticNetCV

from mydatools.plot import plot_grid_search_result

% matplotlib inline

## Config

In [63]:
# trn_path = './data/input/train.csv'
tst_path = './data/input/test.csv'
id_col = 'ID'
label_col = 'item_cnt_month'

submission_path = './data/output/submission/submission_weights_mean.csv'
output_id_col = id_col
output_label_col = label_col

## Load Data

In [3]:
item_cate_df = pd.read_csv('./data/input/item_categories.csv')
item_df = pd.read_csv('./data/input/items.csv')
sales_df = pd.read_csv('./data/input/sales_train.csv')
shop_df = pd.read_csv('./data/input/shops.csv')

In [26]:
sales_df['revenue'] = sales_df['item_price'] * sales_df['item_cnt_day']
trn_df = sales_df.groupby(['shop_id', 'item_id', 'date_block_num'])['item_cnt_day'].sum().reset_index()
trn_df = trn_df.rename(columns={'item_cnt_day': 'item_cnt_month'})
trn_df['ID'] = 0
trn_df['ds_type'] = 'trn'

tst_df = pd.read_csv(tst_path)
tst_df['date_block_num'] = 34
tst_df['item_cnt_month'] = np.nan
tst_df['ds_type'] = 'tst'

full_df = pd.concat([trn_df, tst_df])
full_df.head()

Unnamed: 0,ID,date_block_num,ds_type,item_cnt_month,item_id,shop_id
0,0,1,trn,31.0,30,0
1,0,1,trn,11.0,31,0
2,0,0,trn,6.0,32,0
3,0,1,trn,10.0,32,0
4,0,0,trn,3.0,33,0


In [5]:
# dataset type
is_trn = full_df['ds_type'] == 'trn'
is_tst = full_df['ds_type'] == 'tst'

## Weighted mean

用历史的趋势来估计下一个月的销售情况

具体的，设置每个月的权重（越后面的月份权重越大），加权估计下一个月的销量

In [31]:
N = 34
weights = pd.Series((np.arange(1, N+1) / N) ** 2, 
                    index=range(N), name='weights')
# weights = weights / weights.sum()
weights

0     0.000865
1     0.003460
2     0.007785
3     0.013841
4     0.021626
5     0.031142
6     0.042388
7     0.055363
8     0.070069
9     0.086505
10    0.104671
11    0.124567
12    0.146194
13    0.169550
14    0.194637
15    0.221453
16    0.250000
17    0.280277
18    0.312284
19    0.346021
20    0.381488
21    0.418685
22    0.457612
23    0.498270
24    0.540657
25    0.584775
26    0.630623
27    0.678201
28    0.727509
29    0.778547
30    0.831315
31    0.885813
32    0.942042
33    1.000000
Name: weights, dtype: float64

In [33]:
full_df = full_df.join(weights, on='date_block_num')

In [57]:
wmean = lambda x: (x.item_cnt_month * x.weights).sum() / x.weights.sum()
def fill_target(tst_df, trn_df, join_key):
    new_df = pd.DataFrame({'v': trn_df.groupby(join_key).apply(wmean)})
    tst_df = tst_df.join(new_df, on=join_key)
    is_missing = tst_df['item_cnt_month'].isnull()
    tst_df.loc[is_missing, 'item_cnt_month'] = tst_df.loc[is_missing, 'v']
    tst_df.drop('v', axis=1, inplace=True)
    return tst_df

In [58]:
tst_df = full_df[is_tst]
trn_df = full_df[is_trn]

In [59]:
tst_df = fill_target(tst_df, trn_df, ['shop_id', 'item_id'])

In [61]:
# 未填充值数量
tst_df['item_cnt_month'].isnull().sum()

102796

In [65]:
tst_df = tst_df.fillna(0)

save to file

In [None]:
res_df = tst_df[['ID', 'item_cnt_month']].copy()
res_df['ID'] = res_df['ID'].astype('int')
res_df['item_cnt_month'] = res_df['item_cnt_month'].clip(0,20)
res_df.to_csv(submission_path, index=False)