In [1]:
# coding: utf-8
import os
import datetime

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from common import read_features, save_features

% matplotlib inline

In [2]:
# trn_path = './data/input/train.csv'
# tst_path = './data/input/test.csv'
id_col = 'id'
label_col = 'visitors'

submission_path = './data/output/submission/weighted_mean_visitors.csv'
output_id_col = id_col
output_label_col = label_col

## Load Data

In [3]:
air_reserve = pd.read_csv('./data/input/air_reserve.csv')
air_store_info = pd.read_csv('./data/input/air_store_info.csv')
air_visit_data = pd.read_csv('./data/input/air_visit_data.csv')
date_info = pd.read_csv('./data/input/date_info.csv')
hpg_reserve = pd.read_csv('./data/input/hpg_reserve.csv')
hpg_store_info = pd.read_csv('./data/input/hpg_store_info.csv')
sample_submission = pd.read_csv('./data/input/sample_submission.csv')
store_id_relation = pd.read_csv('./data/input/store_id_relation.csv')

## Data Wrangle

In [4]:
# 周末不算holiday
wkend_holidays = date_info.apply((lambda x:(x.day_of_week=='Sunday' or x.day_of_week=='Saturday') and x.holiday_flg==1), axis=1)
date_info.loc[wkend_holidays, 'holiday_flg'] = 0

In [5]:
# date_info['weight'] = ((date_info.index + 1) / len(date_info))       # LB 0.509
# date_info['weight'] = ((date_info.index + 1) / len(date_info)) ** 2  # LB 0.503
# date_info['weight'] = ((date_info.index + 1) / len(date_info)) ** 3  # LB 0.500
# date_info['weight'] = ((date_info.index + 1) / len(date_info)) ** 4  # LB 0.498
date_info['weight'] = ((date_info.index + 1) / len(date_info)) ** 5  # LB 0.497

In [6]:
date_info = date_info.rename(columns={'calendar_date': 'visit_date'})

In [7]:
date_info.head()

Unnamed: 0,visit_date,day_of_week,holiday_flg,weight
0,2016-01-01,Friday,1,2.707368e-14
1,2016-01-02,Saturday,0,8.663577e-13
2,2016-01-03,Sunday,0,6.578904e-12
3,2016-01-04,Monday,0,2.772345e-11
4,2016-01-05,Tuesday,0,8.460525e-11


* 设置权重

In [8]:
visit_data = air_visit_data.merge(date_info, how='left', on='visit_date')
visit_data['visitors'] = visit_data['visitors'].apply(lambda x: np.log1p(x))

In [9]:
visit_data.head()

Unnamed: 0,air_store_id,visit_date,visitors,day_of_week,holiday_flg,weight
0,air_ba937bf13d40fb24,2016-01-13,3.258097,Wednesday,0,1.005227e-08
1,air_ba937bf13d40fb24,2016-01-14,3.496508,Thursday,0,1.456087e-08
2,air_ba937bf13d40fb24,2016-01-15,3.401197,Friday,0,2.055908e-08
3,air_ba937bf13d40fb24,2016-01-16,3.135494,Saturday,0,2.838881e-08
4,air_ba937bf13d40fb24,2016-01-18,1.94591,Monday,0,5.115756e-08


* 将权重应用到测试集上

In [10]:
sample_submission['air_store_id'] = sample_submission['id'].apply(lambda s: s[:-11])
sample_submission['visit_date'] = sample_submission['id'].apply(lambda s: s[-10:])
sample_submission = sample_submission.merge(date_info, how='left', on='visit_date')
sample_submission.head()

Unnamed: 0,id,visitors,air_store_id,visit_date,day_of_week,holiday_flg,weight
0,air_00a91d42b08b08d9_2017-04-23,0,air_00a91d42b08b08d9,2017-04-23,Sunday,0,0.682692
1,air_00a91d42b08b08d9_2017-04-24,0,air_00a91d42b08b08d9,2017-04-24,Monday,0,0.689848
2,air_00a91d42b08b08d9_2017-04-25,0,air_00a91d42b08b08d9,2017-04-25,Tuesday,0,0.697064
3,air_00a91d42b08b08d9_2017-04-26,0,air_00a91d42b08b08d9,2017-04-26,Wednesday,0,0.70434
4,air_00a91d42b08b08d9_2017-04-27,0,air_00a91d42b08b08d9,2017-04-27,Thursday,0,0.711677


In [11]:
wmean = lambda x: (x.weight * x.visitors).sum() / x.weight.sum()

def fill_visitors(df, visit_df, join_key):
    # 根据历史visitors的加权平均值来模拟未来的visitors
    new_visitors = pd.DataFrame({'v': visit_df.groupby(join_key).apply(wmean)})
    df = df.join(new_visitors, how='left', on=join_key)
    is_missing = df['visitors'].isnull()
    df.loc[is_missing, 'visitors'] = df.loc[is_missing, 'v']
    df.drop('v', axis=1, inplace=True)
    return df

In [12]:
# 先使用历史['air_store_id', 'day_of_week', 'holiday_flg']作为参考数据
# 如果有缺失值 就用['air_store_id', 'day_of_week'], ['air_store_id']来填补
sample_submission['visitors'] = None
sample_submission = fill_visitors(sample_submission, visit_data, ['air_store_id', 'day_of_week', 'holiday_flg'])
sample_submission = fill_visitors(sample_submission, visit_data, ['air_store_id', 'day_of_week'])
sample_submission = fill_visitors(sample_submission, visit_data, ['air_store_id'])

In [13]:
sample_submission['visitors'] = sample_submission['visitors'].apply(lambda x: np.expm1(x))

In [14]:
sample_submission[[id_col, label_col]].to_csv(submission_path, index=False)