In [2]:
import numpy as np
import pandas as pd
from datetime import date, timedelta
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

In [3]:
data = {
    'tra': pd.read_csv('air_visit_data.csv'),
    'as': pd.read_csv('air_store_info.csv'),
    'hs': pd.read_csv('hpg_store_info.csv'),
    'ar': pd.read_csv('air_reserve.csv'),
    'hr': pd.read_csv('hpg_reserve.csv'),
    'id': pd.read_csv('store_id_relation.csv'),
    'tes': pd.read_csv('sample_submission.csv'),
    'hol': pd.read_csv('date_info.csv').rename(columns={'calendar_date':'visit_date'}),
    'before': pd.read_csv('before_42D_data.csv')
    }

In [161]:
data['hr'] = pd.merge(data['hr'], data['id'], how='inner', on=['hpg_store_id'])
data['hr'] = pd.merge(data['hr'], data['as'], how='left', on=['air_store_id'])
data['ar'] = pd.merge(data['ar'], data['as'], how='left', on=['air_store_id'])

preprocessing for reservation data

In [162]:
tmp_area = {}
tmp_genre = {}

for df in ['ar','hr']:
    data[df]['visit_datetime'] = pd.to_datetime(data[df]['visit_datetime'])
    data[df]['visit_datetime'] = data[df]['visit_datetime'].dt.date
    data[df]['reserve_datetime'] = pd.to_datetime(data[df]['reserve_datetime'])
    data[df]['reserve_datetime'] = data[df]['reserve_datetime'].dt.date
    data[df]['v_r_diff'] = data[df].apply(
        lambda r: (r['visit_datetime'] - r['reserve_datetime']).days, axis=1)
    tmp_area[df] = data[df].groupby(
        ['air_area_name','visit_datetime'],as_index=False)['reserve_visitors'].median().rename(
        columns={'visit_datetime':'visit_date','reserve_visitors':'area_reserve_visitors'})
    tmp_genre[df] = data[df].groupby(
        ['air_genre_name','visit_datetime'],as_index=False)['reserve_visitors'].median().rename(
        columns={'visit_datetime':'visit_date','reserve_visitors':'genre_reserve_visitors'})
    data[df] = data[df].groupby(
        ['air_store_id','visit_datetime'], 
        as_index=False)['reserve_visitors'].sum().rename(columns={'visit_datetime':'visit_date'})    

In [164]:
data['tra']['visit_date'] = pd.to_datetime(data['tra']['visit_date'])
data['tra']['year'] = data['tra']['visit_date'].dt.year
data['tra']['month'] = data['tra']['visit_date'].dt.month
data['tra']['week'] = data['tra']['visit_date'].dt.week
data['tra']['dow'] = data['tra']['visit_date'].dt.dayofweek
data['tra']['visit_date'] = data['tra']['visit_date'].dt.date

In [165]:
data['tes']['visit_date'] = data['tes']['id'].map(
    lambda x: str(x).split('_')[2])
data['tes']['air_store_id'] = data['tes']['id'].map(
    lambda x: '_'.join(x.split('_')[:2]))
data['tes']['visit_date'] = pd.to_datetime(data['tes']['visit_date'])
data['tes']['year'] = data['tes']['visit_date'].dt.year
data['tes']['month'] = data['tes']['visit_date'].dt.month
data['tes']['week'] = data['tes']['visit_date'].dt.week
data['tes']['dow'] = data['tes']['visit_date'].dt.dayofweek
data['tes']['visit_date'] = data['tes']['visit_date'].dt.date

In [167]:
tmp = data['tra'].groupby(
    ['air_store_id','dow'],as_index=False)['visitors'].min().rename(columns={
    'visitors':'min_visitors'})
data['tra'] = pd.merge(data['tra'], tmp, how='left', on=['air_store_id','dow'])
data['tes'] = pd.merge(data['tes'], tmp, how='left', on=['air_store_id','dow'])

In [168]:
tmp = data['tra'].groupby(
    ['air_store_id','dow'],as_index=False)['visitors'].max().rename(columns={
    'visitors':'max_visitors'})
data['tra'] = pd.merge(data['tra'], tmp, how='left', on=['air_store_id','dow'])
data['tes'] = pd.merge(data['tes'], tmp, how='left', on=['air_store_id','dow'])

In [169]:
tmp = data['tra'].groupby(
    ['air_store_id','dow'],as_index=False)['visitors'].mean().rename(columns={
    'visitors':'mean_visitors'})
data['tra'] = pd.merge(data['tra'], tmp, how='left', on=['air_store_id','dow'])
data['tes'] = pd.merge(data['tes'], tmp, how='left', on=['air_store_id','dow'])

In [170]:
tmp = data['tra'].groupby(
    ['air_store_id','dow'],as_index=False)['visitors'].median().rename(columns={
    'visitors':'median_visitors'})
data['tra'] = pd.merge(data['tra'], tmp, how='left', on=['air_store_id','dow'])
data['tes'] = pd.merge(data['tes'], tmp, how='left', on=['air_store_id','dow'])

In [171]:
tmp = data['tra'].groupby(
    ['air_store_id','dow'],as_index=False)['visitors'].count().rename(columns={
    'visitors':'count_visitors'})
data['tra'] = pd.merge(data['tra'], tmp, how='left', on=['air_store_id','dow'])
data['tes'] = pd.merge(data['tes'], tmp, how='left', on=['air_store_id','dow'])

In [172]:
data['before']['visit_date'] = pd.to_datetime(data['before']['visit_date'])
data['before']['visit_date'] = data['before']['visit_date'].dt.date
data['tra'] = pd.merge(data['tra'], data['before'], how='left', on=['air_store_id','visit_date'])
data['tes'] = pd.merge(data['tes'], data['before'], how='left', on=['air_store_id','visit_date'])

In [173]:
data['tra'] = pd.merge(data['tra'], data['as'], how='left', on=['air_store_id'])
data['tes'] = pd.merge(data['tes'], data['as'], how='left', on=['air_store_id'])

In [174]:
del data['hol']['day_of_week']

In [175]:
data['hol']['visit_date'] = pd.to_datetime(data['hol']['visit_date'])
data['hol']['visit_date'] = data['hol']['visit_date'].dt.date

In [176]:
data['tra'] = pd.merge(data['tra'], data['hol'], how='left', on=['visit_date'])
data['tes'] = pd.merge(data['tes'], data['hol'], how='left', on=['visit_date'])

In [177]:
train_r = pd.merge(data['ar'], data['hr'], 
                   how='outer', on=['air_store_id','visit_date']).fillna(0)
train_r['reserve_visitor'] = train_r['reserve_visitors_x'] + train_r['reserve_visitors_y']

In [179]:
data['tra'].dropna(inplace=True)
data['tes'] = data['tes'].fillna(0)

In [181]:
data['tra'] = pd.merge(data['tra'], train_r, 
                       how='left', on=['air_store_id','visit_date']).fillna(0)

In [185]:
data['tes'] = pd.merge(data['tes'], train_r, 
                       how='left', on=['air_store_id','visit_date']).fillna(0)

In [186]:
tmp_area = pd.merge(tmp_area['ar'], tmp_area['hr'], 
                    how='outer', on=['air_area_name','visit_date']).fillna(0)
tmp_area['area_reserve_visitors'] = tmp_area['area_reserve_visitors_x'] + tmp_area['area_reserve_visitors_y']

In [187]:
del tmp_area['area_reserve_visitors_x']
del tmp_area['area_reserve_visitors_y']

In [188]:
tmp_genre = pd.merge(tmp_genre['ar'], tmp_genre['hr'], 
                     how='outer', on=['air_genre_name','visit_date']).fillna(0)

In [189]:
tmp_genre['genre_reserve_visitors'] = tmp_genre['genre_reserve_visitors_x'] + tmp_genre['genre_reserve_visitors_y']
del tmp_genre['genre_reserve_visitors_x']
del tmp_genre['genre_reserve_visitors_y']

In [195]:
data['tra'] = pd.merge(data['tra'], tmp_area, 
                       how='left', on=['visit_date','air_area_name']).fillna(0)
data['tra'] = pd.merge(data['tra'], tmp_genre, 
                       how='left', on=['visit_date','air_genre_name']).fillna(0)
data['tes'] = pd.merge(data['tes'], tmp_area, 
                       how='left', on=['visit_date','air_area_name']).fillna(0)
data['tes'] = pd.merge(data['tes'], tmp_genre, 
                       how='left', on=['visit_date','air_genre_name']).fillna(0)

In [200]:
genre_mapping = {label:idx+1 for idx,label in enumerate(np.unique(data['as']['air_genre_name']))}
area_mapping = {label:idx+1 for idx,label in enumerate(np.unique(data['as']['air_area_name']))}

#!!!!categorical varibales: family, class, cluster
data['tra']['air_genre_name'] = data['tra']['air_genre_name'].map(genre_mapping)
data['tra']['air_area_name'] = data['tra']['air_area_name'].map(area_mapping)
data['tes']['air_genre_name'] = data['tes']['air_genre_name'].map(genre_mapping)
data['tes']['air_area_name'] = data['tes']['air_area_name'].map(area_mapping)

In [203]:
train_all = data['tra'].copy()
test_all = data['tes'].copy()

In [206]:
change_value = ['visitors','min_visitors','max_visitors',
                'mean_visitors','median_visitors','count_visitors',
                'before_mean_visit','before_min_visit','before_max_visit',
                'before_median_visit','reserve_visitor','reserve_visitors_x',
                'reserve_visitors_y','area_reserve_visitors','genre_reserve_visitors']

for name in change_value:
    train_all[name] = train_all[name].apply(lambda r:np.log1p(float(r)))
    test_all[name] = test_all[name].apply(lambda r:np.log1p(float(r)))

In [270]:
train_all['visit_date'] = pd.to_datetime(train_all['visit_date'])
split_data = train_all[train_all['visit_date'] >= pd.to_datetime(date(2017,3,1))]
train_part1 = train_all[train_all['visit_date'] < pd.to_datetime(date(2017,3,1))]
train_part2, val = train_test_split(split_data, test_size=0.6)
train = pd.concat([train_part1,train_part2])

In [272]:
train_y = train['visitors'].values
train_x = train.drop(['air_store_id','visit_date','visitors'],axis=1)

In [273]:
val_y = val['visitors'].values
val_x = val.drop(['air_store_id','visit_date','visitors'],axis=1)

In [274]:
params = {
    'num_leaves': 31,
    'objective': 'regression',
    'min_data_in_leaf': 180,
    'learning_rate': 0.02,
    'feature_fraction': 0.6,
    'metric': 'l2_root',
}

MAX_ROUNDS = 8000
cate_vars = ['air_genre_name','air_area_name',
             'holiday_flg','dow', 'year', 'month','week']

In [275]:
dtrain = lgb.Dataset(
    train_x, label=train_y,
    categorical_feature=cate_vars)
dval = lgb.Dataset(
    val_x, label=val_y, reference=dtrain,
    categorical_feature=cate_vars)

bst = lgb.train(
    params, dtrain, num_boost_round=MAX_ROUNDS,
    valid_sets=[dtrain, dval], early_stopping_rounds=500,verbose_eval=100
)

print("\n".join(("%s: %.2f" % x) for x in sorted(
    zip(train_x, bst.feature_importance("gain")),
    key=lambda x: x[1], reverse=True
)))



Training until validation scores don't improve for 500 rounds.
[100]	training's rmse: 0.51053	valid_1's rmse: 0.509909
[200]	training's rmse: 0.494041	valid_1's rmse: 0.494398
[300]	training's rmse: 0.488027	valid_1's rmse: 0.490602
[400]	training's rmse: 0.484297	valid_1's rmse: 0.488602
[500]	training's rmse: 0.481565	valid_1's rmse: 0.487571
[600]	training's rmse: 0.479383	valid_1's rmse: 0.486912
[700]	training's rmse: 0.477343	valid_1's rmse: 0.486278
[800]	training's rmse: 0.475615	valid_1's rmse: 0.485602
[900]	training's rmse: 0.474018	valid_1's rmse: 0.485163
[1000]	training's rmse: 0.472481	valid_1's rmse: 0.484837
[1100]	training's rmse: 0.471032	valid_1's rmse: 0.484371
[1200]	training's rmse: 0.469684	valid_1's rmse: 0.483968
[1300]	training's rmse: 0.468383	valid_1's rmse: 0.483619
[1400]	training's rmse: 0.467101	valid_1's rmse: 0.483347
[1500]	training's rmse: 0.465849	valid_1's rmse: 0.483083
[1600]	training's rmse: 0.464632	valid_1's rmse: 0.482862
[1700]	training's r

In [276]:
test_x = test_all.drop(['id','visitors','visit_date','air_store_id'],axis=1)

In [277]:
result = bst.predict(test_x)

In [278]:
test_all['visitors'] = np.expm1(result)

In [279]:
test_all[['id','visitors']].to_csv('try14.csv', index=False, float_format='%.3f')