# Preparing Dataset

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb
import numpy as np

full = pd.read_csv('scratch/full_engineering.csv')

In [2]:
# train, test 데이터셋 분리
past = full.query('dataset=="past"')
future  = full.query('dataset=="future"')

print('train data set:')
display(past.shape)
display(past.head())

print('test data set:')
display(future.shape)
display(future.head())

train data set:


(252108, 24)

Unnamed: 0,dataset,visitors,id,weekday,year,month,day_of_year,is_month_end,holiday_flg,tomorrow_is_holiday,...,Todofuken,city,street,n_stores_same_street,n_stores_same_city,n_stores_same_Todofuken,n_reserves,n_reserve_visitors,reserve_avg_hours_ahead,max_reserve_visitors
0,past,25,air_ba937bf13d40fb24_2016-01-13,2,2016,1,13,0,0,0,...,7,29,77,51,61,444,,,,
1,past,32,air_ba937bf13d40fb24_2016-01-14,3,2016,1,14,0,0,0,...,7,29,77,51,61,444,,,,
2,past,29,air_ba937bf13d40fb24_2016-01-15,4,2016,1,15,0,0,0,...,7,29,77,51,61,444,,,,
3,past,22,air_ba937bf13d40fb24_2016-01-16,5,2016,1,16,0,0,0,...,7,29,77,51,61,444,,,,
4,past,6,air_ba937bf13d40fb24_2016-01-18,0,2016,1,18,0,0,0,...,7,29,77,51,61,444,,,,


test data set:


(32019, 24)

Unnamed: 0,dataset,visitors,id,weekday,year,month,day_of_year,is_month_end,holiday_flg,tomorrow_is_holiday,...,Todofuken,city,street,n_stores_same_street,n_stores_same_city,n_stores_same_Todofuken,n_reserves,n_reserve_visitors,reserve_avg_hours_ahead,max_reserve_visitors
252108,future,0,air_00a91d42b08b08d9_2017-04-23,6,2017,4,113,0,0,0,...,7,5,45,20,22,444,,,,
252109,future,0,air_00a91d42b08b08d9_2017-04-24,0,2017,4,114,0,0,0,...,7,5,45,20,22,444,,,,
252110,future,0,air_00a91d42b08b08d9_2017-04-25,1,2017,4,115,0,0,0,...,7,5,45,20,22,444,,,,
252111,future,0,air_00a91d42b08b08d9_2017-04-26,2,2017,4,116,0,0,0,...,7,5,45,20,22,444,,,,
252112,future,0,air_00a91d42b08b08d9_2017-04-27,3,2017,4,117,0,0,0,...,7,5,45,20,22,444,,,,


In [3]:
# 불필요한 변수 제거
X = past.drop(columns=['id','dataset','visitors'])

# target 로그 스케일링
y = past['visitors'].apply(np.log1p)

# test 데이터셋
id_col = future['id']
X_test = future.drop(columns=['id','dataset','visitors'])

In [4]:
# train, val 데이터셋 분리
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=10)

In [5]:
# DMatrix
train_dmatrix = xgb.DMatrix(X_train, label=y_train)
val_dmatrix = xgb.DMatrix(X_val, label=y_val)
X_test = xgb.DMatrix(X_test)

eval_setting = [ (train_dmatrix, 'train'), (val_dmatrix, 'eval') ]

In [6]:
%%time

try_xgb_params = {'eta': 0.1,
                  'colsample_bytree': 0.8,
                  'gamma': 0.5,
                  'max_depth': 8,
                  'min_child_weight': 100,
                  'objective': 'reg:squarederror',
                  'seed': 2018,
                  'subsample': 1}

model = xgb.train(params=try_xgb_params,
                  dtrain=train_dmatrix, 
                  num_boost_round=100000, 
                  evals=eval_setting,
                  early_stopping_rounds=50,
                  verbose_eval=100
                 )

best_iteration = model.best_iteration
best_score = model.best_score

print ('best_score: {}, best_iteration: {}'.format(best_score, best_iteration))


[0]	train-rmse:2.32902	eval-rmse:2.33304
[100]	train-rmse:0.68549	eval-rmse:0.69736
[200]	train-rmse:0.66496	eval-rmse:0.67965
[300]	train-rmse:0.65723	eval-rmse:0.67372
[400]	train-rmse:0.65273	eval-rmse:0.67074
[500]	train-rmse:0.64856	eval-rmse:0.66824
[600]	train-rmse:0.64557	eval-rmse:0.66639
[700]	train-rmse:0.64411	eval-rmse:0.66601
[800]	train-rmse:0.64250	eval-rmse:0.66529
[900]	train-rmse:0.64121	eval-rmse:0.66455
[956]	train-rmse:0.64105	eval-rmse:0.66449
best_score: 0.664482, best_iteration: 907
CPU times: user 7min 30s, sys: 23.5 s, total: 7min 53s
Wall time: 1min 8s


In [7]:
future['visitors'] = model.predict(X_test)
future['visitors'] = np.expm1(future['visitors'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  future['visitors'] = model.predict(X_test)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  future['visitors'] = np.expm1(future['visitors'])


In [8]:
sub = future[['id','visitors']].copy()
sub.to_csv('sub_xgb_try.csv', index=False)

In [52]:
sub

Unnamed: 0,id,visitors
252108,air_00a91d42b08b08d9_2017-04-23,12.417237
252109,air_00a91d42b08b08d9_2017-04-24,19.735861
252110,air_00a91d42b08b08d9_2017-04-25,22.724957
252111,air_00a91d42b08b08d9_2017-04-26,26.968084
252112,air_00a91d42b08b08d9_2017-04-27,27.132521
...,...,...
284122,air_fff68b929994bfbd_2017-05-27,11.611891
284123,air_fff68b929994bfbd_2017-05-28,10.418058
284124,air_fff68b929994bfbd_2017-05-29,5.568339
284125,air_fff68b929994bfbd_2017-05-30,7.430390


# Model Tuning

In [8]:
basic_xgb_params = {
    'objective': 'reg:squarederror',
    'eta': 0.1,
    'seed': 2018
}

In [9]:

search_xgb_params = {
    'max_depth': [4, 8, 12, 16, 20, 24, 28, 32],
    'gamma': [0, 0.1, 0.2, 0.5, 1, 1.5, 2, 3],
    'colsample_bytree': [0.1, 0.2, 0.4, 0.6, 0.8, 1],
    'subsample': [0.1, 0.2, 0.4, 0.6, 0.8, 1],
    'min_child_weight': [0, 1, 3, 10, 20, 30, 50, 100]
}

In [10]:
%%time

best_params = basic_xgb_params.copy()

for p in search_xgb_params:  
    print('Tuning parameter {}, range: {}'.format(p, search_xgb_params[p]))
    test_params = best_params
    scores = []
    
    for v in search_xgb_params[p]:    
        test_params[p] = v
        test_model = xgb.train(params=test_params, 
                               dtrain=train_dmatrix, 
                               num_boost_round=100000, 
                               evals=eval_setting,
                               early_stopping_rounds=50,
                               verbose_eval=False
                              )
        best_iter = test_model.best_iteration
        best_score = test_model.best_score
        scores.append([v, best_score])
        print('    {p}={v}: best_score={s}, best_iter={it}'.format(p=p, v=v, s=best_score, it=best_iter))
        
    scores.sort(key=lambda x: x[1])
    best_value = scores[0][0]
    best_score = scores[0][1]
    best_params[p] = best_value
    print('  Best value for {p}: {v} (score {s})'.format(p=p, v=best_value, s=best_score))
        
print(best_params)

Tuning parameter max_depth, range: [4, 8, 12, 16, 20, 24, 28, 32]
    max_depth=4: best_score=0.667849, best_iter=2857
    max_depth=8: best_score=0.665771, best_iter=280
    max_depth=12: best_score=0.671893, best_iter=53
    max_depth=16: best_score=0.690279, best_iter=34
    max_depth=20: best_score=0.718402, best_iter=29
    max_depth=24: best_score=0.741959, best_iter=26
    max_depth=28: best_score=0.757558, best_iter=25
    max_depth=32: best_score=0.762604, best_iter=24
  Best value for max_depth: 8 (score 0.665771)
Tuning parameter gamma, range: [0, 0.1, 0.2, 0.5, 1, 1.5, 2, 3]
    gamma=0: best_score=0.665771, best_iter=280
    gamma=0.1: best_score=0.665249, best_iter=267
    gamma=0.2: best_score=0.665451, best_iter=298
    gamma=0.5: best_score=0.665044, best_iter=232
    gamma=1: best_score=0.671152, best_iter=139
    gamma=1.5: best_score=0.672393, best_iter=130
    gamma=2: best_score=0.670254, best_iter=147
    gamma=3: best_score=0.671619, best_iter=128
  Best value f