In [1]:
# coding: utf-8
import os
import datetime

import pandas as pd
import numpy as np
import scipy
import lightgbm as lgb
import xgboost as xgb
from sklearn import metrics
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import seaborn as sns

from mydatools.features_analyze import get_top_k_corr
from mydatools.plot import plot_grid_search_result

from common import read_features, save_features

% matplotlib inline

## Config

In [2]:
# trn_path = './data/input/train.csv'
# tst_path = './data/input/test.csv'
id_col = 'id'
label_col = 'visitors'

submission_path = './data/output/submission/lightgbm.csv'
output_id_col = id_col
output_label_col = label_col

## Load Data

In [3]:
full_df, feature_columns = read_features()

In [4]:
# dataset type
is_trn = full_df['ds_type'] == 'trn'
is_tst = full_df['ds_type'] == 'tst'

In [5]:
full_df.head()

Unnamed: 0,air_store_id,ds_type,id,visit_date,visitors,visitors_o,air_genre_name,air_area_name,latitude,longitude,...,air_genre_name-visit_dayofweek-visitors_min,air_genre_name-visit_dayofweek-visitors_q25,air_genre_name-visit_dayofweek-visitors_q50,air_genre_name-visit_dayofweek-visitors_q75,air_area_name-visit_dayofweek-visitors_avg,air_area_name-visit_dayofweek-visitors_max,air_area_name-visit_dayofweek-visitors_min,air_area_name-visit_dayofweek-visitors_q25,air_area_name-visit_dayofweek-visitors_q50,air_area_name-visit_dayofweek-visitors_q75
0,air_ba937bf13d40fb24,trn,air_ba937bf13d40fb24_2016-01-13,2016-01-13,3.258097,25,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,...,0.693147,1.94591,2.639057,3.218876,2.777286,4.976734,0.693147,2.197225,2.890372,3.401197
1,air_ba937bf13d40fb24,trn,air_ba937bf13d40fb24_2016-01-14,2016-01-14,3.496508,32,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,...,0.693147,2.079442,2.639057,3.218876,2.75431,4.70048,0.693147,2.197225,2.833213,3.367296
2,air_ba937bf13d40fb24,trn,air_ba937bf13d40fb24_2016-01-15,2016-01-15,3.401197,29,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,...,0.693147,2.302585,2.944439,3.465736,2.925904,4.812184,0.693147,2.397895,3.044522,3.555348
3,air_ba937bf13d40fb24,trn,air_ba937bf13d40fb24_2016-01-16,2016-01-16,3.135494,22,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,...,0.693147,2.484907,3.091042,3.583519,2.730508,4.867534,0.693147,2.079442,2.772589,3.367296
4,air_ba937bf13d40fb24,trn,air_ba937bf13d40fb24_2016-01-18,2016-01-18,1.94591,6,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,...,0.693147,1.791759,2.484907,3.044522,2.450613,5.723585,0.693147,1.791759,2.484907,3.091042


In [6]:
feature_columns

['air_genre_name_Asian',
 'air_genre_name_Bar/Cocktail',
 'air_genre_name_Cafe/Sweets',
 'air_genre_name_Creative cuisine',
 'air_genre_name_Dining bar',
 'air_genre_name_International cuisine',
 'air_genre_name_Italian/French',
 'air_genre_name_Izakaya',
 'air_genre_name_Japanese food',
 'air_genre_name_Karaoke/Party',
 'air_genre_name_Okonomiyaki/Monja/Teppanyaki',
 'air_genre_name_Other',
 'air_genre_name_Western food',
 'air_genre_name_Yakiniku/Korean food',
 'latitude',
 'longitude',
 'holiday_flg',
 'visit_month',
 'visit_dayofweek',
 'visit_days_in_month',
 'reserve_count',
 'total_reserve_visitors',
 'ahead_reserve_hours_median',
 'reserve_in_1day_count',
 'reserve_in_2day_count',
 'reserve_in_3day_count',
 'reserve_in_7day_count',
 'reserve_in_30day_count',
 'reserve_in_1day_visitors',
 'reserve_in_2day_visitors',
 'reserve_in_3day_visitors',
 'reserve_in_7day_visitors',
 'reserve_in_30day_visitors',
 'air_store_id-visitors_avg',
 'air_store_id-visitors_max',
 'air_store_id-vi

## Preprocessing

In [7]:
trn_df = full_df[is_trn]
tst_df = full_df[is_tst]

X = trn_df[feature_columns]
y = trn_df[label_col]
X_trn, X_val, y_trn, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

X_tst = tst_df[feature_columns]

## LightGBM

In [8]:
trn_lgb = lgb.Dataset(X_trn.values, y_trn)
val_lgb = lgb.Dataset(X_val.values, y_val, reference=trn_lgb)

In [9]:
params = {
    'boosting': 'gbdt',
    'application': 'mse',
    'metric': 'rmse',
    'learning_rate': 0.05,
    'max_depth': 5,
    'num_leaves': 20,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.5,
    'bagging_freq': 1,
    'seed': 0,
    'reg_alpha': 0.1,
}
lgbm = lgb.train(params, trn_lgb, 1000, valid_sets=[trn_lgb, val_lgb], early_stopping_rounds=20)

[1]	training's rmse: 0.784733	valid_1's rmse: 0.781374
Training until validation scores don't improve for 20 rounds.
[2]	training's rmse: 0.762613	valid_1's rmse: 0.759405
[3]	training's rmse: 0.742228	valid_1's rmse: 0.739179
[4]	training's rmse: 0.723149	valid_1's rmse: 0.720248
[5]	training's rmse: 0.705663	valid_1's rmse: 0.702906
[6]	training's rmse: 0.68954	valid_1's rmse: 0.686949
[7]	training's rmse: 0.67455	valid_1's rmse: 0.672097
[8]	training's rmse: 0.660546	valid_1's rmse: 0.658234
[9]	training's rmse: 0.647575	valid_1's rmse: 0.645376
[10]	training's rmse: 0.635706	valid_1's rmse: 0.633616
[11]	training's rmse: 0.62488	valid_1's rmse: 0.622913
[12]	training's rmse: 0.614941	valid_1's rmse: 0.613087
[13]	training's rmse: 0.60567	valid_1's rmse: 0.603924
[14]	training's rmse: 0.597205	valid_1's rmse: 0.595563
[15]	training's rmse: 0.58948	valid_1's rmse: 0.587946
[16]	training's rmse: 0.582307	valid_1's rmse: 0.580877
[17]	training's rmse: 0.575864	valid_1's rmse: 0.574522


[148]	training's rmse: 0.5001	valid_1's rmse: 0.502001
[149]	training's rmse: 0.500072	valid_1's rmse: 0.501978
[150]	training's rmse: 0.50004	valid_1's rmse: 0.501971
[151]	training's rmse: 0.500018	valid_1's rmse: 0.501956
[152]	training's rmse: 0.499985	valid_1's rmse: 0.501952
[153]	training's rmse: 0.499959	valid_1's rmse: 0.501937
[154]	training's rmse: 0.499929	valid_1's rmse: 0.501917
[155]	training's rmse: 0.499906	valid_1's rmse: 0.501917
[156]	training's rmse: 0.499877	valid_1's rmse: 0.501913
[157]	training's rmse: 0.499846	valid_1's rmse: 0.501914
[158]	training's rmse: 0.499795	valid_1's rmse: 0.501862
[159]	training's rmse: 0.499783	valid_1's rmse: 0.501859
[160]	training's rmse: 0.499735	valid_1's rmse: 0.501819
[161]	training's rmse: 0.499698	valid_1's rmse: 0.501791
[162]	training's rmse: 0.499666	valid_1's rmse: 0.501771
[163]	training's rmse: 0.499607	valid_1's rmse: 0.501711
[164]	training's rmse: 0.499579	valid_1's rmse: 0.501692
[165]	training's rmse: 0.499552	va

[294]	training's rmse: 0.496083	valid_1's rmse: 0.499744
[295]	training's rmse: 0.496053	valid_1's rmse: 0.499726
[296]	training's rmse: 0.496037	valid_1's rmse: 0.499741
[297]	training's rmse: 0.496022	valid_1's rmse: 0.499749
[298]	training's rmse: 0.496011	valid_1's rmse: 0.499745
[299]	training's rmse: 0.495986	valid_1's rmse: 0.499728
[300]	training's rmse: 0.495954	valid_1's rmse: 0.499702
[301]	training's rmse: 0.495932	valid_1's rmse: 0.499687
[302]	training's rmse: 0.495897	valid_1's rmse: 0.499652
[303]	training's rmse: 0.495873	valid_1's rmse: 0.499638
[304]	training's rmse: 0.495861	valid_1's rmse: 0.499656
[305]	training's rmse: 0.495838	valid_1's rmse: 0.499644
[306]	training's rmse: 0.495807	valid_1's rmse: 0.499628
[307]	training's rmse: 0.495793	valid_1's rmse: 0.499636
[308]	training's rmse: 0.495784	valid_1's rmse: 0.499638
[309]	training's rmse: 0.495778	valid_1's rmse: 0.49964
[310]	training's rmse: 0.495765	valid_1's rmse: 0.49964
[311]	training's rmse: 0.495724	v

[441]	training's rmse: 0.493266	valid_1's rmse: 0.498613
[442]	training's rmse: 0.493249	valid_1's rmse: 0.498605
[443]	training's rmse: 0.493239	valid_1's rmse: 0.498599
[444]	training's rmse: 0.493236	valid_1's rmse: 0.498603
[445]	training's rmse: 0.493219	valid_1's rmse: 0.498627
[446]	training's rmse: 0.493199	valid_1's rmse: 0.498611
[447]	training's rmse: 0.493191	valid_1's rmse: 0.498605
[448]	training's rmse: 0.493166	valid_1's rmse: 0.498579
[449]	training's rmse: 0.493137	valid_1's rmse: 0.498598
[450]	training's rmse: 0.493118	valid_1's rmse: 0.498586
[451]	training's rmse: 0.4931	valid_1's rmse: 0.498571
[452]	training's rmse: 0.493078	valid_1's rmse: 0.498558
[453]	training's rmse: 0.493023	valid_1's rmse: 0.498515
[454]	training's rmse: 0.493013	valid_1's rmse: 0.498515
[455]	training's rmse: 0.493006	valid_1's rmse: 0.498516
[456]	training's rmse: 0.492984	valid_1's rmse: 0.498516
[457]	training's rmse: 0.49295	valid_1's rmse: 0.498499
[458]	training's rmse: 0.492938	va

[590]	training's rmse: 0.491138	valid_1's rmse: 0.498012
[591]	training's rmse: 0.491124	valid_1's rmse: 0.498008
[592]	training's rmse: 0.491104	valid_1's rmse: 0.498002
[593]	training's rmse: 0.491089	valid_1's rmse: 0.497996
[594]	training's rmse: 0.491064	valid_1's rmse: 0.497983
[595]	training's rmse: 0.49105	valid_1's rmse: 0.497986
[596]	training's rmse: 0.491034	valid_1's rmse: 0.49798
[597]	training's rmse: 0.491014	valid_1's rmse: 0.497978
[598]	training's rmse: 0.490994	valid_1's rmse: 0.49798
[599]	training's rmse: 0.490984	valid_1's rmse: 0.497982
[600]	training's rmse: 0.490958	valid_1's rmse: 0.497958
[601]	training's rmse: 0.490949	valid_1's rmse: 0.497955
[602]	training's rmse: 0.490935	valid_1's rmse: 0.497951
[603]	training's rmse: 0.490921	valid_1's rmse: 0.497962
[604]	training's rmse: 0.490892	valid_1's rmse: 0.497944
[605]	training's rmse: 0.490879	valid_1's rmse: 0.497938
[606]	training's rmse: 0.490863	valid_1's rmse: 0.49793
[607]	training's rmse: 0.49085	vali

[737]	training's rmse: 0.489172	valid_1's rmse: 0.497637
[738]	training's rmse: 0.489157	valid_1's rmse: 0.497638
[739]	training's rmse: 0.48915	valid_1's rmse: 0.497646
[740]	training's rmse: 0.489131	valid_1's rmse: 0.497636
[741]	training's rmse: 0.489128	valid_1's rmse: 0.497637
[742]	training's rmse: 0.489119	valid_1's rmse: 0.497638
[743]	training's rmse: 0.489113	valid_1's rmse: 0.497635
[744]	training's rmse: 0.489105	valid_1's rmse: 0.497636
[745]	training's rmse: 0.489076	valid_1's rmse: 0.497625
[746]	training's rmse: 0.489067	valid_1's rmse: 0.497621
[747]	training's rmse: 0.489052	valid_1's rmse: 0.497601
[748]	training's rmse: 0.489038	valid_1's rmse: 0.497602
[749]	training's rmse: 0.489035	valid_1's rmse: 0.497604
[750]	training's rmse: 0.489029	valid_1's rmse: 0.497614
[751]	training's rmse: 0.489015	valid_1's rmse: 0.49763
[752]	training's rmse: 0.489008	valid_1's rmse: 0.497623
[753]	training's rmse: 0.489004	valid_1's rmse: 0.497629
[754]	training's rmse: 0.488993	v

In [10]:
imp_df = pd.DataFrame([lgbm.feature_importance()], 
                      columns=feature_columns, 
                      index=['importance']).T.sort_values(by='importance', ascending=False)
imp_df

Unnamed: 0,importance
visit_month,853
air_store_id-visit_dayofweek-visitors_avg,794
air_store_id-visit_dayofweek-visitors_max,710
visit_month-visitors_avg,698
air_store_id-visitors_max,644
air_store_id-visit_dayofweek-visitors_q25,637
visit_month-visitors_max,563
total_reserve_visitors,528
air_store_id-visit_dayofweek-visitors_q75,518
air_store_id-visitors_avg,428


## predict

In [11]:
pred_df = pd.DataFrame({
    'lgbm': lgbm.predict(X_tst),
})

In [12]:
pred_df

Unnamed: 0,lgbm
0,0.900769
1,3.024311
2,3.119990
3,3.322663
4,3.432096
5,3.624016
6,2.653887
7,0.900769
8,3.076241
9,3.176332


In [13]:
pred_res = pred_df.mean(axis=1)
pred_res = pred_res.apply(np.expm1)
# test
pred_res = pred_res.apply(lambda x: np.around(x, 1))

In [14]:
res_df = pd.DataFrame({
    output_id_col: tst_df[output_id_col].values,
    output_label_col: pred_res,
})
res_df[[output_id_col, output_label_col]].to_csv(submission_path, index=False)