<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc" style="margin-top: 1em;"><ul class="toc-item"><li><span><a href="#Load-data-from-scratch-folder" data-toc-modified-id="Load-data-from-scratch-folder-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Load data from <code>scratch</code> folder</a></span></li><li><span><a href="#Feature-Engineering" data-toc-modified-id="Feature-Engineering-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Feature Engineering</a></span><ul class="toc-item"><li><span><a href="#Datetime-features" data-toc-modified-id="Datetime-features-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Datetime features</a></span></li><li><span><a href="#Location-of-stores" data-toc-modified-id="Location-of-stores-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Location of <code>stores</code></a></span><ul class="toc-item"><li><span><a href="#Split-area_name-string-to-get-different-levels-of-location-names" data-toc-modified-id="Split-area_name-string-to-get-different-levels-of-location-names-2.2.1"><span class="toc-item-num">2.2.1&nbsp;&nbsp;</span>Split <code>area_name</code> string to get different levels of location names</a></span></li><li><span><a href="#Count-stores-in-the-same-location" data-toc-modified-id="Count-stores-in-the-same-location-2.2.2"><span class="toc-item-num">2.2.2&nbsp;&nbsp;</span>Count stores in the same location</a></span></li><li><span><a href="#Add-store-info-to-full" data-toc-modified-id="Add-store-info-to-full-2.2.3"><span class="toc-item-num">2.2.3&nbsp;&nbsp;</span>Add store info to <code>full</code></a></span></li></ul></li><li><span><a href="#Reservation-features" data-toc-modified-id="Reservation-features-2.3"><span class="toc-item-num">2.3&nbsp;&nbsp;</span>Reservation features</a></span></li></ul></li></ul></div>

In [61]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn

## Load data from `scratch` folder

In [62]:
full = pd.read_csv('scratch/full.csv', index_col=[0,1], parse_dates=['visit_date'])
full.index.rename([None, None], inplace=True)
display(full.head())
display(full.info())

Unnamed: 0,Unnamed: 1,air_store_id,id,visit_date,visitors
past,0,air_ba937bf13d40fb24,air_ba937bf13d40fb24_2016-01-13,2016-01-13,25
past,1,air_ba937bf13d40fb24,air_ba937bf13d40fb24_2016-01-14,2016-01-14,32
past,2,air_ba937bf13d40fb24,air_ba937bf13d40fb24_2016-01-15,2016-01-15,29
past,3,air_ba937bf13d40fb24,air_ba937bf13d40fb24_2016-01-16,2016-01-16,22
past,4,air_ba937bf13d40fb24,air_ba937bf13d40fb24_2016-01-18,2016-01-18,6


<class 'pandas.core.frame.DataFrame'>
MultiIndex: 284127 entries, (past, 0) to (future, 32018)
Data columns (total 4 columns):
air_store_id    284127 non-null object
id              284127 non-null object
visit_date      284127 non-null datetime64[ns]
visitors        284127 non-null int64
dtypes: datetime64[ns](1), int64(1), object(2)
memory usage: 11.9+ MB


None

## Feature Engineering

In [74]:
full.head()

Unnamed: 0,Unnamed: 1,air_store_id,id,visit_date,visitors,weekday,year,month,day_of_year,days_in_month,week_of_year,is_month_end
past,0,air_ba937bf13d40fb24,air_ba937bf13d40fb24_2016-01-13,2016-01-13,25,2,2016,1,13,31,2,False
past,1,air_ba937bf13d40fb24,air_ba937bf13d40fb24_2016-01-14,2016-01-14,32,3,2016,1,14,31,2,False
past,2,air_ba937bf13d40fb24,air_ba937bf13d40fb24_2016-01-15,2016-01-15,29,4,2016,1,15,31,2,False
past,3,air_ba937bf13d40fb24,air_ba937bf13d40fb24_2016-01-16,2016-01-16,22,5,2016,1,16,31,2,False
past,4,air_ba937bf13d40fb24,air_ba937bf13d40fb24_2016-01-18,2016-01-18,6,0,2016,1,18,31,3,False


### Datetime

In [63]:
full['weekday'] = full.visit_date.dt.dayofweek
full['year'] = full.visit_date.dt.year
full['month'] = full.visit_date.dt.month
full['day_of_year'] = full.visit_date.dt.dayofyear
full['days_in_month'] = full.visit_date.dt.days_in_month
full['week_of_year'] = full.visit_date.dt.weekofyear
full['is_month_end'] = full.visit_date.dt.is_month_end

In [64]:
full.head()

Unnamed: 0,Unnamed: 1,air_store_id,id,visit_date,visitors,weekday,year,month,day_of_year,days_in_month,week_of_year,is_month_end
past,0,air_ba937bf13d40fb24,air_ba937bf13d40fb24_2016-01-13,2016-01-13,25,2,2016,1,13,31,2,False
past,1,air_ba937bf13d40fb24,air_ba937bf13d40fb24_2016-01-14,2016-01-14,32,3,2016,1,14,31,2,False
past,2,air_ba937bf13d40fb24,air_ba937bf13d40fb24_2016-01-15,2016-01-15,29,4,2016,1,15,31,2,False
past,3,air_ba937bf13d40fb24,air_ba937bf13d40fb24_2016-01-16,2016-01-16,22,5,2016,1,16,31,2,False
past,4,air_ba937bf13d40fb24,air_ba937bf13d40fb24_2016-01-18,2016-01-18,6,0,2016,1,18,31,3,False


### Location of `stores`

In [65]:
stores = pd.read_csv('scratch/stores.csv')

#### Split `area_name` string to get different levels of location names

In [66]:
area_split = stores.air_area_name.str.split(' ', expand=True)

stores['Todofuken'] = area_split[0]
stores['city'] = area_split[1]
stores['street'] = area_split.iloc[:, 2:].apply(lambda row: ' '.join(row.dropna()), axis=1)

In [67]:
stores.head()

Unnamed: 0,air_store_id,air_genre_name,air_area_name,latitude_air,longitude_air,hpg_store_id,hpg_genre_name,hpg_area_name,latitude_hpg,longitude_hpg,Todofuken,city,street
0,air_0f0cdeee6c9bf3d7,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852,,,,,,Hyōgo-ken,Kōbe-shi,Kumoidōri
1,air_7cc17a324ae5c7dc,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852,hpg_9b38b9e13da6da27,,,,,Hyōgo-ken,Kōbe-shi,Kumoidōri
2,air_fee8dcf4d619598e,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852,,,,,,Hyōgo-ken,Kōbe-shi,Kumoidōri
3,air_a17f0778617c76e2,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852,,,,,,Hyōgo-ken,Kōbe-shi,Kumoidōri
4,air_83db5aff8f50478e,Italian/French,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,,,,,,Tōkyō-to,Minato-ku,Shibakōen


#### Count stores in the same location

Same street:

In [68]:
n_stores_by_street = stores.groupby(['air_area_name']).size().to_frame(name='n_stores_same_street').reset_index()
n_stores_by_city = stores.groupby(['Todofuken', 'city']).size().to_frame(name='n_stores_same_city').reset_index()
n_stores_by_Todofuken = stores.groupby('Todofuken').size().to_frame(name='n_stores_same_Todofuken').reset_index()

In [69]:
n_stores_by_street.head()

Unnamed: 0,air_area_name,n_stores_same_street
0,Fukuoka-ken Fukuoka-shi Daimyō,64
1,Fukuoka-ken Fukuoka-shi Hakata Ekimae,16
2,Fukuoka-ken Fukuoka-shi Imaizumi,2
3,Fukuoka-ken Fukuoka-shi Momochi,6
4,Fukuoka-ken Fukuoka-shi Shiobaru,7


In [70]:
n_stores_by_city.head()

Unnamed: 0,Todofuken,city,n_stores_same_city
0,Fukuoka-ken,Fukuoka-shi,103
1,Fukuoka-ken,Itoshima-shi,7
2,Fukuoka-ken,Kitakyūshū-shi,9
3,Fukuoka-ken,Kurume-shi,6
4,Fukuoka-ken,Yame-shi,2


In [71]:
n_stores_by_Todofuken

Unnamed: 0,Todofuken,n_stores_same_Todofuken
0,Fukuoka-ken,127
1,Hiroshima-ken,32
2,Hokkaidō,46
3,Hyōgo-ken,57
4,Miyagi-ken,17
5,Niigata-ken,14
6,Shizuoka-ken,18
7,Tōkyō-to,444
8,Ōsaka-fu,74


In [72]:
stores = pd.merge(left=stores, right=n_stores_by_street, how='left', on='air_area_name')
stores = pd.merge(left=stores, right=n_stores_by_city, how='left', on=['Todofuken', 'city'])
stores = pd.merge(left=stores, right=n_stores_by_Todofuken, how='left', on='Todofuken')

In [73]:
stores.head()

Unnamed: 0,air_store_id,air_genre_name,air_area_name,latitude_air,longitude_air,hpg_store_id,hpg_genre_name,hpg_area_name,latitude_hpg,longitude_hpg,Todofuken,city,street,n_stores_same_street,n_stores_same_city,n_stores_same_Todofuken
0,air_0f0cdeee6c9bf3d7,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852,,,,,,Hyōgo-ken,Kōbe-shi,Kumoidōri,17,25,57
1,air_7cc17a324ae5c7dc,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852,hpg_9b38b9e13da6da27,,,,,Hyōgo-ken,Kōbe-shi,Kumoidōri,17,25,57
2,air_fee8dcf4d619598e,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852,,,,,,Hyōgo-ken,Kōbe-shi,Kumoidōri,17,25,57
3,air_a17f0778617c76e2,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852,,,,,,Hyōgo-ken,Kōbe-shi,Kumoidōri,17,25,57
4,air_83db5aff8f50478e,Italian/French,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,,,,,,Tōkyō-to,Minato-ku,Shibakōen,51,61,444


#### Add store info to `full`

In [75]:
full = pd.merge(left=full, right=stores, how='left', on='air_store_id')

In [76]:
full.head()

Unnamed: 0,air_store_id,id,visit_date,visitors,weekday,year,month,day_of_year,days_in_month,week_of_year,...,hpg_genre_name,hpg_area_name,latitude_hpg,longitude_hpg,Todofuken,city,street,n_stores_same_street,n_stores_same_city,n_stores_same_Todofuken
0,air_ba937bf13d40fb24,air_ba937bf13d40fb24_2016-01-13,2016-01-13,25,2,2016,1,13,31,2,...,,,,,Tōkyō-to,Minato-ku,Shibakōen,51,61,444
1,air_ba937bf13d40fb24,air_ba937bf13d40fb24_2016-01-14,2016-01-14,32,3,2016,1,14,31,2,...,,,,,Tōkyō-to,Minato-ku,Shibakōen,51,61,444
2,air_ba937bf13d40fb24,air_ba937bf13d40fb24_2016-01-15,2016-01-15,29,4,2016,1,15,31,2,...,,,,,Tōkyō-to,Minato-ku,Shibakōen,51,61,444
3,air_ba937bf13d40fb24,air_ba937bf13d40fb24_2016-01-16,2016-01-16,22,5,2016,1,16,31,2,...,,,,,Tōkyō-to,Minato-ku,Shibakōen,51,61,444
4,air_ba937bf13d40fb24,air_ba937bf13d40fb24_2016-01-18,2016-01-18,6,0,2016,1,18,31,3,...,,,,,Tōkyō-to,Minato-ku,Shibakōen,51,61,444


### Reservation features