### Pandas Lab -- Grouping & Merging

Welcome to today's lab!  It will come in two different parts:  

One section will be devoted to using the `groupby` method in order to answer different questions about our data.  

The second portion will be devoted towards combining grouping & merging to create summary statistics -- one of the more important features you can add to a dataset for statistical modeling.  

In [129]:
#import modules
import pandas as pd
import numpy as np
import plotly.express as px
import datetime as dt

In [130]:
#load in data sets
base_path = '/Users/hannah.westberg/Projects/GA-DS-Repo/ClassMaterial/Unit1/data/restaurant_data/'
df_reserve = pd.read_csv(base_path+'air_reserve.csv', parse_dates=True)
df_store = pd.read_csv(base_path+'air_store_info.csv')
df_visit = pd.read_csv(base_path+'air_visit_data.csv', parse_dates=True)
df_date= pd.read_csv(base_path+'date_info.csv', parse_dates=True)
df_master= pd.read_csv(base_path+'master.csv')

In [131]:
df_reserve

Unnamed: 0.1,Unnamed: 0,air_store_id,visit_date,reserve_visitors
0,0,air_00a91d42b08b08d9,2016-10-31,2
1,1,air_00a91d42b08b08d9,2016-12-05,9
2,2,air_00a91d42b08b08d9,2016-12-14,18
3,3,air_00a91d42b08b08d9,2016-12-17,2
4,4,air_00a91d42b08b08d9,2016-12-20,4
...,...,...,...,...
29825,29825,air_fea5dc9594450608,2017-04-22,2
29826,29826,air_fea5dc9594450608,2017-04-25,2
29827,29827,air_fea5dc9594450608,2017-04-28,3
29828,29828,air_fea5dc9594450608,2017-05-20,6


In [132]:
df_reserve.sort_values(by=['air_store_id', 'visit_date'])

Unnamed: 0.1,Unnamed: 0,air_store_id,visit_date,reserve_visitors
0,0,air_00a91d42b08b08d9,2016-10-31,2
1,1,air_00a91d42b08b08d9,2016-12-05,9
2,2,air_00a91d42b08b08d9,2016-12-14,18
3,3,air_00a91d42b08b08d9,2016-12-17,2
4,4,air_00a91d42b08b08d9,2016-12-20,4
...,...,...,...,...
29825,29825,air_fea5dc9594450608,2017-04-22,2
29826,29826,air_fea5dc9594450608,2017-04-25,2
29827,29827,air_fea5dc9594450608,2017-04-28,3
29828,29828,air_fea5dc9594450608,2017-05-20,6


In [133]:
# one row per store id and visit date
df_reserve[df_reserve.duplicated(subset=['air_store_id', 'visit_date'])]

Unnamed: 0.1,Unnamed: 0,air_store_id,visit_date,reserve_visitors


In [134]:
df_store

Unnamed: 0,air_store_id,air_genre_name,air_area_name,latitude,longitude
0,air_0f0cdeee6c9bf3d7,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197853
1,air_7cc17a324ae5c7dc,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197853
2,air_fee8dcf4d619598e,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197853
3,air_a17f0778617c76e2,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197853
4,air_83db5aff8f50478e,Italian/French,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599
...,...,...,...,...,...
824,air_9bf595ef095572fb,International cuisine,Tōkyō-to Shibuya-ku Shibuya,35.661777,139.704051
825,air_764f71040a413d4d,Asian,Tōkyō-to Shibuya-ku Shibuya,35.661777,139.704051
826,air_10bbe8acd943d8f6,Asian,Tōkyō-to Shibuya-ku Shibuya,35.661777,139.704051
827,air_7514d90009613cd6,Karaoke/Party,Hokkaidō Sapporo-shi Minami 3 Jōnishi,43.055460,141.340956


In [135]:
# one row per store id
df_store[df_store.duplicated(subset='air_store_id')]

Unnamed: 0,air_store_id,air_genre_name,air_area_name,latitude,longitude


In [136]:
df_visit

Unnamed: 0,air_store_id,visit_date,visitors
0,air_ba937bf13d40fb24,2016-01-13,25
1,air_ba937bf13d40fb24,2016-01-14,32
2,air_ba937bf13d40fb24,2016-01-15,29
3,air_ba937bf13d40fb24,2016-01-16,22
4,air_ba937bf13d40fb24,2016-01-18,6
...,...,...,...
252103,air_24e8414b9b07decb,2017-04-18,6
252104,air_24e8414b9b07decb,2017-04-19,6
252105,air_24e8414b9b07decb,2017-04-20,7
252106,air_24e8414b9b07decb,2017-04-21,8


In [137]:
# one row per store and visit date
df_visit[df_visit.duplicated(subset=['air_store_id', 'visit_date'])]

Unnamed: 0,air_store_id,visit_date,visitors


In [138]:
df_date

Unnamed: 0,calendar_date,day_of_week,holiday_flg
0,2016-01-01,Friday,1
1,2016-01-02,Saturday,1
2,2016-01-03,Sunday,1
3,2016-01-04,Monday,0
4,2016-01-05,Tuesday,0
...,...,...,...
512,2017-05-27,Saturday,0
513,2017-05-28,Sunday,0
514,2017-05-29,Monday,0
515,2017-05-30,Tuesday,0


In [139]:
# one row per calendar date
df_date[df_date.duplicated(subset='calendar_date')]

Unnamed: 0,calendar_date,day_of_week,holiday_flg


# Join together to recreate the master data set!

always start with what level of detail you want the final data set to have
and always check the LOD of the data sets you're using

In [140]:
df_store_visits = df_visit.merge(df_store, left_on='air_store_id', right_on='air_store_id', how='left')


In [141]:
df_store_visits.head()

Unnamed: 0,air_store_id,visit_date,visitors,air_genre_name,air_area_name,latitude,longitude
0,air_ba937bf13d40fb24,2016-01-13,25,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599
1,air_ba937bf13d40fb24,2016-01-14,32,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599
2,air_ba937bf13d40fb24,2016-01-15,29,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599
3,air_ba937bf13d40fb24,2016-01-16,22,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599
4,air_ba937bf13d40fb24,2016-01-18,6,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599


In [142]:
df_store_visits_reserves = df_store_visits.merge(df_reserve, left_on=['air_store_id', 'visit_date'], right_on=['air_store_id', 'visit_date'], how='left')



In [143]:
df_final = df_store_visits_reserves.merge(df_date, left_on='visit_date', right_on='calendar_date', how='left')

In [144]:
#check for duplicates
df_final[df_final.duplicated(subset=['air_store_id', 'visit_date'])]

Unnamed: 0.1,air_store_id,visit_date,visitors,air_genre_name,air_area_name,latitude,longitude,Unnamed: 0,reserve_visitors,calendar_date,day_of_week,holiday_flg


In [145]:
df_final.drop(columns=['Unnamed: 0'], inplace=True)

In [146]:
df_final.head()

Unnamed: 0,air_store_id,visit_date,visitors,air_genre_name,air_area_name,latitude,longitude,reserve_visitors,calendar_date,day_of_week,holiday_flg
0,air_ba937bf13d40fb24,2016-01-13,25,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,,2016-01-13,Wednesday,0
1,air_ba937bf13d40fb24,2016-01-14,32,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,,2016-01-14,Thursday,0
2,air_ba937bf13d40fb24,2016-01-15,29,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,,2016-01-15,Friday,0
3,air_ba937bf13d40fb24,2016-01-16,22,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,,2016-01-16,Saturday,0
4,air_ba937bf13d40fb24,2016-01-18,6,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,,2016-01-18,Monday,0


In [147]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 252108 entries, 0 to 252107
Data columns (total 11 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   air_store_id      252108 non-null  object 
 1   visit_date        252108 non-null  object 
 2   visitors          252108 non-null  int64  
 3   air_genre_name    252108 non-null  object 
 4   air_area_name     252108 non-null  object 
 5   latitude          252108 non-null  float64
 6   longitude         252108 non-null  float64
 7   reserve_visitors  28064 non-null   float64
 8   calendar_date     252108 non-null  object 
 9   day_of_week       252108 non-null  object 
 10  holiday_flg       252108 non-null  int64  
dtypes: float64(3), int64(2), object(6)
memory usage: 23.1+ MB


In [148]:
# fix dates
df_final['visit_date'] = pd.to_datetime(df_final['visit_date'])

In [149]:
df_final['calendar'] = pd.to_datetime(df_final['calendar_date'])

In [150]:
df_final.describe()

Unnamed: 0,visitors,latitude,longitude,reserve_visitors,holiday_flg
count,252108.0,252108.0,252108.0,28064.0,252108.0
mean,20.973761,35.613121,137.357865,13.751283,0.050673
std,16.757007,2.044473,3.671577,17.284799,0.219329
min,1.0,33.211967,130.195555,1.0,0.0
25%,9.0,34.692337,135.341564,5.0,0.0
50%,17.0,35.658068,139.670038,10.0,0.0
75%,29.0,35.694003,139.751599,18.0,0.0
max,877.0,44.020632,144.273398,1633.0,1.0


In [151]:
df_final['air_area_name'].value_counts()

Fukuoka-ken Fukuoka-shi Daimyō           19775
Tōkyō-to Shibuya-ku Shibuya              17352
Tōkyō-to Minato-ku Shibakōen             14696
Tōkyō-to Shinjuku-ku Kabukichō           12517
Tōkyō-to Setagaya-ku Setagaya             8719
                                         ...  
Fukuoka-ken Kitakyūshū-shi Konyamachi      505
Tōkyō-to Shibuya-ku Higashi                485
Tōkyō-to Setagaya-ku Kitazawa              484
Fukuoka-ken Fukuoka-shi Imaizumi           480
Ōsaka-fu Suita-shi Izumichō                479
Name: air_area_name, Length: 103, dtype: int64

In [152]:
df_master['area'].value_counts()

Fukuoka-ken Fukuoka-shi Daimyō           19775
Tōkyō-to Shibuya-ku Shibuya              17352
Tōkyō-to Minato-ku Shibakōen             14696
Tōkyō-to Shinjuku-ku Kabukichō           12517
Tōkyō-to Setagaya-ku Setagaya             8719
                                         ...  
Fukuoka-ken Kitakyūshū-shi Konyamachi      505
Tōkyō-to Shibuya-ku Higashi                485
Tōkyō-to Setagaya-ku Kitazawa              484
Fukuoka-ken Fukuoka-shi Imaizumi           480
Ōsaka-fu Suita-shi Izumichō                479
Name: area, Length: 103, dtype: int64

In [153]:
df_master.head()

Unnamed: 0,id,visit_date,visitors,day_of_week,holiday,genre,area,latitude,longitude,reserve_visitors
0,air_ba937bf13d40fb24,2016-01-13,25,Wednesday,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,
1,air_ba937bf13d40fb24,2016-01-14,32,Thursday,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,
2,air_ba937bf13d40fb24,2016-01-15,29,Friday,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,
3,air_ba937bf13d40fb24,2016-01-16,22,Saturday,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,
4,air_ba937bf13d40fb24,2016-01-18,6,Monday,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,


In [154]:
# final data set should have one row per restaurant per day
# reservations data- 

In [155]:
# example of only joining certain columns- you can select the columns in the dfs when you use
#use merge

### Section I - Grouping

In [156]:
#group by one column
df_final.groupby('air_store_id').mean()


Unnamed: 0_level_0,visitors,latitude,longitude,reserve_visitors,holiday_flg
air_store_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
air_00a91d42b08b08d9,26.081897,35.694003,139.753595,5.833333,0.004310
air_0164b9927d20bcc3,9.248322,35.658068,139.751599,6.256410,0.033557
air_0241aa3964b7f861,9.896465,35.712607,139.779996,,0.042929
air_0328696196e46f18,7.939655,34.701279,135.528090,,0.051724
air_034a3d5b40d5b1b1,14.828685,34.692337,135.472229,7.153846,0.059761
...,...,...,...,...,...
air_fea5dc9594450608,14.485401,34.710895,137.725940,5.227273,0.036496
air_fee8dcf4d619598e,26.027778,34.695124,135.197853,5.000000,0.059028
air_fef9ccb3ba0da2f7,9.620408,34.815149,134.685353,,0.061224
air_ffcc2d5087e1b476,20.242798,35.658068,139.751599,,0.020576


In [157]:
#group by multiple columns
df_final.groupby(['air_store_id', 'day_of_week']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,visitors,latitude,longitude,reserve_visitors,holiday_flg
air_store_id,day_of_week,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
air_00a91d42b08b08d9,Friday,36.500000,35.694003,139.753595,2.0,0.000000
air_00a91d42b08b08d9,Monday,22.457143,35.694003,139.753595,5.5,0.028571
air_00a91d42b08b08d9,Saturday,14.973684,35.694003,139.753595,5.5,0.000000
air_00a91d42b08b08d9,Sunday,2.000000,35.694003,139.753595,,0.000000
air_00a91d42b08b08d9,Thursday,29.868421,35.694003,139.753595,12.0,0.000000
...,...,...,...,...,...,...
air_fff68b929994bfbd,Saturday,7.439024,35.708146,139.666288,,0.048780
air_fff68b929994bfbd,Sunday,4.000000,35.708146,139.666288,,0.000000
air_fff68b929994bfbd,Thursday,4.707317,35.708146,139.666288,,0.097561
air_fff68b929994bfbd,Tuesday,4.526316,35.708146,139.666288,,0.026316


In [158]:
# with group by- the group bys become the index
df_final.groupby(['air_store_id', 'day_of_week']).mean().index

MultiIndex([('air_00a91d42b08b08d9',    'Friday'),
            ('air_00a91d42b08b08d9',    'Monday'),
            ('air_00a91d42b08b08d9',  'Saturday'),
            ('air_00a91d42b08b08d9',    'Sunday'),
            ('air_00a91d42b08b08d9',  'Thursday'),
            ('air_00a91d42b08b08d9',   'Tuesday'),
            ('air_00a91d42b08b08d9', 'Wednesday'),
            ('air_0164b9927d20bcc3',    'Friday'),
            ('air_0164b9927d20bcc3',    'Monday'),
            ('air_0164b9927d20bcc3',  'Saturday'),
            ...
            ('air_ffcc2d5087e1b476',  'Thursday'),
            ('air_ffcc2d5087e1b476',   'Tuesday'),
            ('air_ffcc2d5087e1b476', 'Wednesday'),
            ('air_fff68b929994bfbd',    'Friday'),
            ('air_fff68b929994bfbd',    'Monday'),
            ('air_fff68b929994bfbd',  'Saturday'),
            ('air_fff68b929994bfbd',    'Sunday'),
            ('air_fff68b929994bfbd',  'Thursday'),
            ('air_fff68b929994bfbd',   'Tuesday'),
            ('a

In [159]:
# with group by- the group bys become the index- to sort by index do this
df_final.groupby(['air_store_id', 'day_of_week']).mean().sort_index(level = [0, 1])

Unnamed: 0_level_0,Unnamed: 1_level_0,visitors,latitude,longitude,reserve_visitors,holiday_flg
air_store_id,day_of_week,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
air_00a91d42b08b08d9,Friday,36.500000,35.694003,139.753595,2.0,0.000000
air_00a91d42b08b08d9,Monday,22.457143,35.694003,139.753595,5.5,0.028571
air_00a91d42b08b08d9,Saturday,14.973684,35.694003,139.753595,5.5,0.000000
air_00a91d42b08b08d9,Sunday,2.000000,35.694003,139.753595,,0.000000
air_00a91d42b08b08d9,Thursday,29.868421,35.694003,139.753595,12.0,0.000000
...,...,...,...,...,...,...
air_fff68b929994bfbd,Saturday,7.439024,35.708146,139.666288,,0.048780
air_fff68b929994bfbd,Sunday,4.000000,35.708146,139.666288,,0.000000
air_fff68b929994bfbd,Thursday,4.707317,35.708146,139.666288,,0.097561
air_fff68b929994bfbd,Tuesday,4.526316,35.708146,139.666288,,0.026316


In [160]:
# or just reset the index to make this back into a data frame
df_final.groupby(['air_store_id', 'day_of_week']).mean().reset_index()

Unnamed: 0,air_store_id,day_of_week,visitors,latitude,longitude,reserve_visitors,holiday_flg
0,air_00a91d42b08b08d9,Friday,36.500000,35.694003,139.753595,2.0,0.000000
1,air_00a91d42b08b08d9,Monday,22.457143,35.694003,139.753595,5.5,0.028571
2,air_00a91d42b08b08d9,Saturday,14.973684,35.694003,139.753595,5.5,0.000000
3,air_00a91d42b08b08d9,Sunday,2.000000,35.694003,139.753595,,0.000000
4,air_00a91d42b08b08d9,Thursday,29.868421,35.694003,139.753595,12.0,0.000000
...,...,...,...,...,...,...,...
5736,air_fff68b929994bfbd,Saturday,7.439024,35.708146,139.666288,,0.048780
5737,air_fff68b929994bfbd,Sunday,4.000000,35.708146,139.666288,,0.000000
5738,air_fff68b929994bfbd,Thursday,4.707317,35.708146,139.666288,,0.097561
5739,air_fff68b929994bfbd,Tuesday,4.526316,35.708146,139.666288,,0.026316


In [161]:
# some operations cannot be directly applied to a group by- isnull() is an example
df_final.isnull().sum()

air_store_id             0
visit_date               0
visitors                 0
air_genre_name           0
air_area_name            0
latitude                 0
longitude                0
reserve_visitors    224044
calendar_date            0
day_of_week              0
holiday_flg              0
calendar                 0
dtype: int64

In [162]:
# in these cases, you can use an apply statement:
# in this case, x refers to each of the unique groupings- or the storeid and day of week grouping
# think about the chain of operations you want to apply and then tack them on to the group by. 
# this allows you to apply complicated logic to different segments of data 

df_final.groupby(['air_store_id', 'day_of_week']).apply(lambda x: x.isnull().sum() )


Unnamed: 0_level_0,Unnamed: 1_level_0,air_store_id,visit_date,visitors,air_genre_name,air_area_name,latitude,longitude,reserve_visitors,calendar_date,day_of_week,holiday_flg,calendar
air_store_id,day_of_week,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
air_00a91d42b08b08d9,Friday,0,0,0,0,0,0,0,39,0,0,0,0
air_00a91d42b08b08d9,Monday,0,0,0,0,0,0,0,33,0,0,0,0
air_00a91d42b08b08d9,Saturday,0,0,0,0,0,0,0,36,0,0,0,0
air_00a91d42b08b08d9,Sunday,0,0,0,0,0,0,0,1,0,0,0,0
air_00a91d42b08b08d9,Thursday,0,0,0,0,0,0,0,37,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
air_fff68b929994bfbd,Saturday,0,0,0,0,0,0,0,41,0,0,0,0
air_fff68b929994bfbd,Sunday,0,0,0,0,0,0,0,30,0,0,0,0
air_fff68b929994bfbd,Thursday,0,0,0,0,0,0,0,41,0,0,0,0
air_fff68b929994bfbd,Tuesday,0,0,0,0,0,0,0,38,0,0,0,0


In [163]:
# sometimes, instead of a single stat, you might want to see multiple stats next to each other
# use agg and pass a list with functions
df_final.groupby('air_store_id')['reserve_visitors'].agg([np.mean, np.max, np.median])

Unnamed: 0_level_0,mean,amax,median
air_store_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
air_00a91d42b08b08d9,5.833333,18.0,3.5
air_0164b9927d20bcc3,6.256410,26.0,5.0
air_0241aa3964b7f861,,,
air_0328696196e46f18,,,
air_034a3d5b40d5b1b1,7.153846,50.0,3.0
...,...,...,...
air_fea5dc9594450608,5.227273,15.0,4.0
air_fee8dcf4d619598e,5.000000,5.0,5.0
air_fef9ccb3ba0da2f7,,,
air_ffcc2d5087e1b476,,,


**Question 1:** What restaurant had the highest total amount of visitors throughout the dataset?

In [164]:
# your answer here

df_final.groupby('air_area_name')['visitors'].sum().sort_values(ascending=False)

# Fukuoka-ken Fukuoka-shi Daimyō had the most visitors

air_area_name
Fukuoka-ken Fukuoka-shi Daimyō           408708
Tōkyō-to Shibuya-ku Shibuya              366744
Tōkyō-to Minato-ku Shibakōen             288180
Tōkyō-to Shinjuku-ku Kabukichō           244996
Tōkyō-to Chūō-ku Tsukiji                 186263
                                          ...  
Ōsaka-fu Neyagawa-shi Honmachi             5424
Fukuoka-ken Kitakyūshū-shi Konyamachi      5254
Tōkyō-to Shibuya-ku Higashi                4133
Hokkaidō Katō-gun Motomachi                3731
Tōkyō-to Setagaya-ku Kitazawa              2166
Name: visitors, Length: 103, dtype: int64

**Question 2:** What was the average attendance for holidays & non-holidays for all restaurants?

In [165]:
# your answer here
df_final.groupby('holiday_flg')['visitors'].mean()

holiday_flg
0    20.828064
1    23.703327
Name: visitors, dtype: float64

**Question 3:** Can you grab the first 15 rows of dates for each restaurant?  The last 15 rows? (**Hint:** Use the `apply` method for this)

In [166]:
# your answer here
df_final.groupby('air_store_id').apply(lambda x: x.head(15)) 

# or this
df.groupby('id').apply(lambda x: x.iloc[:15])


NameError: name 'df' is not defined

**Question 4:** Grab the quarterly sales for each individual restaurant within our dataset

In [None]:
# your answer here
df_final['quarter']=df_final['visit_date'].dt.quarter

In [None]:
df_final.head()

In [None]:
df_final.groupby('quarter')['visitors'].sum()

**Question 5:** What restaurant had the highest amount of total reservations?

In [None]:
# your answer here
df_final.groupby('air_area_name')['reserve_visitors'].sum().sort_values(ascending=False)

**Question 6:** What is the total number of missing entries for each restaurant?  

In [None]:
# # your answer here

df_final.groupby('air_area_name').apply(lambda x: x.isnull()).sum()

**Question 7:**  Create two variables, `train` and `test`.  Make `train` a dataset that contains all but the **last 15 rows** for each restaurant, ordered chronologically.  Make `test` the last 15 rows for each restaurant.

In [None]:
# your answer here
# sort the values first

df_final.sort_values(by=['air_store_id', 'calendar_date'], inplace=True)

In [None]:
len(df_final) - 15

In [None]:
train = df_final.groupby('air_store_id').apply(lambda x: x.iloc[:-15])

test = df_final.groupby('air_store_id').apply(lambda x: x.iloc[-15:])

### Grouping & Merging

In this section of the lab, we are going to create different types of summary statistics -- where the rows for an individual sample can be compared with a larger group statistic.

**Bonus:** If you want to make this a little bit more effective, instead of using the entire `df`, try using a grouping from the `train` variable you just created, and use the grouping's values to populate both the training and test sets.

Use the technique discussed in class to create columns for the following stats:

**Question 1:** Create columns that list the average, median and standard deviation of visitors for each restaurant

In [173]:
# your answer here

df_final.groupby('air_store_id')['visitors'].agg([np.mean, np.median, np.std])


# .agg([np.mean, np.median, np.stdev])
# ({'A' : ['sum', 'min'], 'B' : ['min', 'max']})

Unnamed: 0_level_0,mean,median,std
air_store_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
air_00a91d42b08b08d9,26.081897,26.0,12.435364
air_0164b9927d20bcc3,9.248322,8.0,6.348980
air_0241aa3964b7f861,9.896465,9.0,6.214877
air_0328696196e46f18,7.939655,6.0,6.733807
air_034a3d5b40d5b1b1,14.828685,12.0,13.154107
...,...,...,...
air_fea5dc9594450608,14.485401,13.0,8.528006
air_fee8dcf4d619598e,26.027778,27.0,9.799697
air_fef9ccb3ba0da2f7,9.620408,8.0,5.387333
air_ffcc2d5087e1b476,20.242798,19.0,10.917927


**Question 2:** Create a column that lists the average and median sales amount for each restaurant on a particular day of the week.

In [175]:
# your answer here
df_final.groupby(['air_store_id', 'day_of_week'])['visitors'].agg([np.mean, np.median, np.std])


Unnamed: 0_level_0,Unnamed: 1_level_0,mean,median,std
air_store_id,day_of_week,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
air_00a91d42b08b08d9,Friday,36.500000,35.5,9.021342
air_00a91d42b08b08d9,Monday,22.457143,19.0,9.425693
air_00a91d42b08b08d9,Saturday,14.973684,11.0,16.793639
air_00a91d42b08b08d9,Sunday,2.000000,2.0,
air_00a91d42b08b08d9,Thursday,29.868421,30.0,7.143968
...,...,...,...,...
air_fff68b929994bfbd,Saturday,7.439024,7.0,3.521710
air_fff68b929994bfbd,Sunday,4.000000,3.5,2.477485
air_fff68b929994bfbd,Thursday,4.707317,4.0,2.786072
air_fff68b929994bfbd,Tuesday,4.526316,4.0,2.554665


**Question 3:** Create columns that display the average and median sales amount for each genre in each city on each day of the week.  Create a column called `city` that captures the first value of `area` in order to this.  Values should be `Tokyo`, `Hiroshima`, etc.  **Hint:** You should use the `str` attribute combined with `split` in order to do this.

In [186]:
# your answer here
df_final['City']= df_final['air_area_name'].str.split(expand=True)[0]

In [187]:
df_final.head()

Unnamed: 0,air_store_id,visit_date,visitors,air_genre_name,air_area_name,latitude,longitude,reserve_visitors,calendar_date,day_of_week,holiday_flg,calendar,City
0,air_ba937bf13d40fb24,2016-01-13,25,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,,2016-01-13,Wednesday,0,2016-01-13,Tōkyō-to
1,air_ba937bf13d40fb24,2016-01-14,32,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,,2016-01-14,Thursday,0,2016-01-14,Tōkyō-to
2,air_ba937bf13d40fb24,2016-01-15,29,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,,2016-01-15,Friday,0,2016-01-15,Tōkyō-to
3,air_ba937bf13d40fb24,2016-01-16,22,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,,2016-01-16,Saturday,0,2016-01-16,Tōkyō-to
4,air_ba937bf13d40fb24,2016-01-18,6,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,,2016-01-18,Monday,0,2016-01-18,Tōkyō-to


In [188]:
df_final.groupby(['air_genre_name', 'City', 'day_of_week'])['visitors'].agg([np.mean, np.median, np.std])


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,mean,median,std
air_genre_name,City,day_of_week,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Asian,Tōkyō-to,Friday,38.829545,38.0,14.999787
Asian,Tōkyō-to,Monday,33.964286,32.0,17.859860
Asian,Tōkyō-to,Saturday,44.684783,49.0,17.385057
Asian,Tōkyō-to,Sunday,47.533333,51.0,23.802181
Asian,Tōkyō-to,Thursday,33.357143,33.5,16.454266
...,...,...,...,...,...
Yakiniku/Korean food,Ōsaka-fu,Saturday,15.225275,14.0,9.543087
Yakiniku/Korean food,Ōsaka-fu,Sunday,19.165563,18.0,12.452272
Yakiniku/Korean food,Ōsaka-fu,Thursday,12.805556,10.5,8.244922
Yakiniku/Korean food,Ōsaka-fu,Tuesday,12.957746,10.0,10.621050
