# Aggregation and Grouping - Another Example
The VanderPlas book/notebook uses the planets data from Seaborn.  I'm going to do another example using the tips data (I've been a waiter at two restaurants, so this one was particularly interesting!).

In [1]:
# Setup -- import plus load
import seaborn as sns
tips = sns.load_dataset('tips')
type(tips), tips.shape

(pandas.core.frame.DataFrame, (244, 7))

In [2]:
# looks like a dataset of restaurant tips!
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [3]:
# Let's calculate the tip percentage
tips['percent'] = (tips['tip'] / tips['total_bill'])*100
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,percent
0,16.99,1.01,Female,No,Sun,Dinner,2,5.944673
1,10.34,1.66,Male,No,Sun,Dinner,3,16.054159
2,21.01,3.5,Male,No,Sun,Dinner,3,16.658734
3,23.68,3.31,Male,No,Sun,Dinner,2,13.978041
4,24.59,3.61,Female,No,Sun,Dinner,4,14.680765


## Aggregate Functions

In [4]:
# Aggregate functions - mean, median, and total tips.
tips['tip'].mean(), tips['tip'].median(), tips['tip'].sum()

(2.9982786885245902, 2.9, 731.5799999999999)

In [5]:
# total number of people
tips['size'].sum()

627

In [6]:
# Let's look at some aggregate statistics for the whole dataset
tips.describe()

Unnamed: 0,total_bill,tip,size,percent
count,244.0,244.0,244.0,244.0
mean,19.785943,2.998279,2.569672,16.080258
std,8.902412,1.383638,0.9511,6.10722
min,3.07,1.0,1.0,3.563814
25%,13.3475,2.0,2.0,12.912736
50%,17.795,2.9,2.0,15.476977
75%,24.1275,3.5625,3.0,19.147549
max,50.81,10.0,6.0,71.034483


In [7]:
# wow -- who left a 71% tip?
tips[tips['percent'] > 71]
# looks like a male smoker who ate dinner on a Sunday with one other person.

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,percent
172,7.25,5.15,Male,Yes,Sun,Dinner,2,71.034483


In [8]:
# who had a $51 meal?
tips[tips['total_bill'] > 50]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,percent
170,50.81,10.0,Male,Yes,Sat,Dinner,3,19.681165


In [9]:
set(tips['day'].values)

{'Fri', 'Sat', 'Sun', 'Thur'}

In [10]:
# Best day to work?  Could use a similar method and filter by day 
tips[tips['day']=='Thur']

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,percent
77,27.20,4.00,Male,No,Thur,Lunch,4,14.705882
78,22.76,3.00,Male,No,Thur,Lunch,2,13.181019
79,17.29,2.71,Male,No,Thur,Lunch,2,15.673800
80,19.44,3.00,Male,Yes,Thur,Lunch,2,15.432099
81,16.66,3.40,Male,No,Thur,Lunch,2,20.408163
...,...,...,...,...,...,...,...,...
202,13.00,2.00,Female,Yes,Thur,Lunch,2,15.384615
203,16.40,2.50,Female,Yes,Thur,Lunch,2,15.243902
204,20.53,4.00,Male,Yes,Thur,Lunch,4,19.483682
205,16.47,3.23,Female,Yes,Thur,Lunch,3,19.611415


In [11]:
# now pick off tips and sum them. 
tips[tips['day']=='Thur']['tip'].sum()
# and then for Fri, Sat, Sun ... 

171.82999999999996

## Group By

In [12]:
# Use groupby to grab them all at once ...  what days do I want to work?
tips.groupby('day')[['tip']].sum()
# Why double brackets? 
# https://stackoverflow.com/questions/33417991/pandas-why-are-double-brackets-needed-to-select-column-after-boolean-indexing

Unnamed: 0_level_0,tip
day,Unnamed: 1_level_1
Thur,171.83
Fri,51.96
Sat,260.4
Sun,247.39


In [13]:
# so, what is the object created by groupby()?
tips.groupby('day')[['tip']]

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001CAA24F14F0>

In [14]:
# add the total bill (total)
tips.groupby('day')[['total_bill', 'tip']].sum()

Unnamed: 0_level_0,total_bill,tip
day,Unnamed: 1_level_1,Unnamed: 2_level_1
Thur,1096.33,171.83
Fri,325.88,51.96
Sat,1778.4,260.4
Sun,1627.16,247.39


In [15]:
# Look at some averages by day
tips.groupby('day')[['total_bill', 'tip', 'percent']].mean()

Unnamed: 0_level_0,total_bill,tip,percent
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Thur,17.682742,2.771452,16.127563
Fri,17.151579,2.734737,16.991303
Sat,20.441379,2.993103,15.315172
Sun,21.41,3.255132,16.689729


In [16]:
# Aggregate - Look at more summary stats:
tips.groupby('day')[['total_bill', 'tip']].aggregate(['sum', 'mean', 'median'])

Unnamed: 0_level_0,total_bill,total_bill,total_bill,tip,tip,tip
Unnamed: 0_level_1,sum,mean,median,sum,mean,median
day,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Thur,1096.33,17.682742,16.2,171.83,2.771452,2.305
Fri,325.88,17.151579,15.38,51.96,2.734737,3.0
Sat,1778.4,20.441379,18.24,260.4,2.993103,2.75
Sun,1627.16,21.41,19.63,247.39,3.255132,3.15


In [17]:
# median values by time (lunch/dinner)
tips.groupby('time')[['total_bill', 'tip', 'percent']].median()

Unnamed: 0_level_0,total_bill,tip,percent
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Lunch,15.965,2.25,15.408357
Dinner,18.39,3.0,15.540002


In [18]:
# female/male?
tips.groupby('sex')[['total_bill', 'tip', 'percent']].mean()

Unnamed: 0_level_0,total_bill,tip,percent
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Male,20.744076,3.089618,15.765055
Female,18.056897,2.833448,16.649074


In [19]:
# smoker/non-smoker?
tips.groupby('smoker')[['total_bill', 'tip']].aggregate(['mean','median', 'std'])

Unnamed: 0_level_0,total_bill,total_bill,total_bill,tip,tip,tip
Unnamed: 0_level_1,mean,median,std,mean,median,std
smoker,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Yes,20.756344,17.92,9.832154,3.00871,3.0,1.401468
No,19.188278,17.59,8.255582,2.991854,2.74,1.37719


In [20]:
# Multiple levels -- 
tips.groupby(['day', 'sex'])[['total_bill', 'tip', 'percent']].median()

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,percent
day,sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Thur,Male,16.975,2.53,15.552949
Thur,Female,13.785,2.005,15.314259
Fri,Male,17.215,2.6,13.373871
Fri,Female,15.38,3.0,19.505852
Sat,Male,18.24,3.0,15.183246
Sat,Female,18.36,2.625,15.10911
Sun,Male,20.725,3.085,15.78451
Sun,Female,17.41,3.5,16.939739


In [21]:
tips.groupby(['day', 'smoker'])[['total_bill', 'tip', 'percent']].aggregate(['mean', 'median'])

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,total_bill,tip,tip,percent,percent
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,median,mean,median,mean,median
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Thur,Yes,19.190588,16.47,3.03,2.56,16.386327,15.384615
Thur,No,17.113111,15.95,2.673778,2.18,16.029808,15.349194
Fri,Yes,16.813333,13.42,2.714,2.5,17.478305,17.391304
Fri,No,18.42,19.235,2.8125,3.125,15.165044,14.924093
Sat,Yes,21.276667,20.39,2.875476,2.69,14.790607,15.362439
Sat,No,19.661778,17.82,3.102889,2.75,15.804766,15.015198
Sun,Yes,24.12,23.1,3.516842,3.5,18.725032,13.812155
Sun,No,20.506667,18.43,3.167895,3.02,16.011294,16.166505


In [22]:
tips.groupby(['day', 'sex', 'smoker'])[['total_bill', 'tip', 'percent']].median()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_bill,tip,percent
day,sex,smoker,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Thur,Male,Yes,17.645,2.78,15.365988
Thur,Male,No,16.975,2.405,16.522346
Thur,Female,Yes,16.4,2.5,15.384615
Thur,Female,No,13.42,2.0,14.688602
Fri,Male,Yes,17.215,2.6,13.373871
Fri,Male,No,17.475,2.5,13.800498
Fri,Female,Yes,13.42,2.5,19.821606
Fri,Female,No,19.365,3.125,16.529591
Sat,Male,Yes,20.29,3.0,14.482259
Sat,Male,No,17.87,2.86,15.435249


## Dispatch Methods

In [23]:
# Use the describe() method that's defined for DataFrame and Series objects -- essentially "passed through" the GroupBy.
tips.groupby('day')[['total_bill', 'tip']].describe()

Unnamed: 0_level_0,total_bill,total_bill,total_bill,total_bill,total_bill,total_bill,total_bill,total_bill,tip,tip,tip,tip,tip,tip,tip,tip
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
day,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Thur,62.0,17.682742,7.88617,7.51,12.4425,16.2,20.155,43.11,62.0,2.771452,1.240223,1.25,2.0,2.305,3.3625,6.7
Fri,19.0,17.151579,8.30266,5.75,12.095,15.38,21.75,40.17,19.0,2.734737,1.019577,1.0,1.96,3.0,3.365,4.73
Sat,87.0,20.441379,9.480419,3.07,13.905,18.24,24.74,50.81,87.0,2.993103,1.631014,1.0,2.0,2.75,3.37,10.0
Sun,76.0,21.41,8.832122,7.25,14.9875,19.63,25.5975,48.17,76.0,3.255132,1.23488,1.01,2.0375,3.15,4.0,6.5


## Group By Iterator -- Iterate through the resulting groups

In [24]:
# loop through the day groups and show some stuff
for (day, group) in tips.groupby('day'):
    print("{:5s}  {:3d}  ${:.2f}".format(day, group.shape[0], group['total_bill'].sum()))

Thur    62  $1096.33
Fri     19  $325.88
Sat     87  $1778.40
Sun     76  $1627.16
