## Setup

In [12]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

import utils_10 as utils

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [13]:
%%html
<style>
table { font-family: monospace; }
</style>

## 01 Data Loading and Inspection

In [14]:
t = utils.Tips()
t.tips.head()

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
0,16.99,1.01,No,Sun,Dinner,2,0.059447
1,10.34,1.66,No,Sun,Dinner,3,0.160542
2,21.01,3.5,No,Sun,Dinner,3,0.166587
3,23.68,3.31,No,Sun,Dinner,2,0.13978
4,24.59,3.61,No,Sun,Dinner,4,0.146808


## 02 Column-Wise and Multiple Function Application

In [15]:
grouped = t.tips.groupby([t.DAY, t.SMOKER])
grouped_pct = grouped[t.TIP]
grouped, grouped_pct

(<pandas.core.groupby.generic.DataFrameGroupBy object at 0x11759d130>,
 <pandas.core.groupby.generic.SeriesGroupBy object at 0x114f5d6a0>)

In [16]:
grouped_pct.agg("mean")

day   smoker
Fri   No        2.812500
      Yes       2.714000
Sat   No        3.102889
      Yes       2.875476
Sun   No        3.167895
      Yes       3.516842
Thur  No        2.673778
      Yes       3.030000
Name: tip, dtype: float64

In [17]:
grouped_pct.mean()

day   smoker
Fri   No        2.812500
      Yes       2.714000
Sat   No        3.102889
      Yes       2.875476
Sun   No        3.167895
      Yes       3.516842
Thur  No        2.673778
      Yes       3.030000
Name: tip, dtype: float64

In [18]:
# Using `agg` we may pass more than one function
grouped_pct.agg(["mean", "std", "count"])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,count
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fri,No,2.8125,0.898494,4
Fri,Yes,2.714,1.077668,15
Sat,No,3.102889,1.642088,45
Sat,Yes,2.875476,1.63058,42
Sun,No,3.167895,1.224785,57
Sun,Yes,3.516842,1.261151,19
Thur,No,2.673778,1.282964,45
Thur,Yes,3.03,1.113491,17


In [19]:
# We may also supply names for the columns
grouped_pct.agg([("Average", "mean"), ("Standard Deviation", "std"), ("Count", "count")])

Unnamed: 0_level_0,Unnamed: 1_level_0,Average,Standard Deviation,Count
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fri,No,2.8125,0.898494,4
Fri,Yes,2.714,1.077668,15
Sat,No,3.102889,1.642088,45
Sat,Yes,2.875476,1.63058,42
Sun,No,3.167895,1.224785,57
Sun,Yes,3.516842,1.261151,19
Thur,No,2.673778,1.282964,45
Thur,Yes,3.03,1.113491,17


In [20]:
# We may apply multiple functions to multiple columns
functions = ["mean"]
grouped[[t.TIP, t.TOTAL_BILL]].agg(functions)

Unnamed: 0_level_0,Unnamed: 1_level_0,tip,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,mean
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2
Fri,No,2.8125,18.42
Fri,Yes,2.714,16.813333
Sat,No,3.102889,19.661778
Sat,Yes,2.875476,21.276667
Sun,No,3.167895,20.506667
Sun,Yes,3.516842,24.12
Thur,No,2.673778,17.113111
Thur,Yes,3.03,19.190588


In [21]:
# Finally, we may pass different functions to different columns
grouped.agg({t.TIP: "mean", t.TOTAL_BILL: ["mean", "max"]})

Unnamed: 0_level_0,Unnamed: 1_level_0,tip,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,mean,max
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Fri,No,2.8125,18.42,22.75
Fri,Yes,2.714,16.813333,40.17
Sat,No,3.102889,19.661778,48.33
Sat,Yes,2.875476,21.276667,50.81
Sun,No,3.167895,20.506667,48.17
Sun,Yes,3.516842,24.12,45.35
Thur,No,2.673778,17.113111,41.19
Thur,Yes,3.03,19.190588,43.11


## 03 Apply: General split-apply-combine

In [22]:
# Select the top five tip_pct values by group
t = utils.Tips()
t.top_tips_by_group()

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,smoker,day,time,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,232,11.61,3.39,No,Sat,Dinner,2,0.29199
No,149,7.51,2.0,No,Thur,Lunch,2,0.266312
No,51,10.29,2.6,No,Sun,Dinner,2,0.252672
No,185,20.69,5.0,No,Sun,Dinner,5,0.241663
No,88,24.71,5.85,No,Thur,Lunch,2,0.236746
Yes,172,7.25,5.15,Yes,Sun,Dinner,2,0.710345
Yes,178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
Yes,67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
Yes,183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
Yes,109,14.31,4.0,Yes,Sat,Dinner,2,0.279525


In [27]:
t.tips.groupby(t.SMOKER).apply(t.top, include_groups=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,day,time,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
No,232,11.61,3.39,Sat,Dinner,2,0.29199
No,149,7.51,2.0,Thur,Lunch,2,0.266312
No,51,10.29,2.6,Sun,Dinner,2,0.252672
No,185,20.69,5.0,Sun,Dinner,5,0.241663
No,88,24.71,5.85,Thur,Lunch,2,0.236746
Yes,172,7.25,5.15,Sun,Dinner,2,0.710345
Yes,178,9.6,4.0,Sun,Dinner,2,0.416667
Yes,67,3.07,1.0,Sat,Dinner,1,0.325733
Yes,183,23.17,6.5,Sun,Dinner,4,0.280535
Yes,109,14.31,4.0,Sat,Dinner,2,0.279525


In [None]:
# We may drop the MultiIndex created by groupby/apply by passing
# `group_keys=False` to groupby()
t.tips.groupby(t.SMOKER, group_keys=False).apply(t.top, include_groups=False)

Unnamed: 0,total_bill,tip,day,time,size,tip_pct
232,11.61,3.39,Sat,Dinner,2,0.29199
149,7.51,2.0,Thur,Lunch,2,0.266312
51,10.29,2.6,Sun,Dinner,2,0.252672
185,20.69,5.0,Sun,Dinner,5,0.241663
88,24.71,5.85,Thur,Lunch,2,0.236746
172,7.25,5.15,Sun,Dinner,2,0.710345
178,9.6,4.0,Sun,Dinner,2,0.416667
67,3.07,1.0,Sat,Dinner,1,0.325733
183,23.17,6.5,Sun,Dinner,4,0.280535
109,14.31,4.0,Sat,Dinner,2,0.279525
