# Chapter 10: Data Aggregation and Group Operations

In [1]:
import numpy as np
import pandas as pd

## 10.1: How to Think About Group Operations

In [2]:
df=pd.DataFrame({"key1":["a","a",None,"b","b","a",None],
                 "key2":pd.Series([1,2,1,2,1,None,1],dtype="Int64"),
                 "data1":np.random.standard_normal(7),
                 "data2":np.random.standard_normal(7)})
df

Unnamed: 0,key1,key2,data1,data2
0,a,1.0,-0.596161,0.396495
1,a,2.0,-1.481188,-0.755564
2,,1.0,-1.388448,-0.32682
3,b,2.0,1.552896,0.667659
4,b,1.0,-0.793779,-2.116557
5,a,,-0.354812,0.270046
6,,1.0,-0.215277,0.354816


In [3]:
grouped=df["data1"].groupby(df["key1"])
grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x00000159FF4287D0>

In [4]:
grouped.mean()

key1
a   -0.810720
b    0.379558
Name: data1, dtype: float64

In [5]:
grouped.value_counts()

key1  data1    
a     -1.481188    1
      -0.596161    1
      -0.354812    1
b     -0.793779    1
       1.552896    1
Name: count, dtype: int64

In [6]:
means=df["data1"].groupby([df["key1"],df["key2"]]).mean()
means

key1  key2
a     1      -0.596161
      2      -1.481188
b     1      -0.793779
      2       1.552896
Name: data1, dtype: float64

In [7]:
means.unstack()

key2,1,2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.596161,-1.481188
b,-0.793779,1.552896


In [8]:
states=np.array(["OH","CA","CA","OH","OH","CA","OH"])
years=[2005,2005,2006,2005,2006,2005,2006]
df

Unnamed: 0,key1,key2,data1,data2
0,a,1.0,-0.596161,0.396495
1,a,2.0,-1.481188,-0.755564
2,,1.0,-1.388448,-0.32682
3,b,2.0,1.552896,0.667659
4,b,1.0,-0.793779,-2.116557
5,a,,-0.354812,0.270046
6,,1.0,-0.215277,0.354816


In [9]:
df["states"]=states
df["years"]=years
df

Unnamed: 0,key1,key2,data1,data2,states,years
0,a,1.0,-0.596161,0.396495,OH,2005
1,a,2.0,-1.481188,-0.755564,CA,2005
2,,1.0,-1.388448,-0.32682,CA,2006
3,b,2.0,1.552896,0.667659,OH,2005
4,b,1.0,-0.793779,-2.116557,OH,2006
5,a,,-0.354812,0.270046,CA,2005
6,,1.0,-0.215277,0.354816,OH,2006


In [10]:
df1=df["data1"]
df1

0   -0.596161
1   -1.481188
2   -1.388448
3    1.552896
4   -0.793779
5   -0.354812
6   -0.215277
Name: data1, dtype: float64

In [11]:
df1.groupby([states,years]).mean()

CA  2005   -0.918000
    2006   -1.388448
OH  2005    0.478367
    2006   -0.504528
Name: data1, dtype: float64

In [12]:
df=df.drop(columns=["states","years"])

In [13]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,1.0,-0.596161,0.396495
1,a,2.0,-1.481188,-0.755564
2,,1.0,-1.388448,-0.32682
3,b,2.0,1.552896,0.667659
4,b,1.0,-0.793779,-2.116557
5,a,,-0.354812,0.270046
6,,1.0,-0.215277,0.354816


In [14]:
df.groupby("key1").mean()

Unnamed: 0_level_0,key2,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1.5,-0.81072,-0.029674
b,1.5,0.379558,-0.724449


In [15]:
df.groupby("key2").mean(numeric_only=True)

Unnamed: 0_level_0,data1,data2
key2,Unnamed: 1_level_1,Unnamed: 2_level_1
1,-0.748416,-0.423017
2,0.035854,-0.043952


In [16]:
df.groupby(["key1","key2"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,-0.596161,0.396495
a,2,-1.481188,-0.755564
b,1,-0.793779,-2.116557
b,2,1.552896,0.667659


In [17]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,1.0,-0.596161,0.396495
1,a,2.0,-1.481188,-0.755564
2,,1.0,-1.388448,-0.32682
3,b,2.0,1.552896,0.667659
4,b,1.0,-0.793779,-2.116557
5,a,,-0.354812,0.270046
6,,1.0,-0.215277,0.354816


In [18]:
df.groupby(["key1","key2"]).size()

key1  key2
a     1       1
      2       1
b     1       1
      2       1
dtype: int64

In [19]:
df.groupby(["key1","key2"],dropna=False).size()

key1  key2
a     1       1
      2       1
      <NA>    1
b     1       1
      2       1
NaN   1       2
dtype: int64

In [20]:
df.groupby("key1").count()

Unnamed: 0_level_0,key2,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,2,3,3
b,2,2,2


In [21]:
df.groupby("key1").size()

key1
a    3
b    2
dtype: int64

### Iterating over Groups

In [22]:
for name,group in df.groupby("key1"):
    print(name)
    print(group)

a
  key1  key2     data1     data2
0    a     1 -0.596161  0.396495
1    a     2 -1.481188 -0.755564
5    a  <NA> -0.354812  0.270046
b
  key1  key2     data1     data2
3    b     2  1.552896  0.667659
4    b     1 -0.793779 -2.116557


In [23]:
for (k1,k2),group in df.groupby(["key1","key2"]):
    print(k1)
    print(k2)
    print(group)

a
1
  key1  key2     data1     data2
0    a     1 -0.596161  0.396495
a
2
  key1  key2     data1     data2
1    a     2 -1.481188 -0.755564
b
1
  key1  key2     data1     data2
4    b     1 -0.793779 -2.116557
b
2
  key1  key2     data1     data2
3    b     2  1.552896  0.667659


In [24]:
pieces={name:group for name,group in df.groupby("key1")}
pieces

{'a':   key1  key2     data1     data2
 0    a     1 -0.596161  0.396495
 1    a     2 -1.481188 -0.755564
 5    a  <NA> -0.354812  0.270046,
 'b':   key1  key2     data1     data2
 3    b     2  1.552896  0.667659
 4    b     1 -0.793779 -2.116557}

In [38]:
pieces["b"]

Unnamed: 0,key1,key2,data1,data2
3,b,2,0.879961,0.501058
4,b,1,-0.85,-0.572475


In [25]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,1.0,-0.596161,0.396495
1,a,2.0,-1.481188,-0.755564
2,,1.0,-1.388448,-0.32682
3,b,2.0,1.552896,0.667659
4,b,1.0,-0.793779,-2.116557
5,a,,-0.354812,0.270046
6,,1.0,-0.215277,0.354816


In [33]:
grouped=df.groupby({"key1":"key","key2":"key","data1":"data","data2":"data"},axis="columns")
grouped

  grouped=df.groupby({"key1":"key","key2":"key","data1":"data","data2":"data"},axis="columns")


<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000159FEABBBD0>

In [34]:
df.T

Unnamed: 0,0,1,2,3,4,5,6
key1,a,a,,b,b,a,
key2,1,2,1.0,2,1,,1.0
data1,-0.596161,-1.481188,-1.388448,1.552896,-0.793779,-0.354812,-0.215277
data2,0.396495,-0.755564,-0.32682,0.667659,-2.116557,0.270046,0.354816


In [35]:
list(grouped)

[('data',
        data1     data2
  0 -0.596161  0.396495
  1 -1.481188 -0.755564
  2 -1.388448 -0.326820
  3  1.552896  0.667659
  4 -0.793779 -2.116557
  5 -0.354812  0.270046
  6 -0.215277  0.354816),
 ('key',
     key1  key2
  0     a     1
  1     a     2
  2  None     1
  3     b     2
  4     b     1
  5     a  <NA>
  6  None     1)]

In [37]:
for group_key,group_values in grouped:
    print(group_key)
    print(group_values)

data
      data1     data2
0 -0.596161  0.396495
1 -1.481188 -0.755564
2 -1.388448 -0.326820
3  1.552896  0.667659
4 -0.793779 -2.116557
5 -0.354812  0.270046
6 -0.215277  0.354816
key
   key1  key2
0     a     1
1     a     2
2  None     1
3     b     2
4     b     1
5     a  <NA>
6  None     1


### Selecting a Column or Subset of Columns

In [39]:
list(df.groupby("key1")["data1"])

[('a',
  0   -0.596161
  1   -1.481188
  5   -0.354812
  Name: data1, dtype: float64),
 ('b',
  3    1.552896
  4   -0.793779
  Name: data1, dtype: float64)]

In [41]:
list(df["data1"].groupby(df["key1"]))

[('a',
  0   -0.596161
  1   -1.481188
  5   -0.354812
  Name: data1, dtype: float64),
 ('b',
  3    1.552896
  4   -0.793779
  Name: data1, dtype: float64)]

In [43]:
df.groupby(["key1","key2"])[["data2"]].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,1,0.396495
a,2,-0.755564
b,1,-2.116557
b,2,0.667659


In [44]:
df.groupby(["key1", "key2"])["data2"].mean()

key1  key2
a     1       0.396495
      2      -0.755564
b     1      -2.116557
      2       0.667659
Name: data2, dtype: float64

### Grouping with Dictionaries and Series

In [45]:
people=pd.DataFrame(np.random.standard_normal((5,5)),
                    columns=["a","b","c","d","e"],
                    index=["Joe","Steve","Wanda","Jill","Trey"])
people

Unnamed: 0,a,b,c,d,e
Joe,-0.511938,-0.237204,-0.179721,-0.515863,-0.071186
Steve,-0.70804,0.021124,0.100103,-0.169757,-0.104073
Wanda,0.566411,-0.932951,0.243359,0.992808,0.084624
Jill,-0.118987,-0.301634,1.35123,-0.209966,0.804185
Trey,-1.815636,-1.048429,0.451249,0.768155,-0.929682


In [47]:
people.iloc[2,[1,2]]=np.nan
people

Unnamed: 0,a,b,c,d,e
Joe,-0.511938,-0.237204,-0.179721,-0.515863,-0.071186
Steve,-0.70804,0.021124,0.100103,-0.169757,-0.104073
Wanda,0.566411,,,0.992808,0.084624
Jill,-0.118987,-0.301634,1.35123,-0.209966,0.804185
Trey,-1.815636,-1.048429,0.451249,0.768155,-0.929682


In [48]:
mapping={"a":"red","b":"red","c":"blue","d":"blue","e":"red","f":"orange"}

In [50]:
by_column=people.groupby(mapping,axis="columns")
by_column.sum()

  by_column=people.groupby(mapping,axis="columns")


Unnamed: 0,blue,red
Joe,-0.695583,-0.820329
Steve,-0.069654,-0.790989
Wanda,0.992808,0.651036
Jill,1.141264,0.383564
Trey,1.219403,-3.793747


In [51]:
map_series=pd.Series(mapping)
map_series

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

In [52]:
people.groupby(map_series,axis="columns").sum()

  people.groupby(map_series,axis="columns").sum()


Unnamed: 0,blue,red
Joe,-0.695583,-0.820329
Steve,-0.069654,-0.790989
Wanda,0.992808,0.651036
Jill,1.141264,0.383564
Trey,1.219403,-3.793747


### Grouping with Functions

In [53]:
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,-0.511938,-0.237204,-0.179721,-0.515863,-0.071186
4,-1.934623,-1.350063,1.802479,0.558188,-0.125497
5,-0.141629,0.021124,0.100103,0.823051,-0.019448


In [55]:
people

Unnamed: 0,a,b,c,d,e
Joe,-0.511938,-0.237204,-0.179721,-0.515863,-0.071186
Steve,-0.70804,0.021124,0.100103,-0.169757,-0.104073
Wanda,0.566411,,,0.992808,0.084624
Jill,-0.118987,-0.301634,1.35123,-0.209966,0.804185
Trey,-1.815636,-1.048429,0.451249,0.768155,-0.929682


In [54]:
key_list=["one","one","one","two","two"]
people.groupby([len,key_list]).min()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,-0.511938,-0.237204,-0.179721,-0.515863,-0.071186
4,two,-1.815636,-1.048429,0.451249,-0.209966,-0.929682
5,one,-0.70804,0.021124,0.100103,-0.169757,-0.104073


In [56]:
list(people.groupby([len,key_list]))

[((3, 'one'),
              a         b         c         d         e
  Joe -0.511938 -0.237204 -0.179721 -0.515863 -0.071186),
 ((4, 'two'),
               a         b         c         d         e
  Jill -0.118987 -0.301634  1.351230 -0.209966  0.804185
  Trey -1.815636 -1.048429  0.451249  0.768155 -0.929682),
 ((5, 'one'),
                a         b         c         d         e
  Steve -0.708040  0.021124  0.100103 -0.169757 -0.104073
  Wanda  0.566411       NaN       NaN  0.992808  0.084624)]

### Grouping by Index Levels

In [58]:
columns=pd.MultiIndex.from_arrays([["US","US","US","JP","JP"],
                                   [1,3,5,1,3]],
                                   names=["cty","tenor"])
columns

MultiIndex([('US', 1),
            ('US', 3),
            ('US', 5),
            ('JP', 1),
            ('JP', 3)],
           names=['cty', 'tenor'])

In [59]:
hier_df=pd.DataFrame(np.random.standard_normal((4,5)),columns=columns)
hier_df

cty,US,US,US,JP,JP
tenor,1,3,5,1,3
0,-1.664421,1.13526,-0.001298,0.195735,1.095595
1,-0.377991,0.941386,-0.236948,0.520007,-0.9817
2,-0.760076,0.366727,-0.890437,-0.852954,0.976746
3,1.699227,0.941977,-0.004966,0.28718,-0.084813


In [63]:
hier_df.groupby(level="cty",axis="columns").count()

  hier_df.groupby(level="cty",axis="columns").count()


cty,JP,US
0,2,3
1,2,3
2,2,3
3,2,3


## 10.2: Data Aggregation

In [64]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,1.0,-0.596161,0.396495
1,a,2.0,-1.481188,-0.755564
2,,1.0,-1.388448,-0.32682
3,b,2.0,1.552896,0.667659
4,b,1.0,-0.793779,-2.116557
5,a,,-0.354812,0.270046
6,,1.0,-0.215277,0.354816


In [65]:
grouped=df.groupby("key1")
grouped["data1"].nsmallest(2)

key1   
a     1   -1.481188
      0   -0.596161
b     4   -0.793779
      3    1.552896
Name: data1, dtype: float64

In [66]:
def peak_to_peak(arr):
    return arr.max()-arr.min()

In [67]:
grouped.agg(peak_to_peak)

Unnamed: 0_level_0,key2,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,1.126376,1.15206
b,1,2.346674,2.784216


In [69]:
list(grouped)

[('a',
    key1  key2     data1     data2
  0    a     1 -0.596161  0.396495
  1    a     2 -1.481188 -0.755564
  5    a  <NA> -0.354812  0.270046),
 ('b',
    key1  key2     data1     data2
  3    b     2  1.552896  0.667659
  4    b     1 -0.793779 -2.116557)]

In [70]:
grouped.describe()

Unnamed: 0_level_0,key2,key2,key2,key2,key2,key2,key2,key2,data1,data1,data1,data1,data1,data2,data2,data2,data2,data2,data2,data2,data2
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
a,2.0,1.5,0.707107,1.0,1.25,1.5,1.75,2.0,3.0,-0.81072,...,-0.475487,-0.354812,3.0,-0.029674,0.63181,-0.755564,-0.242759,0.270046,0.333271,0.396495
b,2.0,1.5,0.707107,1.0,1.25,1.5,1.75,2.0,2.0,0.379558,...,0.966227,1.552896,2.0,-0.724449,1.968738,-2.116557,-1.420503,-0.724449,-0.028395,0.667659


### Column-Wise and Multiple Function Application

In [71]:
tips=pd.read_csv("../../examples/tips.csv")
tips["tip_pct"]=tips["tip"]/tips["total_bill"]
tips

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
0,16.99,1.01,No,Sun,Dinner,2,0.059447
1,10.34,1.66,No,Sun,Dinner,3,0.160542
2,21.01,3.50,No,Sun,Dinner,3,0.166587
3,23.68,3.31,No,Sun,Dinner,2,0.139780
4,24.59,3.61,No,Sun,Dinner,4,0.146808
...,...,...,...,...,...,...,...
239,29.03,5.92,No,Sat,Dinner,3,0.203927
240,27.18,2.00,Yes,Sat,Dinner,2,0.073584
241,22.67,2.00,Yes,Sat,Dinner,2,0.088222
242,17.82,1.75,No,Sat,Dinner,2,0.098204


In [72]:
grouped=tips.groupby(["day","smoker"])
list(grouped)

[(('Fri', 'No'),
       total_bill   tip smoker  day    time  size   tip_pct
  91        22.49  3.50     No  Fri  Dinner     2  0.155625
  94        22.75  3.25     No  Fri  Dinner     2  0.142857
  99        12.46  1.50     No  Fri  Dinner     2  0.120385
  223       15.98  3.00     No  Fri   Lunch     3  0.187735),
 (('Fri', 'Yes'),
       total_bill   tip smoker  day    time  size   tip_pct
  90        28.97  3.00    Yes  Fri  Dinner     2  0.103555
  92         5.75  1.00    Yes  Fri  Dinner     2  0.173913
  93        16.32  4.30    Yes  Fri  Dinner     2  0.263480
  95        40.17  4.73    Yes  Fri  Dinner     4  0.117750
  96        27.28  4.00    Yes  Fri  Dinner     2  0.146628
  97        12.03  1.50    Yes  Fri  Dinner     2  0.124688
  98        21.01  3.00    Yes  Fri  Dinner     2  0.142789
  100       11.35  2.50    Yes  Fri  Dinner     2  0.220264
  101       15.38  3.00    Yes  Fri  Dinner     2  0.195059
  220       12.16  2.20    Yes  Fri   Lunch     2  0.180921
  2

In [74]:
grouped_pct=grouped["tip_pct"]
list(grouped_pct)

[(('Fri', 'No'),
  91     0.155625
  94     0.142857
  99     0.120385
  223    0.187735
  Name: tip_pct, dtype: float64),
 (('Fri', 'Yes'),
  90     0.103555
  92     0.173913
  93     0.263480
  95     0.117750
  96     0.146628
  97     0.124688
  98     0.142789
  100    0.220264
  101    0.195059
  220    0.180921
  221    0.259314
  222    0.223776
  224    0.117735
  225    0.153657
  226    0.198216
  Name: tip_pct, dtype: float64),
 (('Sat', 'No'),
  19     0.162228
  20     0.227679
  21     0.135535
  22     0.141408
  23     0.192288
  24     0.160444
  25     0.131387
  26     0.149589
  27     0.157604
  28     0.198157
  29     0.152672
  30     0.151832
  31     0.136240
  32     0.199203
  33     0.118415
  34     0.183915
  35     0.149626
  36     0.122624
  37     0.181335
  38     0.123596
  39     0.159898
  40     0.139651
  57     0.056797
  59     0.139424
  64     0.150085
  65     0.156873
  66     0.150152
  68     0.099357
  70     0.163894
  71     0.17574

In [75]:
grouped_pct.agg("mean")

day   smoker
Fri   No        0.151650
      Yes       0.174783
Sat   No        0.158048
      Yes       0.147906
Sun   No        0.160113
      Yes       0.187250
Thur  No        0.160298
      Yes       0.163863
Name: tip_pct, dtype: float64

In [76]:
grouped_pct.mean()

day   smoker
Fri   No        0.151650
      Yes       0.174783
Sat   No        0.158048
      Yes       0.147906
Sun   No        0.160113
      Yes       0.187250
Thur  No        0.160298
      Yes       0.163863
Name: tip_pct, dtype: float64

In [77]:
grouped_pct.agg(["mean","std",peak_to_peak])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,peak_to_peak
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fri,No,0.15165,0.028123,0.067349
Fri,Yes,0.174783,0.051293,0.159925
Sat,No,0.158048,0.039767,0.235193
Sat,Yes,0.147906,0.061375,0.290095
Sun,No,0.160113,0.042347,0.193226
Sun,Yes,0.18725,0.154134,0.644685
Thur,No,0.160298,0.038774,0.19335
Thur,Yes,0.163863,0.039389,0.15124


In [81]:
grouped_pct.agg([("average","mean"),("stdev","std"),("peak_to_peak",peak_to_peak)])

Unnamed: 0_level_0,Unnamed: 1_level_0,average,stdev,peak_to_peak
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fri,No,0.15165,0.028123,0.067349
Fri,Yes,0.174783,0.051293,0.159925
Sat,No,0.158048,0.039767,0.235193
Sat,Yes,0.147906,0.061375,0.290095
Sun,No,0.160113,0.042347,0.193226
Sun,Yes,0.18725,0.154134,0.644685
Thur,No,0.160298,0.038774,0.19335
Thur,Yes,0.163863,0.039389,0.15124


In [83]:
functions=["count","mean","max"]
result=grouped[["tip_pct","total_bill"]].agg(functions)
result

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,total_bill,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,max,count,mean,max
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Fri,No,4,0.15165,0.187735,4,18.42,22.75
Fri,Yes,15,0.174783,0.26348,15,16.813333,40.17
Sat,No,45,0.158048,0.29199,45,19.661778,48.33
Sat,Yes,42,0.147906,0.325733,42,21.276667,50.81
Sun,No,57,0.160113,0.252672,57,20.506667,48.17
Sun,Yes,19,0.18725,0.710345,19,24.12,45.35
Thur,No,45,0.160298,0.266312,45,17.113111,41.19
Thur,Yes,17,0.163863,0.241255,17,19.190588,43.11


In [84]:
result["tip_pct"]

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,max
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fri,No,4,0.15165,0.187735
Fri,Yes,15,0.174783,0.26348
Sat,No,45,0.158048,0.29199
Sat,Yes,42,0.147906,0.325733
Sun,No,57,0.160113,0.252672
Sun,Yes,19,0.18725,0.710345
Thur,No,45,0.160298,0.266312
Thur,Yes,17,0.163863,0.241255


In [85]:
ftuples=[("Average","mean"),("Variance",np.var)]
grouped[["tip_pct","total_bill"]].agg(ftuples)

  grouped[["tip_pct","total_bill"]].agg(ftuples)


Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,Average,Variance,Average,Variance
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Fri,No,0.15165,0.000791,18.42,25.596333
Fri,Yes,0.174783,0.002631,16.813333,82.562438
Sat,No,0.158048,0.001581,19.661778,79.908965
Sat,Yes,0.147906,0.003767,21.276667,101.387535
Sun,No,0.160113,0.001793,20.506667,66.09998
Sun,Yes,0.18725,0.023757,24.12,109.046044
Thur,No,0.160298,0.001503,17.113111,59.625081
Thur,Yes,0.163863,0.001551,19.190588,69.808518


In [86]:
grouped.agg({"tip":max,"size":sum})

  grouped.agg({"tip":max,"size":sum})
  grouped.agg({"tip":max,"size":sum})


Unnamed: 0_level_0,Unnamed: 1_level_0,tip,size
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Fri,No,3.5,9
Fri,Yes,4.73,31
Sat,No,9.0,115
Sat,Yes,10.0,104
Sun,No,6.0,167
Sun,Yes,6.5,49
Thur,No,6.7,112
Thur,Yes,5.0,40


In [87]:
grouped.agg({"tip_pct":["min","max","mean","std"],
             "size":"sum"})

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,tip_pct,size
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean,std,sum
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Fri,No,0.120385,0.187735,0.15165,0.028123,9
Fri,Yes,0.103555,0.26348,0.174783,0.051293,31
Sat,No,0.056797,0.29199,0.158048,0.039767,115
Sat,Yes,0.035638,0.325733,0.147906,0.061375,104
Sun,No,0.059447,0.252672,0.160113,0.042347,167
Sun,Yes,0.06566,0.710345,0.18725,0.154134,49
Thur,No,0.072961,0.266312,0.160298,0.038774,112
Thur,Yes,0.090014,0.241255,0.163863,0.039389,40


### Returning Aggregated Data Without Row Indexes

In [88]:
grouped = tips.groupby(["day", "smoker"], as_index=False)

In [89]:
grouped.mean(numeric_only=True)

Unnamed: 0,day,smoker,total_bill,tip,size,tip_pct
0,Fri,No,18.42,2.8125,2.25,0.15165
1,Fri,Yes,16.813333,2.714,2.066667,0.174783
2,Sat,No,19.661778,3.102889,2.555556,0.158048
3,Sat,Yes,21.276667,2.875476,2.47619,0.147906
4,Sun,No,20.506667,3.167895,2.929825,0.160113
5,Sun,Yes,24.12,3.516842,2.578947,0.18725
6,Thur,No,17.113111,2.673778,2.488889,0.160298
7,Thur,Yes,19.190588,3.03,2.352941,0.163863


## 10.3: Apply: General split-apply-combine