# Chapter 10: Data Aggregation and Group Operations

In [1]:
import numpy as np
import pandas as pd

## 10.1: How to Think About Group Operations

In [2]:
df=pd.DataFrame({"key1":["a","a",None,"b","b","a",None],
                 "key2":pd.Series([1,2,1,2,1,None,1],dtype="Int64"),
                 "data1":np.random.standard_normal(7),
                 "data2":np.random.standard_normal(7)})
df

Unnamed: 0,key1,key2,data1,data2
0,a,1.0,-0.922777,-1.516412
1,a,2.0,-0.838157,1.224253
2,,1.0,1.534372,-0.553134
3,b,2.0,0.985158,-0.347303
4,b,1.0,-0.5918,0.45164
5,a,,-0.124712,-0.3262
6,,1.0,0.741002,1.846095


In [3]:
grouped=df["data1"].groupby(df["key1"])
grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x000002C072599E10>

In [4]:
grouped.mean()

key1
a   -0.628548
b    0.196679
Name: data1, dtype: float64

In [5]:
grouped.value_counts()

key1  data1    
a     -0.922777    1
      -0.838157    1
      -0.124712    1
b     -0.591800    1
       0.985158    1
Name: count, dtype: int64

In [6]:
means=df["data1"].groupby([df["key1"],df["key2"]]).mean()
means

key1  key2
a     1      -0.922777
      2      -0.838157
b     1      -0.591800
      2       0.985158
Name: data1, dtype: float64

In [7]:
means.unstack()

key2,1,2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.922777,-0.838157
b,-0.5918,0.985158


In [8]:
states=np.array(["OH","CA","CA","OH","OH","CA","OH"])
years=[2005,2005,2006,2005,2006,2005,2006]
df

Unnamed: 0,key1,key2,data1,data2
0,a,1.0,-0.922777,-1.516412
1,a,2.0,-0.838157,1.224253
2,,1.0,1.534372,-0.553134
3,b,2.0,0.985158,-0.347303
4,b,1.0,-0.5918,0.45164
5,a,,-0.124712,-0.3262
6,,1.0,0.741002,1.846095


In [9]:
df["states"]=states
df["years"]=years
df

Unnamed: 0,key1,key2,data1,data2,states,years
0,a,1.0,-0.922777,-1.516412,OH,2005
1,a,2.0,-0.838157,1.224253,CA,2005
2,,1.0,1.534372,-0.553134,CA,2006
3,b,2.0,0.985158,-0.347303,OH,2005
4,b,1.0,-0.5918,0.45164,OH,2006
5,a,,-0.124712,-0.3262,CA,2005
6,,1.0,0.741002,1.846095,OH,2006


In [10]:
df1=df["data1"]
df1

0   -0.922777
1   -0.838157
2    1.534372
3    0.985158
4   -0.591800
5   -0.124712
6    0.741002
Name: data1, dtype: float64

In [11]:
df1.groupby([states,years]).mean()

CA  2005   -0.481434
    2006    1.534372
OH  2005    0.031191
    2006    0.074601
Name: data1, dtype: float64

In [12]:
df=df.drop(columns=["states","years"])

In [13]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,1.0,-0.922777,-1.516412
1,a,2.0,-0.838157,1.224253
2,,1.0,1.534372,-0.553134
3,b,2.0,0.985158,-0.347303
4,b,1.0,-0.5918,0.45164
5,a,,-0.124712,-0.3262
6,,1.0,0.741002,1.846095


In [14]:
df.groupby("key1").mean()

Unnamed: 0_level_0,key2,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1.5,-0.628548,-0.20612
b,1.5,0.196679,0.052169


In [15]:
df.groupby("key2").mean(numeric_only=True)

Unnamed: 0_level_0,data1,data2
key2,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.1902,0.057047
2,0.073501,0.438475


In [16]:
df.groupby(["key1","key2"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,-0.922777,-1.516412
a,2,-0.838157,1.224253
b,1,-0.5918,0.45164
b,2,0.985158,-0.347303


In [17]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,1.0,-0.922777,-1.516412
1,a,2.0,-0.838157,1.224253
2,,1.0,1.534372,-0.553134
3,b,2.0,0.985158,-0.347303
4,b,1.0,-0.5918,0.45164
5,a,,-0.124712,-0.3262
6,,1.0,0.741002,1.846095


In [18]:
df.groupby(["key1","key2"]).size()

key1  key2
a     1       1
      2       1
b     1       1
      2       1
dtype: int64

In [19]:
df.groupby(["key1","key2"],dropna=False).size()

key1  key2
a     1       1
      2       1
      <NA>    1
b     1       1
      2       1
NaN   1       2
dtype: int64

In [20]:
df.groupby("key1").count()

Unnamed: 0_level_0,key2,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,2,3,3
b,2,2,2


In [21]:
df.groupby("key1").size()

key1
a    3
b    2
dtype: int64

### Iterating over Groups

In [22]:
for name,group in df.groupby("key1"):
    print(name)
    print(group)

a
  key1  key2     data1     data2
0    a     1 -0.922777 -1.516412
1    a     2 -0.838157  1.224253
5    a  <NA> -0.124712 -0.326200
b
  key1  key2     data1     data2
3    b     2  0.985158 -0.347303
4    b     1 -0.591800  0.451640


In [23]:
for (k1,k2),group in df.groupby(["key1","key2"]):
    print(k1)
    print(k2)
    print(group)

a
1
  key1  key2     data1     data2
0    a     1 -0.922777 -1.516412
a
2
  key1  key2     data1     data2
1    a     2 -0.838157  1.224253
b
1
  key1  key2   data1    data2
4    b     1 -0.5918  0.45164
b
2
  key1  key2     data1     data2
3    b     2  0.985158 -0.347303


In [24]:
pieces={name:group for name,group in df.groupby("key1")}
pieces

{'a':   key1  key2     data1     data2
 0    a     1 -0.922777 -1.516412
 1    a     2 -0.838157  1.224253
 5    a  <NA> -0.124712 -0.326200,
 'b':   key1  key2     data1     data2
 3    b     2  0.985158 -0.347303
 4    b     1 -0.591800  0.451640}

In [25]:
pieces["b"]

Unnamed: 0,key1,key2,data1,data2
3,b,2,0.985158,-0.347303
4,b,1,-0.5918,0.45164


In [26]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,1.0,-0.922777,-1.516412
1,a,2.0,-0.838157,1.224253
2,,1.0,1.534372,-0.553134
3,b,2.0,0.985158,-0.347303
4,b,1.0,-0.5918,0.45164
5,a,,-0.124712,-0.3262
6,,1.0,0.741002,1.846095


In [27]:
grouped=df.groupby({"key1":"key","key2":"key","data1":"data","data2":"data"},axis="columns")
grouped

  grouped=df.groupby({"key1":"key","key2":"key","data1":"data","data2":"data"},axis="columns")


<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000002C07135FC10>

In [28]:
df.T

Unnamed: 0,0,1,2,3,4,5,6
key1,a,a,,b,b,a,
key2,1,2,1.0,2,1,,1.0
data1,-0.922777,-0.838157,1.534372,0.985158,-0.5918,-0.124712,0.741002
data2,-1.516412,1.224253,-0.553134,-0.347303,0.45164,-0.3262,1.846095


In [29]:
list(grouped)

[('data',
        data1     data2
  0 -0.922777 -1.516412
  1 -0.838157  1.224253
  2  1.534372 -0.553134
  3  0.985158 -0.347303
  4 -0.591800  0.451640
  5 -0.124712 -0.326200
  6  0.741002  1.846095),
 ('key',
     key1  key2
  0     a     1
  1     a     2
  2  None     1
  3     b     2
  4     b     1
  5     a  <NA>
  6  None     1)]

In [30]:
for group_key,group_values in grouped:
    print(group_key)
    print(group_values)

data
      data1     data2
0 -0.922777 -1.516412
1 -0.838157  1.224253
2  1.534372 -0.553134
3  0.985158 -0.347303
4 -0.591800  0.451640
5 -0.124712 -0.326200
6  0.741002  1.846095
key
   key1  key2
0     a     1
1     a     2
2  None     1
3     b     2
4     b     1
5     a  <NA>
6  None     1


### Selecting a Column or Subset of Columns

In [31]:
list(df.groupby("key1")["data1"])

[('a',
  0   -0.922777
  1   -0.838157
  5   -0.124712
  Name: data1, dtype: float64),
 ('b',
  3    0.985158
  4   -0.591800
  Name: data1, dtype: float64)]

In [32]:
list(df["data1"].groupby(df["key1"]))

[('a',
  0   -0.922777
  1   -0.838157
  5   -0.124712
  Name: data1, dtype: float64),
 ('b',
  3    0.985158
  4   -0.591800
  Name: data1, dtype: float64)]

In [33]:
df.groupby(["key1","key2"])[["data2"]].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,1,-1.516412
a,2,1.224253
b,1,0.45164
b,2,-0.347303


In [34]:
df.groupby(["key1", "key2"])["data2"].mean()

key1  key2
a     1      -1.516412
      2       1.224253
b     1       0.451640
      2      -0.347303
Name: data2, dtype: float64

### Grouping with Dictionaries and Series

In [35]:
people=pd.DataFrame(np.random.standard_normal((5,5)),
                    columns=["a","b","c","d","e"],
                    index=["Joe","Steve","Wanda","Jill","Trey"])
people

Unnamed: 0,a,b,c,d,e
Joe,0.414549,-1.475493,0.821092,0.729569,0.264797
Steve,1.177361,-0.325011,0.060358,-1.504812,-0.164848
Wanda,-0.953338,0.311632,-1.636776,0.123149,1.111949
Jill,-1.035872,-0.420272,1.113017,-0.321133,1.236514
Trey,-0.996007,0.38104,-1.130701,1.195607,-0.786891


In [36]:
people.iloc[2,[1,2]]=np.nan
people

Unnamed: 0,a,b,c,d,e
Joe,0.414549,-1.475493,0.821092,0.729569,0.264797
Steve,1.177361,-0.325011,0.060358,-1.504812,-0.164848
Wanda,-0.953338,,,0.123149,1.111949
Jill,-1.035872,-0.420272,1.113017,-0.321133,1.236514
Trey,-0.996007,0.38104,-1.130701,1.195607,-0.786891


In [37]:
mapping={"a":"red","b":"red","c":"blue","d":"blue","e":"red","f":"orange"}

In [38]:
by_column=people.groupby(mapping,axis="columns")
by_column.sum()

  by_column=people.groupby(mapping,axis="columns")


Unnamed: 0,blue,red
Joe,1.550661,-0.796147
Steve,-1.444455,0.687502
Wanda,0.123149,0.158611
Jill,0.791884,-0.219629
Trey,0.064906,-1.401857


In [39]:
map_series=pd.Series(mapping)
map_series

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

In [40]:
people.groupby(map_series,axis="columns").sum()

  people.groupby(map_series,axis="columns").sum()


Unnamed: 0,blue,red
Joe,1.550661,-0.796147
Steve,-1.444455,0.687502
Wanda,0.123149,0.158611
Jill,0.791884,-0.219629
Trey,0.064906,-1.401857


### Grouping with Functions

In [41]:
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,0.414549,-1.475493,0.821092,0.729569,0.264797
4,-2.031879,-0.039231,-0.017684,0.874474,0.449623
5,0.224023,-0.325011,0.060358,-1.381664,0.947101


In [42]:
people

Unnamed: 0,a,b,c,d,e
Joe,0.414549,-1.475493,0.821092,0.729569,0.264797
Steve,1.177361,-0.325011,0.060358,-1.504812,-0.164848
Wanda,-0.953338,,,0.123149,1.111949
Jill,-1.035872,-0.420272,1.113017,-0.321133,1.236514
Trey,-0.996007,0.38104,-1.130701,1.195607,-0.786891


In [43]:
key_list=["one","one","one","two","two"]
people.groupby([len,key_list]).min()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,0.414549,-1.475493,0.821092,0.729569,0.264797
4,two,-1.035872,-0.420272,-1.130701,-0.321133,-0.786891
5,one,-0.953338,-0.325011,0.060358,-1.504812,-0.164848


In [44]:
list(people.groupby([len,key_list]))

[((3, 'one'),
              a         b         c         d         e
  Joe  0.414549 -1.475493  0.821092  0.729569  0.264797),
 ((4, 'two'),
               a         b         c         d         e
  Jill -1.035872 -0.420272  1.113017 -0.321133  1.236514
  Trey -0.996007  0.381040 -1.130701  1.195607 -0.786891),
 ((5, 'one'),
                a         b         c         d         e
  Steve  1.177361 -0.325011  0.060358 -1.504812 -0.164848
  Wanda -0.953338       NaN       NaN  0.123149  1.111949)]

### Grouping by Index Levels

In [45]:
columns=pd.MultiIndex.from_arrays([["US","US","US","JP","JP"],
                                   [1,3,5,1,3]],
                                   names=["cty","tenor"])
columns

MultiIndex([('US', 1),
            ('US', 3),
            ('US', 5),
            ('JP', 1),
            ('JP', 3)],
           names=['cty', 'tenor'])

In [46]:
hier_df=pd.DataFrame(np.random.standard_normal((4,5)),columns=columns)
hier_df

cty,US,US,US,JP,JP
tenor,1,3,5,1,3
0,1.457694,-0.439418,0.464202,1.287013,0.913776
1,0.573183,-1.348563,0.759582,0.325169,1.328441
2,1.769417,-0.656663,-0.270215,-0.108435,0.642556
3,1.460748,-0.94017,-1.58031,1.103507,-0.544047


In [47]:
hier_df.groupby(level="cty",axis="columns").count()

  hier_df.groupby(level="cty",axis="columns").count()


cty,JP,US
0,2,3
1,2,3
2,2,3
3,2,3


## 10.2: Data Aggregation

In [48]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,1.0,-0.922777,-1.516412
1,a,2.0,-0.838157,1.224253
2,,1.0,1.534372,-0.553134
3,b,2.0,0.985158,-0.347303
4,b,1.0,-0.5918,0.45164
5,a,,-0.124712,-0.3262
6,,1.0,0.741002,1.846095


In [49]:
grouped=df.groupby("key1")
grouped["data1"].nsmallest(2)

key1   
a     0   -0.922777
      1   -0.838157
b     4   -0.591800
      3    0.985158
Name: data1, dtype: float64

In [50]:
def peak_to_peak(arr):
    return arr.max()-arr.min()

In [51]:
grouped.agg(peak_to_peak)

Unnamed: 0_level_0,key2,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0.798065,2.740665
b,1,1.576958,0.798943


In [52]:
list(grouped)

[('a',
    key1  key2     data1     data2
  0    a     1 -0.922777 -1.516412
  1    a     2 -0.838157  1.224253
  5    a  <NA> -0.124712 -0.326200),
 ('b',
    key1  key2     data1     data2
  3    b     2  0.985158 -0.347303
  4    b     1 -0.591800  0.451640)]

In [53]:
grouped.describe()

Unnamed: 0_level_0,key2,key2,key2,key2,key2,key2,key2,key2,data1,data1,data1,data1,data1,data2,data2,data2,data2,data2,data2,data2,data2
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
a,2.0,1.5,0.707107,1.0,1.25,1.5,1.75,2.0,3.0,-0.628548,...,-0.481434,-0.124712,3.0,-0.20612,1.374273,-1.516412,-0.921306,-0.3262,0.449027,1.224253
b,2.0,1.5,0.707107,1.0,1.25,1.5,1.75,2.0,2.0,0.196679,...,0.590919,0.985158,2.0,0.052169,0.564938,-0.347303,-0.147567,0.052169,0.251904,0.45164


### Column-Wise and Multiple Function Application

In [54]:
tips=pd.read_csv("../../examples/tips.csv")
tips["tip_pct"]=tips["tip"]/tips["total_bill"]
tips

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
0,16.99,1.01,No,Sun,Dinner,2,0.059447
1,10.34,1.66,No,Sun,Dinner,3,0.160542
2,21.01,3.50,No,Sun,Dinner,3,0.166587
3,23.68,3.31,No,Sun,Dinner,2,0.139780
4,24.59,3.61,No,Sun,Dinner,4,0.146808
...,...,...,...,...,...,...,...
239,29.03,5.92,No,Sat,Dinner,3,0.203927
240,27.18,2.00,Yes,Sat,Dinner,2,0.073584
241,22.67,2.00,Yes,Sat,Dinner,2,0.088222
242,17.82,1.75,No,Sat,Dinner,2,0.098204


In [55]:
grouped=tips.groupby(["day","smoker"])
list(grouped)

[(('Fri', 'No'),
       total_bill   tip smoker  day    time  size   tip_pct
  91        22.49  3.50     No  Fri  Dinner     2  0.155625
  94        22.75  3.25     No  Fri  Dinner     2  0.142857
  99        12.46  1.50     No  Fri  Dinner     2  0.120385
  223       15.98  3.00     No  Fri   Lunch     3  0.187735),
 (('Fri', 'Yes'),
       total_bill   tip smoker  day    time  size   tip_pct
  90        28.97  3.00    Yes  Fri  Dinner     2  0.103555
  92         5.75  1.00    Yes  Fri  Dinner     2  0.173913
  93        16.32  4.30    Yes  Fri  Dinner     2  0.263480
  95        40.17  4.73    Yes  Fri  Dinner     4  0.117750
  96        27.28  4.00    Yes  Fri  Dinner     2  0.146628
  97        12.03  1.50    Yes  Fri  Dinner     2  0.124688
  98        21.01  3.00    Yes  Fri  Dinner     2  0.142789
  100       11.35  2.50    Yes  Fri  Dinner     2  0.220264
  101       15.38  3.00    Yes  Fri  Dinner     2  0.195059
  220       12.16  2.20    Yes  Fri   Lunch     2  0.180921
  2

In [56]:
grouped_pct=grouped["tip_pct"]
list(grouped_pct)

[(('Fri', 'No'),
  91     0.155625
  94     0.142857
  99     0.120385
  223    0.187735
  Name: tip_pct, dtype: float64),
 (('Fri', 'Yes'),
  90     0.103555
  92     0.173913
  93     0.263480
  95     0.117750
  96     0.146628
  97     0.124688
  98     0.142789
  100    0.220264
  101    0.195059
  220    0.180921
  221    0.259314
  222    0.223776
  224    0.117735
  225    0.153657
  226    0.198216
  Name: tip_pct, dtype: float64),
 (('Sat', 'No'),
  19     0.162228
  20     0.227679
  21     0.135535
  22     0.141408
  23     0.192288
  24     0.160444
  25     0.131387
  26     0.149589
  27     0.157604
  28     0.198157
  29     0.152672
  30     0.151832
  31     0.136240
  32     0.199203
  33     0.118415
  34     0.183915
  35     0.149626
  36     0.122624
  37     0.181335
  38     0.123596
  39     0.159898
  40     0.139651
  57     0.056797
  59     0.139424
  64     0.150085
  65     0.156873
  66     0.150152
  68     0.099357
  70     0.163894
  71     0.17574

In [57]:
grouped_pct.agg("mean")

day   smoker
Fri   No        0.151650
      Yes       0.174783
Sat   No        0.158048
      Yes       0.147906
Sun   No        0.160113
      Yes       0.187250
Thur  No        0.160298
      Yes       0.163863
Name: tip_pct, dtype: float64

In [58]:
grouped_pct.mean()

day   smoker
Fri   No        0.151650
      Yes       0.174783
Sat   No        0.158048
      Yes       0.147906
Sun   No        0.160113
      Yes       0.187250
Thur  No        0.160298
      Yes       0.163863
Name: tip_pct, dtype: float64

In [59]:
grouped_pct.agg(["mean","std",peak_to_peak])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,peak_to_peak
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fri,No,0.15165,0.028123,0.067349
Fri,Yes,0.174783,0.051293,0.159925
Sat,No,0.158048,0.039767,0.235193
Sat,Yes,0.147906,0.061375,0.290095
Sun,No,0.160113,0.042347,0.193226
Sun,Yes,0.18725,0.154134,0.644685
Thur,No,0.160298,0.038774,0.19335
Thur,Yes,0.163863,0.039389,0.15124


In [60]:
grouped_pct.agg([("average","mean"),("stdev","std"),("peak_to_peak",peak_to_peak)])

Unnamed: 0_level_0,Unnamed: 1_level_0,average,stdev,peak_to_peak
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fri,No,0.15165,0.028123,0.067349
Fri,Yes,0.174783,0.051293,0.159925
Sat,No,0.158048,0.039767,0.235193
Sat,Yes,0.147906,0.061375,0.290095
Sun,No,0.160113,0.042347,0.193226
Sun,Yes,0.18725,0.154134,0.644685
Thur,No,0.160298,0.038774,0.19335
Thur,Yes,0.163863,0.039389,0.15124


In [61]:
functions=["count","mean","max"]
result=grouped[["tip_pct","total_bill"]].agg(functions)
result

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,total_bill,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,max,count,mean,max
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Fri,No,4,0.15165,0.187735,4,18.42,22.75
Fri,Yes,15,0.174783,0.26348,15,16.813333,40.17
Sat,No,45,0.158048,0.29199,45,19.661778,48.33
Sat,Yes,42,0.147906,0.325733,42,21.276667,50.81
Sun,No,57,0.160113,0.252672,57,20.506667,48.17
Sun,Yes,19,0.18725,0.710345,19,24.12,45.35
Thur,No,45,0.160298,0.266312,45,17.113111,41.19
Thur,Yes,17,0.163863,0.241255,17,19.190588,43.11


In [62]:
result["tip_pct"]

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,max
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fri,No,4,0.15165,0.187735
Fri,Yes,15,0.174783,0.26348
Sat,No,45,0.158048,0.29199
Sat,Yes,42,0.147906,0.325733
Sun,No,57,0.160113,0.252672
Sun,Yes,19,0.18725,0.710345
Thur,No,45,0.160298,0.266312
Thur,Yes,17,0.163863,0.241255


In [63]:
ftuples=[("Average","mean"),("Variance",np.var)]
grouped[["tip_pct","total_bill"]].agg(ftuples)

  grouped[["tip_pct","total_bill"]].agg(ftuples)


Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,Average,Variance,Average,Variance
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Fri,No,0.15165,0.000791,18.42,25.596333
Fri,Yes,0.174783,0.002631,16.813333,82.562438
Sat,No,0.158048,0.001581,19.661778,79.908965
Sat,Yes,0.147906,0.003767,21.276667,101.387535
Sun,No,0.160113,0.001793,20.506667,66.09998
Sun,Yes,0.18725,0.023757,24.12,109.046044
Thur,No,0.160298,0.001503,17.113111,59.625081
Thur,Yes,0.163863,0.001551,19.190588,69.808518


In [64]:
grouped.agg({"tip":max,"size":sum})

  grouped.agg({"tip":max,"size":sum})
  grouped.agg({"tip":max,"size":sum})


Unnamed: 0_level_0,Unnamed: 1_level_0,tip,size
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Fri,No,3.5,9
Fri,Yes,4.73,31
Sat,No,9.0,115
Sat,Yes,10.0,104
Sun,No,6.0,167
Sun,Yes,6.5,49
Thur,No,6.7,112
Thur,Yes,5.0,40


In [65]:
grouped.agg({"tip_pct":["min","max","mean","std"],
             "size":"sum"})

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,tip_pct,size
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean,std,sum
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Fri,No,0.120385,0.187735,0.15165,0.028123,9
Fri,Yes,0.103555,0.26348,0.174783,0.051293,31
Sat,No,0.056797,0.29199,0.158048,0.039767,115
Sat,Yes,0.035638,0.325733,0.147906,0.061375,104
Sun,No,0.059447,0.252672,0.160113,0.042347,167
Sun,Yes,0.06566,0.710345,0.18725,0.154134,49
Thur,No,0.072961,0.266312,0.160298,0.038774,112
Thur,Yes,0.090014,0.241255,0.163863,0.039389,40


### Returning Aggregated Data Without Row Indexes

In [66]:
grouped = tips.groupby(["day", "smoker"], as_index=False)

In [67]:
grouped.mean(numeric_only=True)

Unnamed: 0,day,smoker,total_bill,tip,size,tip_pct
0,Fri,No,18.42,2.8125,2.25,0.15165
1,Fri,Yes,16.813333,2.714,2.066667,0.174783
2,Sat,No,19.661778,3.102889,2.555556,0.158048
3,Sat,Yes,21.276667,2.875476,2.47619,0.147906
4,Sun,No,20.506667,3.167895,2.929825,0.160113
5,Sun,Yes,24.12,3.516842,2.578947,0.18725
6,Thur,No,17.113111,2.673778,2.488889,0.160298
7,Thur,Yes,19.190588,3.03,2.352941,0.163863


## 10.3: Apply: General split-apply-combine

In [68]:
def top(df,n=5,column="tip_pct"):
    return df.sort_values(column,ascending=False)[:n]

In [69]:
top(tips,n=6)

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
172,7.25,5.15,Yes,Sun,Dinner,2,0.710345
178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
232,11.61,3.39,No,Sat,Dinner,2,0.29199
183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
109,14.31,4.0,Yes,Sat,Dinner,2,0.279525


In [70]:
tips.groupby("smoker").apply(top)

  tips.groupby("smoker").apply(top)


Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,smoker,day,time,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,232,11.61,3.39,No,Sat,Dinner,2,0.29199
No,149,7.51,2.0,No,Thur,Lunch,2,0.266312
No,51,10.29,2.6,No,Sun,Dinner,2,0.252672
No,185,20.69,5.0,No,Sun,Dinner,5,0.241663
No,88,24.71,5.85,No,Thur,Lunch,2,0.236746
Yes,172,7.25,5.15,Yes,Sun,Dinner,2,0.710345
Yes,178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
Yes,67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
Yes,183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
Yes,109,14.31,4.0,Yes,Sat,Dinner,2,0.279525


In [71]:
tips.groupby(["smoker","day"]).apply(top,n=1,column="total_bill")

  tips.groupby(["smoker","day"]).apply(top,n=1,column="total_bill")


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_bill,tip,smoker,day,time,size,tip_pct
smoker,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
No,Fri,94,22.75,3.25,No,Fri,Dinner,2,0.142857
No,Sat,212,48.33,9.0,No,Sat,Dinner,4,0.18622
No,Sun,156,48.17,5.0,No,Sun,Dinner,6,0.103799
No,Thur,142,41.19,5.0,No,Thur,Lunch,5,0.121389
Yes,Fri,95,40.17,4.73,Yes,Fri,Dinner,4,0.11775
Yes,Sat,170,50.81,10.0,Yes,Sat,Dinner,3,0.196812
Yes,Sun,182,45.35,3.5,Yes,Sun,Dinner,3,0.077178
Yes,Thur,197,43.11,5.0,Yes,Thur,Lunch,4,0.115982


In [72]:
result=tips.groupby("smoker")["tip_pct"].describe()
result

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,151.0,0.159328,0.03991,0.056797,0.136906,0.155625,0.185014,0.29199
Yes,93.0,0.163196,0.085119,0.035638,0.106771,0.153846,0.195059,0.710345


In [73]:
result.unstack("smoker")

       smoker
count  No        151.000000
       Yes        93.000000
mean   No          0.159328
       Yes         0.163196
std    No          0.039910
       Yes         0.085119
min    No          0.056797
       Yes         0.035638
25%    No          0.136906
       Yes         0.106771
50%    No          0.155625
       Yes         0.153846
75%    No          0.185014
       Yes         0.195059
max    No          0.291990
       Yes         0.710345
dtype: float64

### Suppressing the Group Keys

In [74]:
tips.groupby("smoker",group_keys=False).apply(top)

  tips.groupby("smoker",group_keys=False).apply(top)


Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
232,11.61,3.39,No,Sat,Dinner,2,0.29199
149,7.51,2.0,No,Thur,Lunch,2,0.266312
51,10.29,2.6,No,Sun,Dinner,2,0.252672
185,20.69,5.0,No,Sun,Dinner,5,0.241663
88,24.71,5.85,No,Thur,Lunch,2,0.236746
172,7.25,5.15,Yes,Sun,Dinner,2,0.710345
178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
109,14.31,4.0,Yes,Sat,Dinner,2,0.279525


### Quantile and Bucket Analysis

In [75]:
frame=pd.DataFrame({"data1":np.random.standard_normal(1000),
                    "data2":np.random.standard_normal(1000)})
frame

Unnamed: 0,data1,data2
0,0.254421,1.356260
1,-0.911180,-0.029634
2,-0.691502,1.838154
3,0.762900,-0.557152
4,-1.569130,0.015494
...,...,...
995,-0.939175,1.265224
996,-1.291625,0.768810
997,1.420135,1.339135
998,0.663489,0.539755


In [76]:
quartiles=pd.cut(frame["data1"],4)
quartiles

0      (-0.106, 1.738]
1      (-1.95, -0.106]
2      (-1.95, -0.106]
3      (-0.106, 1.738]
4      (-1.95, -0.106]
            ...       
995    (-1.95, -0.106]
996    (-1.95, -0.106]
997    (-0.106, 1.738]
998    (-0.106, 1.738]
999    (-0.106, 1.738]
Name: data1, Length: 1000, dtype: category
Categories (4, interval[float64, right]): [(-3.801, -1.95] < (-1.95, -0.106] < (-0.106, 1.738] < (1.738, 3.581]]

In [79]:
grouped=frame.groupby(quartiles)
list(grouped)

  grouped=frame.groupby(quartiles)


[(Interval(-3.801, -1.95, closed='right'),
          data1     data2
  32  -2.151295  0.861908
  57  -2.067252  1.345355
  105 -2.337913  0.224396
  117 -2.125475 -0.339253
  191 -1.989087  0.612054
  219 -3.793970 -0.976552
  381 -2.016345 -0.821415
  403 -2.068639  0.475534
  468 -2.349988 -1.006029
  484 -1.977181  0.862709
  498 -2.287910  0.090032
  507 -2.537852 -1.119129
  526 -2.726256  1.357933
  535 -2.760784  0.133286
  587 -2.836531 -1.073713
  670 -2.039576  1.412466
  714 -2.398108  0.001677
  814 -2.483403 -1.271678),
 (Interval(-1.95, -0.106, closed='right'),
          data1     data2
  1   -0.911180 -0.029634
  2   -0.691502  1.838154
  4   -1.569130  0.015494
  6   -0.229663 -0.544760
  7   -0.257072  0.893398
  ..        ...       ...
  990 -0.534512 -0.720854
  991 -1.245445  1.681848
  992 -0.181796  0.909988
  995 -0.939175  1.265224
  996 -1.291625  0.768810
  
  [447 rows x 2 columns]),
 (Interval(-0.106, 1.738, closed='right'),
          data1     data2
  0    

In [80]:
def get_stats(group):
    return pd.DataFrame(
        {"min":group.min(),
         "max":group.max(),
         "count":group.count(),
         "mean":group.mean()}
    )

In [82]:
grouped.apply(get_stats)

Unnamed: 0_level_0,Unnamed: 1_level_0,min,max,count,mean
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"(-3.801, -1.95]",data1,-3.79397,-1.977181,18,-2.385976
"(-3.801, -1.95]",data2,-1.271678,1.412466,18,0.042754
"(-1.95, -0.106]",data1,-1.943904,-0.107341,447,-0.771723
"(-1.95, -0.106]",data2,-2.828521,3.380168,447,0.021177
"(-0.106, 1.738]",data1,-0.104821,1.725515,495,0.643481
"(-0.106, 1.738]",data2,-3.523409,2.766562,495,-0.067226
"(1.738, 3.581]",data1,1.754842,3.581328,40,2.235151
"(1.738, 3.581]",data2,-1.889568,1.954218,40,-0.04042


In [83]:
grouped.agg(["min","max","count","mean"])

Unnamed: 0_level_0,data1,data1,data1,data1,data2,data2,data2,data2
Unnamed: 0_level_1,min,max,count,mean,min,max,count,mean
data1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
"(-3.801, -1.95]",-3.79397,-1.977181,18,-2.385976,-1.271678,1.412466,18,0.042754
"(-1.95, -0.106]",-1.943904,-0.107341,447,-0.771723,-2.828521,3.380168,447,0.021177
"(-0.106, 1.738]",-0.104821,1.725515,495,0.643481,-3.523409,2.766562,495,-0.067226
"(1.738, 3.581]",1.754842,3.581328,40,2.235151,-1.889568,1.954218,40,-0.04042


In [86]:
quartiles_samp=pd.qcut(frame["data1"],4,labels=False)
quartiles_samp

0      2
1      0
2      0
3      3
4      0
      ..
995    0
996    0
997    3
998    2
999    3
Name: data1, Length: 1000, dtype: int64

In [87]:
grouped=frame.groupby(quartiles_samp)
list(grouped)

[(0,
          data1     data2
  1   -0.911180 -0.029634
  2   -0.691502  1.838154
  4   -1.569130  0.015494
  8   -1.527563 -0.491903
  16  -1.606006  2.171310
  ..        ...       ...
  988 -1.521148  0.185460
  989 -0.874726  0.539176
  991 -1.245445  1.681848
  995 -0.939175  1.265224
  996 -1.291625  0.768810
  
  [250 rows x 2 columns]),
 (1,
          data1     data2
  6   -0.229663 -0.544760
  7   -0.257072  0.893398
  9   -0.152082 -0.352122
  13  -0.381930 -1.267709
  14  -0.149354  1.229564
  ..        ...       ...
  979 -0.314882 -0.151033
  984 -0.344270  0.118844
  985 -0.172584 -0.844218
  990 -0.534512 -0.720854
  992 -0.181796  0.909988
  
  [250 rows x 2 columns]),
 (2,
          data1     data2
  0    0.254421  1.356260
  10   0.037509 -1.175823
  12   0.615727 -0.630565
  18   0.210930 -1.471205
  24   0.690763 -0.422210
  ..        ...       ...
  972  0.583677  0.273813
  976  0.669307  0.115351
  986  0.085733  0.200805
  994  0.629118 -0.673609
  998  0.663489

In [88]:
grouped.apply(get_stats)

Unnamed: 0_level_0,Unnamed: 1_level_0,min,max,count,mean
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,data1,-3.79397,-0.626807,250,-1.240403
0,data2,-2.480133,2.761004,250,0.001985
1,data1,-0.625399,0.009985,250,-0.31857
1,data2,-2.828521,3.380168,250,0.012243
2,data1,0.010625,0.69423,250,0.341334
2,data2,-3.523409,2.250345,250,-0.059259
3,data1,0.697223,3.581328,250,1.297725
3,data2,-2.073436,2.766562,250,-0.053599


### Example: Filling Missing Values with Group-Specific Values