# Data Aggregation and Group Operations

In [83]:
import numpy as np
from numpy.random import randn

import pandas as pd
from pandas import Series, DataFrame

## GroupBy Mechanics

In [84]:
df = DataFrame({
    'key1': ['a','a','b','b','a'],
    'key2': ['A','B','A','B','A'],
    'data1': np.arange(0,5),
    'data2': np.arange(0,10,2)
})
df

Unnamed: 0,data1,data2,key1,key2
0,0,0,a,A
1,1,2,a,B
2,2,4,b,A
3,3,6,b,B
4,4,8,a,A


In [85]:
grouped = df['data1'].groupby(df['key1'])
grouped

<pandas.core.groupby.SeriesGroupBy object at 0x10f464b90>

In [86]:
grouped.mean()

key1
a    1.666667
b    2.500000
Name: data1, dtype: float64

In [87]:
means = df['data1'].groupby([df['key1'], df['key2']]).mean()
means

key1  key2
a     A       2
      B       1
b     A       2
      B       3
Name: data1, dtype: int64

In [88]:
means.unstack()

key2,A,B
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,2,1
b,2,3


In [89]:
df.groupby('key1').mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,1.666667,3.333333
b,2.5,5.0


In [90]:
df.groupby(['key1', 'key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,A,2,4
a,B,1,2
b,A,2,4
b,B,3,6


### Iterating Over Groups

In [91]:
for name, group in df.groupby('key1'):
    print name
    print group

a
   data1  data2 key1 key2
0      0      0    a    A
1      1      2    a    B
4      4      8    a    A
b
   data1  data2 key1 key2
2      2      4    b    A
3      3      6    b    B


In [92]:
for type, group in df.groupby(df.dtypes, axis=1):
    print type
    print group

int64
   data1  data2
0      0      0
1      1      2
2      2      4
3      3      6
4      4      8
object
  key1 key2
0    a    A
1    a    B
2    b    A
3    b    B
4    a    A


### Selecting a Column or Subset of Columns

In [93]:
df.groupby('key1')['data1'].count()

key1
a    3
b    2
Name: data1, dtype: int64

In [94]:
df['data1'].groupby(df['key1']).count()

key1
a    3
b    2
Name: data1, dtype: int64

### Grouping with Dicts and Series

In [95]:
people = DataFrame(
    np.arange(25).reshape((5,5)),
    columns=['a','b','c','d','e'],
    index=['one','two','three','four','five']
)
people

Unnamed: 0,a,b,c,d,e
one,0,1,2,3,4
two,5,6,7,8,9
three,10,11,12,13,14
four,15,16,17,18,19
five,20,21,22,23,24


In [96]:
mapping = {'a': 'red', 'b': 'red', 'c': 'blue',
          'd': 'blue', 'e': 'red', 'f': 'orange'}

people.groupby(mapping, axis=1).sum()

Unnamed: 0,blue,red
one,5,5
two,15,20
three,25,35
four,35,50
five,45,65


In [97]:
map_series = Series(mapping)
people.groupby(map_series, axis=1).sum()

Unnamed: 0,blue,red
one,5,5
two,15,20
three,25,35
four,35,50
five,45,65


### Grouping with Functions

In [98]:
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,5,7,9,11,13
4,35,37,39,41,43
5,10,11,12,13,14


In [99]:
people.groupby(lambda x: x[0]).sum()

Unnamed: 0,a,b,c,d,e
f,35,37,39,41,43
o,0,1,2,3,4
t,15,17,19,21,23


### Grouping by Index Levels

In [100]:
columns = pd.MultiIndex.from_arrays(
    [['US','US','US','JP','JP'],
    [1,3,5,1,3]], names=['city', 'tenor'])

hier_df = DataFrame(np.arange(25).reshape((5,5)),
                   columns=columns)
hier_df

city,US,US,US,JP,JP
tenor,1,3,5,1,3
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19
4,20,21,22,23,24


In [101]:
hier_df.groupby(level='city', axis=1).count()

city,JP,US
0,2,3
1,2,3
2,2,3
3,2,3
4,2,3


## Data Aggregation

In [102]:
grouped = df.groupby('key1')
for key, group in grouped:
    print key
    print group

a
   data1  data2 key1 key2
0      0      0    a    A
1      1      2    a    B
4      4      8    a    A
b
   data1  data2 key1 key2
2      2      4    b    A
3      3      6    b    B


In [103]:
grouped['data1'].quantile(0.5)

key1
a    1.0
b    2.5
Name: data1, dtype: float64

In [104]:
def peak_to_peak(arr):
    return arr.max() - arr.min()

grouped.agg(peak_to_peak)

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,4,8
b,1,2


In [105]:
grouped['data1'].mean()

key1
a    1.666667
b    2.500000
Name: data1, dtype: float64

In [106]:
grouped['data1'].median()

key1
a    1.0
b    2.5
Name: data1, dtype: float64

### Column-wise and Multiple Function Application

In [107]:
tips = pd.read_csv('ch08/tips.csv')
tips[:3]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3


In [108]:
tips['tip_pct'] = tips['tip'] / tips['total_bill']
tips[:3]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
2,21.01,3.5,Male,No,Sun,Dinner,3,0.166587


In [109]:
grouped = tips.groupby(['sex','smoker'])

In [110]:
grouped['tip_pct'].agg('mean').unstack()

smoker,No,Yes
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,0.156921,0.18215
Male,0.160669,0.152771


In [111]:
grouped['tip_pct'].agg(['mean', 'std', peak_to_peak])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,peak_to_peak
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,No,0.156921,0.036421,0.195876
Female,Yes,0.18215,0.071595,0.360233
Male,No,0.160669,0.041849,0.220186
Male,Yes,0.152771,0.090588,0.674707


In [113]:
grouped['tip_pct'].agg([
    ('avarage', 'mean'), ('standard deviation','std')
])

Unnamed: 0_level_0,Unnamed: 1_level_0,avarage,standard deviation
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Female,No,0.156921,0.036421
Female,Yes,0.18215,0.071595
Male,No,0.160669,0.041849
Male,Yes,0.152771,0.090588


In [116]:
grouped['tip_pct', 'total_bill'].agg(['mean', 'std'])

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std
sex,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Female,No,0.156921,0.036421,18.105185,7.286455
Female,Yes,0.18215,0.071595,17.977879,9.189751
Male,No,0.160669,0.041849,19.791237,8.726566
Male,Yes,0.152771,0.090588,22.2845,9.911845


In [118]:
grouped.agg({'tip': 'max', 'size': 'sum'})

Unnamed: 0_level_0,Unnamed: 1_level_0,tip,size
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Female,No,5.2,140
Female,Yes,6.5,74
Male,No,9.0,263
Male,Yes,10.0,150


### Returning Aggregated Data in “unindexed” Form

In [121]:
tips.groupby(['sex','smoker'], as_index=False).mean()

Unnamed: 0,sex,smoker,total_bill,tip,size,tip_pct
0,Female,No,18.105185,2.773519,2.592593,0.156921
1,Female,Yes,17.977879,2.931515,2.242424,0.18215
2,Male,No,19.791237,3.113402,2.71134,0.160669
3,Male,Yes,22.2845,3.051167,2.5,0.152771


## Group-wise Operations and Transformations