In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [2]:
df = DataFrame({
'key1' : ['a', 'a', 'b', 'b', 'a'],
'key2' : ['one', 'two', 'one', 'two', 'one'],
'data1' : [2,4,6,3,5],
'data2' : [8,5,9,1,4]});

df

Unnamed: 0,data1,data2,key1,key2
0,2,8,a,one
1,4,5,a,two
2,6,9,b,one
3,3,1,b,two
4,5,4,a,one


In [3]:
grouped=df['data1'].groupby(df['key1']) 

# grouping the col 'data1' in groups 'a' & 'b'
# a GroupBy object that can be used later for variety of operations

In [4]:
grouped.mean()

key1
a    3.666667
b    4.500000
Name: data1, dtype: float64

In [31]:
grouped.max()

key1
a    5
b    6
Name: data1, dtype: int64

In [32]:
df.groupby('key1').mean() # grouping and descr. stat. simult

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,3.666667,5.666667
b,4.5,5.0


In [33]:
df

Unnamed: 0,data1,data2,key1,key2
0,2,8,a,one
1,4,5,a,two
2,6,9,b,one
3,3,1,b,two
4,5,4,a,one


In [34]:
df.groupby(['key1', 'key2']).mean() # hierarchical grouping

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,3.5,6.0
a,two,4.0,5.0
b,one,6.0,9.0
b,two,3.0,1.0


In [6]:
# for large data sets, it is desirable to aggregate only a few columns:

df.groupby(['key1', 'key2'])[['data2']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,6
a,two,5
b,one,9
b,two,1


In [7]:
df1 = DataFrame({
'key1' : ['a', 'a', 'b', 'b', 'a', 'c','c', 'a', 'b'],
'key2' : ['x', 'y', 'x', 'y', 'x', 'y', 'x', 'x' ,'x'],
'data1' : [2,4,6,3,5,np.nan,8,8,10],
'data2' : [8,np.nan,np.nan,5,9,1,4,1,2]});

df1

Unnamed: 0,data1,data2,key1,key2
0,2.0,8.0,a,x
1,4.0,,a,y
2,6.0,,b,x
3,3.0,5.0,b,y
4,5.0,9.0,a,x
5,,1.0,c,y
6,8.0,4.0,c,x
7,8.0,1.0,a,x
8,10.0,2.0,b,x


In [37]:
df1.groupby('key1').mean() 

# rows containing NaN are discarted: 
# there are two 'c': NaN and 8: mean=8, not 8/2=4

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,4.75,6.0
b,6.333333,3.5
c,8.0,2.5


In [38]:
df1.groupby(['key1', 'key2']).size() # returns the size of the groups

key1  key2
a     x       3
      y       1
b     x       2
      y       1
c     x       1
      y       1
dtype: int64

In [39]:
# Iterating Over Groups:

df

Unnamed: 0,data1,data2,key1,key2
0,2,8,a,one
1,4,5,a,two
2,6,9,b,one
3,3,1,b,two
4,5,4,a,one


In [40]:
for name, group in df.groupby(['key1']):
    print name
    print group
    

a
   data1  data2 key1 key2
0      2      8    a  one
1      4      5    a  two
4      5      4    a  one
b
   data1  data2 key1 key2
2      6      9    b  one
3      3      1    b  two


In [41]:
for (k1, k2), group in df.groupby(['key1', 'key2']):
    print k1,k2
    print group
    

a one
   data1  data2 key1 key2
0      2      8    a  one
4      5      4    a  one
a two
   data1  data2 key1 key2
1      4      5    a  two
b one
   data1  data2 key1 key2
2      6      9    b  one
b two
   data1  data2 key1 key2
3      3      1    b  two


In [42]:
df 

Unnamed: 0,data1,data2,key1,key2
0,2,8,a,one
1,4,5,a,two
2,6,9,b,one
3,3,1,b,two
4,5,4,a,one


In [43]:
# to use a grouped data in other datasets you can save it as dict
#make a list of the groups to be able to evoke them:pieces['b']

pieces=dict(list(df.groupby('key1')));
pieces

{'a':    data1  data2 key1 key2
 0      2      8    a  one
 1      4      5    a  two
 4      5      4    a  one, 'b':    data1  data2 key1 key2
 2      6      9    b  one
 3      3      1    b  two}

In [44]:
pieces['b']

Unnamed: 0,data1,data2,key1,key2
2,6,9,b,one
3,3,1,b,two


In [15]:
# MAPPING:

ppl = DataFrame(np.random.randn(5, 5),
                   columns=['a', 'b', 'c', 'd', 'e'],
                   index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis']);
ppl

Unnamed: 0,a,b,c,d,e
Joe,-0.054207,-0.493048,0.647527,-0.141348,0.546321
Steve,-0.384353,0.218688,-1.944327,-1.486251,1.475841
Wes,-2.092732,-0.958025,-0.997385,-1.157949,2.059358
Jim,-1.592933,-0.818601,0.422756,-0.220683,-1.314243
Travis,0.240147,-1.074445,-0.626517,-0.324649,0.0784


In [121]:
mapping = {'a': 'red', 'b': 'red', 'c': 'blue', 
           'd': 'blue', 'e': 'red', 'f' : 'orange'} # the dict

In [122]:
by_col=ppl.groupby(mapping, axis=1)

In [123]:
by_col.sum() # orange is out

Unnamed: 0,blue,red
Joe,-2.047855,-1.111062
Steve,0.614477,1.035648
Wes,0.332473,-1.813702
Jim,-0.609661,1.318457
Travis,-1.063358,0.90401


In [125]:
by_col.max()

Unnamed: 0,blue,red
Joe,-0.304701,0.201638
Steve,0.434041,1.001468
Wes,0.894659,-0.072952
Jim,0.619933,0.810391
Travis,-0.109864,2.212077


In [49]:
# using def

df

Unnamed: 0,data1,data2,key1,key2
0,2,8,a,one
1,4,5,a,two
2,6,9,b,one
3,3,1,b,two
4,5,4,a,one


In [50]:
grouped=df.groupby(df['key1']) # creation of groups 'a' and 'b' to be used later with agg

In [51]:
def max_minus_min(arr):           # own aggregation function
    return arr.max()-arr.min()

In [52]:
grouped.agg(max_minus_min) # aggregate a function (def)

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,3,4
b,3,8


In [45]:
tips=pd.read_csv('tips.csv');

# add col 'tip percent of total bill'
tips['tip_pct']=tips['tip']/tips['total_bill'];

tips[:6]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
2,21.01,3.5,Male,No,Sun,Dinner,3,0.166587
3,23.68,3.31,Male,No,Sun,Dinner,2,0.13978
4,24.59,3.61,Female,No,Sun,Dinner,4,0.146808
5,25.29,4.71,Male,No,Sun,Dinner,4,0.18624


In [46]:
grouped=tips.groupby(['sex', 'smoker'])

In [47]:
grouped_pct=grouped['tip_pct']

In [49]:
# Note that for descriptive statistics you can pass the name of the
# function as a string:

grouped_pct.agg('mean')

sex     smoker
Female  No        0.156921
        Yes       0.182150
Male    No        0.160669
        Yes       0.152771
Name: tip_pct, dtype: float64

In [52]:
grouped_pct.agg(['mean', 'std', max_minus_min])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,max_minus_min
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,No,0.156921,0.036421,0.195876
Female,Yes,0.18215,0.071595,0.360233
Male,No,0.160669,0.041849,0.220186
Male,Yes,0.152771,0.090588,0.674707


In [58]:
functions=['count', 'mean', 'max']

In [59]:
result=grouped['tip_pct', 'total_bill'].agg(functions);
result

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,total_bill,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,max,count,mean,max
sex,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Female,No,54,0.156921,0.252672,54,18.105185,35.83
Female,Yes,33,0.18215,0.416667,33,17.977879,44.3
Male,No,97,0.160669,0.29199,97,19.791237,48.33
Male,Yes,60,0.152771,0.710345,60,22.2845,50.81


In [60]:
# you can also make summary stat. within a col:

grouped.agg({'tip':np.max, 'size':'sum' })

Unnamed: 0_level_0,Unnamed: 1_level_0,tip,size
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Female,No,5.2,140
Female,Yes,6.5,74
Male,No,9.0,263
Male,Yes,10.0,150


In [61]:
grouped.max() # just a check

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,day,time,size,tip_pct
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Female,No,35.83,5.2,Thur,Lunch,6,0.252672
Female,Yes,44.3,6.5,Thur,Lunch,4,0.416667
Male,No,48.33,9.0,Thur,Lunch,6,0.29199
Male,Yes,50.81,10.0,Thur,Lunch,5,0.710345


In [62]:
grouped.sum() # just a check

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,size,tip_pct
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Female,No,977.68,149.77,140,8.473732
Female,Yes,593.27,96.74,74,6.010962
Male,No,1919.75,302.0,263,15.584865
Male,Yes,1337.07,183.07,150,9.166271


In [8]:
df

Unnamed: 0,data1,data2,key1,key2
0,2,8,a,one
1,4,5,a,two
2,6,9,b,one
3,3,1,b,two
4,5,4,a,one


In [11]:
# to have mean col not separately but next to the existing cols:

# step 1:
k1_means=df.groupby('key1').mean().add_prefix('mean_');
k1_means

Unnamed: 0_level_0,mean_data1,mean_data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,3.666667,5.666667
b,4.5,5.0


In [12]:
# step 2:
pd.merge(df, k1_means, left_on='key1', right_index=True)

Unnamed: 0,data1,data2,key1,key2,mean_data1,mean_data2
0,2,8,a,one,3.666667,5.666667
1,4,5,a,two,3.666667,5.666667
4,5,4,a,one,3.666667,5.666667
2,6,9,b,one,4.5,5.0
3,3,1,b,two,4.5,5.0


In [158]:
# TRANSFORM:

data = {
'a' : [1,2,3,4,5],
'b' : [5,4,3,2,1],
'c' : [2,4,6,3,5],
'd' : [8,4,6,1,3],
'e' : ['x','x','y','x','y']};

names=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'];

people=DataFrame(data, index=names);
people

Unnamed: 0,a,b,c,d,e
Joe,1,5,2,8,x
Steve,2,4,4,4,x
Wes,3,3,6,6,y
Jim,4,2,3,1,x
Travis,5,1,5,3,y


In [159]:
key=['one', 'two', 'one', 'two', 'one'] # rows

In [160]:
people.groupby(key).mean()

Unnamed: 0,a,b,c,d
one,3.0,3.0,4.333333,5.666667
two,3.0,3.0,3.5,2.5


In [161]:
people.groupby(key).transform(np.mean)

Unnamed: 0,a,b,c,d
Joe,3,3,4.333333,5.666667
Steve,3,3,3.5,2.5
Wes,3,3,4.333333,5.666667
Jim,3,3,3.5,2.5
Travis,3,3,4.333333,5.666667


In [162]:
people

Unnamed: 0,a,b,c,d,e
Joe,1,5,2,8,x
Steve,2,4,4,4,x
Wes,3,3,6,6,y
Jim,4,2,3,1,x
Travis,5,1,5,3,y


In [163]:
# APPLY:
# top three vals in col 'c' in desc order:

def top(df, n=3, column='d'):
    return df.sort_values(by=column)[-n:] 

# [-n:] or [:n] asc order

In [164]:
top(people, n=5)

Unnamed: 0,a,b,c,d,e
Jim,4,2,3,1,x
Travis,5,1,5,3,y
Steve,2,4,4,4,x
Wes,3,3,6,6,y
Joe,1,5,2,8,x


In [165]:
people.groupby('e').apply(top)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c,d,e
e,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
x,Jim,4,2,3,1,x
x,Steve,2,4,4,4,x
x,Joe,1,5,2,8,x
y,Travis,5,1,5,3,y
y,Wes,3,3,6,6,y


In [166]:
def top(df, n=5, column='tip_pct'):
    return df.sort_values(by=column)[-n:]

In [137]:
top(tips, n=6)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
109,14.31,4.0,Female,Yes,Sat,Dinner,2,0.279525
183,23.17,6.5,Male,Yes,Sun,Dinner,4,0.280535
232,11.61,3.39,Male,No,Sat,Dinner,2,0.29199
67,3.07,1.0,Female,Yes,Sat,Dinner,1,0.325733
178,9.6,4.0,Female,Yes,Sun,Dinner,2,0.416667
172,7.25,5.15,Male,Yes,Sun,Dinner,2,0.710345


In [138]:
tips.groupby('smoker').apply(top)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,sex,smoker,day,time,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
No,88,24.71,5.85,Male,No,Thur,Lunch,2,0.236746
No,185,20.69,5.0,Male,No,Sun,Dinner,5,0.241663
No,51,10.29,2.6,Female,No,Sun,Dinner,2,0.252672
No,149,7.51,2.0,Male,No,Thur,Lunch,2,0.266312
No,232,11.61,3.39,Male,No,Sat,Dinner,2,0.29199
Yes,109,14.31,4.0,Female,Yes,Sat,Dinner,2,0.279525
Yes,183,23.17,6.5,Male,Yes,Sun,Dinner,4,0.280535
Yes,67,3.07,1.0,Female,Yes,Sat,Dinner,1,0.325733
Yes,178,9.6,4.0,Female,Yes,Sun,Dinner,2,0.416667
Yes,172,7.25,5.15,Male,Yes,Sun,Dinner,2,0.710345
