In [5]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series

In [6]:
#Let's make a dframe
dframe = DataFrame({'k1':['X','X','Y','Y','Z'],
                    'k2':['alpha','beta','alpha','beta','alpha'],
                    'dataset1':np.random.randn(5),
                    'dataset2':np.random.randn(5)})

#Show
dframe

Unnamed: 0,k1,k2,dataset1,dataset2
0,X,alpha,0.352443,-0.614699
1,X,beta,-2.183894,0.959237
2,Y,alpha,-0.938476,-1.443402
3,Y,beta,1.264512,0.327238
4,Z,alpha,0.984216,-3.92761


In [13]:
#Now let's see how to use groupby

#Lets grab the dataset1 column and group it by the k1 key
group2=dframe["dataset2"].groupby(dframe["k2"])

#Show the groupby object
group2

<pandas.core.groupby.generic.SeriesGroupBy object at 0x0000021D96161D48>

In [14]:
#Now we can perform operations on this particular group
group2.mean()

k2
alpha   -1.995237
beta     0.643237
Name: dataset2, dtype: float64

In [16]:
# We can use group keys that are series as well

#For example:

#We'll make some arrays for use as keys
cities = np.array(['NY','LA','LA','NY','NY'])
month = np.array(['JAN','FEB','JAN','FEB','JAN'])

#Now using the data from dataset1, group the means by city and month
dframe["dataset1"].groupby([cities,month]).mean()

LA  FEB   -2.183894
    JAN   -0.938476
NY  FEB    1.264512
    JAN    0.668330
Name: dataset1, dtype: float64

In [19]:
# let's see the original dframe again.
dframe

Unnamed: 0,k1,k2,dataset1,dataset2
0,X,alpha,0.352443,-0.614699
1,X,beta,-2.183894,0.959237
2,Y,alpha,-0.938476,-1.443402
3,Y,beta,1.264512,0.327238
4,Z,alpha,0.984216,-3.92761


In [20]:
# WE can also pass column names as group keys
dframe.groupby("k2").mean()

Unnamed: 0_level_0,dataset1,dataset2
k2,Unnamed: 1_level_1,Unnamed: 2_level_1
alpha,0.132728,-1.995237
beta,-0.459691,0.643237


In [21]:
# Or multiple column names
dframe.groupby(['k1','k2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset1,dataset2
k1,k2,Unnamed: 2_level_1,Unnamed: 3_level_1
X,alpha,0.352443,-0.614699
X,beta,-2.183894,0.959237
Y,alpha,-0.938476,-1.443402
Y,beta,1.264512,0.327238
Z,alpha,0.984216,-3.92761


In [22]:
# Another useful groupby method is getting the group sizes
dframe.groupby("k1").size()

k1
X    2
Y    2
Z    1
dtype: int64

In [32]:
# We can also iterate over groups

#For example:
for name,group in dframe.groupby("k1"):
    print ("This is the %s group" %name)
    print (group)
    print ("\n")

This is the X group
  k1     k2  dataset1  dataset2
0  X  alpha  0.352443 -0.614699
1  X   beta -2.183894  0.959237


This is the Y group
  k1     k2  dataset1  dataset2
2  Y  alpha -0.938476 -1.443402
3  Y   beta  1.264512  0.327238


This is the Z group
  k1     k2  dataset1  dataset2
4  Z  alpha  0.984216  -3.92761




In [34]:
for name,group in dframe.groupby("k2"):
    print("The group name is %s" %name)
    print(group)
    print("/n")

The group name is alpha
  k1     k2  dataset1  dataset2
0  X  alpha  0.352443 -0.614699
2  Y  alpha -0.938476 -1.443402
4  Z  alpha  0.984216 -3.927610
/n
The group name is beta
  k1    k2  dataset1  dataset2
1  X  beta -2.183894  0.959237
3  Y  beta  1.264512  0.327238
/n


In [36]:
# We can also iterate with multiple keys
for (k1,k2) , group in dframe.groupby(['k1','k2']):
    print ("Key1 = %s Key2 = %s" %(k1,k2))
    print (group)
    print ('\n')

Key1 = X Key2 = alpha
  k1     k2  dataset1  dataset2
0  X  alpha  0.352443 -0.614699


Key1 = X Key2 = beta
  k1    k2  dataset1  dataset2
1  X  beta -2.183894  0.959237


Key1 = Y Key2 = alpha
  k1     k2  dataset1  dataset2
2  Y  alpha -0.938476 -1.443402


Key1 = Y Key2 = beta
  k1    k2  dataset1  dataset2
3  Y  beta  1.264512  0.327238


Key1 = Z Key2 = alpha
  k1     k2  dataset1  dataset2
4  Z  alpha  0.984216  -3.92761




In [40]:
# A possibly useful tactic is creating a dictionary of the data pieces 
group_dict=dict(list(dframe.groupby("k1")))
#Show the group with X
group_dict['Z']

Unnamed: 0,k1,k2,dataset1,dataset2
4,Z,alpha,0.984216,-3.92761


In [41]:
# We could have also chosen to do this with axis = 1

# Let's creat a dictionary for dtypes of objects!
group_dict_axis1 = dict(list(dframe.groupby(dframe.dtypes,axis=1)))

#show
group_dict_axis1

{dtype('float64'):    dataset1  dataset2
 0  0.352443 -0.614699
 1 -2.183894  0.959237
 2 -0.938476 -1.443402
 3  1.264512  0.327238
 4  0.984216 -3.927610,
 dtype('O'):   k1     k2
 0  X  alpha
 1  X   beta
 2  Y  alpha
 3  Y   beta
 4  Z  alpha}

In [42]:
# Next we'll learn how to use groupby with columns

In [43]:
# For example if we only wanted to group the dataset2 column with both sets of keys
dataset2_group = dframe.groupby(['k1','k2'])[['dataset2']]

dataset2_group.mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset2
k1,k2,Unnamed: 2_level_1
X,alpha,-0.614699
X,beta,0.959237
Y,alpha,-1.443402
Y,beta,0.327238
Z,alpha,-3.92761


In [44]:
#Next we'll have a quick lesson on grouping with dictionaries and series!