In [1]:
# Hadley Wickham - coined the term 'split-apply-combine' for describing group operations

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.DataFrame({'key1': ['a', 'a', 'b', 'b', 'a'],
                   'key2': ['one', 'two', 'one', 'two', 'one'],
                   'data1': np.random.randn(5),
                   'data2': np.random.randn(5)})
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,-0.984448,-0.699696
1,a,two,-0.491166,-0.336191
2,b,one,0.436987,-1.631336
3,b,two,-2.418742,2.878405
4,a,one,0.666245,0.492571


In [4]:
# compute mean of the data1 column using the labels from key1
# access data1 and call groupby with the column which is a Series at key1

In [5]:
grouped = df['data1'].groupby(df['key1'])
grouped

<pandas.core.groupby.groupby.SeriesGroupBy object at 0x108ef10b8>

In [6]:
# the above grouped variable is now a GroupBy object
# this object has not computed anything yet
# only intermediate data about the group key df['key'] is inferred here
# this object has all the information needed to apply some ops to each of the groups

In [7]:
grouped.mean()

key1
a   -0.269790
b   -0.990878
Name: data1, dtype: float64

In [8]:
# here the data has been aggregated as per the group key
# results in a new Series indexed by the unique values in key1 column
# result index has name 'key1' as the DF column df['key1'] has the same name

In [9]:
# pass multiple arrays as a list
means = df['data1'].groupby([df['key1'], df['key2']]).mean()
means

key1  key2
a     one    -0.159102
      two    -0.491166
b     one     0.436987
      two    -2.418742
Name: data1, dtype: float64

In [10]:
# result Series has a hierarchical index with unique pair of keys
means.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.159102,-0.491166
b,0.436987,-2.418742


In [11]:
# using group keys as arrays of the right length
states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])

In [12]:
years = np.array([2005, 2005, 2006, 2005, 2006])

In [13]:
df['data1'].groupby([states, years]).mean()

California  2005   -0.491166
            2006    0.436987
Ohio        2005   -1.701595
            2006    0.666245
Name: data1, dtype: float64

In [14]:
# when grouping information is found in the same DF as the data
# pass column names as the group key
df.groupby('key1').mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.26979,-0.181105
b,-0.990878,0.623534


In [15]:
df.groupby(['key1', 'key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,-0.159102,-0.103562
a,two,-0.491166,-0.336191
b,one,0.436987,-1.631336
b,two,-2.418742,2.878405


In [16]:
# size - returns a Series containing group sizes
# returns the count/size of similar elements
df.groupby(['key1', 'key2']).size()

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

In [17]:
# missing values in a group key will be excluded from the result

### ITERATING OVER GROUPS

In [18]:
# GroupBy supports iteration, generating sequence of 2-tuples
# tuples contain the group name and chunk of data

In [19]:
for name, group in df.groupby('key1'):
    print(name)
    print(group)

a
  key1 key2     data1     data2
0    a  one -0.984448 -0.699696
1    a  two -0.491166 -0.336191
4    a  one  0.666245  0.492571
b
  key1 key2     data1     data2
2    b  one  0.436987 -1.631336
3    b  two -2.418742  2.878405


In [20]:
# in case of multiple keys, 1st element in the tuple will be a tuple of key values
for (k1, k2), group in df.groupby(['key1', 'key2']):
    print((k1, k2))
    print(group)

('a', 'one')
  key1 key2     data1     data2
0    a  one -0.984448 -0.699696
4    a  one  0.666245  0.492571
('a', 'two')
  key1 key2     data1     data2
1    a  two -0.491166 -0.336191
('b', 'one')
  key1 key2     data1     data2
2    b  one  0.436987 -1.631336
('b', 'two')
  key1 key2     data1     data2
3    b  two -2.418742  2.878405


In [21]:
# computing a dict of data pieces as a one-liner
pieces = dict(list(df.groupby('key1')))

In [22]:
pieces['a']

Unnamed: 0,key1,key2,data1,data2
0,a,one,-0.984448,-0.699696
1,a,two,-0.491166,-0.336191
4,a,one,0.666245,0.492571


In [23]:
pieces['b']

Unnamed: 0,key1,key2,data1,data2
2,b,one,0.436987,-1.631336
3,b,two,-2.418742,2.878405


In [24]:
# groupby groups on axis=0
# grouping the columns by dtype
df.dtypes

key1      object
key2      object
data1    float64
data2    float64
dtype: object

In [25]:
grouped = df.groupby(df.dtypes, axis=1)

In [26]:
for dtype, group in grouped:
    print(dtype)
    print(group)

float64
      data1     data2
0 -0.984448 -0.699696
1 -0.491166 -0.336191
2  0.436987 -1.631336
3 -2.418742  2.878405
4  0.666245  0.492571
object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one


### SELECTING A COLUMN OR SUBSET OF COLUMNS

In [27]:
df.groupby(['key1', 'key2'])[['data2']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,-0.103562
a,two,-0.336191
b,one,-1.631336
b,two,2.878405


In [28]:
s_grouped = df.groupby(['key1', 'key2'])['data2']
s_grouped

<pandas.core.groupby.groupby.SeriesGroupBy object at 0x108f9ef28>

In [29]:
s_grouped.mean()

key1  key2
a     one    -0.103562
      two    -0.336191
b     one    -1.631336
      two     2.878405
Name: data2, dtype: float64

### GROUPING WITH DICTS AND SERIES

In [30]:
# grouping information may exist in a form other than array

In [31]:
people = pd.DataFrame(np.random.randn(5, 5),
                      columns=['a', 'b', 'c', 'd', 'e'],
                      index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people

Unnamed: 0,a,b,c,d,e
Joe,-0.537801,-1.106557,0.646316,0.8473,-1.841414
Steve,-0.37277,-0.296597,0.209077,0.335427,-0.204965
Wes,0.908998,-1.904504,-0.648145,0.889276,-1.519975
Jim,0.922281,-0.376487,-0.71937,-0.890622,0.198481
Travis,-0.61204,1.160461,0.206832,0.818875,-0.42786


In [32]:
# adding few NA values
people.iloc[2:3, [1, 2]] = np.nan
people

Unnamed: 0,a,b,c,d,e
Joe,-0.537801,-1.106557,0.646316,0.8473,-1.841414
Steve,-0.37277,-0.296597,0.209077,0.335427,-0.204965
Wes,0.908998,,,0.889276,-1.519975
Jim,0.922281,-0.376487,-0.71937,-0.890622,0.198481
Travis,-0.61204,1.160461,0.206832,0.818875,-0.42786


In [33]:
# using a group correspondence for columns, sum together the columns by group
mapping = {'a': 'red', 'b': 'red', 'c': 'blue',
           'd': 'blue', 'e': 'red', 'f': 'orange'}

In [34]:
# construct an array from the above dict
by_column = people.groupby(mapping, axis=1)
by_column.sum()

Unnamed: 0,blue,red
Joe,1.493617,-3.485772
Steve,0.544504,-0.874332
Wes,0.889276,-0.610977
Jim,-1.609993,0.744276
Travis,1.025707,0.120561


In [35]:
# same funtionalities hold for Series, viewed as a fixed-size mapping
map_series = pd.Series(mapping)
map_series

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

In [36]:
people.groupby(map_series, axis=1).count()

Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wes,1,2
Jim,2,3
Travis,2,3


### GROUPING WITH FUNCTIONS

In [None]:
# using Python functions is more generic way of defining a group mapping compared with dict or Series
# any function passed as a group key will be called once per index value
# return values will be used as group names