# Group By: split-apply-combine

http://pandas.pydata.org/pandas-docs/stable/groupby.html

### Splitting an object into groups

In [2]:
import pandas as pd
import numpy as np

In [4]:
df = pd.DataFrame({'A':['foo','bar','foo','bar',
                      'foo','bar','foo','foo'],
                 'B':['one','one','two','three',
                     'two','two','one','three'],
                 'C':np.random.randn(8),
                 'D':np.random.randn(8)})
df

Unnamed: 0,A,B,C,D
0,foo,one,0.742826,0.513073
1,bar,one,0.174551,1.803872
2,foo,two,-0.702926,-2.756253
3,bar,three,0.015365,-1.084945
4,foo,two,1.632889,-0.497022
5,bar,two,-0.230613,0.875787
6,foo,one,-0.62469,-1.624208
7,foo,three,-0.142209,-0.09769


In [61]:
# .groupby() creates GroupBy object which has a variety
# of useful methods

# group by column A
grouped = df.groupby('A')
grouped

# group by both column A and B
grouped = df.groupby(['A','B'])
grouped

# both of the above groupby() methods split teh Data Frame
# on it's index (rows)

<pandas.core.groupby.DataFrameGroupBy object at 0x1125a2be0>

In [21]:
# can also split by columns (see tutorial again)

##### Split a Series

In [34]:
# create a list
lst = [1,2,3,1,2,3]

# create a Series using the list
s = pd.Series([1,2,3,10,20,30], lst)
s
#note, not each value in the index is unique

1     1
2     2
3     3
1    10
2    20
3    30
dtype: int64

In [35]:
# non-unique index values are used as the group key
# in a groupby operation (below). So all values for the same
# index value will be in one group
grouped = s.groupby(level=0) # not sure what level=0 does

# .first() computes the first group of values
grouped.first()

1    1
2    2
3    3
dtype: int64

In [36]:
# .last() computes the last group of values
grouped.last()

1    10
2    20
3    30
dtype: int64

In [37]:
# computes the sum of the group values
grouped.sum()

1    11
2    22
3    33
dtype: int64

In [51]:
# group keys are sorted by default
df2 = pd.DataFrame({'X':['B','B','A','A'],
                   'Y':[1,2,3,4]})
df2

Unnamed: 0,X,Y
0,B,1
1,B,2
2,A,3
3,A,4


In [77]:
df2.groupby(['X']).sum()
# note, this groups by column X alphabetically, then sums
# up the contents

In [79]:
# a .groupby() creates a groupby object. It doesn't become
# a dataframe until something is applied to it like sum()

# This is just a groupby object
x = df2.groupby(['X'])
type(x)

pandas.core.groupby.DataFrameGroupBy

In [81]:
# if I add .sum() to it it becomes a dataframe
y = x.sum()
type(y)

pandas.core.frame.DataFrame

In [82]:
y

Unnamed: 0_level_0,Y
X,Unnamed: 1_level_1
A,7
B,3


In [40]:
# override the default sorting
df2.groupby(['X'], sort=False).sum()

Unnamed: 0_level_0,Y
X,Unnamed: 1_level_1
B,3
A,7


In [44]:
# create another dataframe
df3 = pd.DataFrame({'X':['A','B','A','B'], 'Y':[3,4,1,2]})
df3

Unnamed: 0,X,Y
0,A,3
1,B,4
2,A,1
3,B,2


In [84]:
# groupby column X, get group A
# note, that order is preserved within each group
a = df3.groupby(['X']).get_group('A')
a

Unnamed: 0,X,Y
0,A,3
2,A,1


In [85]:
# same with getting group B
b = df3.groupby(['X']).get_group('B')
b

Unnamed: 0,X,Y
1,B,4
3,B,2


In [88]:
# concatenate the groups
# not sure if this is a normal practice
# index is now strange?
pd.concat([a,b])

Unnamed: 0,X,Y
0,A,3
2,A,1
1,B,4
3,B,2


In [53]:
# show the df dataframe again
df

Unnamed: 0,A,B,C,D
0,foo,one,0.742826,0.513073
1,bar,one,0.174551,1.803872
2,foo,two,-0.702926,-2.756253
3,bar,three,0.015365,-1.084945
4,foo,two,1.632889,-0.497022
5,bar,two,-0.230613,0.875787
6,foo,one,-0.62469,-1.624208
7,foo,three,-0.142209,-0.09769


In [63]:
# here .groups is an 'attribute' of the groupby object
# It is a dict whose keys are the computed unique groups
# and corresponding values which are the axis labels for
# each member of the group.
# Basically it's saying "all 'bar' items are 1,3,5
# and all 'foo' are 0,2,4,6,7

df.groupby('A').groups

{'bar': [1, 3, 5], 'foo': [0, 2, 4, 6, 7]}

In [65]:
# groups by column A first, then B
grouped = df.groupby(['A','B'])
grouped.groups

{('bar', 'one'): [1],
 ('bar', 'three'): [3],
 ('bar', 'two'): [5],
 ('foo', 'one'): [0, 6],
 ('foo', 'three'): [7],
 ('foo', 'two'): [2, 4]}

In [67]:
# use Python's len() to see length of the groupedby dict
len(grouped)

6

In [66]:
# Don't yet understand this...

# create a function that will allow me to group across columns
def get_letter_type(letter):
    if letter.lower() in 'aeiou':
        return 'vowel'
    else:
        return 'consonant'

df.groupby(get_letter_type, axis=1).groups

{'consonant': ['B', 'C', 'D'], 'vowel': ['A']}

In [72]:
gb = df.groupby(['A'])
gb.<TAB>
# not sure what the <TAB> part is

<pandas.core.groupby.DataFrameGroupBy object at 0x1125de630>