In [2]:
import pandas as pd
import numpy as np

from IPython.core.interactiveshell import InteractiveShell 
InteractiveShell.ast_node_interactivity = "all"

# Group by: split-apply-combine

## split an objects into groups

> `df.gropuby()`

In [18]:
df = pd.DataFrame([('bird', 'Falconiformes', 389.0),
   ...:                    ('bird', 'Psittaciformes', 24.0),
   ...:                    ('mammal', 'Carnivora', 80.2),
   ...:                    ('mammal', 'Primates', np.nan),
   ...:                    ('mammal', 'Carnivora', 58)],
   ...:                   index=['falcon', 'parrot', 'lion', 'monkey', 'leopard'],
   ...:                   columns=('class', 'order', 'max_speed'))
df

Unnamed: 0,class,order,max_speed
falcon,bird,Falconiformes,389.0
parrot,bird,Psittaciformes,24.0
lion,mammal,Carnivora,80.2
monkey,mammal,Primates,
leopard,mammal,Carnivora,58.0


In [19]:
# default is axis=0
grouped = df.groupby('class')
grouped = df.groupby('order', axis='columns')
grouped = df.groupby('order', axis='columns')

> `MultiIndex`

> 聚合函数输出只包括**唯一的索引值**

* index重复将被合并

In [12]:
lst = [1, 2, 3, 1, 2, 3]
s = pd.Series([1, 2, 3, 10, 20, 30], lst)  # 所以Index重复
s

1     1
2     2
3     3
1    10
2    20
3    30
dtype: int64

In [13]:
grouped = s.groupby(level=0)
grouped.first()
grouped.last()
grouped.sum()

1    1
2    2
3    3
dtype: int64

1    10
2    20
3    30
dtype: int64

1    11
2    22
3    33
dtype: int64

In [14]:
type(grouped)

pandas.core.groupby.groupby.SeriesGroupBy

In [16]:
help(pd.core.groupby.groupby.SeriesGroupBy.first)

Help on function first in module pandas.core.groupby.groupby:

first(self, **kwargs)
    Compute first of group values
    
    See also
    --------
    pandas.Series.groupby
    pandas.DataFrame.groupby
    pandas.Panel.groupby



### groupby sorting

In [21]:
df2 = pd.DataFrame({'X': ['B', 'B', 'A', 'A'], 'Y': [1, 2, 3, 4]})
df2

Unnamed: 0,X,Y
0,B,1
1,B,2
2,A,3
3,A,4


* 默认`groupby`时，group keys会被排序
    * 选择sort=False,停止排序 -- > 加速性能

In [24]:
df2.groupby(['X']).sum()  
df2.groupby(['X'], sort=False).sum()

Unnamed: 0_level_0,Y
X,Unnamed: 1_level_1
A,7
B,3


Unnamed: 0_level_0,Y
X,Unnamed: 1_level_1
B,3
A,7


*　groupby保存了分组的原DataFrame

In [26]:
df3 = pd.DataFrame({'X': ['A', 'B', 'A', 'B'], 'Y': [1, 4, 3, 2]})
df3
df3.groupby(['X']).get_group('A')
df3.groupby(['X']).get_group('B')

Unnamed: 0,X,Y
0,A,1
1,B,4
2,A,3
3,B,2


Unnamed: 0,X,Y
0,A,1
2,A,3


Unnamed: 0,X,Y
1,B,4
3,B,2


## iterating groups

In [57]:
arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
   ....:           ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]

index = pd.MultiIndex.from_arrays(arrays, names=['first', 'second'])
df = pd.DataFrame({'A': [1, 1, 1, 1, 2, 2, 3, 3],
   ....:                    'B': np.arange(8)},
   ....:                   index=index)

df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,1,0
bar,two,1,1
baz,one,1,2
baz,two,1,3
foo,one,2,4
foo,two,2,5
qux,one,3,6
qux,two,3,7


* 迭代类似于`itertools.groupby()`

In [33]:
grouped = df.groupby('A')
for name, group in grouped:
    print(name)
    print(group)

1
              A  B
first second      
bar   one     1  0
      two     1  1
baz   one     1  2
      two     1  3
2
              A  B
first second      
foo   one     2  4
      two     2  5
3
              A  B
first second      
qux   one     3  6
      two     3  7


* 对于*multiple keys*：group name是一个**元组**

In [35]:
for name, group in df.groupby(['A', 'B']):
   ....:     print(name)
   ....:     print(group)

(1, 0)
              A  B
first second      
bar   one     1  0
(1, 1)
              A  B
first second      
bar   two     1  1
(1, 2)
              A  B
first second      
baz   one     1  2
(1, 3)
              A  B
first second      
baz   two     1  3
(2, 4)
              A  B
first second      
foo   one     2  4
(2, 5)
              A  B
first second      
foo   two     2  5
(3, 6)
              A  B
first second      
qux   one     3  6
(3, 7)
              A  B
first second      
qux   two     3  7


## selecting a group
> `get_group()`

In [36]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,1,0
bar,two,1,1
baz,one,1,2
baz,two,1,3
foo,one,2,4
foo,two,2,5
qux,one,3,6
qux,two,3,7


In [48]:
df.groupby('second').get_group('one')

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,1,0
baz,one,1,2
foo,one,2,4
qux,one,3,6


* 对于 multiple columns 的group

In [50]:
df.groupby(['first', 'second']).get_group(('bar', 'one'))

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,1,0


## aggregation聚合

> 在分组的group中使用计算功能: `aggregate()` `agg()`

* e.g .aggregate(np.sum) /  .sum()

In [51]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,1,0
bar,two,1,1
baz,one,1,2
baz,two,1,3
foo,one,2,4
foo,two,2,5
qux,one,3,6
qux,two,3,7


In [54]:
df.groupby('first').first()
df.groupby('first').aggregate(np.sum)

Unnamed: 0_level_0,A,B
first,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,1,0
baz,1,2
foo,2,4
qux,3,6


Unnamed: 0_level_0,A,B
first,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,2,1
baz,2,5
foo,4,9
qux,6,13


In [60]:
df.groupby(['first', 'second']).first()
df.groupby(['first', 'second']).aggregate(np.sum)

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,1,0
bar,two,1,1
baz,one,1,2
baz,two,1,3
foo,one,2,4
foo,two,2,5
qux,one,3,6
qux,two,3,7


Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,1,0
bar,two,1,1
baz,one,1,2
baz,two,1,3
foo,one,2,4
foo,two,2,5
qux,one,3,6
qux,two,3,7


> 聚合后的**索引index**将被修改

* 使用`as_index`：不让index被修改
* 方法二：使用`reset_index`

In [75]:
grouped = df.groupby(['first', 'second'], as_index=False)
grouped.aggregate(np.sum)
df.groupby(['first', 'second']).sum().reset_index()

Unnamed: 0,A,B
0,1,0
1,1,1
2,1,2
3,1,3
4,2,4
5,2,5
6,3,6
7,3,7


Unnamed: 0,first,second,A,B
0,bar,one,1,0
1,bar,two,1,1
2,baz,one,1,2
3,baz,two,1,3
4,foo,one,2,4
5,foo,two,2,5
6,qux,one,3,6
7,qux,two,3,7


In [74]:
df.groupby('first', as_index=False).sum()
df.groupby('first', as_index=True).sum().reset_index()

Unnamed: 0,A,B
0,2,1
1,2,5
2,4,9
3,6,13


Unnamed: 0,first,A,B
0,bar,2,1
1,baz,2,5
2,foo,4,9
3,qux,6,13


> `size`:计算每个group的大小

In [77]:
grouped = df.groupby(['first', 'second'])
grouped.size()

first  second
bar    one       1
       two       1
baz    one       1
       two       1
foo    one       1
       two       1
qux    one       1
       two       1
dtype: int64

In [78]:
grouped.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,A,A,A,A,A,A,A,A,B,B,B,B,B,B,B,B
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
first,second,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2
bar,one,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,0.0,,0.0,0.0,0.0,0.0,0.0
bar,two,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,1.0
baz,one,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,2.0,,2.0,2.0,2.0,2.0,2.0
baz,two,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,3.0,,3.0,3.0,3.0,3.0,3.0
foo,one,1.0,2.0,,2.0,2.0,2.0,2.0,2.0,1.0,4.0,,4.0,4.0,4.0,4.0,4.0
foo,two,1.0,2.0,,2.0,2.0,2.0,2.0,2.0,1.0,5.0,,5.0,5.0,5.0,5.0,5.0
qux,one,1.0,3.0,,3.0,3.0,3.0,3.0,3.0,1.0,6.0,,6.0,6.0,6.0,6.0,6.0
qux,two,1.0,3.0,,3.0,3.0,3.0,3.0,3.0,1.0,7.0,,7.0,7.0,7.0,7.0,7.0


In [89]:
%timeit df.query('A == 1')

1.21 ms ± 74.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [92]:
%timeit df[df['A']==1]

302 µs ± 17.4 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
