In [1]:
import pandas as pd
import numpy as np

## 分组计算

分组计算三步曲：拆分 -> 应用 -> 合并

* 拆分：根据什么进行分组？
* 应用：每个分组进行什么样的计算？
* 合并：把每个分组的计算结果合并起来。


![groupby](groupby.png)

In [2]:
df = pd.DataFrame({'key1': ['a', 'a', 'b', 'b', 'a'],
                  'key2': ['one', 'two', 'one', 'two', 'one'],
                  'data1': np.random.randint(1, 10, 5),
                  'data2': np.random.randint(1, 10, 5)})
df

Unnamed: 0,data1,data2,key1,key2
0,6,6,a,one
1,1,6,a,two
2,8,7,b,one
3,8,8,b,two
4,2,4,a,one


### 对 Series 进行分组

通过索引对齐关联起来

In [3]:
grouped = df['data1'].groupby(df['key1'])

In [4]:
grouped.mean()

key1
a    3
b    8
Name: data1, dtype: int64

In [5]:
df['data1'].groupby([df['key1'], df['key2']]).mean()

key1  key2
a     one     4
      two     1
b     one     8
      two     8
Name: data1, dtype: int64

### 对 DataFrame 进行分组

In [6]:
df.groupby('key1').mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,3,5.333333
b,8,7.5


In [7]:
means = df.groupby(['key1', 'key2']).mean()['data1']
means

key1  key2
a     one     4
      two     1
b     one     8
      two     8
Name: data1, dtype: int64

In [8]:
means.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,4,1
b,8,8


In [9]:
df.groupby(['key1', 'key2'])['data1'].mean()

key1  key2
a     one     4
      two     1
b     one     8
      two     8
Name: data1, dtype: int64

### 每个分组的元素个数

In [10]:
df.groupby(['key1', 'key2']).size()

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

### 对分组进行迭代

In [11]:
for name, group in df.groupby('key1'):
    print name
    print group


a
   data1  data2 key1 key2
0      6      6    a  one
1      1      6    a  two
4      2      4    a  one
b
   data1  data2 key1 key2
2      8      7    b  one
3      8      8    b  two


In [12]:
for name, group in df.groupby(['key1', 'key2']):
    print name
    print group

('a', 'one')
   data1  data2 key1 key2
0      6      6    a  one
4      2      4    a  one
('a', 'two')
   data1  data2 key1 key2
1      1      6    a  two
('b', 'one')
   data1  data2 key1 key2
2      8      7    b  one
('b', 'two')
   data1  data2 key1 key2
3      8      8    b  two


### 转化为字典

In [13]:
d = dict(list(df.groupby('key1')))
d

{'a':    data1  data2 key1 key2
 0      6      6    a  one
 1      1      6    a  two
 4      2      4    a  one, 'b':    data1  data2 key1 key2
 2      8      7    b  one
 3      8      8    b  two}

In [14]:
d['a']

Unnamed: 0,data1,data2,key1,key2
0,6,6,a,one
1,1,6,a,two
4,2,4,a,one


### 按列分组

In [15]:
df.dtypes

data1     int64
data2     int64
key1     object
key2     object
dtype: object

In [16]:
grouped = df.groupby(df.dtypes, axis=1)
dict(list(grouped))

{dtype('int64'):    data1  data2
 0      6      6
 1      1      6
 2      8      7
 3      8      8
 4      2      4, dtype('O'):   key1 key2
 0    a  one
 1    a  two
 2    b  one
 3    b  two
 4    a  one}

### 通过字典进行分组

In [17]:
df = pd.DataFrame(np.random.randint(1, 10, (5, 5)), 
                  columns=['a', 'b', 'c', 'd', 'e'], 
                  index=['Alice', 'Bob', 'Candy', 'Dark', 'Emily'])
df

Unnamed: 0,a,b,c,d,e
Alice,2,9,4,5,3
Bob,2,5,1,2,5
Candy,5,1,5,7,4
Dark,9,2,7,4,9
Emily,5,4,4,1,2


In [18]:
df.ix[1, 1:3] = np.NaN
df

Unnamed: 0,a,b,c,d,e
Alice,2,9.0,4.0,5,3
Bob,2,,,2,5
Candy,5,1.0,5.0,7,4
Dark,9,2.0,7.0,4,9
Emily,5,4.0,4.0,1,2


In [19]:
mapping = {'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'orange', 'e': 'blue'}
grouped = df.groupby(mapping, axis=1)

In [20]:
grouped.sum()

Unnamed: 0,blue,orange,red
Alice,7,5,11
Bob,5,2,2
Candy,9,7,6
Dark,16,4,11
Emily,6,1,9


In [21]:
grouped.count()

Unnamed: 0,blue,orange,red
Alice,2,1,2
Bob,1,1,1
Candy,2,1,2
Dark,2,1,2
Emily,2,1,2


In [22]:
grouped.size()

blue      2
orange    1
red       2
dtype: int64

### 通过函数来分组

当函数作为分组依据时，数据表里的每个索引（可以是行索引，也可以是列索引）都会调用一次函数，函数的返回值作为分组的索引，即相同的返回值分在同一组。

In [24]:
df = pd.DataFrame(np.random.randint(1, 10, (5, 5)), 
                  columns=['a', 'b', 'c', 'd', 'e'], 
                  index=['Alice', 'Bob', 'Candy', 'Dark', 'Emily'])
df

Unnamed: 0,a,b,c,d,e
Alice,9,9,6,6,2
Bob,3,9,2,3,2
Candy,4,8,3,9,6
Dark,7,1,2,1,7
Emily,4,6,3,4,3


In [26]:
def _dummy_group(idx):
    print idx
    return idx
df.groupby(_dummy_group)

Alice
Bob
Candy
Dark
Emily


<pandas.core.groupby.DataFrameGroupBy object at 0x106311390>

In [27]:
df.groupby(_dummy_group, axis=1)

a
b
c
d
e


<pandas.core.groupby.DataFrameGroupBy object at 0x106311110>

In [29]:
grouped = df.groupby(len)

In [30]:
grouped.sum()

Unnamed: 0,a,b,c,d,e
3,3,9,2,3,2
4,7,1,2,1,7
5,17,23,12,19,11


In [31]:
grouped.size()

3    1
4    1
5    3
dtype: int64

In [32]:
grouped.count()

Unnamed: 0,a,b,c,d,e
3,1,1,1,1,1
4,1,1,1,1,1
5,3,3,3,3,3


### 多级索引数据根据索引级别来分组

In [36]:
columns = pd.MultiIndex.from_arrays([['China', 'USA', 'China', 'USA', 'China'],
                                     ['A', 'A', 'B', 'C', 'B']], names=['country', 'index'])
df = pd.DataFrame(np.random.randint(1, 10, (5, 5)), columns=columns)
df

country,China,USA,China,USA,China
index,A,A,B,C,B.1
0,3,4,2,6,2
1,3,4,1,2,7
2,2,2,3,3,2
3,4,1,4,2,2
4,6,7,6,1,8


In [39]:
df.groupby(level='country', axis=1).count()

country,China,USA
0,3,2
1,3,2
2,3,2
3,3,2
4,3,2


In [40]:
df.groupby(level='country', axis=1).sum()

country,China,USA
0,7,10
1,11,6
2,7,5
3,10,3
4,20,8


In [42]:
df.groupby(level='index', axis=1).count()

index,A,B,C
0,2,2,1
1,2,2,1
2,2,2,1
3,2,2,1
4,2,2,1


## 数据聚合

分组运算，先根据一定规则拆分后的数据，然后对数据进行聚合运算，如前面见到的 `mean()`, `sum()` 等就是聚合的例子。聚合时，拆分后的第一个索引指定的数据都会依次传给聚合函数进行运算。最后再把运算结果合并起来，生成最终结果。

聚合函数除了内置的 `sum()`, `min()`, `max()`, `mean()` 等等之外，还可以自定义聚合函数。自定义聚合函数时，使用 `agg()` 或 `aggregate()` 函数。

### 内置聚合函数

In [45]:
df = pd.DataFrame({'key1': ['a', 'a', 'b', 'b', 'a'],
                  'key2': ['one', 'two', 'one', 'two', 'one'],
                  'data1': np.random.randint(1, 10, 5),
                  'data2': np.random.randint(1, 10, 5)})
df

Unnamed: 0,data1,data2,key1,key2
0,9,7,a,one
1,4,8,a,two
2,2,2,b,one
3,7,8,b,two
4,1,3,a,one


In [47]:
df['data1'].groupby(df['key1']).sum()

key1
a    14
b     9
Name: data1, dtype: int64

### 自定义聚合函数

In [56]:
def peak_verbose(s):
    print type(s)
    return s.max() - s.min()

def peak(s):
    return s.max() - s.min()

In [50]:
grouped = df.groupby('key1')

In [54]:
grouped.agg(peak_verbose)

<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>


Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,8,5
b,5,6


### 应用多个聚合函数

In [59]:
grouped['data1', 'data2'].agg(['mean', 'std', peak])

Unnamed: 0_level_0,data1,data1,data1,data2,data2,data2
Unnamed: 0_level_1,mean,std,peak,mean,std,peak
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
a,4.666667,4.041452,8,6,2.645751,5
b,4.5,3.535534,5,5,4.242641,6


In [60]:
# 给聚合后的列取名
grouped['data1'].agg([('agerage', 'mean'), ('max-range', peak)])

Unnamed: 0_level_0,agerage,max-range
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,4.666667,8
b,4.5,5


### 给不同的列应用不同的聚合函数

使用 dict 作为参数来实现

In [62]:
d = {'data1': ['mean', peak, 'max', 'min'],
     'data2': 'sum'}
grouped.agg(d)

Unnamed: 0_level_0,data1,data1,data1,data1,data2
Unnamed: 0_level_1,mean,peak,max,min,sum
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
a,4.666667,8,9,1,18
b,4.5,5,7,2,10


### 重置索引

In [65]:
grouped.agg(d).reset_index()

Unnamed: 0_level_0,key1,data1,data1,data1,data1,data2
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,peak,max,min,sum
0,a,4.666667,8,9,1,18
1,b,4.5,5,7,2,10


In [68]:
df.groupby('key1', as_index=False).agg(d)

Unnamed: 0_level_0,key1,data1,data1,data1,data1,data2
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,peak,max,min,sum
0,a,4.666667,8,9,1,18
1,b,4.5,5,7,2,10
