In [1]:
import pandas as pd
import numpy as np

## 分组计算

分组计算三步曲：拆分 -> 应用 -> 合并

* 拆分：根据什么进行分组？
* 应用：每个分组进行什么样的计算？
* 合并：把每个分组的计算结果合并起来。


![groupby](groupby.png)

In [8]:
df = pd.DataFrame({'key1': ['a', 'a', 'b', 'b', 'a'],
                  'key2': ['one', 'two', 'one', 'two', 'one'],
                  'data1': np.random.randint(1, 10, 5),
                  'data2': np.random.randint(1, 10, 5)})
df

Unnamed: 0,data1,data2,key1,key2
0,6,1,a,one
1,2,8,a,two
2,4,7,b,one
3,4,6,b,two
4,1,7,a,one


### 对 Series 进行分组

通过索引对齐关联起来

In [9]:
grouped = df['data1'].groupby(df['key1'])

In [10]:
grouped.mean()

key1
a    3
b    4
Name: data1, dtype: int32

In [13]:
df['data1'].groupby([df['key1'], df['key2']]).mean()

key1  key2
a     one     3.5
      two     2.0
b     one     4.0
      two     4.0
Name: data1, dtype: float64

### 对 DataFrame 进行分组

In [25]:
df.groupby('key1').mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,3,5.333333
b,4,6.5


In [26]:
means = df.groupby(['key1', 'key2']).mean()['data1']
means

key1  key2
a     one     3.5
      two     2.0
b     one     4.0
      two     4.0
Name: data1, dtype: float64

In [27]:
means.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,3.5,2
b,4.0,4


In [64]:
df.groupby(['key1', 'key2'])['data1'].mean()

key1  key2
a     one     3.5
      two     2.0
b     one     4.0
      two     4.0
Name: data1, dtype: float64

### 每个分组的元素个数

In [29]:
df.groupby(['key1', 'key2']).size()

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

### 对分组进行迭代

In [35]:
for name, group in df.groupby('key1'):
    print name
    print group


a
   data1  data2 key1 key2
0      6      1    a  one
1      2      8    a  two
4      1      7    a  one
b
   data1  data2 key1 key2
2      4      7    b  one
3      4      6    b  two


In [37]:
for name, group in df.groupby(['key1', 'key2']):
    print name
    print group

('a', 'one')
   data1  data2 key1 key2
0      6      1    a  one
4      1      7    a  one
('a', 'two')
   data1  data2 key1 key2
1      2      8    a  two
('b', 'one')
   data1  data2 key1 key2
2      4      7    b  one
('b', 'two')
   data1  data2 key1 key2
3      4      6    b  two


### 转化为字典

In [40]:
d = dict(list(df.groupby('key1')))
d

{'a':    data1  data2 key1 key2
 0      6      1    a  one
 1      2      8    a  two
 4      1      7    a  one, 'b':    data1  data2 key1 key2
 2      4      7    b  one
 3      4      6    b  two}

In [41]:
d['a']

Unnamed: 0,data1,data2,key1,key2
0,6,1,a,one
1,2,8,a,two
4,1,7,a,one


### 按列分组

In [55]:
df.dtypes

data1     int32
data2     int32
key1     object
key2     object
dtype: object

In [63]:
grouped = df.groupby(df.dtypes, axis=1)
dict(list(grouped))

{dtype('int32'):    data1  data2
 0      6      1
 1      2      8
 2      4      7
 3      4      6
 4      1      7, dtype('O'):   key1 key2
 0    a  one
 1    a  two
 2    b  one
 3    b  two
 4    a  one}

### 通过字典进行分组

In [83]:
df = pd.DataFrame(np.random.randint(1, 10, (5, 5)), 
                  columns=['a', 'b', 'c', 'd', 'e'], 
                  index=['Alice', 'Bob', 'Candy', 'Dark', 'Emily'])
df

Unnamed: 0,a,b,c,d,e
Alice,8,5,3,7,3
Bob,2,3,3,6,9
Candy,3,9,7,9,1
Dark,1,8,3,6,4
Emily,2,3,1,1,3


In [84]:
df.ix[1, 1:3] = np.NaN
df

Unnamed: 0,a,b,c,d,e
Alice,8,5.0,3.0,7,3
Bob,2,,,6,9
Candy,3,9.0,7.0,9,1
Dark,1,8.0,3.0,6,4
Emily,2,3.0,1.0,1,3


In [87]:
mapping = {'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'orange', 'e': 'blue'}
grouped = df.groupby(mapping, axis=1)

In [88]:
grouped.sum()

Unnamed: 0,blue,orange,red
Alice,6,7,13
Bob,9,6,2
Candy,8,9,12
Dark,7,6,9
Emily,4,1,5


In [89]:
grouped.count()

Unnamed: 0,blue,orange,red
Alice,2,1,2
Bob,1,1,1
Candy,2,1,2
Dark,2,1,2
Emily,2,1,2


In [90]:
grouped.size()

blue      2
orange    1
red       2
dtype: int64