# 数据的聚合和分组运算


In [13]:
import pandas as pd
import numpy as np

In [14]:
data = pd.DataFrame({
    'adult':[False,False,False,False,False,True,True],
    'animal':'cat dog cat fish dog cat cat'.split(),
    'size':list('SSMMMLL'),
    'weight':[8,10,11,1,20,12,12]})

In [15]:
data

Unnamed: 0,adult,animal,size,weight
0,False,cat,S,8
1,False,dog,S,10
2,False,cat,M,11
3,False,fish,M,1
4,False,dog,M,20
5,True,cat,L,12
6,True,cat,L,12


In [19]:
# 计算每种动物的平均体重  使用分组运算
data.groupby(data['animal'])['weight'].mean()
# 按照什么来分组

animal
cat     10.75
dog     15.00
fish     1.00
Name: weight, dtype: float64

In [20]:
data.groupby('animal')['weight'].mean()
# 这个结果是series

animal
cat     10.75
dog     15.00
fish     1.00
Name: weight, dtype: float64

In [21]:
# 这是一个专门的datagropby 对象
data.groupby('animal').mean()

Unnamed: 0_level_0,adult,weight
animal,Unnamed: 1_level_1,Unnamed: 2_level_1
cat,0.5,10.75
dog,0.0,15.0
fish,0.0,1.0


In [22]:
# 用两个值进行分组
data1 = data.groupby(['animal','adult'])['weight'].mean()

In [23]:
data1

animal  adult
cat     False     9.5
        True     12.0
dog     False    15.0
fish    False     1.0
Name: weight, dtype: float64

In [24]:
data1.unstack()

adult,False,True
animal,Unnamed: 1_level_1,Unnamed: 2_level_1
cat,9.5,12.0
dog,15.0,
fish,1.0,


In [25]:
# 分组的列表中有多个数量
data.groupby('animal').size()

animal
cat     4
dog     2
fish    1
dtype: int64

In [26]:
# 用dataFrame表示
data.groupby('animal')[['weight']].mean()

Unnamed: 0_level_0,weight
animal,Unnamed: 1_level_1
cat,10.75
dog,15.0
fish,1.0


In [27]:
df1 = data.set_index('animal')

In [28]:
df1

Unnamed: 0_level_0,adult,size,weight
animal,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cat,False,S,8
dog,False,S,10
cat,False,M,11
fish,False,M,1
dog,False,M,20
cat,True,L,12
cat,True,L,12


In [31]:
data


Unnamed: 0,adult,animal,size,weight
0,False,cat,S,8
1,False,dog,S,10
2,False,cat,M,11
3,False,fish,M,1
4,False,dog,M,20
5,True,cat,L,12
6,True,cat,L,12


In [30]:
# 用apply使用自定义函数
data.groupby('animal')['weight'].apply(lambda x : x - x.mean())

0   -2.75
1   -5.00
2    0.25
3    0.00
4    5.00
5    1.25
6    1.25
Name: weight, dtype: float64

In [35]:
# 取得体重最大的动物的size
data.groupby('animal').apply(lambda x : x['size'][x['weight'].idxmax()])

animal
cat     L
dog     M
fish    M
dtype: object

In [36]:
# 得到对应的分组
data.groupby('animal').get_group('cat')

Unnamed: 0,adult,animal,size,weight
0,False,cat,S,8
2,False,cat,M,11
5,True,cat,L,12
6,True,cat,L,12
