## 聚合

### 单变量分组聚合


In [4]:
import pandas as pd

In [5]:
# 加载数据
df = pd.read_csv('data/gapminder.tsv',sep='\t')



In [6]:
# groupby语句创建若干组
# 对year字段分组, 会将数据中不同年份作为分组结果
df.groupby('year').lifeExp.mean()


year
1952    49.057620
1957    51.507401
1962    53.609249
1967    55.678290
1972    57.647386
1977    59.570157
1982    61.533197
1987    63.212613
1992    64.160338
1997    65.014676
2002    65.694923
2007    67.007423
Name: lifeExp, dtype: float64

In [7]:
# 查询年份
years = df.year.unique()


In [8]:
# 上面groupby 之后取平均的结果,也可以手动计算
# 针对1952年的数据取子集
y1952 = df.loc[df.year==1952,:]
y1952.lifeExp.mean()


49.057619718309866


### Pandas内置的聚合方法

In [9]:
# 前面例子中分组之后取平均也可以使用describe函数同时计算多个统计量
df.groupby('continent').lifeExp.describe()


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Africa,624.0,48.86533,9.15021,23.599,42.3725,47.792,54.4115,76.442
Americas,300.0,64.658737,9.345088,37.579,58.41,67.048,71.6995,80.653
Asia,396.0,60.064903,11.864532,28.801,51.42625,61.7915,69.50525,82.603
Europe,360.0,71.903686,5.433178,43.585,69.57,72.241,75.4505,81.757
Oceania,24.0,74.326208,3.795611,69.12,71.205,73.665,77.5525,81.235


### 聚合函数

In [10]:
# 可以使用Numpy库的mean函数
import numpy as np
df.groupby('continent').lifeExp.agg(np.mean)




continent
Africa      48.865330
Americas    64.658737
Asia        60.064903
Europe      71.903686
Oceania     74.326208
Name: lifeExp, dtype: float64

In [11]:
# agg和 aggregate效果一样
df.groupby('continent').lifeExp.aggregate(np.mean)


continent
Africa      48.865330
Americas    64.658737
Asia        60.064903
Europe      71.903686
Oceania     74.326208
Name: lifeExp, dtype: float64

In [12]:
# 自定义函数
# 如果想在聚合的时候,使用非Pandas或其他库提供的计算，可以自定义函数然后在aggregate中调用它
def my_mean(values):
    n = len(values) # 获取数据条目数
    sum = 0
    for value in values:
        sum += value
    return(sum/n)


In [13]:
# 调用自定义函数 
# 自定义函数中只有一个参数values，但传入该函数中的数据是一组值，需要对values进行迭代才能取出每一个值
df.groupby('year').lifeExp.agg(my_mean)



year
1952    49.057620
1957    51.507401
1962    53.609249
1967    55.678290
1972    57.647386
1977    59.570157
1982    61.533197
1987    63.212613
1992    64.160338
1997    65.014676
2002    65.694923
2007    67.007423
Name: lifeExp, dtype: float64

In [15]:
# 自定义函数可以有多个参数, 第一个参数接受来自DataFrame分组这之后的值, 其余参数可自定义
# 计算全球平均预期寿命的平均值 与分组之后的平均值做差
def my_mean_diff(values,diff_value):
    '''计算平均值和diff_value之差
    '''
    n = len(values)
    sum = 0
    for value in values:
        sum+=value
    mean = sum/n
    return(mean-diff_value)
# 计算整个数据集的平均年龄
global_mean = df.lifeExp.mean()
# 调用自定义函数 计算平均值的差值
df.groupby('year').lifeExp.agg(my_mean_diff,diff_value = global_mean)

year
1952   -10.416820
1957    -7.967038
1962    -5.865190
1967    -3.796150
1972    -1.827053
1977     0.095718
1982     2.058758
1987     3.738173
1992     4.685899
1997     5.540237
2002     6.220483
2007     7.532983
Name: lifeExp, dtype: float64

### 同时传入多个函数


In [16]:
# 分组之后想计算多个聚合函数,可以把它们全部放入一个Python列表,然后把整个列表传入agg或aggregate中
# 按年计算lifeExp 的非零个数,平均值和标准差
df.groupby('year').lifeExp.agg([np.count_nonzero,np.mean,np.std])


Unnamed: 0_level_0,count_nonzero,mean,std
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1952,142.0,49.05762,12.225956
1957,142.0,51.507401,12.231286
1962,142.0,53.609249,12.097245
1967,142.0,55.67829,11.718858
1972,142.0,57.647386,11.381953
1977,142.0,59.570157,11.227229
1982,142.0,61.533197,10.770618
1987,142.0,63.212613,10.556285
1992,142.0,64.160338,11.22738
1997,142.0,65.014676,11.559439


### 向agg/aggregate中传入字典


In [18]:
# 分组之后,可以对多个字段用不同的方式聚合
df.groupby('year').agg({'lifeExp':'mean','pop':'median','gdpPercap':'median'})


Unnamed: 0_level_0,lifeExp,pop,gdpPercap
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1952,49.05762,3943953.0,1968.528344
1957,51.507401,4282942.0,2173.220291
1962,53.609249,4686039.5,2335.439533
1967,55.67829,5170175.5,2678.334741
1972,57.647386,5877996.5,3339.129407
1977,59.570157,6404036.5,3798.609244
1982,61.533197,7007320.0,4216.228428
1987,63.212613,7774861.5,4280.300366
1992,64.160338,8688686.5,4386.085502
1997,65.014676,9735063.5,4781.825478


In [19]:
# 聚合后的列名就是聚合函数的名字, 可以通过rename进行重命名
df.groupby('year').agg({'lifeExp':'mean','pop':'median','gdpPercap':'median'}).\
    rename(columns={'lifeExp':'平均寿命','pop':'人口','gdpPercap':'人均Gdp'}).reset_index()



Unnamed: 0,year,平均寿命,人口,人均Gdp
0,1952,49.05762,3943953.0,1968.528344
1,1957,51.507401,4282942.0,2173.220291
2,1962,53.609249,4686039.5,2335.439533
3,1967,55.67829,5170175.5,2678.334741
4,1972,57.647386,5877996.5,3339.129407
5,1977,59.570157,6404036.5,3798.609244
6,1982,61.533197,7007320.0,4216.228428
7,1987,63.212613,7774861.5,4280.300366
8,1992,64.160338,8688686.5,4386.085502
9,1997,65.014676,9735063.5,4781.825478


## 转换

### 使用transform分组计算z分数

In [20]:
# 计算z-score   x - 平均值/标准差
def my_zscore(x):
    return (x-x.mean())/x.std()



In [21]:
#按年分组 计算z-score
df.groupby('year').lifeExp.transform(my_zscore)



0      -1.656854
1      -1.731249
2      -1.786543
3      -1.848157
4      -1.894173
          ...   
1699   -0.081621
1700   -0.336974
1701   -1.574962
1702   -2.093346
1703   -1.948180
Name: lifeExp, Length: 1704, dtype: float64

In [22]:
# 查看数据集条目数， 跟之前transform处理之后的条目数一样
df.shape

(1704, 6)

### transform分组填充缺失值

In [23]:
# 加载数据
tips_10 = pd.read_csv('data/tips.csv').sample(10,random_state = 42)


In [24]:
# 构建缺失值
# np.random.permutation 将序列乱序
tips_10.loc[np.random.permutation(tips_10.index)[:4],'total_bill'] = np.NaN


In [25]:
# 查看缺失情况
count_sex = tips_10.groupby('sex').count()


In [26]:
# 定义函数填充缺失值
def fill_na_mean(x):
    # 求平均
    avg = x.mean()
    # 填充缺失值
    return(x.fillna(avg))


In [27]:
total_bill_group_mean = tips_10.groupby('sex').total_bill.transform(fill_na_mean)


In [28]:
# 将计算的结果赋值新列
# 对比total_bill 和 fill_total_bill 发现 Male 和 Female 的填充值不同

tips_10['fill_total_bill'] = total_bill_group_mean


In [29]:
tips_10

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,fill_total_bill
24,,3.18,Male,No,Sat,Dinner,2,20.486
6,8.77,2.0,Male,No,Sun,Dinner,2,8.77
153,24.55,2.0,Male,No,Sun,Dinner,4,24.55
211,25.89,5.16,Male,Yes,Sat,Dinner,4,25.89
198,13.0,2.0,Female,Yes,Thur,Lunch,2,13.0
176,,2.0,Male,Yes,Sun,Dinner,2,20.486
192,28.44,2.56,Male,Yes,Thur,Lunch,2,28.44
124,,2.52,Female,No,Thur,Lunch,2,13.0
9,14.78,3.23,Male,No,Sun,Dinner,2,14.78
101,,3.0,Female,Yes,Fri,Dinner,2,13.0


### transform练习


In [30]:
# weight_loss数据集，找到减肥比赛赢家
# 1 加载数据
weight_loss = pd.read_csv('data/weight_loss.csv')


In [32]:
weight_loss

Unnamed: 0,Name,Month,Week,Weight
0,Bob,Jan,Week 1,291
1,Amy,Jan,Week 1,197
2,Bob,Jan,Week 2,288
3,Amy,Jan,Week 2,189
4,Bob,Jan,Week 3,283
5,Amy,Jan,Week 3,189
6,Bob,Jan,Week 4,283
7,Amy,Jan,Week 4,190
8,Bob,Feb,Week 1,283
9,Amy,Feb,Week 1,190


In [31]:
# 2 Bob，Amy两个人的减肥记录，从1月到4月
# 只查看1月份数据  query 类似SQL的where条件
weight_loss.query('Month == "Jan"')


Unnamed: 0,Name,Month,Week,Weight
0,Bob,Jan,Week 1,291
1,Amy,Jan,Week 1,197
2,Bob,Jan,Week 2,288
3,Amy,Jan,Week 2,189
4,Bob,Jan,Week 3,283
5,Amy,Jan,Week 3,189
6,Bob,Jan,Week 4,283
7,Amy,Jan,Week 4,190


In [33]:
# 3 定义函数计算每周减肥比例 并测试
def find_perc_loss(s):
    return abs((s - s.iloc[0]) / s.iloc[0])


In [34]:
#查找Bob 1月份的数据
bob_jan = weight_loss.query('Name=="Bob" and Month=="Jan"')



In [36]:
bob_jan

Unnamed: 0,Name,Month,Week,Weight
0,Bob,Jan,Week 1,291
2,Bob,Jan,Week 2,288
4,Bob,Jan,Week 3,283
6,Bob,Jan,Week 4,283


In [35]:
#测试计算减肥比例的方法
find_perc_loss(bob_jan['Weight'])


0    0.000000
2    0.010309
4    0.027491
6    0.027491
Name: Weight, dtype: float64

In [37]:
# 4 计算每周减肥比例
pcnt_loss = weight_loss.groupby(['Name', 'Month'])['Weight'].transform(find_perc_loss)
pcnt_loss.head(8)


0    0.000000
1    0.000000
2    0.010309
3    0.040609
4    0.027491
5    0.040609
6    0.027491
7    0.035533
Name: Weight, dtype: float64

In [39]:
weight_loss['Perc Weight Loss'] = pcnt_loss.round(3)


In [40]:
# 查找每个月最后一周的数据 用来比较减肥效果
week4 = weight_loss.query('Week == "Week 4"')
week4


Unnamed: 0,Name,Month,Week,Weight,Perc Weight Loss
6,Bob,Jan,Week 4,283,0.027
7,Amy,Jan,Week 4,190,0.036
14,Bob,Feb,Week 4,268,0.053
15,Amy,Feb,Week 4,173,0.089
22,Bob,Mar,Week 4,261,0.026
23,Amy,Mar,Week 4,170,0.017
30,Bob,Apr,Week 4,250,0.042
31,Amy,Apr,Week 4,161,0.053


In [41]:
# 5 在第四周数据基础上，找到 Bob 和 Amy的减肥数据

week4_Bob = week4.query('Name == "Bob"')[['Month','Perc Weight Loss']]
week4_Bob



Unnamed: 0,Month,Perc Weight Loss
6,Jan,0.027
14,Feb,0.053
22,Mar,0.026
30,Apr,0.042


In [42]:
week4_Amy = week4.query('Name == "Amy"')[['Month','Perc Weight Loss']]
week4_Amy


Unnamed: 0,Month,Perc Weight Loss
7,Jan,0.036
15,Feb,0.089
23,Mar,0.017
31,Apr,0.053


In [43]:
# 6 比较Bob 和 Amy的减肥效果, Amy的减肥效果更明显
week4_Bob.set_index('Month')-week4_Amy.set_index('Month')


Unnamed: 0_level_0,Perc Weight Loss
Month,Unnamed: 1_level_1
Jan,-0.009
Feb,-0.036
Mar,0.009
Apr,-0.011


## 过滤

### 使用方法


In [44]:
# 使用groupby方法还可以过滤数据
# 调用filter 方法，传入一个返回布尔值的函数，返回False的数据会被过滤掉
# 1 使用之前的小费数据
tips = pd.read_csv('data/tips.csv')



In [45]:
# 2 查看用餐人数
tips['size'].value_counts()


2    156
3     38
4     37
5      5
6      4
1      4
Name: size, dtype: int64

In [46]:
# 3 人数为1、5和6人的数据比较少，考虑将这部分数据过滤掉
tips_filtered = tips.groupby('size').filter(lambda x: x['size'].count()>30)



In [47]:
tips_filtered

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [48]:
# 4 查看结果
tips_filtered['size'].value_counts()


2    156
3     38
4     37
Name: size, dtype: int64

## DataFrameGroupBy对象

### 分组

In [49]:
# 1准备数据
tips_10 = pd.read_csv('data/tips.csv').sample(10,random_state = 42)


In [51]:
# 2 调用groupby 创建分组对象
grouped = tips_10.groupby('sex')


In [52]:
# 3 查看grouped,grouped是一个DataFrameGroupBy对象
grouped



<pandas.core.groupby.generic.DataFrameGroupBy object at 0x122537b70>

In [54]:
# 4 通过groups属性查看计算过的分组
grouped.groups
# 上面返回的结果是DataFrame的索引，实际上就是原始数据的行数



{'Female': Int64Index([198, 124, 101], dtype='int64'),
 'Male': Int64Index([24, 6, 153, 211, 176, 192, 9], dtype='int64')}

In [56]:
# 5 在DataFrameGroupBy对象基础上，直接就可以进行aggregate,transform计算了
grouped.mean()
# 上面结果直接计算了按sex分组后，所有列的平均值，但只返回了数值列的结果，非数值列不会计算平均值

Unnamed: 0_level_0,total_bill,tip,size
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Female,13.62,2.506667,2.0
Male,20.02,2.875714,2.571429


In [57]:
 
# 6 通过get_group选择分组
female = grouped.get_group('Female')
female


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
198,13.0,2.0,Female,Yes,Thur,Lunch,2
124,12.48,2.52,Female,No,Thur,Lunch,2
101,15.38,3.0,Female,Yes,Fri,Dinner,2


### 遍历分组

In [58]:
# 通过groupby对象，可以遍历所有分组
# 相比于在groupby之后使用aggregate、transform和filter，有时候使用for循环解决问题更简单
for sex_group in grouped:
    print(sex_group)




('Female',      total_bill   tip     sex smoker   day    time  size
198       13.00  2.00  Female    Yes  Thur   Lunch     2
124       12.48  2.52  Female     No  Thur   Lunch     2
101       15.38  3.00  Female    Yes   Fri  Dinner     2)
('Male',      total_bill   tip   sex smoker   day    time  size
24        19.82  3.18  Male     No   Sat  Dinner     2
6          8.77  2.00  Male     No   Sun  Dinner     2
153       24.55  2.00  Male     No   Sun  Dinner     4
211       25.89  5.16  Male    Yes   Sat  Dinner     4
176       17.89  2.00  Male    Yes   Sun  Dinner     2
192       28.44  2.56  Male    Yes  Thur   Lunch     2
9         14.78  3.23  Male     No   Sun  Dinner     2)


In [59]:
# DataFrameGroupBy对象直接传入索引，会报错
grouped[0]

KeyError: 'Column not found: 0'

In [60]:
for sex_group in grouped:
    print(type(sex_group)) #遍历grouped对象，查看sex_group数据类型
    print(len(sex_group)) # 查看元素个数
    print(sex_group[0]) # 查看第一个元素
    print(type(sex_group[0])) # 查看第一个元素数据类型
    print(sex_group[1]) # 查看第二个元素
    print(type(sex_group[1])) # 查看第二个元素数据类型
    break


<class 'tuple'>
2
Female
<class 'str'>
     total_bill   tip     sex smoker   day    time  size
198       13.00  2.00  Female    Yes  Thur   Lunch     2
124       12.48  2.52  Female     No  Thur   Lunch     2
101       15.38  3.00  Female    Yes   Fri  Dinner     2
<class 'pandas.core.frame.DataFrame'>


### 多个分组

In [62]:
# 前面使用的groupby语句只包含一个变量，可以在groupby中添加多个变量
# 1 使用groupby按性别和用餐时间分别计算小费数据的平均值
group_avg = tips_10.groupby(['sex','time']).mean()


In [63]:
# 2 分别查看分组之后结果的列名和行索引
group_avg.columns


Index(['total_bill', 'tip', 'size'], dtype='object')

In [64]:
group_avg.index


MultiIndex([('Female', 'Dinner'),
            ('Female',  'Lunch'),
            (  'Male', 'Dinner'),
            (  'Male',  'Lunch')],
           names=['sex', 'time'])

In [65]:
# 多个分组之后返回的是MultiIndex
# 3 在结果上调用reset_index方法得到一个普通的DataFrame
group_avg.reset_index()


Unnamed: 0,sex,time,total_bill,tip,size
0,Female,Dinner,15.38,3.0,2.0
1,Female,Lunch,12.74,2.26,2.0
2,Male,Dinner,18.616667,2.928333,2.666667
3,Male,Lunch,28.44,2.56,2.0


In [66]:

# 4 也可以在分组的时候通过as_index = False参数（默认是True），效果与调用reset_index()一样
tips_10.groupby(['sex','time'],as_index = False).mean()


Unnamed: 0,sex,time,total_bill,tip,size
0,Female,Dinner,15.38,3.0,2.0
1,Female,Lunch,12.74,2.26,2.0
2,Male,Dinner,18.616667,2.928333,2.666667
3,Male,Lunch,28.44,2.56,2.0
