In [2]:
import numpy as np
import pandas as pd

# 一、分组模式及其对象
### 1. 分组的一般模式

广泛使用 ： 
> --- 1. 性别分组：统计全国人口寿命的平均值
>
> --- 2. 季节分组：对每一个季节的温度进行组内标准化
>
> --- 3. 班级分组：筛选出组内数学分数的平均值超过80的班级

三要素：
> --- 1. 分组依据 
>
> --- 2. 数据来源
>
> --- 3. 操作机器返回结果

` df.groupby('Gender')['Longevity'].mean()`

` df.groupby(分组依据)[数据来源].使用操作`



In [5]:
df = pd.read_csv('data/learn_pandas.csv')
df.groupby('Gender')['Height'].median()

Gender
Female    159.6
Male      173.4
Name: Height, dtype: float64

### 2. 分组依据的本质
groupby 中传入相应列名构成的列表，实现多维度分组

In [6]:
df.groupby(['School', 'Gender'])['Height'].mean()

School                         Gender
Fudan University               Female    158.776923
                               Male      174.212500
Peking University              Female    158.666667
                               Male      172.030000
Shanghai Jiao Tong University  Female    159.122500
                               Male      176.760000
Tsinghua University            Female    159.753333
                               Male      171.638889
Name: Height, dtype: float64

In [9]:
# 通过复杂逻辑分组--根据学生体重是否超过总体均值--》 计算身高均值
condition = df.Weight > df.Weight.mean()
df.groupby(condition)['Height'].mean()

Weight
False    159.034646
True     172.705357
Name: Height, dtype: float64

### 练一练
请根据上下四分位数分割，将体重分成high, normal, low 三组，统计身高均值

In [16]:
condition1 = df.Weight > df.Weight.quantile(0.75)
condition2 = df.Weight < df.Weight.quantile(0.25)
condition3 = (df.Weight <= df.Weight.quantile(0.75)) & (df.Weight >= df.Weight.quantile(0.25))
condition = df.Weight.mask(condition1, 'high').mask(condition2, 'low').mask(condition3, 'normal')
df.groupby(condition)['Height'].mean()


Weight
high      174.935714
low       153.753659
normal    161.883516
Name: Height, dtype: float64

In [18]:
# 最后的结果按照条件列表中元素值来分组
# 验证：
item = np.random.choice(list('abc'), df.shape[0])
df.groupby(item)['Height'].mean()

a    163.658730
b    164.078571
c    162.031250
Name: Height, dtype: float64

In [19]:
# 多个序列：分组依据 -- 》 这两个序列对应行的唯一组合
df.groupby([condition, item])['Height'].mean()

Weight   
high    a    174.889474
        b    178.075000
        c    171.590909
low     a    153.543750
        b    154.390909
        c    153.492857
normal  a    161.230769
        b    162.218182
        c    162.068750
Name: Height, dtype: float64

In [22]:
# 之前传入列名 等价传入一个或多个列，
# 分组依据来自数据来源组合的unique值
# 通过 drop_duplicates 知道具体的组类别
df[['School', 'Gender']].drop_duplicates()

Unnamed: 0,School,Gender
0,Shanghai Jiao Tong University,Female
1,Peking University,Male
2,Shanghai Jiao Tong University,Male
3,Fudan University,Female
4,Fudan University,Male
5,Tsinghua University,Female
9,Peking University,Female
16,Tsinghua University,Male


In [23]:
df.groupby([df['School'], df['Gender']])['Height'].mean()

School                         Gender
Fudan University               Female    158.776923
                               Male      174.212500
Peking University              Female    158.666667
                               Male      172.030000
Shanghai Jiao Tong University  Female    159.122500
                               Male      176.760000
Tsinghua University            Female    159.753333
                               Male      171.638889
Name: Height, dtype: float64

### 3. Groupby 对象
方便的属性

In [28]:
gb = df.groupby(['School', 'Grade'])

In [29]:
gb.ngroups

16

In [26]:
res = gb.groups
res.keys()

dict_keys([('Fudan University', 'Freshman'), ('Fudan University', 'Junior'), ('Fudan University', 'Senior'), ('Fudan University', 'Sophomore'), ('Peking University', 'Freshman'), ('Peking University', 'Junior'), ('Peking University', 'Senior'), ('Peking University', 'Sophomore'), ('Shanghai Jiao Tong University', 'Freshman'), ('Shanghai Jiao Tong University', 'Junior'), ('Shanghai Jiao Tong University', 'Senior'), ('Shanghai Jiao Tong University', 'Sophomore'), ('Tsinghua University', 'Freshman'), ('Tsinghua University', 'Junior'), ('Tsinghua University', 'Senior'), ('Tsinghua University', 'Sophomore')])

### 练一练
上述介绍 drop_duplicates得到具体的组类别，现用groups属性完成类似功能


In [30]:
len(gb.groups.keys())

16

In [31]:
# size:
# DataFrame属性时，返回表长乘以表宽
# groupby属性时， 表示统计每个组的元素个数
gb.size()

School                         Grade    
Fudan University               Freshman      9
                               Junior       12
                               Senior       11
                               Sophomore     8
Peking University              Freshman     13
                               Junior        8
                               Senior        8
                               Sophomore     5
Shanghai Jiao Tong University  Freshman     13
                               Junior       17
                               Senior       22
                               Sophomore     5
Tsinghua University            Freshman     17
                               Junior       22
                               Senior       14
                               Sophomore    16
dtype: int64

get_group 直接获取所在组对应的行，知道组名


In [33]:
gb.get_group(('Fudan University', 'Freshman')).iloc[:3, :3]  # 展示一部分


Unnamed: 0,School,Grade,Name
15,Fudan University,Freshman,Changqiang Yang
28,Fudan University,Freshman,Gaoqiang Qin
63,Fudan University,Freshman,Gaofeng Zhao


两个属性，两个方法，
> mean
>
> median
>
>
>

### 4. 分组的三大操作


ex1：返回标量值（可能组容量 size）

ex2：做了原序列标准化处理，返回 Series 类型

ex3：返回组所在行的本身，返回DataFrame类型

引申出组的三大操作：聚合，变换，过滤， 分别对应
agg，transform， filter函数及操作

# 二、聚合函数
### 1. 内置聚合函数


groupby聚合函数:

返回标量原则：max、min、mean、median、count、all、any、any、idxmax、idxmin、mad、nunique、skew、quantity、sum、std、var、sem、size、prod

In [35]:
gb = df.groupby('Gender')['Height']
 
print(gb.idxmin())

gb.quantile(0.95)


Gender
Female    143
Male      199
Name: Height, dtype: int64


Gender
Female    166.8
Male      185.9
Name: Height, dtype: float64

请查阅文档，明确all, any, mad, skew, sem, prod

### all

In [38]:
gb.all()  # 有一个 False 就为 False

Gender
Female    True
Male      True
Name: Height, dtype: bool

In [40]:
gb.any()  # 只要有一个 True 就为 True

Gender
Female    True
Male      True
Name: Height, dtype: bool

In [43]:
gb.mad()  # mad 平均绝对离差，用于统计学中对分组的每对数据做离散程度分析

Gender
Female    4.088108
Male      5.394617
Name: Height, dtype: float64

$$ M_{i}=\frac{1}{n} \sum_{k=1}^{n}\left|x_{k}-\bar{x}\right|$$ 

In [44]:
gb.skew()  # skew偏度，分组后每组数据分布的偏态程度

Gender
Female   -0.219253
Male      0.437535
Name: Height, dtype: float64

$$S K_{i}=\frac{n \sqrt{n-1}}{n-2} \frac{\sum_{k=1}^{n}\left(x_{k}-\bar{x}\right)^{3}}{\left(\sum_{k=1}^{n}\left(x_{k}-\bar{x}\right)^{2}\right)^{\frac{3}{2}}}$$

In [46]:
# 分组后的gb对象没有kurt方法，计算峰度，apply取出Series方可调用
try:
    gb.kurt()
except Exception as e:
    print(e)


Cannot access callable attribute 'kurt' of 'SeriesGroupBy' objects, try using the 'apply' method


In [49]:
gb.apply(lambda x: x.kurt())  # 用来反映分组后每组数据分布的平尖程度

Gender
Female   -0.324085
Male      0.920630
Name: Height, dtype: float64

$$K_{i}=\frac{n(n+1)(n-1)}{(n-2)(n-3)} \frac{\sum_{k=1}^{n}\left(x_{k}-\bar{x}\right)^{4}}{\left(\sum_{k=1}^{n}\left(x_{k}-\bar{x}\right)^{2}\right)^{2}}-\frac{3(n-1)^{2}}{(n-2)(n-3)}$$

In [52]:
gb.sem()  # 均值标准误差，多个均值样本的标准差，无偏估计标准差，体现均值抽样分布的离散程度，反映样本均值间差异

Gender
Female    0.439893
Male      0.986985
Name: Height, dtype: float64

$$ S E M_{i}=\frac{s}{\sqrt{N}} $$

In [54]:
gb.prod()  # 连乘

Gender
Female    4.232080e+290
Male      1.594210e+114
Name: Height, dtype: float64

当聚合函数传入的数据来源包含多个列时， 按列进行迭代计算

In [56]:
gb = df.groupby('Gender')[['Height', 'Weight']]
gb.max()

Unnamed: 0_level_0,Height,Weight
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,170.2,63.0
Male,193.9,89.0


### 2. agg 方法

groupby内置和函数不便之处
> -- 无法同时使用多个函数
> 
> -- 无法对特定的列使用特定的聚合函数
>
> -- 无法使用自定义的聚合函数
>
> -- 无法直接对结果的列名在聚合前进行自定义命名
>

In [97]:
# a. agg解决使用多个函数
# 使用多个聚类函数，用列表形式，
# 内置聚合函数的对应的字符串传入
# 先前提到的字符串都合法
gb.agg(['sum', 'idxmax', 'skew'])
# 第一层为数据源
# 第二层为使用的聚合方法
# 逐一对列使用聚合

Unnamed: 0_level_0,Height,Height,Height,Weight,Weight,Weight
Unnamed: 0_level_1,sum,idxmax,skew,sum,idxmax,skew
Gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Female,21014.0,28,-0.219253,6469.0,28,-0.268482
Male,8854.9,193,0.437535,3929.0,2,-0.332393


In [60]:
# b. 对特定的列使用特定的聚合函数
# 方法和列的特殊对应，通过构造字典传入agg中实现
# 字典以列名为键，以以聚合字符串为值
gb.agg({'Height':['mean', 'max'], 'Weight':'count'})

Unnamed: 0_level_0,Height,Height,Weight
Unnamed: 0_level_1,mean,max,count
Gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Female,159.19697,170.2,135
Male,173.62549,193.9,54


### 练一练

使用b中的传入字典的方法完成a中等价的聚合任务

In [62]:
gb.agg({'Height':['sum', 'idxmax', 'skew'], 'Weight':['sum', 'idxmax', 'skew']})

Unnamed: 0_level_0,Height,Height,Height,Weight,Weight,Weight
Unnamed: 0_level_1,sum,idxmax,skew,sum,idxmax,skew
Gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Female,21014.0,28,-0.219253,6469.0,28,-0.268482
Male,8854.9,193,0.437535,3929.0,2,-0.332393


In [63]:
# c. 使用自定义函数
# agg 自定义函数
# attention： 传入函数的参数是之前数据源中的列，逐列计算
# 分组计算身高和体重的极差
gb.agg(lambda x: x.mean() - x.min())

Unnamed: 0_level_0,Height,Weight
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,13.79697,13.918519
Male,17.92549,21.759259


### 练一练
在groupby对象中可以使用 describe 方法，进行统计信息汇总

请使用多个聚合函数，完成与该方法相同的功能

In [64]:
gb.describe()

Unnamed: 0_level_0,Height,Height,Height,Height,Height,Height,Height,Height,Weight,Weight,Weight,Weight,Weight,Weight,Weight,Weight
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
Gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Female,132.0,159.19697,5.053982,145.4,155.675,159.6,162.825,170.2,135.0,47.918519,5.405983,34.0,44.0,48.0,52.0,63.0
Male,51.0,173.62549,7.048485,155.7,168.9,173.4,177.15,193.9,54.0,72.759259,7.772557,51.0,69.0,73.0,78.75,89.0


In [74]:
gb.agg([('count', 'count'), ('mean', 'mean'), ('std', 'std'), ('min', 'min'), ('25%', lambda x: x.quantile(0.25)), ('50%', lambda x: x.quantile(0.5)), ('75%', lambda x: x.quantile(0.75))])


Unnamed: 0_level_0,Height,Height,Height,Height,Height,Height,Height,Weight,Weight,Weight,Weight,Weight,Weight,Weight
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,count,mean,std,min,25%,50%,75%
Gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2
Female,132,159.19697,5.053982,145.4,155.675,159.6,162.825,135,47.918519,5.405983,34.0,44.0,48.0,52.0
Male,51,173.62549,7.048485,155.7,168.9,173.4,177.15,54,72.759259,7.772557,51.0,69.0,73.0,78.75


In [66]:
def my_func(s):
    res = 'High'
    if s.mean() <= df[s.name].mean():
        res = 'Low'
    return res
gb.agg(my_func)

Unnamed: 0_level_0,Height,Weight
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,Low,Low
Male,High,High


聚合结果重命名

In [80]:
gb.agg([('range', lambda x: x.max() - x.min()), ('my_sum', 'sum')])


Unnamed: 0_level_0,Height,Height,Weight,Weight
Unnamed: 0_level_1,range,my_sum,range,my_sum
Gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Female,24.8,21014.0,29.0,6469.0
Male,38.2,8854.9,38.0,3929.0


In [81]:
gb.agg({'Height': [('my_func', my_func), 'sum'],
        'Weight': lambda x:x.max()})
# 单个聚合时，重命名需要加方括号
# 否则不知道是新名字还是手误输错的内置函数字符串


Unnamed: 0_level_0,Height,Height,Weight
Unnamed: 0_level_1,my_func,sum,<lambda>
Gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Female,159.19697,21014.0,63.0
Male,173.62549,8854.9,89.0


In [82]:
gb.agg([('my_sum', 'sum')])

Unnamed: 0_level_0,Height,Weight
Unnamed: 0_level_1,my_sum,my_sum
Gender,Unnamed: 1_level_2,Unnamed: 2_level_2
Female,21014.0,6469.0
Male,8854.9,3929.0


# 三、变换和过滤
### 1. 变换函数与transform方法
返回值为同长度序列，内置变换函数为累计函数

cumcount/cumsum/cumprod/cummax/cummin : 组内累计

groupby对象上还定义了，填充类和划窗类

In [83]:
gb.cummax().head()

Unnamed: 0,Height,Weight
0,158.9,46.0
1,166.5,70.0
2,188.9,89.0
3,,46.0
4,188.9,89.0


### 练一练
groupby对象中，rank方法也是一个使用的变换函数，查阅功能，给出使用的例子

[参考](https://blog.csdn.net/qq_34903176/article/details/111595612)

In [84]:
df6 = pd.DataFrame({'a':[0]*4 + [1]*2,
                    'b':[2, 3, 3, 4, -1, -2],
                    'c':[True, False, False, True, True, False],
                    'd':[3, 2, np.nan, 1, -3, np.nan]})
df6


Unnamed: 0,a,b,c,d
0,0,2,True,3.0
1,0,3,False,2.0
2,0,3,False,
3,0,4,True,1.0
4,1,-1,True,-3.0
5,1,-2,False,


In [87]:
# （分组后排名，method 默认为 average）数值越小排名值越小
# bool值视为0 、 1计算， NaN默认不参与排名
df6.groupby('a').rank()


Unnamed: 0,b,c,d
0,1.0,3.5,3.0
1,2.5,1.5,2.0
2,2.5,1.5,
3,4.0,3.5,1.0
4,2.0,2.0,1.0
5,1.0,1.0,


In [86]:
df6.groupby('b').rank()

Unnamed: 0,a,c,d
0,1.0,1.0,1.0
1,1.5,1.5,1.0
2,1.5,1.5,
3,1.0,1.0,1.0
4,1.0,1.0,1.0
5,1.0,1.0,


In [90]:
# method 为 max，相同排名，用名次值max代替
df6.groupby('a').rank(method = 'max')

Unnamed: 0,b,c,d
0,1.0,4.0,3.0
1,3.0,2.0,2.0
2,3.0,2.0,
3,4.0,4.0,1.0
4,2.0,2.0,1.0
5,1.0,1.0,


In [92]:
df6.groupby('a').rank(method = 'dense')
# 1 2 2 3 3 4 

Unnamed: 0,b,c,d
0,1.0,2.0,3.0
1,2.0,1.0,2.0
2,2.0,1.0,
3,3.0,2.0,1.0
4,2.0,2.0,1.0
5,1.0,1.0,


In [93]:
df6.groupby('a').rank(method = 'first')
# 1 2 3 3 5 6 7

Unnamed: 0,b,c,d
0,1.0,3.0,3.0
1,2.0,1.0,2.0
2,3.0,2.0,
3,4.0,4.0,1.0
4,2.0,2.0,1.0
5,1.0,1.0,


> ascending 控制排名升序或降序
>
> na_option 控制 NaN 的处理方式
>
> --- 默认 keep 不处理
>
> --- top 表现优先排NaN
>
> --- bottom 表示最后排 NaN
>
> pct 表示将排名后的名次转化为前百分比形式


In [94]:
df6.groupby('a').rank(ascending = False, na_option = 'top', pct = True)


Unnamed: 0,b,c,d
0,1.0,0.375,0.5
1,0.625,0.875,0.75
2,0.625,0.875,0.25
3,0.25,0.375,1.0
4,0.5,0.5,1.0
5,1.0,1.0,0.5


自定义变换时需要使用 transform , 被调用的自定义函数
其传入值为数据源的序列，与agg的传入类型一致
其最后的返回结果是行列索引与数据源一致的DataFrame


In [96]:
# 现对身高和体重进行分组标准化，减去组均值后除以组的标准差
gb.transform(lambda x: (x - x.mean()) / x.std()).head()

Unnamed: 0,Height,Weight
0,-0.05876,-0.354888
1,-1.010925,-0.355
2,2.167063,2.089498
3,,-1.279789
4,0.053133,0.159631


### 练一练
对于 transform 方法无法像 agg 一样，通过传入字典来对指定的列使用特定的变换
如果需要在一次transform中实现，给出解决方案


In [102]:
# 1. 字典处理
gb.transform(lambda x :{'Height': eval('x.cumsum()'),
              'Weight': eval('x.cumprod()')}[x.name]).head()

Unnamed: 0,Height,Weight
0,158.9,46.0
1,166.5,70.0
2,355.4,6230.0
3,,1886.0
4,529.4,461020.0


In [103]:
# 分支 if-else
gb.transform(lambda x:x.cummin() if x.name == 'Height' else x.rank()).head()

Unnamed: 0,Height,Weight
0,158.9,47.5
1,166.5,19.0
2,166.5,54.0
3,,14.5
4,166.5,31.5


tranform 返回同长度的序列，事实上还能返回一个标量，是的结果被广播到其所在的整个组，标量广播很常见


In [105]:
# 构造两列新特征来分别表示样本所在性别组的身高均值，和体重均值
gb.transform('mean').head()  # 传入返回标量的函数也是可以的

Unnamed: 0,Height,Weight
0,159.19697,47.918519
1,173.62549,72.759259
2,173.62549,72.759259
3,159.19697,47.918519
4,173.62549,72.759259


### 2. 组索引与过滤

索引和过滤有什么区别呢
> 过滤在分组中是对于组的过滤；索引对于行的过滤
>
> 组过滤是行过滤的推广：对一个组的全体所在行进行统计的结果返回True则会被保留，False被过滤，返回保留的行拼接的DataFrame
>
> filter : 组的筛选
>
> 自定义函数输入参数为数据源构成的DataFrame
>
> 保证输出为布尔值

In [106]:
# 在原表中通过过滤得到所有容量 > 100 的组
gb.filter(lambda x: x.shape[0] > 100).head()


Unnamed: 0,Height,Weight
0,158.9,46.0
3,,41.0
5,158.0,51.0
6,162.5,52.0
7,161.9,50.0


### 练一练
概念上说，索引功能是组过滤功能的子集，使用filter函数完成loc\[.\]功能，.是元素列表

In [126]:
# gb.filter(lambda x: x.index in [0, 1, 2, 3, 4, 5])

In [127]:
# def func(x):
    
#     return x.index.isin([1,2,3])
# gb.filter(func)

In [136]:
df.index.isin([1, 2, 3, 4])[:5]
df.groupby(df.index.isin([1,2,3,4])).head(3)  # 对应⑥列

Unnamed: 0,School,Grade,Name,Gender,Height,Weight,Transfer,Test_Number,Test_Date,Time_Record
0,Shanghai Jiao Tong University,Freshman,Gaopeng Yang,Female,158.9,46.0,N,1,2019/10/5,0:04:34
1,Peking University,Freshman,Changqiang You,Male,166.5,70.0,N,1,2019/9/4,0:04:20
2,Shanghai Jiao Tong University,Senior,Mei Sun,Male,188.9,89.0,N,2,2019/9/12,0:05:22
3,Fudan University,Sophomore,Xiaojuan Sun,Female,,41.0,N,2,2020/1/3,0:04:08
5,Tsinghua University,Freshman,Xiaoli Qian,Female,158.0,51.0,N,1,2019/10/31,0:03:47
6,Shanghai Jiao Tong University,Freshman,Qiang Chu,Female,162.5,52.0,N,1,2019/12/12,0:03:53


In [138]:
df.groupby(df.index.isin([11,3,139,172,54])).filter(lambda x: x.name)

Unnamed: 0,School,Grade,Name,Gender,Height,Weight,Transfer,Test_Number,Test_Date,Time_Record
3,Fudan University,Sophomore,Xiaojuan Sun,Female,,41.0,N,2,2020/1/3,0:04:08
11,Tsinghua University,Junior,Xiaoquan Lv,Female,153.2,43.0,N,2,2019/9/16,0:04:49
54,Peking University,Freshman,Xiaojuan Chu,Male,162.4,58.0,Y,3,2019/11/29,0:03:42
139,Tsinghua University,Sophomore,Qiang Zhou,Female,150.5,36.0,N,1,2019/11/4,0:04:27
172,Shanghai Jiao Tong University,Junior,Quan Zhao,Female,160.6,53.0,N,2,2019/10/4,0:03:45


# 跨列分组
### 1. apply的引用

 如：BMI无法处理

agg是逐列处理，不能多列数据同时处理，用apply来解决

### apply 使用

In [140]:
def BMI(x):
    Height = x['Height'] / 100
    Weight = x['Weight'] 
    BMI_value = Weight / Height**2
    return BMI_value.mean()
gb.apply(BMI)

Gender
Female    18.860930
Male      24.318654
dtype: float64

apply 还可以返回Series和DataFrame

In [142]:
# 1. 结果得到 Series
gb.df.groupby(['Gender', 'Test_number'])[['Height', 'Weight']]


AttributeError: 'DataFrameGroupBy' object has no attribute 'df'