In [1]:
import numpy as np
import pandas as pd

# 数据框的分组与聚合

In [46]:
np.random.seed(2023)
df = pd.DataFrame({
    "工号": np.arange(801, 809),
    "部门": np.random.choice(['A', 'B'], size=8),
    "年龄": np.random.randint(15, 50, 8),
    "绩效": np.random.randint(5, 50, 8),
})
df

Unnamed: 0,工号,部门,年龄,绩效
0,801,B,37,33
1,802,B,47,22
2,803,A,37,12
3,804,B,32,37
4,805,B,28,34
5,806,A,20,45
6,807,B,39,48
7,808,A,18,43


In [47]:
df.groupby(["部门"])

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f6b82f65ea0>

In [48]:
for dept, grp in df.groupby(["部门"]):
    print(f"== 部门:", dept)
    display(grp)

== 部门: ('A',)


Unnamed: 0,工号,部门,年龄,绩效
2,803,A,37,12
5,806,A,20,45
7,808,A,18,43


== 部门: ('B',)


Unnamed: 0,工号,部门,年龄,绩效
0,801,B,37,33
1,802,B,47,22
3,804,B,32,37
4,805,B,28,34
6,807,B,39,48


In [49]:
df.groupby(["部门"]).agg('mean')

Unnamed: 0_level_0,工号,年龄,绩效
部门,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,805.666667,25.0,33.333333
B,803.8,36.6,34.8


In [50]:
df.groupby(["部门"]).mean()

Unnamed: 0_level_0,工号,年龄,绩效
部门,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,805.666667,25.0,33.333333
B,803.8,36.6,34.8


In [51]:
df.groupby("部门")[['年龄', '绩效']].mean()

Unnamed: 0_level_0,年龄,绩效
部门,Unnamed: 1_level_1,Unnamed: 2_level_1
A,25.0,33.333333
B,36.6,34.8


In [52]:
df.groupby("部门").年龄.mean()

部门
A    25.0
B    36.6
Name: 年龄, dtype: float64

In [53]:
df.groupby("部门", as_index=False)[['年龄', '绩效']].mean()

Unnamed: 0,部门,年龄,绩效
0,A,25.0,33.333333
1,B,36.6,34.8


In [54]:
df.groupby("部门")[['年龄', '绩效']].mean().reset_index()

Unnamed: 0,部门,年龄,绩效
0,A,25.0,33.333333
1,B,36.6,34.8


In [55]:
df2 = df.groupby('部门').agg({'绩效': ['mean', 'sum', 'std'], '年龄':'median'})
df2

Unnamed: 0_level_0,绩效,绩效,绩效,年龄
Unnamed: 0_level_1,mean,sum,std,median
部门,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
A,33.333333,100,18.502252,20.0
B,34.8,174,9.311283,37.0


In [56]:
df2[[('绩效',   'mean'), ('绩效',    'sum')]]

Unnamed: 0_level_0,绩效,绩效
Unnamed: 0_level_1,mean,sum
部门,Unnamed: 1_level_2,Unnamed: 2_level_2
A,33.333333,100
B,34.8,174


### 分组transform数据转换

In [57]:
df

Unnamed: 0,工号,部门,年龄,绩效
0,801,B,37,33
1,802,B,47,22
2,803,A,37,12
3,804,B,32,37
4,805,B,28,34
5,806,A,20,45
6,807,B,39,48
7,808,A,18,43


In [58]:
df.groupby('部门').绩效.mean()

部门
A    33.333333
B    34.800000
Name: 绩效, dtype: float64

In [59]:
df.groupby('部门')['绩效'].transform('mean')

0    34.800000
1    34.800000
2    33.333333
3    34.800000
4    34.800000
5    33.333333
6    34.800000
7    33.333333
Name: 绩效, dtype: float64

In [63]:
df3 = df.copy()
df3['部门平均绩效'] = df.groupby('部门')['绩效'].transform('mean')
df3['部门内绩效差异'] = df3.绩效 - df3.部门平均绩效
df3

Unnamed: 0,工号,部门,年龄,绩效,部门平均绩效,部门内绩效差异
0,801,B,37,33,34.8,-1.8
1,802,B,47,22,34.8,-12.8
2,803,A,37,12,33.333333,-21.333333
3,804,B,32,37,34.8,2.2
4,805,B,28,34,34.8,-0.8
5,806,A,20,45,33.333333,11.666667
6,807,B,39,48,34.8,13.2
7,808,A,18,43,33.333333,9.666667


<br>

排名函数或秩函数[`rank()`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.rank.html)

In [64]:
pd.Series([30, 25, 10, 25, 25, 40]).sort_values()

2    10
1    25
3    25
4    25
0    30
5    40
dtype: int64

In [14]:
pd.Series([30, 25, 10, 25, 25, 40]).rank()

0    5.0
1    3.0
2    1.0
3    3.0
4    3.0
5    6.0
dtype: float64

In [65]:
df.groupby('部门').绩效.rank()

0    2.0
1    1.0
2    1.0
3    4.0
4    3.0
5    3.0
6    5.0
7    2.0
Name: 绩效, dtype: float64

In [66]:
df.groupby('部门').绩效.rank(ascending=False)

0    4.0
1    5.0
2    3.0
3    2.0
4    3.0
5    1.0
6    1.0
7    2.0
Name: 绩效, dtype: float64

In [67]:
df.groupby('部门')['绩效'].transform("rank", ascending=False)

0    4.0
1    5.0
2    3.0
3    2.0
4    3.0
5    1.0
6    1.0
7    2.0
Name: 绩效, dtype: float64

In [68]:
df3['部门内绩效排名'] = df.groupby('部门').绩效.rank(ascending=False)
df3

Unnamed: 0,工号,部门,年龄,绩效,部门平均绩效,部门内绩效差异,部门内绩效排名
0,801,B,37,33,34.8,-1.8,4.0
1,802,B,47,22,34.8,-12.8,5.0
2,803,A,37,12,33.333333,-21.333333,3.0
3,804,B,32,37,34.8,2.2,2.0
4,805,B,28,34,34.8,-0.8,3.0
5,806,A,20,45,33.333333,11.666667,1.0
6,807,B,39,48,34.8,13.2,1.0
7,808,A,18,43,33.333333,9.666667,2.0
