In [1]:
import numpy as np
import pandas as pd

# 数据框的分组与聚合

In [2]:
np.random.seed(99)
df = pd.DataFrame({
    "工号": np.arange(801, 809),
    "部门": np.random.choice(['A', 'B'], size=8),
    "年龄": np.random.randint(15, 50, 8),
    "绩效": np.random.randint(5, 50, 8),
})
df

Unnamed: 0,工号,部门,年龄,绩效
0,801,B,20,17
1,802,B,16,45
2,803,B,38,13
3,804,A,16,25
4,805,B,44,44
5,806,A,38,20
6,807,A,17,32
7,808,A,15,42


In [3]:
df.groupby(["部门"])

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f6bec160340>

In [4]:
for dept, grp in df.groupby(["部门"]):
    print(f"== 部门:", dept)
    display(grp)

== 部门: ('A',)


Unnamed: 0,工号,部门,年龄,绩效
3,804,A,16,25
5,806,A,38,20
6,807,A,17,32
7,808,A,15,42


== 部门: ('B',)


Unnamed: 0,工号,部门,年龄,绩效
0,801,B,20,17
1,802,B,16,45
2,803,B,38,13
4,805,B,44,44


In [5]:
df.groupby(["部门"]).agg('mean')

Unnamed: 0_level_0,工号,年龄,绩效
部门,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,806.25,21.5,29.75
B,802.75,29.5,29.75


In [6]:
df.groupby(["部门"]).mean()

Unnamed: 0_level_0,工号,年龄,绩效
部门,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,806.25,21.5,29.75
B,802.75,29.5,29.75


In [7]:
df.groupby("部门")[['年龄', '绩效']].mean()

Unnamed: 0_level_0,年龄,绩效
部门,Unnamed: 1_level_1,Unnamed: 2_level_1
A,21.5,29.75
B,29.5,29.75


In [8]:
df.groupby("部门", as_index=False)[['年龄', '绩效']].mean()

Unnamed: 0,部门,年龄,绩效
0,A,21.5,29.75
1,B,29.5,29.75


In [9]:
df.groupby("部门")[['年龄', '绩效']].mean().reset_index()

Unnamed: 0,部门,年龄,绩效
0,A,21.5,29.75
1,B,29.5,29.75


In [10]:
df2 = df.groupby('部门').agg({'绩效': ['mean', 'sum', 'std'], '年龄':'median'})
df2

Unnamed: 0_level_0,绩效,绩效,绩效,年龄
Unnamed: 0_level_1,mean,sum,std,median
部门,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
A,29.75,119,9.535023,16.5
B,29.75,119,17.114808,29.0


In [11]:
df2.绩效

Unnamed: 0_level_0,mean,sum,std
部门,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,29.75,119,9.535023
B,29.75,119,17.114808


### 分组transform数据转换

In [12]:
df.groupby('部门').绩效.mean()

部门
A    29.75
B    29.75
Name: 绩效, dtype: float64

In [13]:
df3 = df.copy()
df3['部门平均绩效'] = df.groupby('部门')['绩效'].transform('mean')
df3['部门内绩效差异'] = df3.绩效 - df3.部门平均绩效
df3

Unnamed: 0,工号,部门,年龄,绩效,部门平均绩效,部门内绩效差异
0,801,B,20,17,29.75,-12.75
1,802,B,16,45,29.75,15.25
2,803,B,38,13,29.75,-16.75
3,804,A,16,25,29.75,-4.75
4,805,B,44,44,29.75,14.25
5,806,A,38,20,29.75,-9.75
6,807,A,17,32,29.75,2.25
7,808,A,15,42,29.75,12.25


<br>

排名函数或秩函数[`rank()`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.rank.html)

In [14]:
pd.Series([30, 25, 10, 25, 25, 40]).rank()

0    5.0
1    3.0
2    1.0
3    3.0
4    3.0
5    6.0
dtype: float64

In [15]:
df.groupby('部门').绩效.rank()

0    2.0
1    4.0
2    1.0
3    2.0
4    3.0
5    1.0
6    3.0
7    4.0
Name: 绩效, dtype: float64

In [16]:
df.groupby('部门').绩效.rank(ascending=False)

0    3.0
1    1.0
2    4.0
3    3.0
4    2.0
5    4.0
6    2.0
7    1.0
Name: 绩效, dtype: float64

In [17]:
df.groupby('部门')['绩效'].transform("rank", ascending=False)

0    3.0
1    1.0
2    4.0
3    3.0
4    2.0
5    4.0
6    2.0
7    1.0
Name: 绩效, dtype: float64

In [18]:
df3['部门内绩效排名'] = df.groupby('部门').绩效.rank(ascending=False)
df3

Unnamed: 0,工号,部门,年龄,绩效,部门平均绩效,部门内绩效差异,部门内绩效排名
0,801,B,20,17,29.75,-12.75,3.0
1,802,B,16,45,29.75,15.25,1.0
2,803,B,38,13,29.75,-16.75,4.0
3,804,A,16,25,29.75,-4.75,3.0
4,805,B,44,44,29.75,14.25,2.0
5,806,A,38,20,29.75,-9.75,4.0
6,807,A,17,32,29.75,2.25,2.0
7,808,A,15,42,29.75,12.25,1.0
