In [1]:
import numpy as np
import pandas as pd

# 数据框的分组与聚合

In [2]:
np.random.seed(42)

df = pd.DataFrame({
    "工号": np.arange(101, 111),
    "部门": np.random.choice(["A", "B", "C"], size=10),
    "年龄": np.random.randint(15, 50, 10),
    "绩效": np.random.randint(5, 50, 10),
})

df

Unnamed: 0,工号,部门,年龄,绩效
0,101,C,25,25
1,102,A,25,37
2,103,C,38,16
3,104,C,38,26
4,105,A,17,48
5,106,A,36,29
6,107,C,16,31
7,108,B,38,46
8,109,C,44,32
9,110,C,16,20


In [3]:
df.groupby(["部门"])

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f3f4fd4f940>

In [4]:
for dept, grp in df.groupby(["部门"]):
    print(f"== 部门:", dept)
    display(grp)

== 部门: ('A',)


Unnamed: 0,工号,部门,年龄,绩效
1,102,A,25,37
4,105,A,17,48
5,106,A,36,29


== 部门: ('B',)


Unnamed: 0,工号,部门,年龄,绩效
7,108,B,38,46


== 部门: ('C',)


Unnamed: 0,工号,部门,年龄,绩效
0,101,C,25,25
2,103,C,38,16
3,104,C,38,26
6,107,C,16,31
8,109,C,44,32
9,110,C,16,20


In [5]:
df.groupby("部门").agg('mean')

Unnamed: 0_level_0,工号,年龄,绩效
部门,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,104.333333,26.0,38.0
B,108.0,38.0,46.0
C,105.666667,29.5,25.0


In [6]:
df.groupby("部门").mean()

Unnamed: 0_level_0,工号,年龄,绩效
部门,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,104.333333,26.0,38.0
B,108.0,38.0,46.0
C,105.666667,29.5,25.0


In [7]:
df.groupby("部门")[['年龄', '绩效']].mean()

Unnamed: 0_level_0,年龄,绩效
部门,Unnamed: 1_level_1,Unnamed: 2_level_1
A,26.0,38.0
B,38.0,46.0
C,29.5,25.0


In [8]:
df.groupby("部门", as_index=False)[['年龄', '绩效']].mean()

Unnamed: 0,部门,年龄,绩效
0,A,26.0,38.0
1,B,38.0,46.0
2,C,29.5,25.0


In [9]:
df.groupby('部门').agg({'绩效': ['median', 'sum'], '年龄':'mean'})

Unnamed: 0_level_0,绩效,绩效,年龄
Unnamed: 0_level_1,median,sum,mean
部门,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
A,37.0,114,26.0
B,46.0,46,38.0
C,25.5,150,29.5


### 分组transform数据转换

In [10]:
df.绩效.mean()

31.0

In [11]:
df.groupby('部门')['绩效'].transform('mean')

0    25.0
1    38.0
2    25.0
3    25.0
4    38.0
5    38.0
6    25.0
7    46.0
8    25.0
9    25.0
Name: 绩效, dtype: float64

In [12]:
df.绩效.rank()

0     3.0
1     8.0
2     1.0
3     4.0
4    10.0
5     5.0
6     6.0
7     9.0
8     7.0
9     2.0
Name: 绩效, dtype: float64

In [13]:
df.groupby('部门')['绩效'].transform("rank")

0    3.0
1    2.0
2    1.0
3    4.0
4    3.0
5    1.0
6    5.0
7    1.0
8    6.0
9    2.0
Name: 绩效, dtype: float64