In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [29]:
ll = [['foo', 1], ['foo', 2], ['foo', 2], ['bar', 1], ['bar', 1]]
df4 = pd.DataFrame(ll, columns=["A", "B"])
df4

Unnamed: 0,A,B
0,foo,1
1,foo,2
2,foo,2
3,bar,1
4,bar,1


In [30]:
grouped = df4.groupby("A", as_index=False)

In [31]:
grouped.size()

A
bar    2
foo    3
dtype: int64

In [32]:
grouped.count()

Unnamed: 0,A,B
0,bar,2
1,foo,3


In [33]:
grouped.describe()

Unnamed: 0_level_0,B,B,B,B,B,B,B,B
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
0,2.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
1,3.0,1.666667,0.57735,1.0,1.5,2.0,2.0,2.0


In [34]:
grouped.groups

{'bar': Int64Index([3, 4], dtype='int64'),
 'foo': Int64Index([0, 1, 2], dtype='int64')}

In [26]:
grouped["B"].agg(np.mean).rename(columns={"A":"A", "B":"mean"})

Unnamed: 0,A,mean
0,bar,1.0
1,foo,1.666667


In [39]:
grouped["B"].groups

{'bar': Int64Index([3, 4], dtype='int64'),
 'foo': Int64Index([0, 1, 2], dtype='int64')}

In [40]:
grouped["B"].agg([lambda x:x.max()-x.min(), lambda x: x.mean()-x.max()])

Unnamed: 0_level_0,<lambda_0>,<lambda_1>
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,0,0.0
foo,1,-0.333333


### Named Aggregation

In [48]:
In [87]: animals = pd.DataFrame(
   ....:     {
   ....:         "kind": ["cat", "dog", "cat", "dog"],
   ....:         "height": [9.1, 6.0, 9.5, 34.0],
   ....:         "weight": [7.9, 7.5, 9.9, 198.0],
   ....:     }
   ....: )
   ....: 

In [49]:
animals

Unnamed: 0,height,kind,weight
0,9.1,cat,7.9
1,6.0,dog,7.5
2,9.5,cat,9.9
3,34.0,dog,198.0


In [51]:
animals.groupby("kind").agg(
    min_height=pd.NamedAgg(column="height", aggfunc="min"),
    max_height=pd.NamedAgg(column="height", aggfunc="max"),
    average_weight=pd.NamedAgg(column="weight", aggfunc="mean")

)

Unnamed: 0_level_0,average_weight,max_height,min_height
kind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cat,8.9,9.5,9.1
dog,102.75,34.0,6.0


逻辑：groupby之后，显然任一个key对应的height和weight都是一个series，<br>
NamedAggregation可以为不同的series配置不用的聚合函数

特点1：等号左边的变量（keywords）就是输出的列名，右边如code所示：<br>
指定columns和聚合函数aggfunc<br>
特点2：对不同的columns执行不同的聚合函数<br>

In [None]:
上面的NamedAgg使用的是namedTuple，下面是等价的普通元组形式：

In [52]:
animals.groupby("kind").agg(
    min_height=("height", "min"),
    max_weight=("weight", "max")
)

Unnamed: 0_level_0,max_weight,min_height
kind,Unnamed: 1_level_1,Unnamed: 2_level_1
cat,9.9,9.1
dog,198.0,6.0


当选取的keywords不是python合法命名时，可以用字典+unpack的方式实现

In [54]:
animals.groupby("kind").agg(
    **{"min height":("height", "min")},
    max_weight=("weight", "max")
)

Unnamed: 0_level_0,max_weight,min height
kind,Unnamed: 1_level_1,Unnamed: 2_level_1
cat,9.9,9.1
dog,198.0,6.0


如果只关注其中一个columns，那么可以对SeriesGroupBy对象使用named聚合
由于事先指定了列，那么value部分就只有函数

In [55]:
animals.groupby("kind").height.agg(
    min_height="min", 
    max_height="max"
)

Unnamed: 0_level_0,max_height,min_height
kind,Unnamed: 1_level_1,Unnamed: 2_level_1
cat,9.5,9.1
dog,34.0,6.0


In [56]:
animals.groupby("kind")["height"].agg(
    min_height="min", 
    max_height="max"
)

Unnamed: 0_level_0,max_height,min_height
kind,Unnamed: 1_level_1,Unnamed: 2_level_1
cat,9.5,9.1
dog,34.0,6.0
