# groupby()

In [1]:
import pandas as pd

In [5]:
df = pd.DataFrame(
    {
        "a": ["a", "b", "a", "a", "b"],
        "b": [1, 2, 3, 2, 1],
        "c": [3, 1, 5, 1, 7],
        "d": ["我", "是", "一", "只", "cat"],
    }
)
df

Unnamed: 0,a,b,c,d
0,a,1,3,我
1,b,2,1,是
2,a,3,5,一
3,a,2,1,只
4,b,1,7,cat


## groupby simple

groupby 基础api

In [6]:
df.groupby(by=["a"])

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000002069CDD6410>

In [14]:
df.groupby(by="a").sum()

Unnamed: 0_level_0,b,c,d
a,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,6,9,我一只
b,3,8,是cat


In [30]:
df.groupby(by="a",as_index=False).sum()

Unnamed: 0,a,b,c,d
0,a,6,9,我一只
1,b,3,8,是cat


In [8]:
df.groupby(by="a").count()

Unnamed: 0_level_0,b,c,d
a,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,3,3,3
b,2,2,2


In [19]:
df.groupby(by="a").agg(lambda x: len(x))

Unnamed: 0_level_0,b,c,d
a,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,3,3,3
b,2,2,2


## agg

In [24]:
# 仅仅对b 和 c进行处理
df.groupby(by="a")[["b", "c"]].agg(lambda x: len(x))

Unnamed: 0_level_0,b,c
a,Unnamed: 1_level_1,Unnamed: 2_level_1
a,3,3
b,2,2


In [26]:
df.groupby(by="a").agg({"b": len, "c": len})

Unnamed: 0_level_0,b,c
a,Unnamed: 1_level_1,Unnamed: 2_level_1
a,3,3
b,2,2


## multi agg

In [28]:
df.groupby(by="a").agg(
    {
        "b": ["sum", lambda x: str(sum(x)) + "~~~"],
        "c": "mean",
        "d": [" ".join, lambda x: [_ + "旺旺 " for _ in x]],
    }
)

Unnamed: 0_level_0,b,b,c,d,d
Unnamed: 0_level_1,sum,<lambda_0>,mean,join,<lambda_0>
a,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
a,6,6~~~,3.0,我 一 只,"[我旺旺 , 一旺旺 , 只旺旺 ]"
b,3,3~~~,4.0,是 cat,"[是旺旺 , cat旺旺 ]"


In [31]:
df.groupby(by="a", as_index=False).agg(
    {
        "b": ["sum", lambda x: str(sum(x)) + "yoyoyo~"],
        "c": "mean",
        "d": [" ".join, lambda x: [_ + "旺旺 " for _ in x]],
    }
)

Unnamed: 0_level_0,a,b,b,c,d,d
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,<lambda_0>,mean,join,<lambda_0>
0,a,6,6yoyoyo~,3.0,我 一 只,"[我旺旺 , 一旺旺 , 只旺旺 ]"
1,b,3,3yoyoyo~,4.0,是 cat,"[是旺旺 , cat旺旺 ]"


## transform

* transform与agg的区别:
* transform后的结果,行是不会减少的，原来是多少行结果还是多少行

In [39]:
df = pd.DataFrame(
    {
        "x": ["a", "b", "a"],
        "y": [1, 2, 3],
        "z": [3, 1, 5],
    }
)
df

Unnamed: 0,x,y,z
0,a,1,3
1,b,2,1
2,a,3,5


In [40]:
df.groupby(by=["x"], as_index=False).transform("sum")

Unnamed: 0,y,z
0,4,8
1,2,1
2,4,8


In [41]:
df.groupby(by=["x"], as_index=False)[["y"]].transform("sum")

Unnamed: 0,y
0,4
1,2
2,4


# pivot_table

* pivot_table和groupby相似
* index(相当于groupby中的参数by)
* values(要聚合的列)
* aggfunc(聚合函数)得到结果和groupby是一样

In [43]:
import pandas as pd

df = pd.DataFrame(
    {
        "x": ["a", "b", "a"],
        "y": [1, 2, 3],
        "z": [3, 1, 5],
    }
)
df

Unnamed: 0,x,y,z
0,a,1,3
1,b,2,1
2,a,3,5


In [44]:
pd.pivot_table(df, index=["x"], values=["y", "z"], aggfunc="sum")

Unnamed: 0_level_0,y,z
x,Unnamed: 1_level_1,Unnamed: 2_level_1
a,4,8
b,2,1


In [45]:
pd.pivot_table(df, index="x", values=["y"], aggfunc=["sum", "count"])

Unnamed: 0_level_0,sum,count
Unnamed: 0_level_1,y,y
x,Unnamed: 1_level_2,Unnamed: 2_level_2
a,4,2
b,2,1


In [46]:
pd.pivot_table(df, index=["x"], aggfunc="sum")

Unnamed: 0_level_0,y,z
x,Unnamed: 1_level_1,Unnamed: 2_level_1
a,4,8
b,2,1


In [47]:
pd.pivot_table(df, index=["x"], aggfunc={"y": ["count", "sum"], "z": "sum"})

Unnamed: 0_level_0,y,y,z
Unnamed: 0_level_1,count,sum,sum
x,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
a,2,4,8
b,1,2,1
