- Author: Benjamin Du
- Date: 2023-02-07 21:18:18
- Modified: 2023-02-08 11:44:24
- Title: Hands on GroupBy of Polars DataFrame in Python
- Slug: hands-on-GroupBy-of-polars-dataframe-in-python
- Category: Computer Science
- Tags: Computer Science, programming, Python, polars, DataFrame, GroupBy, group by

**Things on this page are fragmentary and immature notes/thoughts of the author. Please read with your own judgement!**

In [2]:
import itertools as it
import polars as pl

In [10]:
df = pl.DataFrame(
    {
        "id": [0, 1, 2, 3, 4],
        "color": ["red", "green", "green", "red", "red"],
        "shape": ["square", "triangle", "square", "triangle", "square"],
    }
)
df

id,color,shape
i64,str,str
0,"""red""","""square"""
1,"""green""","""triangle"""
2,"""green""","""square"""
3,"""red""","""triangle"""
4,"""red""","""square"""


In [4]:
def update_frame(frame):
    frame[0, "id"] = frame[0, "id"] * 1000
    return frame

In [5]:
df.groupby("color").apply(update_frame)

id,color,shape
i64,str,str
0,"""red""","""square"""
3,"""red""","""triangle"""
4,"""red""","""square"""
1000,"""green""","""triangle"""
2,"""green""","""square"""


### GroupBy + Aggregation

In [6]:
df.groupby("color").agg(pl.count().alias("n"))

color,n
str,u32
"""green""",2
"""red""",3


In [9]:
pl.DataFrame(
    data=it.combinations(range(52), 4),
    orient="row"
).with_row_count().groupby([
    "column_0", 
    "column_1", 
    "column_2", 
]).agg(pl.col("row_nr").min()).sort([
    "column_0", 
    "column_1", 
    "column_2", 
])

column_0,column_1,column_2,row_nr
i64,i64,i64,u32
0,1,2,0
0,1,3,49
0,1,4,97
0,1,5,144
0,1,6,190
0,1,7,235
0,1,8,279
0,1,9,322
0,1,10,364
0,1,11,405


### GroupBy as An Iterable

In [11]:
pl.Series(
    (g, frame.shape[0])
    for g, frame in df.groupby("color")
)

"('red', 3)"
"('green', 2)"
