# Chapter 13: Summarizing and Aggregating

In [1]:
import polars as pl
pl.show_versions()

--------Version info---------
Polars:               0.20.31
Index type:           UInt32
Platform:             macOS-12.5-arm64-arm-64bit
Python:               3.11.9 (main, Apr  2 2024, 16:11:47) [Clang 14.0.0 (clang-1400.0.29.202)]

----Optional dependencies----
adbc_driver_manager:  0.8.0
cloudpickle:          3.0.0
connectorx:           0.3.2
deltalake:            0.15.0
fastexcel:            0.9.1
fsspec:               2023.12.2
gevent:               23.9.1
hvplot:               0.9.2
matplotlib:           3.8.4
nest_asyncio:         1.6.0
numpy:                1.26.4
openpyxl:             3.1.2
pandas:               2.2.2
pyarrow:              14.0.2
pydantic:             2.5.3
pyiceberg:            0.5.1
pyxlsb:               <not installed>
sqlalchemy:           2.0.25
torch:                <not installed>
xlsx2csv:             0.8.2
xlsxwriter:           3.2.0


In [2]:
import polars as pl

top2000 = pl.read_excel(
    "data/top2000-2023.xlsx",
    read_options={"skip_rows": 1},
    engine="calamine"
).set_sorted("positie")

In [3]:
(
    top2000
    .group_by("jaar")
    .agg(  # <1>
        (
            pl.concat_str(
                pl.col("artiest"),
                pl.lit(" - "),
                pl.col("titel")
            )  # <2>
        ).alias("songs"),
    )
    .sort("jaar", descending=True)
)

jaar,songs
i64,list[str]
2022,"[""Son Mieux - Multicolor"", ""Bankzitters - Je Blik Richting Mij"", … ""Måneskin - THE LONELIEST""]"
2021,"[""Goldband - Noodgeval"", ""Bankzitters - Stapelgek"", … ""Olivia Rodrigo - Drivers License""]"
2020,"[""DI-RECT - Soldier On"", ""Miss Montreal - Door De Wind"", … ""Dua Lipa ft. DaBaby - Levitating""]"
2019,"[""Danny Vera - Roller Coaster"", ""Floor Jansen & Henk Poort - Phantom Of The Opera"", … ""Tino Martin - Zij Weet Het""]"
2018,"[""Lady Gaga & Bradley Cooper - Shallow"", ""White Lies - Time To Give"", … ""Calvin Harris & Dua Lipa - One Kiss""]"
…,…
1960,"[""Etta James - At Last"", ""Shadows - Apache""]"
1959,"[""Jacques Brel - Ne Me Quitte Pas"", ""Elvis Presley - Hound Dog""]"
1958,"[""Chuck Berry - Johnny B. Goode"", ""Ella Fitzgerald & Louis Armstrong - Summertime""]"
1957,"[""Johnny Cash - I Walk The Line"", ""Elvis Presley - Jailhouse Rock"", … ""Fats Domino - Blueberry Hill""]"


### The Descriptives

In [5]:
(
    top2000
    .group_by("jaar", maintain_order=True)  # <1>
    .head(3)  # <2>
    .sort("jaar", descending=True)
    .head(9)  # <3>
)

jaar,positie,titel,artiest
i64,i64,str,str
2022,179,"""Multicolor""","""Son Mieux"""
2022,370,"""Je Blik Richting Mij""","""Bankzitters"""
2022,395,"""L'enfer""","""Stromae"""
2021,55,"""Noodgeval""","""Goldband"""
2021,149,"""Stapelgek""","""Bankzitters"""
2021,210,"""Dat Heb Jij Gedaan""","""Meau"""
2020,19,"""Soldier On""","""DI-RECT"""
2020,38,"""Door De Wind""","""Miss Montreal"""
2020,77,"""Impossible (Orchestral Version…","""Nothing But Thieves"""


In [6]:
(
    top2000
    .group_by("jaar", maintain_order=True)
    .tail(3)
    .sort("jaar", descending=True)
    .head(9)
)

jaar,positie,titel,artiest
i64,i64,str,str
2022,1391,"""De Diepte""","""S10"""
2022,1688,"""Zeit""","""Rammstein"""
2022,1716,"""THE LONELIEST""","""Måneskin"""
2021,1865,"""Bon Gepakt""","""Donnie & Rene Froger"""
2021,1978,"""Hold On""","""Armin van Buuren ft. Davina Mi…"
2021,2000,"""Drivers License""","""Olivia Rodrigo"""
2020,1824,"""Smoorverliefd""","""Snelle"""
2020,1879,"""The Business""","""Tiësto"""
2020,1902,"""Levitating""","""Dua Lipa ft. DaBaby"""


In [7]:
(
    top2000
    .group_by("artiest")
    .len()
    .sort("len", descending=True)
    .head(10)
)

artiest,len
str,u32
"""Queen""",34
"""The Beatles""",31
"""ABBA""",25
"""The Rolling Stones""",22
"""Bruce Springsteen""",22
"""Fleetwood Mac""",20
"""Coldplay""",20
"""Michael Jackson""",20
"""U2""",18
"""David Bowie""",18


In [8]:
df = pl.read_csv("data/sales_data.csv")
df.columns

FileNotFoundError: No such file or directory (os error 2): data/sales_data.csv

In [9]:
(
    df
    .select("Product_Category", "Sub_Category", "Unit_Price")  # <1>
    .group_by("Product_Category", "Sub_Category")  # <2>
    .max()
    .sort("Unit_Price", descending=True)  # <3>
    .head(10)
)

NameError: name 'df' is not defined

In [10]:
(
    df
    .select("Country", "Profit")
    .group_by("Country")
    .sum()
    .sort("Profit", descending=True)
)

NameError: name 'df' is not defined

In [11]:
(
    df
    .select("Sub_Category", "Product")
    .group_by("Sub_Category")
    .n_unique()
    .sort("Product", descending=True)
    .head(10)
)

NameError: name 'df' is not defined

In [12]:
(
    df
    .select("Age_Group", "Order_Quantity")
    .group_by("Age_Group")
    .mean()
    .sort("Order_Quantity", descending=True)
)

NameError: name 'df' is not defined

In [13]:
(
    df
    .select("Age_Group", "Revenue")
    .group_by("Age_Group")
    .quantile(0.9)
    .sort("Revenue", descending=True)
)

NameError: name 'df' is not defined

### The Advanced

In [15]:
(
    df
    .select("Country", "Profit", "Revenue")
    .group_by("Country")
    .agg(
        pl.col("Profit"),
        pl.col("Revenue"),
    )
)

NameError: name 'df' is not defined

In [16]:
(
    df
    .select("Country", "Profit", "Revenue")
    .group_by("Country")
    .agg(
        pl.col("Profit").alias("All Profits Per Transactions"),
        pl.col("Revenue").name.prefix("All "),
    )
)

NameError: name 'df' is not defined

In [17]:
(
    df
    .select("Country", "Profit", "Revenue")
    .group_by("Country")
    .agg(
        pl.col("Profit").sum().alias("Total Profit"),
        pl.col("Profit").mean().alias("Average Profit per Transaction"),
        pl.col("Revenue").sum().alias("Total Revenue"),
        pl.col("Revenue").mean().alias("Average Revenue per Transaction"),
    )
)

NameError: name 'df' is not defined

In [18]:
(
    df
    .select("Country", "Profit", "Revenue")
    .group_by("Country")
    .agg(
        pl.all().sum().name.prefix("Total "),
        pl.all().mean().name.prefix("Average "),
    )
)

NameError: name 'df' is not defined

In [19]:
(
    df
    .select("Country", "Profit")
    .group_by("Country")
    .agg(
        (pl.col("Profit") > 1000)
        .alias("Profit > 1000"),
        (pl.col("Profit") > 1000)
        .sum()
        .alias("Number of Transactions with Profit > 1000"),
    )
)

NameError: name 'df' is not defined

In [20]:
def custom_agg(column: str) -> pl.Expr:
    return (column > 1000).alias("Profit > 1000"), (column > 1000).sum().alias("Number of Transactions with Profit > 1000")

(
    df
    .select("Country", "Profit")
    .group_by("Country")
    .agg(
        custom_agg(pl.col("Profit"))
    )
)

NameError: name 'df' is not defined

### User-Defined Functions

In [22]:
from textblob import TextBlob

def analyze_sentiment(review):
    return TextBlob(review).sentiment.polarity

df = pl.DataFrame({
    "reviews": [
        "This product is great!",
        "Terrible service.",
        "Okay, but not what I expected.",
        "Excellent! I love it."
    ]
})

df = df.with_columns(
    pl.col("reviews")
    .map_elements(
        analyze_sentiment,
        return_dtype=pl.Float64
    )
    .alias("sentiment_score")
)
df

reviews,sentiment_score
str,f64
"""This product is great!""",1.0
"""Terrible service.""",-1.0
"""Okay, but not what I expected.""",0.2
"""Excellent! I love it.""",0.75


In [23]:
from functools import lru_cache


df = pl.DataFrame({
    "x": [1,1,3,3]
})

@lru_cache(maxsize=None)
def add_one(x):
    return x + 1

df.with_columns(
    pl.col('x')
    .map_elements(
        add_one,
        return_dtype=pl.Int64,
    )
    .alias("x + 1")
)

x,x + 1
i64,i64
1,2
1,2
3,4
3,4


In [24]:
import polars.selectors as cs
import numpy as np
from scipy.special import softmax

df = pl.DataFrame({
    "feature1": [0.3, 0.2, 0.4, 0.1, 0.2, 0.3, 0.5],
    "feature2": [32, 50, 70, 65, 0, 10, 15],
    "label": [1, 0, 1, 0, 1, 0, 0]
})

result = df.select(
    "label",
    cs.starts_with("feature").map_batches(
        lambda x: softmax(x.to_numpy()),
    )
)
result

label,feature1,feature2
i64,f64,f64
1,0.143782,3.1181e-17
0,0.130099,2.0474e-09
1,0.158904,0.993307
0,0.117719,0.006693
1,0.130099,3.9488e-31
0,0.143782,8.6979e-27
0,0.175616,1.2909000000000001e-24


In [25]:
from sklearn.preprocessing import StandardScaler

def scale_temperature(group):
    scaler = StandardScaler()
    scaled_values = scaler.fit_transform(group[['temperature']].to_numpy())
    return group.with_columns(pl.Series(values=scaled_values.flatten(), name="scaled_feature"))

df = pl.DataFrame({
    "group": ["USA", "USA", "USA", "USA", "NL", "NL", "NL"],
    "temperature": [32, 50, 70, 65, 0, 10, 15]
})

result = df.group_by("group").map_groups(scale_temperature)
result

group,temperature,scaled_feature
str,i64,f64
"""NL""",0,-1.336306
"""NL""",10,0.267261
"""NL""",15,1.069045
"""USA""",32,-1.502872
"""USA""",50,-0.287066
"""USA""",70,1.063831
"""USA""",65,0.726107


In [26]:
df = pl.DataFrame({
    "group": ["USA", "USA", "USA", "USA", "NL", "NL", "NL"],
    "temperature": [32, 50, 70, 65, 0, 10, 15]
})

for group in df.group_by(["group"]):
    print(group)

(('NL',), shape: (3, 2)
┌───────┬─────────────┐
│ group ┆ temperature │
│ ---   ┆ ---         │
│ str   ┆ i64         │
╞═══════╪═════════════╡
│ NL    ┆ 0           │
│ NL    ┆ 10          │
│ NL    ┆ 15          │
└───────┴─────────────┘)
(('USA',), shape: (4, 2)
┌───────┬─────────────┐
│ group ┆ temperature │
│ ---   ┆ ---         │
│ str   ┆ i64         │
╞═══════╪═════════════╡
│ USA   ┆ 32          │
│ USA   ┆ 50          │
│ USA   ┆ 70          │
│ USA   ┆ 65          │
└───────┴─────────────┘)


## Row-wise Aggregations with `reduce` and `fold`

In [28]:
df = pl.DataFrame({
    "col1": [2],
    "col2": [3],
    "col3": [4]
})

df.with_columns(
    pl.fold(
        acc=pl.lit(0),  # <1>
        function=lambda acc, x: acc + x,  # <2>
        exprs=pl.col("*")  # <3>
    ).alias("sum")
)

col1,col2,col3,sum
i64,i64,i64,i64
2,3,4,9


In [29]:
df = pl.DataFrame({
    "col1": [2],
    "col2": [3],
    "col3": [4]
})

df.with_columns(
    pl.fold(
        acc=pl.lit(0),  # <1>
        function=lambda acc, x: acc + x,  # <2>
        exprs=pl.col("*")  # <3>
    ).alias("sum")
)

col1,col2,col3,sum
i64,i64,i64,i64
2,3,4,9


In [30]:
df = pl.DataFrame({
    "product_A": [10, 20, 30],
    "product_B": [20, 30, 40],
    "product_C": [30, 40, 50]
})


weights = {  # <1>
    "product_A": 0.5,
    "product_B": 1.5,
    "product_C": 2.0
}

weighted_exprs = [  # <2>
    (pl.col(product) * weight).alias(product)
    for product, weight in weights.items()
]

df_with_weighted_sum = df.with_columns(
    pl.fold(  # <3>
        acc=pl.lit(0),  # <4>
        function=lambda acc, x: acc + x,  # <5>
        exprs=weighted_exprs  # <6>
    ).alias("weighted_sum")
)

df_with_weighted_sum

product_A,product_B,product_C,weighted_sum
i64,i64,i64,f64
10,20,30,95.0
20,30,40,135.0
30,40,50,175.0


## over() Expressions in Selection Context

In [32]:
(top2000
    .select(
        "jaar",
        "artiest",
        "titel",
        "positie",
        pl.col("positie")
        .rank()
        .over("jaar")
        .alias("year_rank")
    )
    .sample(10, seed=42)
)

jaar,artiest,titel,positie,year_rank
i64,str,str,i64,f64
2013,"""Stromae""","""Papaoutai""",318,6.0
1969,"""John Denver""","""Leaving On A Jet Plane""",607,16.0
1971,"""Led Zeppelin""","""Immigrant Song""",590,19.0
2009,"""Anouk""","""For Bitter Or Worse""",1453,23.0
2015,"""Snollebollekes""","""Links Rechts""",1076,14.0
1984,"""Alphaville""","""Forever Young""",302,11.0
1977,"""ABBA""","""Take A Chance On Me""",636,23.0
1975,"""Rod Stewart""","""Sailing""",918,20.0
1986,"""Metallica""","""Master Of Puppets""",29,1.0
2005,"""Alderliefste & Ramses Shaffy""","""Laat Me/Vivre""",463,5.0


## Dynamic Grouping with `group_by_dynamic`

## Rolling Aggregations with `rolling`

In [35]:
from datetime import date

dates = pl.date_range(  # <1>
    start=date(2024, 4, 1),
    end=date(2024, 4, 14),
    interval='1d',
    eager=True,  # <2>
)
dates  = dates.filter(dates.dt.weekday() < 6)  # <3>
dates_repeated = pl.concat([dates, dates]).sort()  # <4>

df = pl.DataFrame({
    "date": dates_repeated,
    "store": ["Store A", "Store B"] * dates.len(),
    "sales": [
        200, 150, 220, 160, 250, 180, 270, 190, 280, 210,
        210, 170, 220, 180, 240, 190, 250, 200, 260, 210,
    ]
}).set_sorted("date", "store")  # <5>

In [36]:
result = (
    df.rolling(  # <1>
        index_column="date",
        period="7d",
        group_by="store",
        check_sorted=False,  # <2>
    ).agg(  # <3>
        pl.sum("sales").alias("sum_of_last_7_days_sales")
    )
)

final_df = df.join(result, on=["date", "store"])  # <4>

final_df

  df.rolling(  # <1>


date,store,sales,sum_of_last_7_days_sales
date,str,i64,i64
2024-04-01,"""Store A""",200,200
2024-04-02,"""Store A""",220,420
2024-04-03,"""Store A""",250,670
2024-04-04,"""Store A""",270,940
2024-04-05,"""Store A""",280,1220
…,…,…,…
2024-04-08,"""Store B""",170,910
2024-04-09,"""Store B""",180,930
2024-04-10,"""Store B""",190,940
2024-04-11,"""Store B""",200,950


## Conclusion