In [1]:
import numpy as np
import polars as pl
from polars import col, lit

In [2]:
np.random.seed(12)

df = pl.DataFrame(
    {
        "nrs": [1, 2, 3, None, 5],
        "names": ["foo", "ham", "spam", "egg", None],
        "random": np.random.rand(5),
        "groups": ["A", "A", "B", "C", "B"],
    }
)
df

nrs,names,random,groups
i64,str,f64,str
1.0,"""foo""",0.154163,"""A"""
2.0,"""ham""",0.74005,"""A"""
3.0,"""spam""",0.263315,"""B"""
,"""egg""",0.533739,"""C"""
5.0,,0.014575,"""B"""


## Polars expressions

Below is a Polars expression:

In [3]:
df.select([
    col("random").sort().head(2),
    col("nrs").filter(col("names").is_in(["foo", "ham", "spam"])).sum()  # this will get broadcast across rows because it's a scalar
])

random,nrs
f64,i64
0.014575,6
0.154163,6


All expressions are run in parallel

### Unique values

In [4]:
df.select([
    col("names").n_unique().alias("unique_names_1"),
    col("names").unique().count().alias("unique_names_2"),  # Polars doesn't like duplicate column names
])

unique_names_1,unique_names_2
u32,u32
5,5


## Aggregations

In [6]:
df.select([
    col("random").sum().alias("sum"),
    col("random").min().alias("min"),
    col("random").std().alias("std"),
])

sum,min,std
f64,f64,f64
1.705842,0.014575,0.293209


## Filtering and conditionals

In [8]:
df.select([
    col("names").filter(col("names").str.contains(r"am$")).count()  # Native support for regex
])

names
u32
2


## Binary functions and modification

Polars uses a when/then/otherwise construct for ternary operation. The `when` function requires a predicate expression (which returns a boolean `Series`).

In [9]:
df.select([
    pl.when(col("random") > 0.5).then(0).otherwise(col("random")) * pl.sum("nrs")
])

literal
f64
1.695791
0.0
2.896465
0.0
0.160325


## Window expressions

Polars allows you to do an implicit groupby/aggregation/join in a single expression. These are an efficient way to determine group statistics, and is all computed automatically in parallel.

**Question**: Is there a way to get one row per group, or are we always going to preserve the same number of rows in a Window expression?

In [18]:
df.select([
    col("*"),
    col("random").sum().over("groups").alias("sum[random]/groups"),
    col("random").list().over("names").alias("random/name"),
])

nrs,names,random,groups,sum[random]/groups,random/name
i64,str,f64,str,f64,list[f64]
1.0,"""foo""",0.154163,"""A""",0.894213,[0.154163]
2.0,"""ham""",0.74005,"""A""",0.894213,[0.74005]
3.0,"""spam""",0.263315,"""B""",0.27789,[0.263315]
,"""egg""",0.533739,"""C""",0.533739,[0.533739]
5.0,,0.014575,"""B""",0.27789,[0.014575]
