# Examples

In [4]:
import polars as pl
from polars import col, lit

In [2]:
df = pl.DataFrame(
    {
        "A": [1, 2, 3, 4, 5],
        "fruits": ["banana", "banana", "apple", "apple", "banana"],
        "B": [5, 4, 3, 2, 1],
        "cars": ["beetle", "audi", "beetle", "beetle", "beetle"],
        "optional": [28, 300, None, 2, -30],
    }
)
df

A,fruits,B,cars,optional
i64,str,i64,str,i64
1,"""banana""",5,"""beetle""",28.0
2,"""banana""",4,"""audi""",300.0
3,"""apple""",3,"""beetle""",
4,"""apple""",2,"""beetle""",2.0
5,"""banana""",1,"""beetle""",-30.0


## Selection context

In [5]:
df.select([
    col('A'),
    "B",      # the col was inferred
    lit('B'), # the literal "B"
    col('fruits')
])

A,B,literal,fruits
i64,i64,str,str
1,5,"""B""","""banana"""
2,4,"""B""","""banana"""
3,3,"""B""","""apple"""
4,2,"""B""","""apple"""
5,1,"""B""","""banana"""


You can select columns with regex by using `^` and `$`

In [7]:
df.select([
    col('^A|B$').sum()
])

A,B
i64,i64
15,15


and you can select by name

In [8]:
df.select([
    col(['A', 'B']).sum()
])

A,B
i64,i64
15,15


Select everything, then select everything reversed

In [10]:
df.select([
    pl.all(),
    pl.all().reverse().suffix('_reverse')  # column-wise reversed
])

A,fruits,B,cars,optional,A_reverse,fruits_reverse,B_reverse,cars_reverse,optional_reverse
i64,str,i64,str,i64,i64,str,i64,str,i64
1,"""banana""",5,"""beetle""",28.0,5,"""banana""",1,"""beetle""",-30.0
2,"""banana""",4,"""audi""",300.0,4,"""apple""",2,"""beetle""",2.0
3,"""apple""",3,"""beetle""",,3,"""apple""",3,"""beetle""",
4,"""apple""",2,"""beetle""",2.0,2,"""banana""",4,"""audi""",300.0
5,"""banana""",1,"""beetle""",-30.0,1,"""banana""",5,"""beetle""",28.0


Select everything, then sum everything, broadbasted to be the same shape as the df

In [13]:
df.select([
    pl.all(),
    pl.all().sum().suffix('_sum')
])

A,fruits,B,cars,optional,A_sum,fruits_sum,B_sum,cars_sum,optional_sum
i64,str,i64,str,i64,i64,str,i64,str,i64
1,"""banana""",5,"""beetle""",28.0,15,,15,,300
2,"""banana""",4,"""audi""",300.0,15,,15,,300
3,"""apple""",3,"""beetle""",,15,,15,,300
4,"""apple""",2,"""beetle""",2.0,15,,15,,300
5,"""banana""",1,"""beetle""",-30.0,15,,15,,300


## `str` and `dt` namespaces

There are specialised namespaces for strings and dates

In [14]:
predicate = pl.col('fruits').str.contains('^b.*')
df.select([
    predicate
])

fruits
bool
True
True
False
False
True


Use the predicate to filter

In [15]:
df.filter(predicate)

A,fruits,B,cars,optional
i64,str,i64,str,i64
1,"""banana""",5,"""beetle""",28
2,"""banana""",4,"""audi""",300
5,"""banana""",1,"""beetle""",-30


We can combine multiple different computations into the same dataframe as long as the results have the same number of rows

In [17]:
df.select([
    col('A').filter(col('fruits').str.contains('^b.*$')).sum(),
    col('B').filter(col('cars').str.contains('^b.*$')).sum() * pl.col("B").sum().alias("some_compute()"),
])

A,B
i64,i64
8,165


In [22]:
some_var = 1000

(df.select([
    ((pl.col("A") / 124.0 * pl.col("B")) / pl.sum("B") * some_var).alias("computed")
]))

computed
f64
2.688172
4.301075
4.83871
4.301075
2.688172


We can combine columns with a predicate

In [24]:
df.select([
    "fruits",
    "B",
    pl.when(col('fruits') == 'banana').then(col("B")).otherwise(-1).alias("b")
])

fruits,B,b
str,i64,i64
"""banana""",5,5
"""banana""",4,4
"""apple""",3,-1
"""apple""",2,-1
"""banana""",1,1


In [25]:
df

A,fruits,B,cars,optional
i64,str,i64,str,i64
1,"""banana""",5,"""beetle""",28.0
2,"""banana""",4,"""audi""",300.0
3,"""apple""",3,"""beetle""",
4,"""apple""",2,"""beetle""",2.0
5,"""banana""",1,"""beetle""",-30.0


We can perform a `fold` across rows, which is kind of like using `functools.reduce`

In [27]:
df.select([
    "A",
    "B",
    # Compute over rows: A + B + B^2 + A/2
    pl.fold(0, lambda a, b: a + b, [col('A'), 'B', col('B')**2, col('A')/2]).alias('fold')
])

A,B,fold
i64,i64,f64
1,5,31.5
2,4,23.0
3,3,16.5
4,2,12.0
5,1,9.5


## Aggregation context

Aggregations are applied over groups instead of columns

In [29]:
df

A,fruits,B,cars,optional
i64,str,i64,str,i64
1,"""banana""",5,"""beetle""",28.0
2,"""banana""",4,"""audi""",300.0
3,"""apple""",3,"""beetle""",
4,"""apple""",2,"""beetle""",2.0
5,"""banana""",1,"""beetle""",-30.0


In [34]:
df.sort('cars').groupby('fruits').agg([
    col('B').sum().alias('B_sum'),
    pl.sum('B').alias('B_sum2'),  # syntactic sugar for the above
    pl.first('cars').alias('cars_first'),
    pl.first('fruits').alias('fruit_first'),  # you can use the thing you're grouping by
    pl.count('A').alias('A_count'),
    col('cars').reverse()
])

fruits,B_sum,B_sum2,cars_first,fruit_first,A_count,cars
str,i64,i64,str,str,u32,list[str]
"""banana""",10,10,"""audi""","""banana""",3,"[""beetle"", ""beetle"", ""audi""]"
"""apple""",5,5,"""beetle""","""apple""",2,"[""beetle"", ""beetle""]"


We can also explode the aggregated list `cars`

In [35]:
df.sort('cars').groupby('fruits').agg([
    col('B').sum().alias('B_sum'),
    pl.sum('B').alias('B_sum2'),  # syntactic sugar for the above
    pl.first('cars').alias('cars_first'),
    pl.first('fruits').alias('fruit_first'),  # you can use the thing you're grouping by
    pl.count('A').alias('A_count'),
    col('cars').reverse()
]).explode('cars')

fruits,B_sum,B_sum2,cars_first,fruit_first,A_count,cars
str,i64,i64,str,str,u32,str
"""banana""",10,10,"""audi""","""banana""",3,"""beetle"""
"""banana""",10,10,"""audi""","""banana""",3,"""beetle"""
"""banana""",10,10,"""audi""","""banana""",3,"""audi"""
"""apple""",5,5,"""beetle""","""apple""",2,"""beetle"""
"""apple""",5,5,"""beetle""","""apple""",2,"""beetle"""


In [37]:
df.sort('cars').groupby('fruits').agg([
    col('cars'),
    col('cars').reverse().alias('cars_reversed')
])

fruits,cars,cars_reversed
str,list[str],list[str]
"""banana""","[""audi"", ""beetle"", ""beetle""]","[""beetle"", ""beetle"", ""audi""]"
"""apple""","[""beetle"", ""beetle""]","[""beetle"", ""beetle""]"


We can do predicates in the `groupby` too

In [42]:
df.groupby('fruits').agg([
    col('B').filter(col('B') > 1).list()
])

fruits,B
str,list[i64]
"""banana""","[5, 4]"
"""apple""","[3, 2]"


In [43]:
df.groupby('fruits').agg([
    col('B').filter(col('B') > 1).mean()
])

fruits,B
str,f64
"""banana""",4.5
"""apple""",2.5


Shift + fill

In [44]:
df.groupby("fruits").agg([
    col('B').shift_and_fill(1, fill_value=0).alias('shifted'),
    col('B').shift_and_fill(1, fill_value=0).sum().alias('shifted_sum')
])

fruits,shifted,shifted_sum
str,list[i64],i64
"""apple""","[0, 3]",3
"""banana""","[0, 5, 4]",9


## Window function

Aggregations in a selection context. This lets you perform multiple groupby/aggregations simultaneously.

In [45]:
df

A,fruits,B,cars,optional
i64,str,i64,str,i64
1,"""banana""",5,"""beetle""",28.0
2,"""banana""",4,"""audi""",300.0
3,"""apple""",3,"""beetle""",
4,"""apple""",2,"""beetle""",2.0
5,"""banana""",1,"""beetle""",-30.0


In [47]:
df.select([
    "fruits",
    "cars",
    "B",
    col("B").sum().over("fruits").alias('B_sum_by_fruits'),
    col("B").sum().over("cars").alias('B_sum_by_cars'),
])

fruits,cars,B,B_sum_by_fruits,B_sum_by_cars
str,str,i64,i64,i64
"""banana""","""beetle""",5,10,11
"""banana""","""audi""",4,10,4
"""apple""","""beetle""",3,5,11
"""apple""","""beetle""",2,5,11
"""banana""","""beetle""",1,10,11


In [49]:
# Reverse by group, and show in the original df
df.select([
    "fruits",
    "B",
    col('B').reverse().over('fruits').alias('B_reversed_by_fruits')
])

fruits,B,B_reversed_by_fruits
str,i64,i64
"""banana""",5,1
"""banana""",4,4
"""apple""",3,2
"""apple""",2,3
"""banana""",1,5


In [52]:
df.select([
    "fruits",
    "B",
    col('B').shift().over("fruits").alias("lag_B_by_fruit")
])

fruits,B,lag_B_by_fruit
str,i64,i64
"""banana""",5,
"""banana""",4,5.0
"""apple""",3,
"""apple""",2,3.0
"""banana""",1,4.0
