# Expressions and contexts

In [1]:
import polars as pl

s = pl.Series("my_nums", [1, 2, 3, 4, 5], dtype=pl.UInt32)
print(s)

shape: (5,)
Series: 'my_nums' [u32]
[
	1
	2
	3
	4
	5
]


In [2]:
from datetime import date

df = pl.DataFrame(
    {
        "name": ["Alice Archer", "Ben Brown", "Chloe Cooper", "Daniel Donovan"],
        "birthdate": [
            date(1997, 1, 10),
            date(1985, 2, 15),
            date(1983, 3, 22),
            date(1981, 4, 30),
        ],
        "weight": [57.9, 72.5, 53.6, 83.1],  # (kg)
        "height": [1.56, 1.77, 1.65, 1.75],  # (m)
    }
)

print(df)

shape: (4, 4)
┌────────────────┬────────────┬────────┬────────┐
│ name           ┆ birthdate  ┆ weight ┆ height │
│ ---            ┆ ---        ┆ ---    ┆ ---    │
│ str            ┆ date       ┆ f64    ┆ f64    │
╞════════════════╪════════════╪════════╪════════╡
│ Alice Archer   ┆ 1997-01-10 ┆ 57.9   ┆ 1.56   │
│ Ben Brown      ┆ 1985-02-15 ┆ 72.5   ┆ 1.77   │
│ Chloe Cooper   ┆ 1983-03-22 ┆ 53.6   ┆ 1.65   │
│ Daniel Donovan ┆ 1981-04-30 ┆ 83.1   ┆ 1.75   │
└────────────────┴────────────┴────────┴────────┘


In [3]:
print(df.glimpse())

Rows: 4
Columns: 4
$ name       <str> 'Alice Archer', 'Ben Brown', 'Chloe Cooper', 'Daniel Donovan'
$ birthdate <date> 1997-01-10, 1985-02-15, 1983-03-22, 1981-04-30
$ weight     <f64> 57.9, 72.5, 53.6, 83.1
$ height     <f64> 1.56, 1.77, 1.65, 1.75

None


In [4]:
imc = (pl.col("weight") / (pl.col("height") ** 2)).cast(pl.Float32)
print(imc)

[(col("weight")) / (col("height").pow([dyn int: 2]))].strict_cast(Float32)


In [5]:
print(df.with_columns(
        imc=imc
    ))

print(df.select(
        imc=imc,
        avg_imc=imc.mean(),
        imc_std=imc.std(),
        imc_norm=(imc - imc.mean()) / imc.std()
    ))

print(df.select(
        imc=imc,
        avg_imc=imc.mean(),
        imc_std=imc.std(),
        imc_norm=(imc - imc.mean()) / imc.std()
    ).filter(
        pl.col("imc_norm") >= 0
    ))


decade = (pl.col("birthdate").dt.year() // 10 * 10).alias("decade")
name_not_start_with_d = ~(pl.col("name").str.starts_with("D"))
avg_h_w = pl.col("weight", "height").mean().name.prefix("avg_")
print(
    df.filter(name_not_start_with_d) \
        .group_by(decade).agg(pl.col("name"), imc.mean(), avg_h_w)
)

shape: (4, 5)
┌────────────────┬────────────┬────────┬────────┬───────────┐
│ name           ┆ birthdate  ┆ weight ┆ height ┆ imc       │
│ ---            ┆ ---        ┆ ---    ┆ ---    ┆ ---       │
│ str            ┆ date       ┆ f64    ┆ f64    ┆ f32       │
╞════════════════╪════════════╪════════╪════════╪═══════════╡
│ Alice Archer   ┆ 1997-01-10 ┆ 57.9   ┆ 1.56   ┆ 23.791914 │
│ Ben Brown      ┆ 1985-02-15 ┆ 72.5   ┆ 1.77   ┆ 23.141499 │
│ Chloe Cooper   ┆ 1983-03-22 ┆ 53.6   ┆ 1.65   ┆ 19.687786 │
│ Daniel Donovan ┆ 1981-04-30 ┆ 83.1   ┆ 1.75   ┆ 27.134693 │
└────────────────┴────────────┴────────┴────────┴───────────┘
shape: (4, 4)
┌───────────┬───────────┬──────────┬───────────┐
│ imc       ┆ avg_imc   ┆ imc_std  ┆ imc_norm  │
│ ---       ┆ ---       ┆ ---      ┆ ---       │
│ f32       ┆ f32       ┆ f32      ┆ f32       │
╞═══════════╪═══════════╪══════════╪═══════════╡
│ 23.791914 ┆ 23.438972 ┆ 3.051929 ┆ 0.115645  │
│ 23.141499 ┆ 23.438972 ┆ 3.051929 ┆ -0.097471 │
│ 19.6877

# Lazy API

In [15]:
with open("data/iris.csv", "r") as f:
    data = f.readlines()

header = [data[0]]
c = data[1:]
big_data = c * 10_000

with open("data/iris_big.csv", "w") as f:
    f.writelines(header + big_data)

In [20]:
%%timeit -n 10 -r 5
df = pl.read_csv("data/iris_big.csv")
df_agg = df.filter(
        pl.col("sepal_length") > 5
    ).group_by("species").agg(
        pl.col("sepal_width").mean()
    )

82.4 ms ± 3.14 ms per loop (mean ± std. dev. of 5 runs, 10 loops each)


In [21]:
%%timeit -n 10 -r 5
df = pl.scan_csv("data/iris_big.csv")
df_agg = df.filter(
        pl.col("sepal_length") > 5
    ).group_by("species").agg(
        pl.col("sepal_width").mean()
    ).collect()

64.2 ms ± 2.91 ms per loop (mean ± std. dev. of 5 runs, 10 loops each)


In [26]:
plan = pl.scan_csv("data/iris_big.csv")\
    .filter(
        pl.col("sepal_length") > 5
    ).group_by("species").agg(
        pl.col("sepal_width").mean()
    )

print(plan.explain())

AGGREGATE
	[col("sepal_width").mean()] BY [col("species")] FROM
  simple π 3/3 ["sepal_width", "species", ... 1 other column]
    Csv SCAN [data/iris_big.csv]
    PROJECT 3/5 COLUMNS
    SELECTION: [(col("sepal_length")) > (5.0)]
