In [291]:
import polars as pl
import numpy as np
import pandas as pd

np.random.seed(0)

In [6]:
df = pl.DataFrame({"numbers": np.random.randint(0, 100_000_000, 100_000_000).tolist()})

In [7]:
%%timeit -n 1 -r 5

result = df.select(
    pl.col("numbers").n_unique().alias("n_unique"),
    # pl.col("numbers").approx_n_unique().alias("approx_n_unique"),
)

1.25 s ± 108 ms per loop (mean ± std. dev. of 5 runs, 1 loop each)


In [8]:
%%timeit -n 1 -r 5

result = df.select(
    # pl.col("numbers").n_unique().alias("n_unique"),
    pl.col("numbers").approx_n_unique().alias("approx_n_unique"),
)

272 ms ± 4.84 ms per loop (mean ± std. dev. of 5 runs, 1 loop each)


In [22]:
df = pl.DataFrame({"numbers": np.random.randint(0, 100_000, 100_000).tolist()})

result = df.select(
    pl.col("numbers").n_unique().alias("n_unique"),
    pl.col("numbers").approx_n_unique().alias("approx_n_unique"),
)

print(result)

shape: (1, 2)
┌──────────┬─────────────────┐
│ n_unique ┆ approx_n_unique │
│ ---      ┆ ---             │
│ u32      ┆ u32             │
╞══════════╪═════════════════╡
│ 63319    ┆ 63192           │
└──────────┴─────────────────┘


In [23]:
df.schema

Schema([('numbers', Int64)])

In [24]:
df.select(pl.col("numbers"))

numbers
i64
26405
99043
4488
6144
69291
…
74863
72030
13152
87616


In [29]:
conditional = pl.when(pl.col("numbers") % 2 == 0).then(pl.lit("Par")).otherwise(pl.lit("Impar")).alias("Parity")

df.with_columns(
    Parity=conditional
)

numbers,Parity
i64,str
26405,"""Impar"""
99043,"""Impar"""
4488,"""Par"""
6144,"""Par"""
69291,"""Impar"""
…,…
74863,"""Impar"""
72030,"""Par"""
13152,"""Par"""
87616,"""Par"""


In [32]:
df = pl.DataFrame(
    {  # As of 14th October 2024, ~3pm UTC
        "ticker": ["AAPL", "NVDA", "MSFT", "GOOG", "AMZN"],
        "company_name": ["Apple", "NVIDIA", "Microsoft", "Alphabet (Google)", "Amazon"],
        "price": [229.9, 138.93, 420.56, 166.41, 188.4],
        "day_high": [231.31, 139.6, 424.04, 167.62, 189.83],
        "day_low": [228.6, 136.3, 417.52, 164.78, 188.44],
        "year_high": [237.23, 140.76, 468.35, 193.31, 201.2],
        "year_low": [164.08, 39.23, 324.39, 121.46, 118.35],
    }
)

print(df.head())

shape: (5, 7)
┌────────┬───────────────────┬────────┬──────────┬─────────┬───────────┬──────────┐
│ ticker ┆ company_name      ┆ price  ┆ day_high ┆ day_low ┆ year_high ┆ year_low │
│ ---    ┆ ---               ┆ ---    ┆ ---      ┆ ---     ┆ ---       ┆ ---      │
│ str    ┆ str               ┆ f64    ┆ f64      ┆ f64     ┆ f64       ┆ f64      │
╞════════╪═══════════════════╪════════╪══════════╪═════════╪═══════════╪══════════╡
│ AAPL   ┆ Apple             ┆ 229.9  ┆ 231.31   ┆ 228.6   ┆ 237.23    ┆ 164.08   │
│ NVDA   ┆ NVIDIA            ┆ 138.93 ┆ 139.6    ┆ 136.3   ┆ 140.76    ┆ 39.23    │
│ MSFT   ┆ Microsoft         ┆ 420.56 ┆ 424.04   ┆ 417.52  ┆ 468.35    ┆ 324.39   │
│ GOOG   ┆ Alphabet (Google) ┆ 166.41 ┆ 167.62   ┆ 164.78  ┆ 193.31    ┆ 121.46   │
│ AMZN   ┆ Amazon            ┆ 188.4  ┆ 189.83   ┆ 188.44  ┆ 201.2     ┆ 118.35   │
└────────┴───────────────────┴────────┴──────────┴─────────┴───────────┴──────────┘


In [33]:
eur_usd_rate = 1.09  # As of 14th October 2024

result = df.with_columns(
    (
        pl.col(
            "price",
            "day_high",
            "day_low",
            "year_high",
            "year_low",
        )
        / eur_usd_rate
    ).round(2)
)
print(result)

shape: (5, 7)
┌────────┬───────────────────┬────────┬──────────┬─────────┬───────────┬──────────┐
│ ticker ┆ company_name      ┆ price  ┆ day_high ┆ day_low ┆ year_high ┆ year_low │
│ ---    ┆ ---               ┆ ---    ┆ ---      ┆ ---     ┆ ---       ┆ ---      │
│ str    ┆ str               ┆ f64    ┆ f64      ┆ f64     ┆ f64       ┆ f64      │
╞════════╪═══════════════════╪════════╪══════════╪═════════╪═══════════╪══════════╡
│ AAPL   ┆ Apple             ┆ 210.92 ┆ 212.21   ┆ 209.72  ┆ 217.64    ┆ 150.53   │
│ NVDA   ┆ NVIDIA            ┆ 127.46 ┆ 128.07   ┆ 125.05  ┆ 129.14    ┆ 35.99    │
│ MSFT   ┆ Microsoft         ┆ 385.83 ┆ 389.03   ┆ 383.05  ┆ 429.68    ┆ 297.61   │
│ GOOG   ┆ Alphabet (Google) ┆ 152.67 ┆ 153.78   ┆ 151.17  ┆ 177.35    ┆ 111.43   │
│ AMZN   ┆ Amazon            ┆ 172.84 ┆ 174.16   ┆ 172.88  ┆ 184.59    ┆ 108.58   │
└────────┴───────────────────┴────────┴──────────┴─────────┴───────────┴──────────┘


In [34]:
result = df.with_columns((pl.col(pl.Float64) / eur_usd_rate).round(2))
print(result)

shape: (5, 7)
┌────────┬───────────────────┬────────┬──────────┬─────────┬───────────┬──────────┐
│ ticker ┆ company_name      ┆ price  ┆ day_high ┆ day_low ┆ year_high ┆ year_low │
│ ---    ┆ ---               ┆ ---    ┆ ---      ┆ ---     ┆ ---       ┆ ---      │
│ str    ┆ str               ┆ f64    ┆ f64      ┆ f64     ┆ f64       ┆ f64      │
╞════════╪═══════════════════╪════════╪══════════╪═════════╪═══════════╪══════════╡
│ AAPL   ┆ Apple             ┆ 210.92 ┆ 212.21   ┆ 209.72  ┆ 217.64    ┆ 150.53   │
│ NVDA   ┆ NVIDIA            ┆ 127.46 ┆ 128.07   ┆ 125.05  ┆ 129.14    ┆ 35.99    │
│ MSFT   ┆ Microsoft         ┆ 385.83 ┆ 389.03   ┆ 383.05  ┆ 429.68    ┆ 297.61   │
│ GOOG   ┆ Alphabet (Google) ┆ 152.67 ┆ 153.78   ┆ 151.17  ┆ 177.35    ┆ 111.43   │
│ AMZN   ┆ Amazon            ┆ 172.84 ┆ 174.16   ┆ 172.88  ┆ 184.59    ┆ 108.58   │
└────────┴───────────────────┴────────┴──────────┴─────────┴───────────┴──────────┘


In [35]:
result = df.with_columns(
    (pl.col("^day.*$") / eur_usd_rate).round(2))
print(result)

shape: (5, 7)
┌────────┬───────────────────┬────────┬──────────┬─────────┬───────────┬──────────┐
│ ticker ┆ company_name      ┆ price  ┆ day_high ┆ day_low ┆ year_high ┆ year_low │
│ ---    ┆ ---               ┆ ---    ┆ ---      ┆ ---     ┆ ---       ┆ ---      │
│ str    ┆ str               ┆ f64    ┆ f64      ┆ f64     ┆ f64       ┆ f64      │
╞════════╪═══════════════════╪════════╪══════════╪═════════╪═══════════╪══════════╡
│ AAPL   ┆ Apple             ┆ 229.9  ┆ 212.21   ┆ 209.72  ┆ 237.23    ┆ 164.08   │
│ NVDA   ┆ NVIDIA            ┆ 138.93 ┆ 128.07   ┆ 125.05  ┆ 140.76    ┆ 39.23    │
│ MSFT   ┆ Microsoft         ┆ 420.56 ┆ 389.03   ┆ 383.05  ┆ 468.35    ┆ 324.39   │
│ GOOG   ┆ Alphabet (Google) ┆ 166.41 ┆ 153.78   ┆ 151.17  ┆ 193.31    ┆ 121.46   │
│ AMZN   ┆ Amazon            ┆ 188.4  ┆ 174.16   ┆ 172.88  ┆ 201.2     ┆ 118.35   │
└────────┴───────────────────┴────────┴──────────┴─────────┴───────────┴──────────┘


In [47]:
result = df.select(
    pl.all().exclude("^day.*$").name.prefix("not a day: ")
)
print(result)

shape: (5, 5)
┌───────────────────┬───────────────────┬──────────────────┬────────────┬─────────────────────┐
│ not a day: ticker ┆ not a day:        ┆ not a day: price ┆ not a day: ┆ not a day: year_low │
│ ---               ┆ company_name      ┆ ---              ┆ year_high  ┆ ---                 │
│ str               ┆ ---               ┆ f64              ┆ ---        ┆ f64                 │
│                   ┆ str               ┆                  ┆ f64        ┆                     │
╞═══════════════════╪═══════════════════╪══════════════════╪════════════╪═════════════════════╡
│ AAPL              ┆ Apple             ┆ 229.9            ┆ 237.23     ┆ 164.08              │
│ NVDA              ┆ NVIDIA            ┆ 138.93           ┆ 140.76     ┆ 39.23               │
│ MSFT              ┆ Microsoft         ┆ 420.56           ┆ 468.35     ┆ 324.39              │
│ GOOG              ┆ Alphabet (Google) ┆ 166.41           ┆ 193.31     ┆ 121.46              │
│ AMZN              ┆ Amaz

In [52]:
def amplitude_expressions(time_periods):
    for tp in time_periods:
        yield (pl.col(f"{tp}_high") - pl.col(f"{tp}_low")).alias(f"{tp}_amplitude")

result = df.with_columns(amplitude_expressions(["day", "year"]))
print(result)

shape: (5, 9)
┌────────┬──────────────┬────────┬──────────┬───┬───────────┬──────────┬─────────────┬─────────────┐
│ ticker ┆ company_name ┆ price  ┆ day_high ┆ … ┆ year_high ┆ year_low ┆ day_amplitu ┆ year_amplit │
│ ---    ┆ ---          ┆ ---    ┆ ---      ┆   ┆ ---       ┆ ---      ┆ de          ┆ ude         │
│ str    ┆ str          ┆ f64    ┆ f64      ┆   ┆ f64       ┆ f64      ┆ ---         ┆ ---         │
│        ┆              ┆        ┆          ┆   ┆           ┆          ┆ f64         ┆ f64         │
╞════════╪══════════════╪════════╪══════════╪═══╪═══════════╪══════════╪═════════════╪═════════════╡
│ AAPL   ┆ Apple        ┆ 229.9  ┆ 231.31   ┆ … ┆ 237.23    ┆ 164.08   ┆ 2.71        ┆ 73.15       │
│ NVDA   ┆ NVIDIA       ┆ 138.93 ┆ 139.6    ┆ … ┆ 140.76    ┆ 39.23    ┆ 3.3         ┆ 101.53      │
│ MSFT   ┆ Microsoft    ┆ 420.56 ┆ 424.04   ┆ … ┆ 468.35    ┆ 324.39   ┆ 6.52        ┆ 143.96      │
│ GOOG   ┆ Alphabet     ┆ 166.41 ┆ 167.62   ┆ … ┆ 193.31    ┆ 121.46   ┆ 2.84

In [64]:
import polars.selectors as cs

try:
    df.select(pl.String())
except TypeError as e:
    print(e)

print("================")
df.select(cs.string())

cannot create expression literal for value of type String.

Hint: Pass `allow_object=True` to accept any value and create a literal of type Object.


ticker,company_name
str,str
"""AAPL""","""Apple"""
"""NVDA""","""NVIDIA"""
"""MSFT""","""Microsoft"""
"""GOOG""","""Alphabet (Google)"""
"""AMZN""","""Amazon"""


In [67]:
print(cs.is_selector(cs.starts_with("adorei")))
print(cs.is_selector(cs.starts_with("adorei").as_expr()))

True
False


In [69]:
people = pl.DataFrame(
    {
        "name": ["Anna", "Bob"],
        "has_partner": [True, False],
        "has_kids": [False, False],
        "has_tattoos": [True, False],
        "is_alive": [True, True],
    }
)

print(
    cs.expand_selector(
        people,
        cs.starts_with("has_"),
    )
)

('has_partner', 'has_kids', 'has_tattoos')


In [87]:
df = pl.DataFrame({"numbers": np.random.random(100_000).tolist()})
print(df.schema)
print(f"{df.estimated_size('kb'):,} kb")

Schema({'numbers': Float64})
781.25 kb


In [88]:
df = pl.DataFrame({"numbers": np.random.random(100_000).tolist()})
df = df.select(pl.col("numbers").cast(pl.Float32))
print(df.schema)
print(f"{df.estimated_size('kb'):,} kb")

Schema({'numbers': Float32})
390.625 kb


In [92]:
from datetime import date, datetime, time

df = pl.DataFrame(
    {
        "date": [
            date(1970, 1, 1),  # epoch
            date(1970, 1, 10),  # 9 days later
        ],
        "datetime": [
            datetime(1970, 1, 1, 0, 0, 0),  # epoch
            datetime(1970, 1, 10, 0, 0, 0),  # 9 days later
        ],
        "time": [
            time(0, 0, 0),  # reference time
            time(0, 0, 1),  # 1 second later
        ],
    }
)

result = df.select(
    pl.col("date").cast(pl.Int64).alias("days_since_epoch"),
    pl.col("datetime").cast(pl.Int64).alias("us_since_epoch"),
    pl.col("time").cast(pl.Int64).alias("ns_since_midnight"),
)
print(result)


shape: (2, 3)
┌──────────────────┬────────────────┬───────────────────┐
│ days_since_epoch ┆ us_since_epoch ┆ ns_since_midnight │
│ ---              ┆ ---            ┆ ---               │
│ i64              ┆ i64            ┆ i64               │
╞══════════════════╪════════════════╪═══════════════════╡
│ 0                ┆ 0              ┆ 0                 │
│ 9                ┆ 777600000000   ┆ 1000000000        │
└──────────────────┴────────────────┴───────────────────┘


In [95]:
df = pl.DataFrame({
    "name": ["João", "Marko"]
})

df.with_columns(
    size_in_bytes = pl.col("name").str.len_bytes(),
    size_in_letters = pl.col("name").str.len_chars()
)

name,size_in_bytes,size_in_letters
str,u32,u32
"""João""",5,4
"""Marko""",5,5


In [96]:
weather = pl.DataFrame(
    {
        "station": [f"Station {idx}" for idx in range(1, 6)],
        "temperatures": [
            "20 5 5 E1 7 13 19 9 6 20",
            "18 8 16 11 23 E2 8 E2 E2 E2 90 70 40",
            "19 24 E9 16 6 12 10 22",
            "E2 E0 15 7 8 10 E1 24 17 13 6",
            "14 8 E0 16 22 24 E1",
        ],
    }
)

print(weather)

shape: (5, 2)
┌───────────┬─────────────────────────────────┐
│ station   ┆ temperatures                    │
│ ---       ┆ ---                             │
│ str       ┆ str                             │
╞═══════════╪═════════════════════════════════╡
│ Station 1 ┆ 20 5 5 E1 7 13 19 9 6 20        │
│ Station 2 ┆ 18 8 16 11 23 E2 8 E2 E2 E2 90… │
│ Station 3 ┆ 19 24 E9 16 6 12 10 22          │
│ Station 4 ┆ E2 E0 15 7 8 10 E1 24 17 13 6   │
│ Station 5 ┆ 14 8 E0 16 22 24 E1             │
└───────────┴─────────────────────────────────┘


In [109]:
weather = weather.with_columns(
    temperatures_list=pl.col("temperatures").str.split(by=" "),
    size=pl.col("temperatures").str.split(by=" ").list.len()
)
print(weather)

shape: (5, 4)
┌───────────┬─────────────────────────────────┬──────────────────────┬──────┐
│ station   ┆ temperatures                    ┆ temperatures_list    ┆ size │
│ ---       ┆ ---                             ┆ ---                  ┆ ---  │
│ str       ┆ str                             ┆ list[str]            ┆ u32  │
╞═══════════╪═════════════════════════════════╪══════════════════════╪══════╡
│ Station 1 ┆ 20 5 5 E1 7 13 19 9 6 20        ┆ ["20", "5", … "20"]  ┆ 10   │
│ Station 2 ┆ 18 8 16 11 23 E2 8 E2 E2 E2 90… ┆ ["18", "8", … "40"]  ┆ 13   │
│ Station 3 ┆ 19 24 E9 16 6 12 10 22          ┆ ["19", "24", … "22"] ┆ 8    │
│ Station 4 ┆ E2 E0 15 7 8 10 E1 24 17 13 6   ┆ ["E2", "E0", … "6"]  ┆ 11   │
│ Station 5 ┆ 14 8 E0 16 22 24 E1             ┆ ["14", "8", … "E1"]  ┆ 7    │
└───────────┴─────────────────────────────────┴──────────────────────┴──────┘


In [135]:
weather.with_columns(
   num_errors=pl.col("temperatures_list").list.eval(pl.element().cast(pl.Int32, strict=False).is_null()).list.sum()
).with_columns(
    pct_errors=(pl.col("num_errors") / pl.col("size") * 100).round(1).cast(pl.String()) + "%"
)

station,temperatures,temperatures_list,size,num_errors,pct_errors
str,str,list[str],u32,u32,str
"""Station 1""","""20 5 5 E1 7 13 19 9 6 20""","[""20"", ""5"", … ""20""]",10,1,"""10.0%"""
"""Station 2""","""18 8 16 11 23 E2 8 E2 E2 E2 90…","[""18"", ""8"", … ""40""]",13,4,"""30.8%"""
"""Station 3""","""19 24 E9 16 6 12 10 22""","[""19"", ""24"", … ""22""]",8,1,"""12.5%"""
"""Station 4""","""E2 E0 15 7 8 10 E1 24 17 13 6""","[""E2"", ""E0"", … ""6""]",11,3,"""27.3%"""
"""Station 5""","""14 8 E0 16 22 24 E1""","[""14"", ""8"", … ""E1""]",7,2,"""28.6%"""


In [160]:
weather_by_day = pl.DataFrame(
    {
        "station": [f"Station {idx}" for idx in range(1, 11)],
        "day_1": [17, 11, 8, 22, 9, 21, 20, 8, 8, 17],
        "day_2": [15, 11, 10, 8, 7, 14, 18, 21, 15, 13],
        "day_3": [16, 15, 24, 24, 8, 23, 19, 23, 16, 10],
    }
)

result = weather_by_day.select(
    pl.concat_list(pl.col("^day..$")).alias("timeseries")
)
print(result)

rank_pct = (pl.element().rank("min", descending=True)).round(2)
result = result.select(
    pl.col("timeseries").list.eval(rank_pct)
)
print(result)

result = weather_by_day.with_columns(
            pl.concat_list(pl.col("^day..$")).alias("timeseries")
        ).select(
            pl.all().exclude("timeseries"),
            pl.col("timeseries").list.eval(rank_pct, parallel=True).alias("temps_rank"),
        )
print(result)

shape: (10, 1)
┌──────────────┐
│ timeseries   │
│ ---          │
│ list[i64]    │
╞══════════════╡
│ [17, 15, 16] │
│ [11, 11, 15] │
│ [8, 10, 24]  │
│ [22, 8, 24]  │
│ [9, 7, 8]    │
│ [21, 14, 23] │
│ [20, 18, 19] │
│ [8, 21, 23]  │
│ [8, 15, 16]  │
│ [17, 13, 10] │
└──────────────┘
shape: (10, 1)
┌────────────┐
│ timeseries │
│ ---        │
│ list[u32]  │
╞════════════╡
│ [1, 3, 2]  │
│ [2, 2, 1]  │
│ [3, 2, 1]  │
│ [2, 3, 1]  │
│ [1, 3, 2]  │
│ [2, 3, 1]  │
│ [1, 3, 2]  │
│ [3, 2, 1]  │
│ [3, 2, 1]  │
│ [1, 2, 3]  │
└────────────┘
shape: (10, 5)
┌────────────┬───────┬───────┬───────┬────────────┐
│ station    ┆ day_1 ┆ day_2 ┆ day_3 ┆ temps_rank │
│ ---        ┆ ---   ┆ ---   ┆ ---   ┆ ---        │
│ str        ┆ i64   ┆ i64   ┆ i64   ┆ list[u32]  │
╞════════════╪═══════╪═══════╪═══════╪════════════╡
│ Station 1  ┆ 17    ┆ 15    ┆ 16    ┆ [1, 3, 2]  │
│ Station 2  ┆ 11    ┆ 11    ┆ 15    ┆ [2, 2, 1]  │
│ Station 3  ┆ 8     ┆ 10    ┆ 24    ┆ [3, 2, 1]  │
│ Station 4  ┆ 22    ┆ 8   

In [307]:
nums = np.random.randint(100, size=(100_000, 5)).astype(np.int32)

df_pandas = pd.DataFrame(nums)
print(df_pandas.shape)

df_list = pl.DataFrame({"my_nums": nums}, schema={"my_nums": pl.List(pl.Int32)})
print(df_list.schema)

df_arr = pl.DataFrame({"my_nums": nums})
print(df_arr.schema)

(100000, 5)
Schema({'my_nums': List(Int32)})
Schema({'my_nums': Array(Int32, shape=(5,))})


In [304]:
%%timeit
df_pandas.mean(axis=1)

5.81 ms ± 36.4 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [306]:
%%timeit
df_list.select(
    pl.col("my_nums").list.max()
)

1.04 ms ± 9.53 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [305]:
%%timeit
df_arr.select(
    pl.col("my_nums").arr.max()
)

830 μs ± 7.45 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
