# Polars
https://docs.pola.rs/user-guide/getting-started/

In [2]:
import polars as pl
import pandas as pd
import numpy as np
import datetime as dt

## Basics

In [3]:
df = pl.DataFrame(
    {
        "name": ["Alice Archer", "Ben Brown", "Chloe Cooper", "Daniel Donovan"],
        "birthdate": [
            dt.date(1997, 1, 10),
            dt.date(1985, 2, 15),
            dt.date(1983, 3, 22),
            dt.date(1981, 4, 30),
        ],
        "weight": [57.9, 72.5, 53.6, 83.1],  # (kg)
        "height": [1.56, 1.77, 1.65, 1.75],  # (m)
    }
)
# IO: https://docs.pola.rs/user-guide/io/
df.write_csv("people.csv")
df = pl.read_csv("people.csv")
df.write_parquet("people.parquet")
df = pl.read_parquet("people.parquet")
df = df.to_pandas()
df = pl.from_pandas(df)
# convert date to datetime
df = df.with_columns(pl.col("birthdate").cast(pl.Date, strict=False))
# add and drop columns
df = df.with_columns(
    [
        (pl.col("weight") / (pl.col("height") ** 2)).alias("BMI"),
        pl.col("birthdate").dt.year().alias("birth_year"),
    ]
)
df = df.drop("birth_year", "BMI")

df

name,birthdate,weight,height
str,date,f64,f64
"""Alice Archer""",1997-01-10,57.9,1.56
"""Ben Brown""",1985-02-15,72.5,1.77
"""Chloe Cooper""",1983-03-22,53.6,1.65
"""Daniel Donovan""",1981-04-30,83.1,1.75


In [15]:
# Select columns and calculate new columns
print(f'df[-1]["name"]: {df[-1]["name"]}')
result = df.select(
    pl.col("name"),
    pl.col("birthdate").dt.year().alias("birth_year"),
    (pl.col("weight") / (pl.col("height") ** 2)).alias("bmi"),
    (pl.col("weight", "height") * 0.95).round(2).name.suffix("-5%"),
    )
result

df[-1]["name"]: shape: (1,)
Series: 'name' [str]
[
	"Daniel Donovan"
]


name,birth_year,bmi,weight-5%,height-5%
str,i32,f64,f64,f64
"""Alice Archer""",1997,23.791913,55.01,1.48
"""Ben Brown""",1985,23.141498,68.88,1.68
"""Chloe Cooper""",1983,19.687787,50.92,1.57
"""Daniel Donovan""",1981,27.134694,78.94,1.66


In [6]:
# Add new columns to the DataFrame
result = df.with_columns(
    pl.col("birthdate").dt.year().alias("birth_year"),
    (pl.col("weight") / (pl.col("height") ** 2)).alias("bmi"),
    (pl.col("weight", "height") * 0.95).round(2).name.suffix("-5%"),
)
result

name,birthdate,weight,height,birth_year,bmi,weight-5%,height-5%
str,date,f64,f64,i32,f64,f64,f64
"""Alice Archer""",1997-01-10,57.9,1.56,1997,23.791913,55.01,1.48
"""Ben Brown""",1985-02-15,72.5,1.77,1985,23.141498,68.88,1.68
"""Chloe Cooper""",1983-03-22,53.6,1.65,1983,19.687787,50.92,1.57
"""Daniel Donovan""",1981-04-30,83.1,1.75,1981,27.134694,78.94,1.66


In [7]:
# Filter rows based on conditions
# result = df.filter(
#     (pl.col("weight") > 70) & (pl.col("height") > 1.7)
# )
result = df.filter(
    pl.col("weight") > 70,
    pl.col("height") > 1.7,
)
result

name,birthdate,weight,height
str,date,f64,f64
"""Ben Brown""",1985-02-15,72.5,1.77
"""Daniel Donovan""",1981-04-30,83.1,1.75


In [8]:
# Group by and aggregate data
result = df.group_by(
    (pl.col("birthdate").dt.year() // 10 * 10).alias("decade"),
    maintain_order=True,
).agg(
    pl.len().alias("sample_size"),
    pl.col("weight").mean().round(2).alias("avg_weight"),
    pl.col("height").max().alias("tallest"),
)
print(result)

shape: (2, 4)
┌────────┬─────────────┬────────────┬─────────┐
│ decade ┆ sample_size ┆ avg_weight ┆ tallest │
│ ---    ┆ ---         ┆ ---        ┆ ---     │
│ i32    ┆ u32         ┆ f64        ┆ f64     │
╞════════╪═════════════╪════════════╪═════════╡
│ 1990   ┆ 1           ┆ 57.9       ┆ 1.56    │
│ 1980   ┆ 3           ┆ 69.73      ┆ 1.77    │
└────────┴─────────────┴────────────┴─────────┘


In [9]:
# Group by and apply calculation to each group
def some_function(group: pl.DataFrame) -> pl.DataFrame:
    print(group)
    return pl.DataFrame(
        {
            "decade": group["decade"],
            "bmi_mean": (group["weight"] / (group["height"] ** 2)).mean(),
        }
    )
result = df.with_columns(
    (pl.col("birthdate").dt.year() // 10 * 10).alias("decade"),
).group_by(
    "decade",
    maintain_order=True,
).map_groups(some_function)
result

shape: (1, 5)
┌──────────────┬────────────┬────────┬────────┬────────┐
│ name         ┆ birthdate  ┆ weight ┆ height ┆ decade │
│ ---          ┆ ---        ┆ ---    ┆ ---    ┆ ---    │
│ str          ┆ date       ┆ f64    ┆ f64    ┆ i32    │
╞══════════════╪════════════╪════════╪════════╪════════╡
│ Alice Archer ┆ 1997-01-10 ┆ 57.9   ┆ 1.56   ┆ 1990   │
└──────────────┴────────────┴────────┴────────┴────────┘
shape: (3, 5)
┌────────────────┬────────────┬────────┬────────┬────────┐
│ name           ┆ birthdate  ┆ weight ┆ height ┆ decade │
│ ---            ┆ ---        ┆ ---    ┆ ---    ┆ ---    │
│ str            ┆ date       ┆ f64    ┆ f64    ┆ i32    │
╞════════════════╪════════════╪════════╪════════╪════════╡
│ Ben Brown      ┆ 1985-02-15 ┆ 72.5   ┆ 1.77   ┆ 1980   │
│ Chloe Cooper   ┆ 1983-03-22 ┆ 53.6   ┆ 1.65   ┆ 1980   │
│ Daniel Donovan ┆ 1981-04-30 ┆ 83.1   ┆ 1.75   ┆ 1980   │
└────────────────┴────────────┴────────┴────────┴────────┘


decade,bmi_mean
i32,f64
1990,23.791913
1980,23.321326
1980,23.321326
1980,23.321326


In [10]:
# Combine multiple DataFrames
df2 = pl.DataFrame(
    {
        "name": ["Ben Brown", "Daniel Donovan", "Alice Archer", "Chloe Cooper"],
        "parent": [True, False, False, False],
        "siblings": [1, 2, 3, 4],
    }
)
result = df.join(df2, on="name", how="left")
result

name,birthdate,weight,height,parent,siblings
str,date,f64,f64,bool,i64
"""Alice Archer""",1997-01-10,57.9,1.56,False,3
"""Ben Brown""",1985-02-15,72.5,1.77,True,1
"""Chloe Cooper""",1983-03-22,53.6,1.65,False,4
"""Daniel Donovan""",1981-04-30,83.1,1.75,False,2


In [11]:
# Concatenate DataFrames
df3 = pl.DataFrame(
    {
        "name": ["Ethan Edwards", "Fiona Foster", "Grace Gibson", "Henry Harris"],
        "birthdate": [
            dt.date(1977, 5, 10),
            dt.date(1975, 6, 23),
            dt.date(1973, 7, 22),
            dt.date(1971, 8, 3),
        ],
        "weight": [67.9, 72.5, 57.6, 93.1],  # (kg)
        "height": [1.76, 1.6, 1.66, 1.8],  # (m)
    }
)

result = pl.concat([df, df3], how="vertical")
result

name,birthdate,weight,height
str,date,f64,f64
"""Alice Archer""",1997-01-10,57.9,1.56
"""Ben Brown""",1985-02-15,72.5,1.77
"""Chloe Cooper""",1983-03-22,53.6,1.65
"""Daniel Donovan""",1981-04-30,83.1,1.75
"""Ethan Edwards""",1977-05-10,67.9,1.76
"""Fiona Foster""",1975-06-23,72.5,1.6
"""Grace Gibson""",1973-07-22,57.6,1.66
"""Henry Harris""",1971-08-03,93.1,1.8


## Performance test

In [12]:
data = []
for d in range(10000):
    for m in range(10):
        for i in range(40):
            data.append({
                'device': f'device_{d}',
                'metric': f'metric_{d}_{m}',
                'timestamp': pd.Timestamp('2023-10-01') + pd.Timedelta(minutes=i),
                'value': np.random.rand(),
                'value2': np.random.rand() if i < 19 else None,
            })
time1 = dt.datetime.now()
df_pl = pl.DataFrame(data)
time2 = dt.datetime.now()
df_pd = pd.DataFrame(data)
time3 = dt.datetime.now()
print("Polars time:", time2 - time1)
print("Pandas time:", time3 - time2)
len(data)

Polars time: 0:00:01.744126
Pandas time: 0:00:02.001543


4000000

In [13]:
# Pandas GroupBy and Apply
def complex_calculation(group):
    weighted_avg = (group['value'] * group['value2']).sum() / group['value2'].sum()
    value_range = group['value'].max() - group['value'].min()
    return pd.Series({
        'Weighted_Avg': weighted_avg,
        'Value_Range': value_range
    })
time1 = dt.datetime.now()
result = df_pd.groupby(['device', 'metric'], group_keys=False).apply(complex_calculation, include_groups=False)
time2 = dt.datetime.now()
print("Pandas GroupBy and Apply time:", time2 - time1)

print(result)

Pandas GroupBy and Apply time: 0:00:14.185337
                           Weighted_Avg  Value_Range
device      metric                                  
device_0    metric_0_0         0.477292     0.906841
            metric_0_1         0.483693     0.946895
            metric_0_2         0.477622     0.989749
            metric_0_3         0.433504     0.972337
            metric_0_4         0.624798     0.988115
...                                 ...          ...
device_9999 metric_9999_5      0.355268     0.981846
            metric_9999_6      0.442956     0.945352
            metric_9999_7      0.552146     0.967406
            metric_9999_8      0.469246     0.953041
            metric_9999_9      0.436128     0.947307

[100000 rows x 2 columns]


In [16]:
def complex_calculation(group):
    weighted_avg = (group['value'] * group['value2']).sum() / group['value2'].sum()
    value_range = group['value'].max() - group['value'].min()
    return pl.DataFrame({
        'device': group['device_metric'][0].split("___")[0],
        'metric': group['device_metric'][0].split("___")[1],
        'Weighted_Avg': weighted_avg,
        'Value_Range': value_range
    })
time1 = dt.datetime.now()
merged = df_pl.with_columns((pl.col("device") + "___" + pl.col("metric")).alias("device_metric"))
time2 = dt.datetime.now()
print("Polars merge columns time:", time2 - time1)
result = merged.group_by('device_metric',maintain_order=True).map_groups(complex_calculation)
time3 = dt.datetime.now()
print("Polars GroupBy and Apply time:", time3 - time2)
# result = result.sort(["device", "metric"])
# time4 = dt.datetime.now()
# print("Final sorting time:", time4 - time3)
result

Polars merge columns time: 0:00:00.103138
Polars GroupBy and Apply time: 0:00:08.715043


device,metric,Weighted_Avg,Value_Range
str,str,f64,f64
"""device_0""","""metric_0_0""",0.477292,0.906841
"""device_0""","""metric_0_1""",0.483693,0.946895
"""device_0""","""metric_0_2""",0.477622,0.989749
"""device_0""","""metric_0_3""",0.433504,0.972337
"""device_0""","""metric_0_4""",0.624798,0.988115
…,…,…,…
"""device_9999""","""metric_9999_5""",0.355268,0.981846
"""device_9999""","""metric_9999_6""",0.442956,0.945352
"""device_9999""","""metric_9999_7""",0.552146,0.967406
"""device_9999""","""metric_9999_8""",0.469246,0.953041
