### Polars

In [1]:
!pip install polars -q


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.10 -m pip install --upgrade pip[0m


In [1]:
import polars as pl

In [2]:
#v 0.20.5

In [3]:
df = pl.DataFrame(
    {
        "integer": [1, 2, 3],
        "string": [
            "one",
            "two",
            "three",
        ],
        "float": [4.0, 5.0, 6.0],
    }
)

print(df)

shape: (3, 3)
┌─────────┬────────┬───────┐
│ integer ┆ string ┆ float │
│ ---     ┆ ---    ┆ ---   │
│ i64     ┆ str    ┆ f64   │
╞═════════╪════════╪═══════╡
│ 1       ┆ one    ┆ 4.0   │
│ 2       ┆ two    ┆ 5.0   │
│ 3       ┆ three  ┆ 6.0   │
└─────────┴────────┴───────┘


> Basics

- datatypes: enum and categorial

In [4]:
## enum: when known
e_dtype = pl.Enum(["Fire", "Water", "Air"])
e_series = pl.Series(["Fire", "Air", "Water", "Fire", "Water"], dtype=e_dtype)
e2_series = pl.Series(["Fire", "Air", "Water", "Fire", "Water"], dtype=e_dtype)
print(e_series.append(e2_series))

shape: (10,)
Series: '' [enum]
[
	"Fire"
	"Air"
	"Water"
	"Fire"
	"Water"
	"Fire"
	"Air"
	"Water"
	"Fire"
	"Water"
]


In [5]:
try:
    e3_series = pl.Series(["Air", "Water", "Land", "Fire", "Fire"], dtype=e_dtype)
except Exception as e:
    print(e)
## raised OutOfBounds error as "Land" is not defined in enum type

value 'Land' is not present in Enum: LargeUtf8Array[Fire, Water, Air]


In [6]:
# compare
dtype = pl.Enum(["Red", "Blue", "Pink"])
c_series = pl.Series(["Red", "Pink", "Blue"], dtype=dtype)
c2_series = pl.Series(["Pink", "Blue", "Blue"], dtype=dtype)

print(c_series == c2_series) # F F T

shape: (3,)
Series: '' [bool]
[
	false
	false
	true
]


In [7]:
print(c_series <= "Blue")

shape: (3,)
Series: '' [bool]
[
	true
	false
	true
]


In [8]:
## categorial: when unknown
cat_series = pl.Series(["Air", "Water", "Land", "Fire", "Fire"],
                       dtype=pl.Categorical)
cat2_series = pl.Series(["Water", "Air", "Land", "Fire", "Fire"], 
                        dtype=pl.Categorical)
print(cat_series.append(cat2_series)) 

shape: (10,)
Series: '' [cat]
[
	"Air"
	"Water"
	"Land"
	"Fire"
	"Fire"
	"Water"
	"Air"
	"Land"
	"Fire"
	"Fire"
]


  print(cat_series.append(cat2_series))


In [9]:
pl.enable_string_cache()

In [10]:
with pl.StringCache():
    c_series = pl.Series(["A", "C", "P"], dtype=pl.Categorical)
    c2_series = pl.Series(["P", "C", "A"], dtype=pl.Categorical)
    print(c_series == c2_series)

shape: (3,)
Series: '' [bool]
[
	false
	true
	false
]


#### context

In [12]:
import numpy as np

In [13]:
df = pl.DataFrame(
    {
        "nrs": [1, 2, 3, None, 5],
        "names": ["rio", "kio", "lio", "zio", None],
        "random": np.random.rand(5),
        "groups": ["A", "A", "B", "C", "B"],
    }
)
print(df)

shape: (5, 4)
┌──────┬───────┬──────────┬────────┐
│ nrs  ┆ names ┆ random   ┆ groups │
│ ---  ┆ ---   ┆ ---      ┆ ---    │
│ i64  ┆ str   ┆ f64      ┆ str    │
╞══════╪═══════╪══════════╪════════╡
│ 1    ┆ rio   ┆ 0.202561 ┆ A      │
│ 2    ┆ kio   ┆ 0.0136   ┆ A      │
│ 3    ┆ lio   ┆ 0.173634 ┆ B      │
│ null ┆ zio   ┆ 0.487294 ┆ C      │
│ 5    ┆ null  ┆ 0.762614 ┆ B      │
└──────┴───────┴──────────┴────────┘


In [15]:
## selection: expression on col
## produce same lenght series or of length 1
## may produce new col accordingly

out = df.select(
    pl.sum("nrs"),
    pl.col("names").sort(),
    pl.col("names").first().alias("first name"),
    (pl.mean("nrs") * 10).alias("10xnrs"),
)
print(out)

shape: (5, 4)
┌─────┬───────┬────────────┬────────┐
│ nrs ┆ names ┆ first name ┆ 10xnrs │
│ --- ┆ ---   ┆ ---        ┆ ---    │
│ i64 ┆ str   ┆ str        ┆ f64    │
╞═════╪═══════╪════════════╪════════╡
│ 11  ┆ null  ┆ rio        ┆ 27.5   │
│ 11  ┆ kio   ┆ rio        ┆ 27.5   │
│ 11  ┆ lio   ┆ rio        ┆ 27.5   │
│ 11  ┆ rio   ┆ rio        ┆ 27.5   │
│ 11  ┆ zio   ┆ rio        ┆ 27.5   │
└─────┴───────┴────────────┴────────┘


In [16]:
## with columns
## retains original cols

df = df.with_columns(
    pl.sum("nrs").alias("nrs_sum"),
    pl.col("random").count().alias("count"),
)
print(df)

shape: (5, 6)
┌──────┬───────┬──────────┬────────┬─────────┬───────┐
│ nrs  ┆ names ┆ random   ┆ groups ┆ nrs_sum ┆ count │
│ ---  ┆ ---   ┆ ---      ┆ ---    ┆ ---     ┆ ---   │
│ i64  ┆ str   ┆ f64      ┆ str    ┆ i64     ┆ u32   │
╞══════╪═══════╪══════════╪════════╪═════════╪═══════╡
│ 1    ┆ rio   ┆ 0.202561 ┆ A      ┆ 11      ┆ 5     │
│ 2    ┆ kio   ┆ 0.0136   ┆ A      ┆ 11      ┆ 5     │
│ 3    ┆ lio   ┆ 0.173634 ┆ B      ┆ 11      ┆ 5     │
│ null ┆ zio   ┆ 0.487294 ┆ C      ┆ 11      ┆ 5     │
│ 5    ┆ null  ┆ 0.762614 ┆ B      ┆ 11      ┆ 5     │
└──────┴───────┴──────────┴────────┴─────────┴───────┘


In [17]:
## filtering 
## evaluates df on boolean data types

out = df.filter(pl.col("nrs") > 2)
print(out)

shape: (2, 6)
┌─────┬───────┬──────────┬────────┬─────────┬───────┐
│ nrs ┆ names ┆ random   ┆ groups ┆ nrs_sum ┆ count │
│ --- ┆ ---   ┆ ---      ┆ ---    ┆ ---     ┆ ---   │
│ i64 ┆ str   ┆ f64      ┆ str    ┆ i64     ┆ u32   │
╞═════╪═══════╪══════════╪════════╪═════════╪═══════╡
│ 3   ┆ lio   ┆ 0.173634 ┆ B      ┆ 11      ┆ 5     │
│ 5   ┆ null  ┆ 0.762614 ┆ B      ┆ 11      ┆ 5     │
└─────┴───────┴──────────┴────────┴─────────┴───────┘


In [19]:
## group_by

out = df.group_by("groups").agg(
    pl.sum("nrs"),  # sum nrs by groups
    pl.col("random").count().alias("count"),  # count group members
    # sum random where name != null
    pl.col("random").filter(pl.col("names").is_not_null()).sum().name.suffix("_sum"),
    pl.col("names").reverse().alias("reversed names"),
)
print(out)

shape: (3, 5)
┌────────┬─────┬───────┬────────────┬────────────────┐
│ groups ┆ nrs ┆ count ┆ random_sum ┆ reversed names │
│ ---    ┆ --- ┆ ---   ┆ ---        ┆ ---            │
│ str    ┆ i64 ┆ u32   ┆ f64        ┆ list[str]      │
╞════════╪═════╪═══════╪════════════╪════════════════╡
│ C      ┆ 0   ┆ 1     ┆ 0.487294   ┆ ["zio"]        │
│ A      ┆ 3   ┆ 2     ┆ 0.216161   ┆ ["kio", "rio"] │
│ B      ┆ 8   ┆ 2     ┆ 0.173634   ┆ [null, "lio"]  │
└────────┴─────┴───────┴────────────┴────────────────┘


In [23]:
## expression (can use many combinations)
pl.col("names").sort().head(2)

In [24]:
df

nrs,names,random,groups,nrs_sum,count
i64,str,f64,str,i64,u32
1.0,"""rio""",0.202561,"""A""",11,5
2.0,"""kio""",0.0136,"""A""",11,5
3.0,"""lio""",0.173634,"""B""",11,5
,"""zio""",0.487294,"""C""",11,5
5.0,,0.762614,"""B""",11,5


In [28]:
df.select(pl.col("nrs").sort().head(3), pl.col("names").filter(pl.col("nrs") == 1).sum())

nrs,names
i64,str
,
1.0,
2.0,
