In [7]:
import polars as pl
import pathlib
import numpy as np

path_to_data = pathlib.Path("data/titanic.csv")

In [2]:
df = pl.read_csv(path_to_data)
df.head(3)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Ow...","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. ...","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""Heikkinen, Mis...","""female""",26.0,0,0,"""STON/O2. 31012...",7.925,,"""S"""


In [3]:
df[0]

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Ow...","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""


In [4]:
df[[2, 3]]

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
3,1,3,"""Heikkinen, Mis...","""female""",26.0,0,0,"""STON/O2. 31012...",7.925,,"""S"""
4,1,1,"""Futrelle, Mrs....","""female""",35.0,1,0,"""113803""",53.1,"""C123""","""S"""


In [5]:
df[:2]

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Ow...","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. ...","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""


In [6]:
df[range(2, 4)]

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
3,1,3,"""Heikkinen, Mis...","""female""",26.0,0,0,"""STON/O2. 31012...",7.925,,"""S"""
4,1,1,"""Futrelle, Mrs....","""female""",35.0,1,0,"""113803""",53.1,"""C123""","""S"""


In [8]:
df[np.arange(0, 3)]

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Ow...","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. ...","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""Heikkinen, Mis...","""female""",26.0,0,0,"""STON/O2. 31012...",7.925,,"""S"""


In [10]:
# NOTE - you can't pass boolean masks to Polars Eager Dfs
df[[True for _ in range(len(df))]]

ValueError: Cannot __getitem__ on DataFrame with item: 'shape: (891,)
Series: '' [bool]
[
	true
	true
	true
	true
	true
	true
	true
	true
	true
	true
	true
	true
	...
	true
	true
	true
	true
	true
	true
	true
	true
	true
	true
	true
	true
	true
]' of type: '<class 'polars.internals.series.series.Series'>'.

In [11]:
# so this very familiar adage from pandas doesn't work
df[df["Age"] > 30]

ValueError: Cannot __getitem__ on DataFrame with item: 'shape: (891,)
Series: 'Age' [bool]
[
	false
	true
	false
	true
	true
	null
	true
	false
	false
	false
	false
	true
	...
	null
	true
	false
	true
	false
	false
	false
	true
	false
	false
	null
	false
	true
]' of type: '<class 'polars.internals.series.series.Series'>'.

# Filtering in Eager Mode
The primary way to filter rows in Polars is too use the `filter` method with the Expression API. The main use case of the bracket accessor is to select rows when quickly inspecting data in interactive mode.

In [12]:
df.filter(
    pl.col('Pclass') == 1
).head(3)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
2,1,1,"""Cumings, Mrs. ...","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
4,1,1,"""Futrelle, Mrs....","""female""",35.0,1,0,"""113803""",53.1,"""C123""","""S"""
7,0,1,"""McCarthy, Mr. ...","""male""",54.0,0,0,"""17463""",51.8625,"""E46""","""S"""


`[]` indexing can **only** be used in eager mode, whereas `filter` can be used in both. Also, `filter` expressions are optimized by the query optimizer in lazy mode.

In [14]:
# you can add an explicit row number column using:
df_with_row_number = df.with_row_count(name="row_nr")
df_with_row_number.head(3)

row_nr,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
u32,i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
0,1,0,3,"""Braund, Mr. Ow...","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
1,2,1,1,"""Cumings, Mrs. ...","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
2,3,1,3,"""Heikkinen, Mis...","""female""",26.0,0,0,"""STON/O2. 31012...",7.925,,"""S"""


In [16]:
# then you can use that row number to do index based indexing in lazy mode, with the filter expression
df_with_row_number.filter(
    pl.col('row_nr') < 4
)

row_nr,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
u32,i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
0,1,0,3,"""Braund, Mr. Ow...","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
1,2,1,1,"""Cumings, Mrs. ...","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
2,3,1,3,"""Heikkinen, Mis...","""female""",26.0,0,0,"""STON/O2. 31012...",7.925,,"""S"""
3,4,1,1,"""Futrelle, Mrs....","""female""",35.0,1,0,"""113803""",53.1,"""C123""","""S"""


In eager mode, chaining calls of `.filter` will actually still generate multiple dataframes for each step. 
To get the benefits of Polars, make sure that you chain the conditions of the `.filter()` calls with the `&` operator.
This will lead to the evaluation of the `.filter` operation over a single pass through the data.

In [17]:
# naive - using multiple conditions
df_naive_multiple_conditions = (
    df
    .filter(
        pl.col('Pclass') == 1
    )
    .filter(
        (pl.col('Age') > 70)
    )
)
df_naive_multiple_conditions.head(3)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
97,0,1,"""Goldschmidt, M...","""male""",71.0,0,0,"""PC 17754""",34.6542,"""A5""","""C"""
494,0,1,"""Artagaveytia, ...","""male""",71.0,0,0,"""PC 17609""",49.5042,,"""C"""
631,1,1,"""Barkworth, Mr....","""male""",80.0,0,0,"""27042""",30.0,"""A23""","""S"""


In [19]:
# better way - using multiple conditions
df_better_multiple_conditions = (
    df
    .filter(
        (pl.col('Pclass') == 1) & (pl.col('Age') > 70)
    )
)
df_better_multiple_conditions.head(3)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
97,0,1,"""Goldschmidt, M...","""male""",71.0,0,0,"""PC 17754""",34.6542,"""A5""","""C"""
494,0,1,"""Artagaveytia, ...","""male""",71.0,0,0,"""PC 17609""",49.5042,,"""C"""
631,1,1,"""Barkworth, Mr....","""male""",80.0,0,0,"""27042""",30.0,"""A23""","""S"""


In [20]:
# better way - using multiple conditions
df_or_better_multiple_conditions = (
    df
    .filter(
        (pl.col('Pclass') == 1) | (pl.col('Age') > 70)
    )
)
df_or_better_multiple_conditions.head(3)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
2,1,1,"""Cumings, Mrs. ...","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
4,1,1,"""Futrelle, Mrs....","""female""",35.0,1,0,"""113803""",53.1,"""C123""","""S"""
7,0,1,"""McCarthy, Mr. ...","""male""",54.0,0,0,"""17463""",51.8625,"""E46""","""S"""


# Filtering in Lazy Mode

In [22]:
df_lazy = pl.scan_csv(path_to_data)
df_lazy

In [23]:
df_with_lazy_filter = df_lazy.filter(pl.col("Age") > 30)
df_with_lazy_filter

In [25]:
print(df_with_lazy_filter.describe_optimized_plan())

  CSV SCAN data/titanic.csv
  PROJECT */12 COLUMNS
  SELECTION: [(col("Age")) > (30f64)]



Here is how you show predicate pushdown at work - combining multiple filter statements into a single pass over the data.

In [26]:
df_with_multiple_lazy_filters = (
    df_lazy
    .filter(
        pl.col("Pclass") == 1
    )
    .filter(
        pl.col("Age") > 70
    )
)
df_with_multiple_lazy_filters.describe_optimized_plan()

'  CSV SCAN data/titanic.csv\n  PROJECT */12 COLUMNS\n  SELECTION: [([(col("Age")) > (70f64)]) & ([(col("Pclass")) == (1i64)])]\n'