In [1]:
import polars as pl
import pathlib
import plotly.express as px

In [2]:
path_to_data = pathlib.Path("data/titanic.csv")

In [3]:
df = pl.read_csv(path_to_data)

In [4]:
df.head(5)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Ow...","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. ...","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""Heikkinen, Mis...","""female""",26.0,0,0,"""STON/O2. 31012...",7.925,,"""S"""
4,1,1,"""Futrelle, Mrs....","""female""",35.0,1,0,"""113803""",53.1,"""C123""","""S"""
5,0,3,"""Allen, Mr. Wil...","""male""",35.0,0,0,"""373450""",8.05,,"""S"""


In [5]:
df.glimpse()

'Rows: 891\nColumns: 12\n$ PassengerId   <Int64> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10\n$ Survived      <Int64> 0, 1, 1, 1, 0, 0, 0, 0, 1, 1\n$ Pclass        <Int64> 3, 1, 3, 1, 3, 3, 1, 3, 3, 2\n$ Name           <Utf8> Braund, Mr. Owen Harris, Cumings, Mrs. John Bradley (Florence Briggs Thayer), Heikkinen, Miss. Laina, Futrelle, Mrs. Jacques Heath (Lily May Peel), Allen, Mr. William Henry, Moran, Mr. James, McCarthy, Mr. Timothy J, Palsson, Master. Gosta Leonard, Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg), Nasser, Mrs. Nicholas (Adele Achem)\n$ Sex            <Utf8> male, female, female, female, male, male, male, male, female, female\n$ Age         <Float64> 22.0, 38.0, 26.0, 35.0, 35.0, None, 54.0, 2.0, 27.0, 14.0\n$ SibSp         <Int64> 1, 1, 0, 1, 0, 0, 0, 3, 0, 1\n$ Parch         <Int64> 0, 0, 0, 0, 0, 0, 0, 1, 2, 0\n$ Ticket         <Utf8> A/5 21171, PC 17599, STON/O2. 3101282, 113803, 373450, 330877, 17463, 349909, 347742, 237736\n$ Fare        <Float64> 7.25, 71.2833, 7.925, 53

You can use the bracket accessors like you do from pandas, but it's actually not as efficient as using the **expression API**.

In [6]:
df[:3, ["Pclass", "Name", "Age"]]

Pclass,Name,Age
i64,str,f64
3,"""Braund, Mr. Ow...",22.0
1,"""Cumings, Mrs. ...",38.0
3,"""Heikkinen, Mis...",26.0


In [7]:
# an example of using the expression API (PySpark style syntax)
(
    df
    .select(
        [
            pl.col("Pclass"),
            pl.col("Name"),
            pl.col("Age")
        ]
    )
)

Pclass,Name,Age
i64,str,f64
3,"""Braund, Mr. Ow...",22.0
1,"""Cumings, Mrs. ...",38.0
3,"""Heikkinen, Mis...",26.0
1,"""Futrelle, Mrs....",35.0
3,"""Allen, Mr. Wil...",35.0
3,"""Moran, Mr. Jam...",
1,"""McCarthy, Mr. ...",54.0
3,"""Palsson, Maste...",2.0
3,"""Johnson, Mrs. ...",27.0
2,"""Nasser, Mrs. N...",14.0


If you use the expression API, it'll let you run transformations on individual columns before returning them. In addition, it'll take care of running those operations in parallel, and enables running query optimization in lazy mode.

In [8]:
# an example of using the expression API (PySpark style syntax)
(
    df
    .select(
        [
            pl.col("Pclass"),
            pl.col("Name").str.to_lowercase(),
            pl.col("Age").round(2)
        ]
    )
)

Pclass,Name,Age
i64,str,f64
3,"""braund, mr. ow...",22.0
1,"""cumings, mrs. ...",38.0
3,"""heikkinen, mis...",26.0
1,"""futrelle, mrs....",35.0
3,"""allen, mr. wil...",35.0
3,"""moran, mr. jam...",
1,"""mccarthy, mr. ...",54.0
3,"""palsson, maste...",2.0
3,"""johnson, mrs. ...",27.0
2,"""nasser, mrs. n...",14.0


In [9]:
(
    df
    .groupby(["Survived", "Pclass"])
    .agg(
        pl.col("PassengerId").count().alias("counts")
    )
)

Survived,Pclass,counts
i64,i64,u32
0,3,372
0,1,80
0,2,97
1,2,87
1,1,136
1,3,119


Using the expression API, Polars will do the groupby aggregation itself in parallel, and if you have multiple different aggregations, it'll do those in parallel as well.

# Visualization

In [11]:
px.scatter(
    x=df["Age"],
    y=df["Fare"]
)

# Lazy Mode
***
In lazy mode, your code is not run imperatively, but a logical execution plan is generated, and the transformations are only triggered upon the calling of an action method.

Polars will optimize your query to perform it in a more optimal way.

To make polars evaluate lazily, you use `scan_csv`, instead of `read_csv`

In [10]:
print(
    pl.scan_csv(path_to_data)
    .groupby(["Survived", "Pclass"])
    .agg(
        pl.col("PassengerId").count().alias("counts")
    )
    .describe_optimized_plan()
)

  Aggregate
  	[col("PassengerId").count().alias("counts")] BY [col("Survived"), col("Pclass")] FROM
  	  CSV SCAN data/titanic.csv
  PROJECT 3/12 COLUMNS




So, polars has determined that we can optimize our current execution plan by first only selecting the 3 ccolumns that are required to compute this query (PROJECT 3/12 COLUMNS)

# Streaming
***
Using the streaming API of Polars allows us to work with out of core data sets, by streaming all the data through memory and performing our operations in batches. 

You enable Polars streaming by passing `streaming=True` to the `.collect()` method.

So Polars will use lazy execution, and process all the data in the file in batches, while performing our aggregations and transformations.

In [12]:
(
    pl.scan_csv(path_to_data)
    .groupby(["Survived", "Pclass"])
    .agg(
        pl.col("PassengerId").count().alias("counts")
    )
    .collect(streaming=True)
)

Survived,Pclass,counts
i64,i64,u32
1,1,136
0,1,80
0,3,372
1,2,87
0,2,97
1,3,119
