# Why should you use Polars
* polars is much faster out of the box than other dataframe libraries
* polars uses a declarative execution engine to optimize your queries
* polars code uses the well-known PySpark syntax structure

In [1]:
import polars as pl
import pandas as pd
import pathlib

In [2]:
path_to_data = pathlib.Path("data/yellow_tripdata_2022-01.parquet")

In [3]:
# look at the first two rows of this data file
pl.scan_parquet(path_to_data).head(2).collect()

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
i64,datetime[ns],datetime[ns],f64,f64,f64,str,i64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1,2022-01-01 00:35:40,2022-01-01 00:53:29,2.0,3.8,1.0,"""N""",142,236,1,14.5,3.0,0.5,3.65,0.0,0.3,21.95,2.5,0.0
1,2022-01-01 00:33:43,2022-01-01 00:42:07,1.0,2.1,1.0,"""N""",236,42,1,8.0,0.5,0.5,4.0,0.0,0.3,13.3,0.0,0.0


Now we can compare the time taken to read in this data file using pandas and using Polars

In [4]:
%%timeit
pl.read_parquet(path_to_data)

124 ms ± 5.38 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [5]:
%%timeit
pd.read_parquet(path_to_data)

226 ms ± 3.41 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


# Now Let's Try Running a Query

In [6]:
%%timeit
polars_results = (
    pl.scan_parquet(path_to_data)
    .groupby("passenger_count")
    .agg(
        [
            pl.col("trip_distance").mean().suffix("_mean"),
            pl.col("trip_distance").max().suffix("_max")
        ]
    )
    .collect()
)

62.9 ms ± 2.81 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [7]:
polars_results = (
    pl.scan_parquet(path_to_data)
    .groupby("passenger_count")
    .agg(
        [
            pl.col("trip_distance").mean().suffix("_mean"),
            pl.col("trip_distance").max().suffix("_max")
        ]
    )
    .collect()
)

pandas_results = (
    pd.read_parquet(path_to_data, engine="pyarrow", columns=["passenger_count", "trip_distance"])
    .groupby("passenger_count")
    .agg({"trip_distance": ["mean", "max"]})
)

In [8]:
%%timeit
pandas_results = (
    pd.read_parquet(path_to_data, engine="pyarrow", columns=["passenger_count", "trip_distance"])
    .groupby("passenger_count")
    .agg({"trip_distance": ["mean", "max"]})
)

113 ms ± 1.62 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [20]:
polars_results.sort("passenger_count").drop_nulls()

passenger_count,trip_distance_mean,trip_distance_max
f64,f64,f64
0.0,2.746838,622.0
1.0,3.0153,620.0
2.0,3.503723,651.0
3.0,3.343565,196.9
4.0,3.48676,616.8
5.0,3.00871,254.88
6.0,3.149393,57.58
7.0,0.792222,6.81
8.0,0.51125,4.09
9.0,9.506667,28.4


In [21]:
pandas_results

Unnamed: 0_level_0,trip_distance,trip_distance
Unnamed: 0_level_1,mean,max
passenger_count,Unnamed: 1_level_2,Unnamed: 2_level_2
0.0,2.746838,622.0
1.0,3.0153,620.0
2.0,3.503723,651.0
3.0,3.343565,196.9
4.0,3.48676,616.8
5.0,3.00871,254.88
6.0,3.149393,57.58
7.0,0.792222,6.81
8.0,0.51125,4.09
9.0,9.506667,28.4


So the conclusion here is that polars is at least 2 times as fast as pandas with parquet and pyarrow, and we have the exact same results between the two methods, up to numerical uncertainty. 