# Polars

__https://pola-rs.github.io/polars-book/user-guide/__

>The goal is to introduce Polars by comparing it to other solutions.

>Polars is completely written in Rust (no runtime overhead!) which gives it C/C++ performance and allows it to fully control performance critical parts in a query engine.

## The goal of Polars is to provide a lightning fast DataFrame library that:

>Utilizes all available cores on your machine.

>Optimizes queries to reduce unneeded work/memory allocations.

>Handles datasets much larger than your available RAM.

>Has an API that is consistent and predictable.

>Has a strict schema (data-types should be known before running the query).



## Installation:

pip install polars

OR

https://anaconda.org/conda-forge/polars

conda install -c conda-forge polars


In [1]:
import polars as pl

In [2]:
s = pl.Series("a", [1, 2, 3, 4, 5])
print(s)

shape: (5,)
Series: 'a' [i64]
[
	1
	2
	3
	4
	5
]


In [3]:
from datetime import datetime
df = pl.DataFrame(
    {
        "integer": [1, 2, 3, 4, 5],
        "date": [
            datetime(2022, 1, 1),
            datetime(2022, 1, 2),
            datetime(2022, 1, 3),
            datetime(2022, 1, 4),
            datetime(2022, 1, 5),
        ],
        "float": [4.0, 5.0, 6.0, 7.0, 8.0],
    }
)

print(df)

shape: (5, 3)
┌─────────┬─────────────────────┬───────┐
│ integer ┆ date                ┆ float │
│ ---     ┆ ---                 ┆ ---   │
│ i64     ┆ datetime[μs]        ┆ f64   │
╞═════════╪═════════════════════╪═══════╡
│ 1       ┆ 2022-01-01 00:00:00 ┆ 4.0   │
│ 2       ┆ 2022-01-02 00:00:00 ┆ 5.0   │
│ 3       ┆ 2022-01-03 00:00:00 ┆ 6.0   │
│ 4       ┆ 2022-01-04 00:00:00 ┆ 7.0   │
│ 5       ┆ 2022-01-05 00:00:00 ┆ 8.0   │
└─────────┴─────────────────────┴───────┘


In [4]:
df.head(3)

integer,date,float
i64,datetime[μs],f64
1,2022-01-01 00:00:00,4.0
2,2022-01-02 00:00:00,5.0
3,2022-01-03 00:00:00,6.0


In [5]:
df.tail(3)

integer,date,float
i64,datetime[μs],f64
3,2022-01-03 00:00:00,6.0
4,2022-01-04 00:00:00,7.0
5,2022-01-05 00:00:00,8.0


In [6]:
df.sample(2)

integer,date,float
i64,datetime[μs],f64
4,2022-01-04 00:00:00,7.0
3,2022-01-03 00:00:00,6.0


In [7]:
df.describe()

describe,integer,date,float
str,f64,str,f64
"""count""",5.0,"""5""",5.0
"""null_count""",0.0,"""0""",0.0
"""mean""",3.0,,6.0
"""std""",1.581139,,1.581139
"""min""",1.0,"""2022-01-01 00:…",4.0
"""max""",5.0,"""2022-01-05 00:…",8.0
"""median""",3.0,,6.0
"""25%""",2.0,,5.0
"""75%""",4.0,,7.0


In [8]:
import numpy as np

In [9]:
df = pl.DataFrame(
    {
        "nrs": [1, 2, 3, None, 5],
        "names": ["foo", "ham", "spam", "egg", None],
        "random": np.random.rand(5),
        "groups": ["A", "A", "B", "C", "B"],
    }
)
df

nrs,names,random,groups
i64,str,f64,str
1.0,"""foo""",0.455021,"""A"""
2.0,"""ham""",0.113997,"""A"""
3.0,"""spam""",0.917982,"""B"""
,"""egg""",0.288977,"""C"""
5.0,,0.877943,"""B"""


In [11]:
out = df.select(
    [
        pl.sum("nrs"),
        pl.col("names").sort(),
        pl.col("names").first().alias("first name"),
        (pl.mean("nrs") * 10).alias("10xnrs"),
    ]
)
out

nrs,names,first name,10xnrs
i64,str,str,f64
11,,"""foo""",27.5
11,"""egg""","""foo""",27.5
11,"""foo""","""foo""",27.5
11,"""ham""","""foo""",27.5
11,"""spam""","""foo""",27.5


In [12]:
df = df.with_columns(
    [
        pl.sum("nrs").alias("nrs_sum"),
        pl.col("random").count().alias("count"),
    ]
)
df

nrs,names,random,groups,nrs_sum,count
i64,str,f64,str,i64,u32
1.0,"""foo""",0.455021,"""A""",11,5
2.0,"""ham""",0.113997,"""A""",11,5
3.0,"""spam""",0.917982,"""B""",11,5
,"""egg""",0.288977,"""C""",11,5
5.0,,0.877943,"""B""",11,5


In [13]:
df.head()

nrs,names,random,groups,nrs_sum,count
i64,str,f64,str,i64,u32
1.0,"""foo""",0.455021,"""A""",11,5
2.0,"""ham""",0.113997,"""A""",11,5
3.0,"""spam""",0.917982,"""B""",11,5
,"""egg""",0.288977,"""C""",11,5
5.0,,0.877943,"""B""",11,5


In [14]:
out = df.filter(pl.col("nrs") > 2)
out

nrs,names,random,groups,nrs_sum,count
i64,str,f64,str,i64,u32
3,"""spam""",0.917982,"""B""",11,5
5,,0.877943,"""B""",11,5


In [15]:
out = df.groupby("groups").agg([
    pl.sum("nrs"),  # sum nrs by groups
    pl.col("random").count().alias("count"),  # count group members
    # sum random where name != null
    pl.col("random").filter(pl.col("names").is_not_null()
                            ).sum().suffix("_sum"),
    pl.col("names").reverse().alias(("reversed names")),
])
out

groups,nrs,count,random_sum,reversed names
str,i64,u32,f64,list[str]
"""A""",3.0,2,0.569018,"[""ham"", ""foo""]"
"""C""",,1,0.288977,"[""egg""]"
"""B""",8.0,2,0.917982,"[null, ""spam""]"


In [16]:
import pandas as pd
import polars as pl

In [17]:
df = pd.read_parquet("flights.parquet")
df.head()

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
0,2013,1,1,517.0,515,2.0,830.0,819,11.0,UA,1545,N14228,EWR,IAH,227.0,1400,5,15,2013-01-01 05:00:00
1,2013,1,1,533.0,529,4.0,850.0,830,20.0,UA,1714,N24211,LGA,IAH,227.0,1416,5,29,2013-01-01 05:00:00
2,2013,1,1,542.0,540,2.0,923.0,850,33.0,AA,1141,N619AA,JFK,MIA,160.0,1089,5,40,2013-01-01 05:00:00
3,2013,1,1,544.0,545,-1.0,1004.0,1022,-18.0,B6,725,N804JB,JFK,BQN,183.0,1576,5,45,2013-01-01 05:00:00
4,2013,1,1,554.0,600,-6.0,812.0,837,-25.0,DL,461,N668DN,LGA,ATL,116.0,762,6,0,2013-01-01 06:00:00


In [18]:
df.shape

(336776, 19)

In [None]:
df.to_csv("flights.csv")

In [None]:
df.carrier.value_counts()

In [19]:
%%timeit
df = pd.read_parquet("flights.parquet")
agg = df.groupby(['carrier']).agg(
    {'dep_delay':'mean',
    'arr_delay':'mean'}
).sort_values('dep_delay')
agg

50.1 ms ± 1.03 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [20]:
%%timeit
agg = (pl.read_parquet("flights.parquet")
       .groupby(['carrier'])
       .agg(
    [pl.col('dep_delay').mean().alias("dep_delay"),
     pl.col('arr_delay').mean().alias("arr_delay")
    ]
    ).sort(pl.col('dep_delay'))
      )
agg

18.8 ms ± 843 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


# Happy Learning