In [3]:
from datetime import datetime
import numpy as np
import polars as pl
from pathlib import Path
from polars import col, lit
import pandas as pd
import time
from pandas.io.common import get_handle

In [4]:
pd.__version__

'1.5.2'

In [5]:
pl.__version__

'0.14.28'

In [6]:
# Download a huge csv as a test. Takes a while and only needed once...
big_csv = Path("./big.csv")
csv_url = "http://sdm.lbl.gov/fastbit/data/star2002-full.csv.gz"

# Obtains the path for big_csv, this file of moderate size 15,857,624 rows and 16 columns
if not big_csv.exists():
    with get_handle(csv_url, compression="gzip", mode="r") as fh_in, open(big_csv, "bw") as fh_out:
        fh_out.write(fh_in.handle.buffer.read())

# Load Data:

In [7]:
s = time.time()
df_pandas = pd.read_csv(big_csv, header=None)
e = time.time()
pd_time = e - s
print("Pandas Loading Time = {}".format(pd_time))

Pandas Loading Time = 36.55296516418457


In [8]:
s = time.time()
df_pypolars = pl.read_csv(big_csv, has_header=False)
e = time.time()
pl_time = e - s 
print("PyPolars Loading Time = {}".format(pl_time))

PyPolars Loading Time = 10.709691047668457


# Filter:

In [27]:
s = time.time()
temp = df_pandas[df_pandas[5]>500]
e = time.time()
pd_time = e - s
print("Pandas Filter Time = {}".format(pd_time))

Pandas Filter Time = 6.666182994842529


In [25]:
s = time.time()
temp = df_pypolars.filter(pl.col("column_6") > 500)
e = time.time()
pl_time = e - s 
print("PyPolars Filter Time = {}".format(pl_time))

PyPolars Filter Time = 1.117527723312378


# Groupby:

In [30]:
s = time.time()
temp = df_pandas.groupby(by=0).agg({2:np.sum, 3:np.mean})
e = time.time()
pd_time = e - s
print("Pandas GroupBy Time = {}".format(pd_time))

Pandas GroupBy Time = 0.9216609001159668


In [38]:
s = time.time()
q = (
    df_pypolars
    .lazy()
    .groupby(by='column_1')
    .agg(
        [
        pl.col('column_3').sum(),
        pl.col('column_4').mean()
        ]
    )
)
q.collect()
e = time.time()
pd_time = e - s
print("PyPolars GroupBy Time = {}".format(pd_time))


PyPolars GroupBy Time = 0.19597697257995605


# Describe:

In [40]:
s = time.time()
temp = df_pandas.describe()
e = time.time()
pd_time = e - s
print("Pandas Describe Time = {}".format(pd_time))

Pandas Describe Time = 19.44732904434204


There is no describe method in Polars so we need to rely on on transforming polars dataframe into a pandas df then apply the describe method.

In [46]:
s = time.time()
temp = df_pypolars[df_pypolars.columns].to_pandas().describe()
e = time.time()
pd_time = e - s
print("PyPolars Describe Time = {}".format(pd_time))

PyPolars Describe Time = 20.313318014144897


# Uniques:

In [61]:
s = time.time()
temp = df_pandas[5].unique()
e = time.time()
pd_time = e - s
print("Pandas Unique Time = {}".format(pd_time))

Pandas Unique Time = 0.5552530288696289


In [60]:
s = time.time()
temp = df_pypolars['column_6'].unique()
e = time.time()
pd_time = e - s
print("PyPolars Unique Time = {}".format(pd_time))

PyPolars Unique Time = 0.2614939212799072


# Saving:

In [53]:
s = time.time()
df_pandas.to_csv("temp_table.csv", index=False)
e = time.time()
pd_time = e - s
print("Pandas Saving Time = {}".format(pd_time))

Pandas Saving Time = 144.83919620513916


In [55]:
s = time.time()
df_pypolars.write_csv("temp_table.csv")
e = time.time()
pd_time = e - s
print("PyPolars Saving Time = {}".format(pd_time))

PyPolars Saving Time = 13.505635261535645
