In [None]:
import pathlib
import pandas as pd
from datafusion import SessionContext
import pyarrow as pa
import polars as pl

# Data

Tick data for 6 ETFs downloaded from https://www.dukascopy.com/swiss/english/marketwatch/historical/

In [None]:
FOLDER_PATH = "./dukascopy/download/"

In [None]:
!ls -laRh ./dukascopy/download/

# 29M dvyususd-tick-2020-01-01-2020-06-01.parquet
# 23M eemususd-tick-2020-01-01-2020-06-01.parquet
# 19M fxiususd-tick-2020-01-01-2020-06-01.parquet
# 35M ibbususd-tick-2020-01-01-2020-06-01.parquet
# 47M iveususd-tick-2020-01-01-2020-06-01.parquet
# 42M iwdususd-tick-2020-01-01-2020-06-01.parquet

# DuckDB

In [None]:
import duckdb

In [None]:
con = duckdb.connect()

In [None]:
q = con.execute(f"""
   SELECT *
   FROM '{FOLDER_PATH}/*.parquet'
""")
result = q.arrow()

In [None]:
assert result.shape == (29431347, 3)

In [None]:
%%timeit 
q = con.execute(f"""
   SELECT *
   FROM '{FOLDER_PATH}/*.parquet'
""")
result = q.arrow()
# 654 ms ± 9.78 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

# Polars

In [None]:
s = pl.scan_parquet(f'{FOLDER_PATH}/*.parquet')
result = s.collect().to_arrow()

In [None]:
assert result.shape == (29431347, 3)

In [None]:
%%timeit 
s = pl.scan_parquet(f'{FOLDER_PATH}/*.parquet')
result = s.collect().to_arrow()
# 354 ms ± 7.09 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

# Datafusion

In [None]:
ctx = SessionContext()
ctx.register_parquet('quote_ticks', f'{FOLDER_PATH}')
q = ctx.sql("select * from quote_ticks").collect()
result = table = pa.Table.from_batches(q)

In [None]:
assert result.shape == (29431347, 3)

In [None]:
%%timeit 
ctx = SessionContext()
ctx.register_parquet('quote_ticks', f'{FOLDER_PATH}')
q = ctx.sql("select * from quote_ticks").collect()
result = table = pa.Table.from_batches(q)
# 222 ms ± 20.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

# Pandas

In [None]:
dfs = pd.read_parquet(FOLDER_PATH)

In [None]:
%%timeit 

df = pd.read_parquet(FOLDER_PATH, pre_buffer=True)
result = pa.Table.from_pandas(df)
# 363 ms ± 37.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)