In [None]:
from datetime import date

import pandas as pd
import polars as pl
import dask.dataframe as dd
import duckdb

- Polars: https://pola-rs.github.io/polars/user-guide/io/parquet/
- Dask: https://docs.dask.org/en/latest/dataframe-parquet.html
- DuckDB: https://duckdb.org/docs/data/parquet/overview.html

In [None]:
pl_df = pl.read_parquet("./data/parquet/*.parquet")
pl_df.filter((pl.col("date") == pl.lit(date(2023, 12, 22))) & (pl.col("close") >= 1.5))

In [None]:
pl_lazy_df = pl.scan_parquet("./data/parquet/*.parquet")
pl_lazy_df.filter((pl.col("date") == pl.lit(date(2023, 12, 22))) & (pl.col("close") >= 1.5)).collect()

In [None]:
dd_df = dd.read_parquet("./data/parquet/*.parquet")
dd_df[(dd_df["date"] == date(2023, 12, 22)) & (dd_df["close"] >= 1.5)].compute()

In [None]:
TABLE = "./data/parquet/*.parquet"
duckdb.query(f"""SELECT * FROM '{TABLE}' WHERE date = '2023-12-22' AND close >= 1.5""")

In [None]:
pl_final_df = pl_df.pivot(index="ticker", columns="date", values="close").with_columns(
    pl.concat_list(pl.all().exclude("ticker")).alias("allclose")
)
pl_final_df.write_parquet("./tmp/polars.parquet")

In [None]:
dd_df = dd.read_parquet("./data/parquet/*.parquet")

# Make date column to categorical to use pivot method
dd_df.date = dd_df.date.dt.strftime("%Y-%m-%d").astype("category").cat.as_known()
dd_pivot_df = dd_df.pivot_table(index='ticker', columns='date', values='close')

# Aggregate the date columns
dd_agg_df = dd_df.groupby('ticker').agg(list).drop(columns="date")

# Merge the two dataframes
dd_final_df = dd_pivot_df.merge(dd_agg_df, on="ticker").compute()

dd_final_df.rename(columns={"close": "allclose"}, inplace=True)

dd_final_df.to_parquet("./tmp/dask.parquet")

In [None]:
TABLE = "./data/parquet/*.parquet"
OUTPUT = "./tmp/duckdb.parquet"

# Directly join the results of the two queries using subqueries
duckdb.query(f"""
COPY (
    SELECT pivot_t.*, agg_t.allclose
    FROM (
        PIVOT '{TABLE}' ON date USING first(close) GROUP BY ticker
    ) AS pivot_t
    INNER JOIN (
        SELECT ticker, list(close ORDER BY date ASC) AS allclose 
        FROM '{TABLE}' 
        GROUP BY ticker
    ) AS agg_t ON pivot_t.ticker = agg_t.ticker
) TO '{OUTPUT}' (FORMAT PARQUET)
""")

In [None]:
df = dd.read_parquet("./tmp/dask.parquet", engine="pyarrow", dtype_backend="pyarrow").compute()
pl.from_pandas(df, schema_overrides={"allclose": pl.List(pl.Float64)})

In [None]:
# df = dd.read_parquet("./tmp/dask.parquet", engine="pyarrow", dtype_backend="pyarrow").compute()
df = dd.read_parquet("./tmp/dask.parquet").compute()
df.to_parquet("tmp/tmp.parquet")
# df = pd.read_parquet("tmp/tmp.parquet")

In [None]:
# df = dd.read_parquet("./tmp/polars.parquet", engine="pyarrow", dtype_backend="pyarrow").compute()
df = dd.read_parquet("./tmp/polars.parquet", engine="pyarrow").compute()
df.to_parquet("tmp/tmp.parquet")
pd.read_parquet("tmp/tmp.parquet")

In [None]:
df = dd.read_parquet("./tmp/duckdb.parquet", engine="pyarrow").compute()
df.to_parquet("tmp/tmp.parquet")
pd.read_parquet("tmp/tmp.parquet")

In [None]:
df = dd.read_parquet("./tmp/polars.parquet").compute()
pl.from_pandas(df)

In [None]:
df = dd.read_parquet("./tmp/polars.parquet", engine="pyarrow", dtype_backend="pyarrow").compute()
pl.from_pandas(df)

In [None]:
pl.read_parquet("./tmp/dask.parquet")