# Load all parquet with polars

I can load up to 300M rows with `read_parquet`, I can't load 400M and `scan_parquet` doesn't let me ignore a column.

`use_pyarrow` doesn't let me restrict the number of rows to read. The Dask-generated parquet has an optional `path` column as a categorical which Polars doesn't like.

Separately it seems that a `LazyFrame` doesn't support `value_counts`, I'm not sure why.

In [1]:
import polars as pl

display(pl.__version__)

from cell_profiler import cell_profiler as cp
%start_cell_profiler

'0.17.10'

'Profiling enabled'

In [1] used 0.0 MiB RAM in 0.10s (system mean cpu 0%, single max cpu 0%), peaked 0.0 MiB above final usage, current RAM usage now 78.8 MiB


In [2]:
parquet_test_result = "../test_result.parquet"
pl.read_parquet_schema(parquet_test_result + "/part.1.parquet")

{'test_id': Int64,
 'vehicle_id': Int64,
 'test_date': Datetime(time_unit='ns', time_zone=None),
 'test_class_id': Int64,
 'test_type': Utf8,
 'test_result': Utf8,
 'test_mileage': Int64,
 'postcode_area': Utf8,
 'make': Utf8,
 'model': Utf8,
 'colour': Utf8,
 'fuel_type': Utf8,
 'cylinder_capacity': Int64,
 'first_use_date': Datetime(time_unit='ns', time_zone=None)}

In [2] used 0.5 MiB RAM in 0.13s (system mean cpu 8%, single max cpu 15%), peaked 0.0 MiB above final usage, current RAM usage now 79.3 MiB


In [3]:
#dfp = pl.read_parquet(parquet_test_result + "/*.parquet", n_rows=10, columns='make')
#dfp = pl.read_parquet(parquet_test_result + "/*.parquet", columns='make') # costs 9GB 5s, 639M rows
#dfp = pl.read_parquet(parquet_test_result + "/*.parquet", columns=['make', 'fuel_type']) # costs 17GB 10s
#dfp = pl.read_parquet(parquet_test_result + "/*.parquet", n_rows=1_000_000, ) # costs FAILS with:
# ComputeError(ErrString("cannot concat categoricals coming from a different source; consider setting a global StringCache"))', /home/runner/work/polars/polars/polars/polars-core/src/frame/mod.rs:923:36


In [3] used 0.0 MiB RAM in 0.10s (system mean cpu 9%, single max cpu 23%), peaked 0.0 MiB above final usage, current RAM usage now 79.7 MiB


In [None]:
cols = ['test_id', 'vehicle_id', 'test_date', 'test_class_id', 'test_type', 'test_result', 'test_mileage', 'postcode_area', 'make', 'model', 'colour', 'fuel_type', 'cylinder_capacity', 'first_use_date', ] # 'path'
#dfp = pl.read_parquet(parquet_test_result + "/*.parquet", n_rows=1_000_000, columns=cols) # costs 10GB 45s
#dfp = pl.read_parquet(parquet_test_result + "/*.parquet", n_rows=100_000_000, columns=cols) # costs 20GB 95s 
#dfp = pl.read_parquet(parquet_test_result + "/*.parquet", columns=cols) # EXPLODES AND RUNS OUT OF RAM

#dfp = pl.read_parquet(parquet_test_result + "/*.parquet", n_rows=1_000_000, columns=cols) # costs 10GB 50S
#dfp = pl.read_parquet(parquet_test_result + "/*.parquet", n_rows=1_000_000, columns=cols, low_memory=True) # costs 10GB 60s - not sure it behaves much differently?
#dfp = pl.read_parquet(parquet_test_result + "/*.parquet", n_rows=1_000_000, columns=cols, use_pyarrow=True) # CAN'T mix n_rows and use_pyarrow, Use pyarrow instead of the Rust native parquet reader. The pyarrow reader is more stable.

#dfp = pl.read_parquet(parquet_test_result + "/*.parquet", columns=cols, use_pyarrow=True, low_memory=True) # CAN'T mix n_rows and use_pyarrow 
# and this gives "FileNotFoundError: ../test_result.parquet/*.parquet"

#dfp = pl.read_parquet(parquet_test_result + "/*.parquet", n_rows=100_000_000, columns=cols, low_memory=True) # costs 20GB 72s
#dfp = pl.read_parquet(parquet_test_result + "/*.parquet", n_rows=300_000_000, columns=cols, low_memory=True) # costs 27GB 162s
#dfp = pl.read_parquet(parquet_test_result + "/*.parquet", columns=cols, low_memory=True) # FAILS runs out of swap, same for 400_ and 500_000_000 rows

# BEHAVIOUR reading eats Swap and RAM quickly maxes at 64GB and stays there, then reduces after the load completes
# it looks like Polars eats a lot of swap

# SO 300M rows is ok, more eats too much swap

dfp.shape



In [None]:
cols = ['test_id', 'vehicle_id', 'test_date', 'test_class_id', 'test_type', 'test_result', 'test_mileage', 
        'postcode_area', 'make', 'model', 'colour', 'fuel_type', 'cylinder_capacity', 'first_use_date', ] 
# ignoring 'path' from Dask output which is categorical

#dfp = pl.read_parquet(parquet_test_result + "/*.parquet", n_rows=300_000_000, columns=cols, low_memory=True) # costs 35GB 132s
# In [5] used 35820.2 MiB RAM in 134.01s (system mean cpu 22%, single max cpu 100%), peaked 24928.4 MiB above final usage, current RAM usage now 35906.9 MiB

In [None]:
cols = ['test_id', 'vehicle_id', 'test_date', 'test_class_id', 'test_type', 'test_result', 'test_mileage', 
        'postcode_area', 'make', 'model', 'colour', 'fuel_type', 'cylinder_capacity', 'first_use_date', ] 

#dfp = pl.read_parquet(parquet_test_result + "/*.parquet", n_rows=300_000_000, columns=cols, low_memory=False) # costs ?
# In [4] used 34525.3 MiB RAM in 142.73s (system mean cpu 22%, single max cpu 100%), peaked 25902.7 MiB above final usage, current RAM usage now 34613.7 MiB

Filed as an issue: https://github.com/pola-rs/polars/issues/8925

In [None]:
#del dfp

In [None]:
import gc; gc.collect()

In [None]:
%whos

In [None]:
pl.show_versions()

In [None]:
cols = ['test_id', 'vehicle_id', 'test_date', 'test_class_id', 'test_type', 'test_result', 'test_mileage', 
        'postcode_area', 'make', 'model', 'colour', 'fuel_type', 'cylinder_capacity', 'first_use_date', ] 

#dfp = pl.read_parquet(parquet_test_result + "/*.parquet", n_rows=300_000_000, columns=cols, rechunk=False) # costs 35GB 140s
dfp = pl.read_parquet(parquet_test_result + "/*.parquet", n_rows=400_000_000, columns=cols, rechunk=False) # FAILS as it did above

In [3]:
cols = ['test_id', 'vehicle_id', 'test_date', 'test_class_id', 'test_type', 'test_result', 'test_mileage', 
        'postcode_area', 'make', 'model', 'colour', 'fuel_type', 'cylinder_capacity', 'first_use_date', ] 

#dfp = pl.scan_parquet(parquet_test_result + "/*.parquet", n_rows=400_000_000, ).collect() # FAILS
# PanicException: should not fail: ComputeError(ErrString("cannot concat categoricals coming from a different source; consider setting a global StringCache"))

In [3] used 2.3 MiB RAM in 0.10s (system mean cpu 3%, single max cpu 6%), peaked 0.0 MiB above final usage, current RAM usage now 81.1 MiB


In [None]:
#dfp = pl.scan_parquet(parquet_test_result + "/*.parquet", n_rows=400_000_000, ).select(pl.col(cols)).collect() # FAILS OOM
#dfp = pl.scan_parquet(parquet_test_result + "/*.parquet", n_rows=400_000_000, ).select(pl.col(cols)).head(1_000).collect() # FAILS

In [10]:
# show_graph and explain
#pl.scan_parquet(parquet_test_result + "/*.parquet", n_rows=400_000_000, ).select(pl.col(cols)).head(1_000).show_graph()
#pl.scan_parquet(parquet_test_result + "/*.parquet", ).select(pl.col(cols)).head(1_000).show_graph()

In [10] used -2.4 MiB RAM in 0.10s (system mean cpu 0%, single max cpu 0%), peaked 0.0 MiB above final usage, current RAM usage now 147.8 MiB


In [57]:
#dfp = pl.read_parquet(parquet_test_result + "/*.parquet", n_rows=1_000_000, columns=cols)
dfp = pl.read_parquet(parquet_test_result + "/*.parquet", columns=['test_class_id'])

In [57] used 4015.0 MiB RAM in 4.32s (system mean cpu 18%, single max cpu 100%), peaked 2466.0 MiB above final usage, current RAM usage now 13737.0 MiB


In [61]:
dfp.select('test_class_id')['test_class_id'].value_counts()

test_class_id,counts
i64,u32
0,37501
1,4831754
2,13385573
3,238944
4,604886542
5,920358
7,15206290


In [61] used -0.3 MiB RAM in 4.90s (system mean cpu 23%, single max cpu 48%), peaked 0.0 MiB above final usage, current RAM usage now 13730.6 MiB


In [58]:
dfp.select('test_class_id').to_series().value_counts().sort(by='counts', descending=True)

test_class_id,counts
i64,u32
4,604886542
7,15206290
2,13385573
1,4831754
5,920358
3,238944
0,37501


In [58] used -6.1 MiB RAM in 5.10s (system mean cpu 7%, single max cpu 9%), peaked 0.0 MiB above final usage, current RAM usage now 13730.9 MiB


In [41]:
#pl.scan_parquet(parquet_test_result + "/*.parquet").head(10).select([pl.col("test_class_id")])['test_class_id'].value_counts().sort(by="counts", descending=True).collect()

In [41] used 0.0 MiB RAM in 0.10s (system mean cpu 0%, single max cpu 0%), peaked 0.0 MiB above final usage, current RAM usage now 9189.0 MiB


In [65]:
pl.scan_parquet(parquet_test_result + "/*.parquet").select(pl.col("test_class_id")).value_counts().collect()

AttributeError: 'LazyFrame' object has no attribute 'value_counts'

In [65] used 1.6 MiB RAM in 0.15s (system mean cpu 3%, single max cpu 7%), peaked 0.0 MiB above final usage, current RAM usage now 13732.0 MiB


In [68]:
dfp = pl.scan_parquet(parquet_test_result + "/*.parquet").select(pl.col("test_class_id")).collect()
dfp['test_class_id'].value_counts().sort(by='counts', descending=True)

test_class_id,counts
i64,u32
4,604886542
7,15206290
2,13385573
1,4831754
5,920358
3,238944
0,37501


In [68] used 3940.1 MiB RAM in 8.05s (system mean cpu 27%, single max cpu 100%), peaked 2781.3 MiB above final usage, current RAM usage now 16897.4 MiB


In [19]:
# can't do sort(by='count', descending=False)
res = pl.scan_parquet(parquet_test_result + "/*.parquet").select(pl.col("test_class_id").value_counts()).unnest("test_class_id") \
.sort(by='counts', descending=True).collect()
res

test_class_id,counts
i64,u32
4,604886542
7,15206290
2,13385573
1,4831754
5,920358
3,238944
0,37501


In [19] used 16.4 MiB RAM in 17.08s (system mean cpu 13%, single max cpu 100%), peaked 8928.8 MiB above final usage, current RAM usage now 972.1 MiB


AttributeError: 'DataFrame' object has no attribute 'unest'

In [13] used 0.0 MiB RAM in 0.11s (system mean cpu 6%, single max cpu 12%), peaked 0.0 MiB above final usage, current RAM usage now 988.7 MiB


In [62]:
res = pl.scan_parquet(parquet_test_result + "/*.parquet").head(10000).select(pl.col("test_class_id"))['test_class_id'].value_counts().collect()

TypeError: 'LazyFrame' object is not subscriptable (aside from slicing). Use 'select()' or 'filter()' instead.

In [62] used -3.3 MiB RAM in 0.14s (system mean cpu 4%, single max cpu 6%), peaked 3.3 MiB above final usage, current RAM usage now 13727.2 MiB


In [82]:
pl.scan_parquet(parquet_test_result + "/*.parquet").select(pl.col("test_class_id")).max().collect()

test_class_id
i64
7


In [82] used 55.7 MiB RAM in 2.84s (system mean cpu 30%, single max cpu 100%), peaked 7740.4 MiB above final usage, current RAM usage now 16794.7 MiB


In [83]:
pl.scan_parquet(parquet_test_result + "/*.parquet").select(pl.col("test_class_id")).schema

{'test_class_id': Int64}

In [83] used -2.0 MiB RAM in 0.23s (system mean cpu 5%, single max cpu 11%), peaked 2.0 MiB above final usage, current RAM usage now 16792.7 MiB


# Now try using the new Dask export that lacks the path categorical

In [7]:
#dfp = pl.read_parquet(parquet_test_result + "/*.parquet", use_pyarrow=True, rechunk=False)
# still can't read as the glob isn't understood

In [7] used 0.1 MiB RAM in 0.10s (system mean cpu 0%, single max cpu 0%), peaked 0.0 MiB above final usage, current RAM usage now 241.2 MiB


In [9]:
pl.scan_parquet(parquet_test_result + "/*.parquet").head(100).select(pl.col("test_class_id").count()).collect()

test_class_id
u32
100


In [9] used 97.5 MiB RAM in 10.20s (system mean cpu 9%, single max cpu 100%), peaked 4781.6 MiB above final usage, current RAM usage now 336.9 MiB
