# Try some queries

* read 1 parquet file, select date range
* count pass rates by day for petrols, same for fails - value_counts, groupby
* read all 2021 rows

How do we?

* count rows in a file using scan_parquet, without instantiating it

In [1]:
import polars as pl

display(pl.__version__)

from cell_profiler import cell_profiler as cp
%start_cell_profiler

'0.17.10'

'Profiling enabled'

In [1] used 0.0 MiB RAM in 0.10s (system mean cpu 0%, single max cpu 0%), peaked 0.0 MiB above final usage, current RAM usage now 78.1 MiB


# Read 1 file, calculate pass and fail rates

In [2]:
parquet_test_result = "../test_result.parquet/part.1.parquet"
#pl.read_parquet_schema(parquet_test_result + "/part.1.parquet")
df = pl.read_parquet(parquet_test_result)
df.head()

test_id,vehicle_id,test_date,test_class_id,test_type,test_result,test_mileage,postcode_area,make,model,colour,fuel_type,cylinder_capacity,first_use_date
i64,i64,datetime[ns],i64,str,str,i64,str,str,str,str,str,i64,datetime[ns]
501924826,995588368,2005-07-01 00:00:00,4,"""NT""","""F""",72145,"""NR""","""FORD""","""FIESTA""","""RED""","""PE""",1242,1998-06-30 00:00:00
52672256,844414844,2005-07-01 00:00:00,7,"""NT""","""P""",107920,"""CR""","""FORD""","""TRANSIT""","""WHITE""","""DI""",2496,2001-02-12 00:00:00
1516474096,123004922,2005-07-01 00:00:00,4,"""RT""","""P""",60483,"""NE""","""FIAT""","""PUNTO""","""BLUE""","""PE""",1242,1999-03-26 00:00:00
1322772390,1477765462,2005-07-01 00:00:00,4,"""NT""","""P""",83307,"""ME""","""FORD""","""ESCORT""","""RED""","""DI""",1753,1999-01-17 00:00:00
1924636408,1169641864,2005-07-01 00:00:00,4,"""NT""","""F""",56870,"""B""","""PEUGEOT""","""206""","""GREY""","""PE""",1124,2002-03-25 00:00:00


In [2] used 130.9 MiB RAM in 0.21s (system mean cpu 11%, single max cpu 34%), peaked 0.0 MiB above final usage, current RAM usage now 209.0 MiB


In [17]:
# Monday is 1, Sunday 7
df.filter((pl.col('test_result')=='P') & (pl.col('fuel_type')=='PE'))['test_date'].dt.weekday().value_counts()

test_date,counts
u32,u32
1,78057
2,83191
3,76524
4,65422
5,80020
6,37068
7,510


In [17] used 7.3 MiB RAM in 0.13s (system mean cpu 7%, single max cpu 18%), peaked 0.0 MiB above final usage, current RAM usage now 613.4 MiB


In [24]:
pass_cnt_days = df.filter((pl.col('test_result')=='P') & (pl.col('fuel_type')=='PE')).groupby(pl.col('test_date').dt.weekday()).count()
pass_cnt_days

test_date,count
u32,u32
1,78057
2,83191
3,76524
4,65422
5,80020
6,37068
7,510


In [24] used 14.4 MiB RAM in 0.14s (system mean cpu 7%, single max cpu 11%), peaked 0.0 MiB above final usage, current RAM usage now 672.0 MiB


In [25]:
failure_cnt_days = df.filter((pl.col('test_result')=='F') & (pl.col('fuel_type')=='PE')).groupby(pl.col('test_date').dt.weekday()).count()
failure_cnt_days

test_date,count
u32,u32
1,26352
2,27530
3,24956
4,19822
5,23575
6,10512
7,128


In [25] used 0.3 MiB RAM in 0.12s (system mean cpu 8%, single max cpu 13%), peaked 0.0 MiB above final usage, current RAM usage now 672.3 MiB


In [28]:
pass_cnt_days/failure_cnt_days

test_date,count
f64,f64
1.0,2.96209
1.0,3.021831
1.0,3.066357
1.0,3.300474
1.0,3.394274
1.0,3.526256
1.0,3.984375


In [28] used 0.0 MiB RAM in 0.10s (system mean cpu 7%, single max cpu 19%), peaked 0.0 MiB above final usage, current RAM usage now 672.1 MiB


# Try filtering on all data

In [12]:
import datetime
#df.filter(pl.col('test_date') > datetime.datetime(2006, 1, 1))
df.filter(pl.col('test_date') > datetime.datetime(2006, 1, 1)).shape

(0, 14)

In [12] used 0.3 MiB RAM in 0.11s (system mean cpu 7%, single max cpu 10%), peaked 0.0 MiB above final usage, current RAM usage now 348.3 MiB


In [None]:
df.head(5)

In [13]:
dt = datetime.datetime(2021, 1, 1)
dfp = pl.scan_parquet('../test_result.parquet/*.parquet').filter(pl.col('test_date') > dt).collect()
dfp.shape

(82012245, 14)

In [13] used 12413.6 MiB RAM in 32.20s (system mean cpu 79%, single max cpu 100%), peaked 9460.4 MiB above final usage, current RAM usage now 12761.8 MiB


In [17]:
f"{dfp.estimated_size():,} bytes"

'11,167,322,973'

In [17] used 0.0 MiB RAM in 0.10s (system mean cpu 0%, single max cpu 0%), peaked 0.0 MiB above final usage, current RAM usage now 12778.3 MiB


In [23]:
dfp['test_date'].max(), dfp['test_date'].min()

(datetime.datetime(2022, 12, 31, 0, 0), datetime.datetime(2021, 1, 2, 0, 0))

In [23] used 0.0 MiB RAM in 0.20s (system mean cpu 8%, single max cpu 17%), peaked 0.0 MiB above final usage, current RAM usage now 12776.0 MiB


In [26]:
passes_cnt_days = dfp.filter(pl.col('test_result')=='P').groupby(pl.col('test_date').dt.weekday()).count()

In [26] used -97.4 MiB RAM in 4.46s (system mean cpu 51%, single max cpu 100%), peaked 10736.7 MiB above final usage, current RAM usage now 11950.8 MiB


In [27]:
failures_cnt_days = dfp.filter(pl.col('test_result')=='F').groupby(pl.col('test_date').dt.weekday()).count()

In [27] used -289.6 MiB RAM in 2.33s (system mean cpu 51%, single max cpu 100%), peaked 2390.6 MiB above final usage, current RAM usage now 11661.2 MiB


In [28]:
passes_cnt_days / failures_cnt_days

test_date,count
f64,f64
1.0,3.690846
1.0,3.843502
1.0,3.93646
1.0,4.031116
1.0,4.370042
1.0,5.245285
1.0,5.428355


In [28] used 0.1 MiB RAM in 0.11s (system mean cpu 9%, single max cpu 17%), peaked 0.0 MiB above final usage, current RAM usage now 11661.3 MiB


# Try filtering and writing out

In [40]:
#pl.scan_parquet('../test_result.parquet/*.parquet').

In [40] used 0.0 MiB RAM in 0.10s (system mean cpu 0%, single max cpu 0%), peaked 0.0 MiB above final usage, current RAM usage now 11674.2 MiB


In [41]:
dt = datetime.datetime(2021, 1, 1)
dfp = pl.scan_parquet('../test_result.parquet/*.parquet').filter(pl.col('test_date') > dt).collect()


In [41] used 11792.8 MiB RAM in 25.84s (system mean cpu 89%, single max cpu 100%), peaked 9667.8 MiB above final usage, current RAM usage now 23467.0 MiB


In [42]:
dfp.shape

(82012245, 14)

In [42] used 0.0 MiB RAM in 0.10s (system mean cpu 0%, single max cpu 0%), peaked 0.0 MiB above final usage, current RAM usage now 23467.0 MiB


In [35]:
import pathlib
base_path = '../test_result.parquet'
new_path = pathlib.Path(base_path) / "test_result_2021on.parquet"
new_path

PosixPath('../test_result.parquet/test_result_2021on.parquet')

In [35] used 0.0 MiB RAM in 0.10s (system mean cpu 0%, single max cpu 0%), peaked 0.0 MiB above final usage, current RAM usage now 11668.5 MiB


In [37]:
# default compression zstd, 'snappy' is allowed
# dfp.write_parquet(new_path)
# writes out a 2GB file in test_result as a single file
# ought to figure out how to partition it a bit?

In [37] used 0.0 MiB RAM in 0.10s (system mean cpu 0%, single max cpu 0%), peaked 0.0 MiB above final usage, current RAM usage now 11678.0 MiB
