In [1]:
import polars as pl

In [2]:
%%time

ldf = (
    pl.scan_parquet("../../test_result.parquet/*")
    .select(
        "vehicle_id",
        "make",
        "model",
        "fuel_type",
        "cylinder_capacity",
        "first_use_date",
        "test_date",
        "test_mileage",
    )
    .filter(pl.col("make").is_in(["VOLVO", "ROVER", "VOLKSWAGEN"]))
    .filter(pl.col("model").is_in(["V50", "200", "PASSAT"]))
    .groupby("vehicle_id")
    .agg(
        pl.col(
            "make", "model", "fuel_type", "cylinder_capacity", "first_use_date"
        ).last(),
        pl.col("test_date").max().alias("last_test_date"),
        pl.col("test_mileage").max().alias("last_known_mileage"),
    )
)

CPU times: user 17.6 ms, sys: 71.9 ms, total: 89.5 ms
Wall time: 498 ms


{'vehicle_id': Int64,
 'make': Utf8,
 'model': Utf8,
 'fuel_type': Utf8,
 'cylinder_capacity': Int64,
 'first_use_date': Datetime(time_unit='ns', time_zone=None),
 'last_test_date': Datetime(time_unit='ns', time_zone=None),
 'last_known_mileage': Int64}

In [None]:
ldf.schema

In [3]:
%%time
edf = ldf.collect(streaming=True)

PARTITIONED DS: estimated cardinality: 0.99440104 exceeded the boundary: 0.4, running default HASH AGGREGATION


CPU times: user 2min 13s, sys: 41.3 s, total: 2min 54s
Wall time: 15.1 s


In [4]:
edf.head()

vehicle_id,make,model,fuel_type,cylinder_capacity,first_use_date,last_test_date,last_known_mileage
i64,str,str,str,i64,datetime[ns],datetime[ns],i64
1021476656,"""VOLKSWAGEN""","""PASSAT""","""PE""",1984,2002-12-18 00:00:00,2018-07-16 00:00:00,85559
902520208,"""VOLKSWAGEN""","""PASSAT""","""DI""",1896,1999-12-31 00:00:00,2013-08-06 00:00:00,152665
477485504,"""VOLKSWAGEN""","""PASSAT""","""DI""",1968,2006-10-17 00:00:00,2022-03-24 00:00:00,106259
928707056,"""VOLKSWAGEN""","""PASSAT""","""DI""",1968,2010-03-18 00:00:00,2022-05-05 00:00:00,177222
1163839536,"""VOLKSWAGEN""","""PASSAT""","""PE""",1781,1999-10-22 00:00:00,2007-11-05 00:00:00,61711


In [5]:
%%time

ldf = (
    pl.scan_parquet(
        "../../test_result_sorted.parquet/*",
    )
    .select(
        "vehicle_id",
        "make",
        "model",
        "fuel_type",
        "cylinder_capacity",
        "first_use_date",
        "test_date",
        "test_mileage",
    )
    .filter(pl.col("make").is_in(["VOLVO", "ROVER", "VOLKSWAGEN"]))
    .filter(pl.col("model").is_in(["V50", "200", "PASSAT"]))
    .groupby("vehicle_id")
    .agg(
        pl.col(
            "make", "model", "fuel_type", "cylinder_capacity", "first_use_date"
        ).last(),
        pl.col("test_date").max().alias("last_test_date"),
        pl.col("test_mileage").max().alias("last_known_mileage"),
    )
)

CPU times: user 103 ms, sys: 177 ms, total: 281 ms
Wall time: 556 ms


{'vehicle_id': Int64,
 'make': Utf8,
 'model': Utf8,
 'fuel_type': Utf8,
 'cylinder_capacity': Int64,
 'first_use_date': Datetime(time_unit='us', time_zone=None),
 'last_test_date': Datetime(time_unit='us', time_zone=None),
 'last_known_mileage': Int64}

In [None]:
ldf.schema

In [6]:
%%time
edf = ldf.collect(streaming=True)

PARTITIONED DS: estimated cardinality: 0.9920015 exceeded the boundary: 0.4, running default HASH AGGREGATION


CPU times: user 2min 6s, sys: 31.3 s, total: 2min 37s
Wall time: 12.9 s


In [None]:
%%time

ldf = (
    pl.scan_parquet("../../test_result_sorted.parquet/*", low_memory=True)
    .select(
        "vehicle_id",
        "make",
        "model",
        "fuel_type",
        "cylinder_capacity",
        "first_use_date",
        "test_date",
        "test_mileage",
    )
    .filter(pl.col("make").is_in(["VOLVO", "ROVER", "VOLKSWAGEN"]))
    .filter(pl.col("model").is_in(["V50", "200", "PASSAT"]))
    .groupby("vehicle_id")
    .agg(
        pl.col(
            "make", "model", "fuel_type", "cylinder_capacity", "first_use_date"
        ).last(),
        pl.col("test_date").max().alias("last_test_date"),
        pl.col("test_mileage").max().alias("last_known_mileage"),
    )
)

In [8]:
ldf.schema

{'vehicle_id': Int64,
 'make': Utf8,
 'model': Utf8,
 'fuel_type': Utf8,
 'cylinder_capacity': Int64,
 'first_use_date': Datetime(time_unit='us', time_zone=None),
 'last_test_date': Datetime(time_unit='us', time_zone=None),
 'last_known_mileage': Int64}

In [None]:
%%time
edf = ldf.collect(streaming=True)

In [None]:
%%time
edf.write_parquet("volvo_data_polars.parquet")

In [None]:
edf.head()