In [1]:
import polars as pl

In [15]:
%%time

ldf = (
    pl.scan_parquet('../../test_result.parquet/*',J
                    #low_memory=True
                   )
    .select('vehicle_id', 'make','model','fuel_type','cylinder_capacity','first_use_date','test_date','test_mileage',)
    .filter(pl.col('make').is_in(["VOLVO", "ROVER", "VOLKSWAGEN"]))
    .filter(pl.col('model').is_in(["V50", "200", "PASSAT"]))
    .groupby('vehicle_id')
    .agg(pl.col('make', 'model', 'fuel_type', 'cylinder_capacity', 'first_use_date').last(),
         pl.col('test_date').max().alias('last_test_date'),
         pl.col('test_mileage').max().alias('last_known_mileage'),
        )
)

ldf.schema

CPU times: user 21.9 ms, sys: 11.4 ms, total: 33.3 ms
Wall time: 32.7 ms


{'vehicle_id': Int64,
 'make': Utf8,
 'model': Utf8,
 'fuel_type': Utf8,
 'cylinder_capacity': Int64,
 'first_use_date': Datetime(time_unit='us', time_zone=None),
 'last_test_date': Datetime(time_unit='us', time_zone=None),
 'last_known_mileage': Int64}

In [16]:
%%time
edf = ldf.collect(streaming=True)

CPU times: user 3min 2s, sys: 51.7 s, total: 3min 54s
Wall time: 2min 15s


In [15]:
%%time

ldf = (
    pl.scan_parquet('../../test_result_sorted.parquet/*',J
                    #low_memory=True
                   )
    .select('vehicle_id', 'make','model','fuel_type','cylinder_capacity','first_use_date','test_date','test_mileage',)
    .filter(pl.col('make').is_in(["VOLVO", "ROVER", "VOLKSWAGEN"]))
    .filter(pl.col('model').is_in(["V50", "200", "PASSAT"]))
    .groupby('vehicle_id')
    .agg(pl.col('make', 'model', 'fuel_type', 'cylinder_capacity', 'first_use_date').last(),
         pl.col('test_date').max().alias('last_test_date'),
         pl.col('test_mileage').max().alias('last_known_mileage'),
        )
)

ldf.schema

CPU times: user 21.9 ms, sys: 11.4 ms, total: 33.3 ms
Wall time: 32.7 ms


{'vehicle_id': Int64,
 'make': Utf8,
 'model': Utf8,
 'fuel_type': Utf8,
 'cylinder_capacity': Int64,
 'first_use_date': Datetime(time_unit='us', time_zone=None),
 'last_test_date': Datetime(time_unit='us', time_zone=None),
 'last_known_mileage': Int64}

In [16]:
%%time
edf = ldf.collect(streaming=True)

CPU times: user 3min 2s, sys: 51.7 s, total: 3min 54s
Wall time: 2min 15s


In [17]:
%%time

ldf = (
    pl.scan_parquet('../../test_result_sorted.parquet/*',
                    low_memory=True
                   )
    .select('vehicle_id', 'make','model','fuel_type','cylinder_capacity','first_use_date','test_date','test_mileage',)
    .filter(pl.col('make').is_in(["VOLVO", "ROVER", "VOLKSWAGEN"]))
    .filter(pl.col('model').is_in(["V50", "200", "PASSAT"]))
    .groupby('vehicle_id')
    .agg(pl.col('make', 'model', 'fuel_type', 'cylinder_capacity', 'first_use_date').last(),
         pl.col('test_date').max().alias('last_test_date'),
         pl.col('test_mileage').max().alias('last_known_mileage'),
        )
)

ldf.schema

CPU times: user 31.6 ms, sys: 0 ns, total: 31.6 ms
Wall time: 30.7 ms


{'vehicle_id': Int64,
 'make': Utf8,
 'model': Utf8,
 'fuel_type': Utf8,
 'cylinder_capacity': Int64,
 'first_use_date': Datetime(time_unit='us', time_zone=None),
 'last_test_date': Datetime(time_unit='us', time_zone=None),
 'last_known_mileage': Int64}

In [18]:
%%time
edf = ldf.collect(streaming=True)

CPU times: user 3min 1s, sys: 52.2 s, total: 3min 53s
Wall time: 2min 14s


In [8]:
%%time
edf.write_parquet('volvo_data_polars.parquet')

CPU times: user 51 ms, sys: 38.6 ms, total: 89.5 ms
Wall time: 88.3 ms


In [9]:
edf.head()

vehicle_id,make,model,fuel_type,cylinder_capacity,first_use_date,last_test_date,last_known_mileage
i64,str,str,str,i64,datetime[μs],datetime[μs],i64
503093648,"""VOLVO""","""V50""","""DI""",1997.0,2011-11-03 00:00:00,2022-11-05 00:00:00,62625
638811922,"""VOLVO""","""V50""","""PE""",2521.0,2007-05-15 00:00:00,2022-06-14 00:00:00,108915
638742268,"""VOLVO""","""V50""","""DI""",2401.0,2007-03-27 00:00:00,2022-04-04 00:00:00,34836
656698580,"""VOLVO""","""V50""","""DI""",1998.0,2005-03-30 00:00:00,2017-12-28 00:00:00,215205
808792404,,,,,,2022-10-07 00:00:00,100490
