# Pandas groupby on parquet with numpy vs arrow

In [1]:
import pandas as pd
import polars as pl
from humanfriendly import format_size, format_number
from simpler_mpl import set_commas, set_common_mpl_styles
import matplotlib.pyplot as plt
import seaborn as sns

%load_ext autoreload
%autoreload 2

import datetime, os
display(f"Pandas {pd.__version__}, Polars {pl.__version__}")
display(f'Running: {datetime.datetime.now().strftime("%Y%m%d %H:%M:%S")}, env {os.getenv("CONDA_DEFAULT_ENV")}')

%load_ext ipython_memory_usage
%imu_start


def show_rows_memory(df, deep=False):
    """
    Show rows and memory cost of a Pandas/Polars dataframe
    `deep=True` only has an impact on Pandas numpy-backed string columns, which otherwise are undercounted
    """
    num_bytes = 0
    df_type = "Unknown"
    try:
        num_bytes = df.estimated_size()  # try Polars
        df_type = "Polars"
    except AttributeError:
        pass
    try:
        num_bytes = df.memory_usage(deep=deep, index=False).sum()  # try Pandas
        df_type = "Pandas"
    except AttributeError:
        pass
    display(
        f"{df_type} df with {format_number(df.shape[0])} rows, {format_size(num_bytes)} bytes"
    )

'Pandas 2.1.3, Polars 0.19.17'

'Running: 20231203 10:17:40, env pydataglobal2023'

Enabling IPython Memory Usage, use %imu_start to begin, %imu_stop to end
In [1] used 0.4 MiB RAM in 0.19s (system mean cpu 0%, single max cpu 0%), peaked 0.0 MiB above final usage, current RAM usage now 212.5 MiB


In [2]:
dfpda = pd.read_parquet("../test_result_2021on.parquet", dtype_backend="pyarrow")

In [2] used 12857.8 MiB RAM in 6.51s (system mean cpu 69%, single max cpu 100%), peaked 2520.2 MiB above final usage, current RAM usage now 13070.3 MiB


In [3]:
dfpda.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82012245 entries, 0 to 82012244
Data columns (total 14 columns):
 #   Column             Dtype                 
---  ------             -----                 
 0   test_id            int64[pyarrow]        
 1   vehicle_id         int64[pyarrow]        
 2   test_date          timestamp[us][pyarrow]
 3   test_class_id      int64[pyarrow]        
 4   test_type          large_string[pyarrow] 
 5   test_result        large_string[pyarrow] 
 6   test_mileage       int64[pyarrow]        
 7   postcode_area      large_string[pyarrow] 
 8   make               large_string[pyarrow] 
 9   model              large_string[pyarrow] 
 10  colour             large_string[pyarrow] 
 11  fuel_type          large_string[pyarrow] 
 12  cylinder_capacity  int64[pyarrow]        
 13  first_use_date     timestamp[us][pyarrow]
dtypes: int64[pyarrow](5), large_string[pyarrow](7), timestamp[us][pyarrow](2)
memory usage: 10.4 GB
In [3] used -48.0 MiB RAM in 1.4

In [4]:
mask = dfpda["cylinder_capacity"].isna()
mask.sum()  # nbr of rows to remove

242604

In [4] used 78.3 MiB RAM in 0.28s (system mean cpu 24%, single max cpu 100%), peaked 0.0 MiB above final usage, current RAM usage now 13100.6 MiB


In [5]:
df_nonan_cylcap = dfpda[~mask]
df_nonan_cylcap.shape

(81769641, 14)

In [5] used 11314.2 MiB RAM in 18.96s (system mean cpu 14%, single max cpu 100%), peaked 1163.7 MiB above final usage, current RAM usage now 24414.8 MiB


In [6]:
# agg_result = df_nonan_cylcap.groupby('make')['cylinder_capacity'].agg(['median', 'size'])
# agg_result.query('size > 10').sort_values('median')
# agg_result.query('size == 1').sort_index()

In [6] used 0.0 MiB RAM in 0.10s (system mean cpu 16%, single max cpu 91%), peaked 0.0 MiB above final usage, current RAM usage now 24414.8 MiB


In [7]:
# mask = dfpda.make.str.startswith('TESLA')
# dfpda[mask] # so TESLA 40k rows, but NA cylinder_capacity and fuel_type Electric or EL!

In [7] used 0.0 MiB RAM in 0.10s (system mean cpu 12%, single max cpu 64%), peaked 0.0 MiB above final usage, current RAM usage now 24414.8 MiB


In [8]:
# result = dfpda[~dfpda['cylinder_capacity'].isna()] \ # same speed

In [8] used 0.0 MiB RAM in 0.10s (system mean cpu 14%, single max cpu 100%), peaked 0.0 MiB above final usage, current RAM usage now 24414.8 MiB


In [9]:
%%time
result = (
    dfpda.dropna(subset=["cylinder_capacity"])
    .groupby("make")["cylinder_capacity"]
    .agg(["median", "count"])
    .query("count > 10")
    .sort_values("median")
)
#"RuntimeWarning: Engine has switched to 'python' 
# because numexpr does not support extension 
# array dtypes"
# note this is probably a result of the query which uses eval which is not supported by numexpr for array arrays
result

CPU times: user 14.8 s, sys: 12.6 s, total: 27.3 s
Wall time: 27.4 s




Unnamed: 0_level_0,median,count
make,Unnamed: 1_level_1,Unnamed: 2_level_1
TESLA,0.0,16
VECTRIX,4.0,13
ATALA,49.0,14
BEELINE,49.0,304
BOATIAN,49.0,11
...,...,...
ERF,10000.0,30
LEYLAND NATIONAL,10450.0,15
NEOPLAN,12609.5,18
KENWORTH,14000.0,11


In [9] used -4.1 MiB RAM in 27.53s (system mean cpu 16%, single max cpu 100%), peaked 13851.7 MiB above final usage, current RAM usage now 24410.7 MiB


In [10]:
assert result.loc["ROLLS ROYCE"]["median"] == 6749.0
assert result.loc["ROLLS ROYCE"]["count"] == 11741.0

In [10] used 0.8 MiB RAM in 0.10s (system mean cpu 0%, single max cpu 0%), peaked 0.0 MiB above final usage, current RAM usage now 24411.4 MiB


In [11]:
# mask = result.index.str.startswith('ROLL')
# result[mask]

In [11] used 0.6 MiB RAM in 0.10s (system mean cpu 0%, single max cpu 0%), peaked 0.0 MiB above final usage, current RAM usage now 24412.0 MiB


# Repeat for numpy equivalent

In [12]:
dfpdn = pd.read_parquet("../test_result_2021on.parquet", dtype_backend="numpy_nullable")

In [12] used 20589.0 MiB RAM in 29.20s (system mean cpu 26%, single max cpu 100%), peaked 3026.9 MiB above final usage, current RAM usage now 45001.1 MiB


In [13]:
%%time
# result = dfpdn[~dfpdn['cylinder_capacity'].isna()] \
result = (
    dfpdn.dropna(subset=["cylinder_capacity"])
    .groupby("make")["cylinder_capacity"]
    .agg(["median", "count"])
    .query("count > 10")
    .sort_values("median")
)
result

CPU times: user 9.08 s, sys: 6.9 s, total: 16 s
Wall time: 13.9 s




Unnamed: 0_level_0,median,count
make,Unnamed: 1_level_1,Unnamed: 2_level_1
TESLA,0.0,16
VECTRIX,4.0,13
BOATIAN,49.0,11
EASY RIDER,49.0,575
DIRECTBIKES DB50QT-A,49.0,16
...,...,...
ERF,10000.0,30
LEYLAND NATIONAL,10450.0,15
NEOPLAN,12609.5,18
KENWORTH,14000.0,11


In [13] used -9886.1 MiB RAM in 14.00s (system mean cpu 11%, single max cpu 100%), peaked 12243.7 MiB above final usage, current RAM usage now 35114.9 MiB


In [14]:
assert result.loc["ROLLS ROYCE"]["median"] == 6749.0
assert result.loc["ROLLS ROYCE"]["count"] == 11741.0

In [14] used 0.0 MiB RAM in 0.10s (system mean cpu 0%, single max cpu 0%), peaked 0.0 MiB above final usage, current RAM usage now 35114.9 MiB
