In [1]:
import pandas as pd
import numpy as np
print('pandas', pd.__version__)
print('numpy', np.__version__)

pandas 2.0.0rc0
numpy 1.22.0


In [2]:
df = pd.read_csv('data/Batting.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102816 entries, 0 to 102815
Data columns (total 22 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   playerID  102816 non-null  object 
 1   yearID    102816 non-null  int64  
 2   stint     102816 non-null  int64  
 3   teamID    102816 non-null  object 
 4   lgID      102079 non-null  object 
 5   G         102816 non-null  int64  
 6   AB        102816 non-null  int64  
 7   R         102816 non-null  int64  
 8   H         102816 non-null  int64  
 9   2B        102816 non-null  int64  
 10  3B        102816 non-null  int64  
 11  HR        102816 non-null  int64  
 12  RBI       102392 non-null  float64
 13  SB        101516 non-null  float64
 14  CS        79360 non-null   float64
 15  BB        102816 non-null  int64  
 16  SO        94978 non-null   float64
 17  IBB       66251 non-null   float64
 18  HBP       100006 non-null  float64
 19  SH        96478 non-null   float64
 20  SF  

In [3]:
pd.options.mode.dtype_backend = 'pyarrow'

df_v2 = pd.read_csv('data/Batting.csv', engine='pyarrow', use_nullable_dtypes=True)

In [4]:
df_v2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102816 entries, 0 to 102815
Data columns (total 22 columns):
 #   Column    Non-Null Count   Dtype          
---  ------    --------------   -----          
 0   playerID  102816 non-null  string[pyarrow]
 1   yearID    102816 non-null  int64[pyarrow] 
 2   stint     102816 non-null  int64[pyarrow] 
 3   teamID    102816 non-null  string[pyarrow]
 4   lgID      102816 non-null  string[pyarrow]
 5   G         102816 non-null  int64[pyarrow] 
 6   AB        102816 non-null  int64[pyarrow] 
 7   R         102816 non-null  int64[pyarrow] 
 8   H         102816 non-null  int64[pyarrow] 
 9   2B        102816 non-null  int64[pyarrow] 
 10  3B        102816 non-null  int64[pyarrow] 
 11  HR        102816 non-null  int64[pyarrow] 
 12  RBI       102392 non-null  int64[pyarrow] 
 13  SB        101516 non-null  int64[pyarrow] 
 14  CS        79360 non-null   int64[pyarrow] 
 15  BB        102816 non-null  int64[pyarrow] 
 16  SO        94978 non-

In [11]:
print("NumPy:")
%timeit df = pd.read_csv('data/Batting.csv')
print("Arrow:")
%timeit df_v2 = pd.read_csv('data/Batting.csv', engine='pyarrow', use_nullable_dtypes=True)

NumPy:
92.5 ms ± 765 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
Arrow:
26.4 ms ± 344 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [12]:
print("NumPy:")
%timeit df['AB'].mean()
print("Arrow:")
%timeit df_v2['AB'].mean()

NumPy:
158 µs ± 1.28 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
Arrow:
63.7 µs ± 329 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [13]:
df['AB'] = df['AB'].astype(float)
df_v2['AB'] = df_v2['AB'].astype('float64[pyarrow]')
print("NumPy:")
%timeit df['AB'].mean()
print("Arrow:")
%timeit df_v2['AB'].mean()

NumPy:
156 µs ± 1.42 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
Arrow:
63.7 µs ± 343 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [9]:
print("NumPy:")
%timeit df['playerID'].str.startswith('jeter')
print("Arrow:")
%timeit df_v2['playerID'].str.startswith('jeter')

NumPy:
18 ms ± 132 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
Arrow:
358 µs ± 5.67 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [17]:
import_time = round(98.2/29.3, 2)
mean_float_time = round(167/67.4, 2)
mean_int_time = round(89.9/70.6, 2)
str_time = round(18900/387, 2) # ms / µs
import_time, mean_float_time, mean_int_time, str_time

(3.35, 2.48, 1.27, 48.84)

| Task | NumPy Time | Arrow Time | Increase |
| --- | :-: | :-: | :-: |
| Import CSV (6MB) | 98.2 ms | 29.3 ms | 3.3x |
| Mean (float64) | 167 µs | 67.4 µs | 2.5x |
| Mean (int64) | 89.9 µs | 70.6 µs | 1.3x |
| startswith (string) | 18.9 ms | 387 µs | 49x |