In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import time

# Beyond 1

If we read the CSV file using the "pyarrow" engine, do we see any speedup? That is, can we read CSV files into memory any faster if we use a different engine?

In [2]:
filename = '../data/nyc-parking-violations-2020.csv'
start_time = time.perf_counter()
df = pd.read_csv(filename, engine='pyarrow')
end_time = time.perf_counter()
total_time = end_time - start_time
print(f'Reading via pyarrow engine, {total_time=}')    

Reading via pyarrow engine, total_time=9.923564148019068


# Beyond 2

If we specify the dtypes when reading from a CSV file, do we save any time?

In [3]:
start_time = time.perf_counter()
df = pd.read_csv(filename, low_memory=False,
                 dtype=dict(df.dtypes))
end_time = time.perf_counter()

total_time = end_time - start_time
total_time

63.521172957960516

# Beyond 3

How much memory does our data frame take in as a `pandas` data frame? How much memory does it require as an Arrow table?

In [4]:
# Pandas table
n = df.memory_usage(deep=True).sum()
print(f'{n:,}')

16,789,335,057


In [5]:
# Arrow table
import pyarrow.feather as feather
read_arrow = feather.read_table('parking-violations.feather')

In [6]:
n = read_arrow.nbytes
print(f'{n:,}')

4,309,680,899
