In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import time

In [2]:
# Read the full NYC parking-violations CSV file into memory.

filename = '../data/nyc-parking-violations-2020.csv'
df = pd.read_csv(filename, low_memory=False)

In [3]:
root = 'parking-violations'
write_methods = {'JSON': df.to_json,
           'CSV': df.to_csv,
           'feather': df.to_feather
          }

In [5]:
for one_format, method in write_methods.items():
    print(f'Saving in {one_format}')
    start_time = time.perf_counter()
    write_methods[one_format](f'parking-violations.{one_format.lower()}')
    end_time = time.perf_counter()

    total_time = end_time - start_time
    print(f'\tWriting {one_format}: {total_time=}')    

Saving in JSON
	Writing JSON: total_time=47.94986385299126
Saving in CSV
	Writing CSV: total_time=84.28116728103487
Saving in feather
	Writing feather: total_time=10.2521946990164


In [6]:
# How big are the files you've created?
import glob
import os

for one_filename in glob.glob(f'{root}*'):
    print(f'{one_filename:27}: {os.stat(one_filename).st_size:,}')

parking-violations.json    : 8,820,247,015
parking-violations.csv     : 2,440,860,181
parking-violations.feather : 1,466,536,058


In [7]:
read_methods = {'JSON': pd.read_json,
           'CSV': pd.read_csv,
           'feather': pd.read_feather
          }

In [8]:
for one_format, method in read_methods.items():
    print(f'Reading from {one_format}')
    start_time = time.perf_counter()
    df = read_methods[one_format](f'parking-violations.{one_format.lower()}')
    end_time = time.perf_counter()

    total_time = end_time - start_time
    print(f'\tReading {one_format}: {total_time=}')    

Reading from JSON
	Reading JSON: total_time=512.0497572919703
Reading from CSV


  df = read_methods[one_format](f'parking-violations.{one_format.lower()}')


	Reading CSV: total_time=44.657161787035875
Reading from feather
	Reading feather: total_time=13.85696751094656
