In [1]:
import pandas as pd
import numpy as np

## Create dataset

In [21]:
def get_dataset(size):
    # Create Fake Dataset
    df = pd.DataFrame()
    df['size'] = np.random.choice(['big','medium','small'], size)
    df['age'] = np.random.randint(1, 50, size)
    df['team'] = np.random.choice(['red','blue','yellow','green'], size)
    df['win'] = np.random.choice(['yes','no'], size)
    dates = pd.date_range('2021-01-01', '2023-01-31')
    df['date'] = np.random.choice(dates, size)
    df['prob'] = np.random.uniform(0, 1, size)
    return df

def set_dtypes(df):
    df['size'] = df['size'].astype('category')
    df['team'] = df['team'].astype('category')
    df['age'] = df['age'].astype('int16')
    df['win'] = df['win'].map({'yes':True, 'no': False})
    df['prob'] = df['prob'].astype('float32')
    return df

data = get_dataset(1_000_000)
data.head()

Unnamed: 0,size,age,team,win,date,prob
0,big,49,blue,yes,2021-12-12,0.074077
1,big,14,green,no,2021-11-17,0.231078
2,medium,48,green,yes,2021-10-16,0.033506
3,big,41,yellow,no,2022-11-12,0.396516
4,small,42,red,yes,2022-06-20,0.390289


## CSV

In [25]:
%time data.to_csv("test_csv.csv")

Wall time: 4.32 s


In [24]:
!ls -GFlash test_csv.csv

54M -rw-r--r-- 1 WW930+a844026 54M Feb  1 10:22 test_csv.csv


In [26]:
%time df = pd.read_csv("test_csv.csv", index_col=[0])

Wall time: 863 ms


- 54 MB
- 4.32 sec to save
- 0.8 sec to read

In [27]:
%time data.to_csv("test_csv.csv", index=False)

Wall time: 3.93 s


In [28]:
!ls -GFlash test_csv.csv

47M -rw-r--r-- 1 WW930+a844026 47M Feb  1 10:23 test_csv.csv


In [29]:
%time df = pd.read_csv("test_csv.csv")

Wall time: 743 ms


- 47 MB
- 3.9 sec to save
- 0.7 sec to read

## CSV vs Pickle vs Parquet vs Feather

In [35]:
df = get_dataset(5_000_000)
df = set_dtypes(data)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000000 entries, 0 to 4999999
Data columns (total 6 columns):
 #   Column  Dtype         
---  ------  -----         
 0   size    category      
 1   age     int16         
 2   team    category      
 3   win     object        
 4   date    datetime64[ns]
 5   prob    float32       
dtypes: category(2), datetime64[ns](1), float32(1), int16(1), object(1)
memory usage: 114.4+ MB


In [37]:
print('Reading and writing CSV')
%time df.to_csv('test.csv', index=False)
%time df_csv = pd.read_csv('test.csv')
df_csv.head()


Reading and writing CSV
Wall time: 32 s
Wall time: 4.48 s


Unnamed: 0,size,age,team,win,date,prob
0,big,19,green,,2022-08-22,0.617788
1,big,13,red,,2021-09-26,0.476119
2,big,9,red,,2022-08-18,0.843716
3,big,21,blue,,2022-09-02,0.338139
4,medium,39,yellow,,2022-03-18,0.721077


In [39]:
print('Reading and writing Pickle')
%time df.to_pickle('test.pickle')
%time df_pickle = pd.read_pickle('test.pickle')


Reading and writing Pickle
Wall time: 285 ms
Wall time: 708 ms


In [40]:
print('Reading and writing Parquet')
%time df.to_parquet('test.parquet')
%time df_parquet = pd.read_parquet('test.parquet')


Reading and writing Parquet
Wall time: 1.16 s
Wall time: 406 ms


In [42]:
print('Reading and writing Feather')
%time df.to_feather('test.feather')
%time df_feather = pd.read_feather('test.feather')
df_feather.head()

Reading and writing Feather
Wall time: 723 ms
Wall time: 198 ms


Unnamed: 0,size,age,team,win,date,prob
0,big,19,green,,2022-08-22,0.617788
1,big,13,red,,2021-09-26,0.476119
2,big,9,red,,2022-08-18,0.843716
3,big,21,blue,,2022-09-02,0.338139
4,medium,39,yellow,,2022-03-18,0.721077


In [45]:
!ls -GFlash test.csv test.pickle test.parquet test.feather

180M -rw-r--r-- 1 WW930+a844026 180M Feb  1 10:49 test.csv
 48M -rw-r--r-- 1 WW930+a844026  48M Feb  1 10:50 test.feather
 32M -rw-r--r-- 1 WW930+a844026  32M Feb  1 10:49 test.parquet
120M -rw-r--r-- 1 WW930+a844026 120M Feb  1 10:49 test.pickle


In [46]:
!rm -rf test.csv test.pickle test.parquet test.feather