In [20]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
import time

DATAPATH = Path("../../temp/data")

# create data

In [21]:
# dataframe w/random numbers
df = pd.DataFrame(np.random.normal(size=(1050000, 7)),
                  dtype=np.float32)
df.loc[:, (df.shape[1] + 1)] = 0.0001

df.head()

Unnamed: 0,0,1,2,3,4,5,6,8
0,0.651282,1.251676,1.17248,-0.623277,-0.488529,0.565954,0.521299,0.0001
1,-0.32127,0.214199,0.668231,1.363346,2.232346,0.258891,0.20358,0.0001
2,1.136323,-1.01777,-0.271917,-1.328532,0.060997,-1.713914,0.074197,0.0001
3,0.463319,-0.756274,-0.37666,1.810076,2.36834,-0.612749,0.726914,0.0001
4,-3.567508,-0.652595,0.272587,0.170276,1.15743,0.531651,-0.025606,0.0001


# write data

In [22]:
# to csv
%time df.to_csv(DATAPATH / "abc.csv")
f"file size (Mb): {os.path.getsize(DATAPATH / 'abc.csv') / 1e06}"

CPU times: user 16.1 s, sys: 387 ms, total: 16.5 s
Wall time: 16.6 s


'file size (Mb): 94.813664'

In [23]:
# to csv + zip
%time df.to_csv(DATAPATH / "abc.zip", compression="zip")
f"file size (Mb): {os.path.getsize(DATAPATH / 'abc.zip') / 1e06}"

CPU times: user 29.4 s, sys: 234 ms, total: 29.6 s
Wall time: 29.7 s


'file size (Mb): 38.257632'

In [24]:
# to pickle
%time df.to_pickle(DATAPATH / "abc.pkl")
f"file size (Mb): {os.path.getsize(DATAPATH / 'abc.pkl') / 1e06}"

CPU times: user 0 ns, sys: 71.7 ms, total: 71.7 ms
Wall time: 84.2 ms


'file size (Mb): 37.800796'

In [25]:
# to hdf
%time df.to_hdf(DATAPATH / "abc.h5", key="fx/spot")
f"file size (Mb): {os.path.getsize(DATAPATH / 'abc.h5') / 1e06}"

CPU times: user 25.1 ms, sys: 35 ms, total: 60.1 ms
Wall time: 57.5 ms


'file size (Mb): 46.2106'

In [26]:
# to feather
%time df.to_feather(DATAPATH / "abc.arrow")
f"file size (Mb): {os.path.getsize(DATAPATH / 'abc.arrow') / 1e06}"

CPU times: user 269 ms, sys: 99 ms, total: 368 ms
Wall time: 134 ms


'file size (Mb): 29.451914'

In [27]:
# to parquet
%time df.to_parquet(DATAPATH / "abc.parquet")
f"file size (Mb): {os.path.getsize(DATAPATH / 'abc.parquet') / 1e06}"

CPU times: user 748 ms, sys: 181 ms, total: 930 ms
Wall time: 741 ms


'file size (Mb): 33.789854'

In [28]:
# to parquet
%time df.to_parquet(DATAPATH / "abc.zparquet", compression='gzip')
f"file size (Mb): {os.path.getsize(DATAPATH / 'abc.zparquet') / 1e06}"

CPU times: user 4.03 s, sys: 214 ms, total: 4.25 s
Wall time: 4.05 s


'file size (Mb): 31.415451'

# read data

In [29]:
readers = {
    "csv": pd.read_csv,
    "zip": lambda _x: pd.read_csv(_x, compression="zip"),
    "pkl": pd.read_pickle,
    "h5": pd.read_hdf,
    "arrow": pd.read_feather,
    "parquet": pd.read_parquet,
    "zparquet": lambda _x: pd.read_parquet(_x)
}

read_times = {}
for _f in os.listdir(DATAPATH):
    extension = _f.split(".")[-1]
    _reader = readers[extension]
    _st = time.time()
    _ = _reader(DATAPATH / _f)
    _et = time.time()
    read_times[extension] = _et - _st

pd.Series(read_times)\
    .sort_values()\
    .to_frame("dt (seconds)").style.format(precision=4)

Unnamed: 0,dt (seconds)
pkl,0.0454
arrow,0.1048
parquet,0.1408
zparquet,0.1888
h5,0.2094
csv,2.1112
zip,3.5975


# size reduction

In [30]:
# round
%time df.round(6).to_csv(DATAPATH / "abc-rounded.csv")
f"file size (Mb): {os.path.getsize(DATAPATH / 'abc-rounded.csv') / 1e06}"

CPU times: user 20.3 s, sys: 606 ms, total: 20.9 s
Wall time: 21.3 s


'file size (Mb): 83.591071'

In [31]:
# multiply by 10
%time df.mul(10).round(5).to_csv(DATAPATH / "abc-rounded.csv")
f"file size (Mb): {os.path.getsize(DATAPATH / 'abc-rounded.csv') / 1e06}"

CPU times: user 16.1 s, sys: 384 ms, total: 16.5 s
Wall time: 16.5 s


'file size (Mb): 77.524794'

In [32]:
print(df.round(6).head())
print(df.mul(10).round(5).div(10).head())

          0         1         2         3         4         5         6  \
0  0.651282  1.251676  1.172480 -0.623277 -0.488529  0.565954  0.521299   
1 -0.321270  0.214199  0.668231  1.363346  2.232346  0.258891  0.203580   
2  1.136323 -1.017770 -0.271917 -1.328532  0.060997 -1.713914  0.074197   
3  0.463319 -0.756274 -0.376660  1.810076  2.368340 -0.612749  0.726914   
4 -3.567508 -0.652595  0.272587  0.170276  1.157430  0.531651 -0.025606   

        8  
0  0.0001  
1  0.0001  
2  0.0001  
3  0.0001  
4  0.0001  
          0         1         2         3         4         5         6  \
0  0.651282  1.251676  1.172480 -0.623277 -0.488529  0.565954  0.521299   
1 -0.321270  0.214199  0.668231  1.363346  2.232346  0.258891  0.203580   
2  1.136323 -1.017770 -0.271917 -1.328532  0.060997 -1.713914  0.074197   
3  0.463319 -0.756274 -0.376660  1.810076  2.368340 -0.612749  0.726914   
4 -3.567508 -0.652595  0.272587  0.170276  1.157430  0.531651 -0.025606   

        8  
0  0.0001  
1 