In [None]:
!rm -r ./data/
!mkdir data

In [1]:
import time

import numpy as np
import pandas as pd

Create an example dataframe
===

In [2]:
days = 365*10
n_securities = 3000
df_wide = pd.DataFrame(data=np.random.rand(days, n_securities), index=pd.date_range('2000', periods=days))
df_wide.columns = ['security_{}'.format(i) for i in range(1, n_securities+1)]
df_wide.head()

Unnamed: 0,security_1,security_2,security_3,security_4,security_5,security_6,security_7,security_8,security_9,security_10,...,security_2991,security_2992,security_2993,security_2994,security_2995,security_2996,security_2997,security_2998,security_2999,security_3000
2000-01-01,0.085617,0.326365,0.60184,0.156531,0.839945,0.844082,0.767079,0.163527,0.289341,0.806636,...,0.172043,0.391151,0.474607,0.842958,0.655185,0.620974,0.866098,0.078042,0.567426,0.011965
2000-01-02,0.940922,0.27528,0.419693,0.269434,0.063795,0.906147,0.164838,0.998205,0.772404,0.710726,...,0.168168,0.842518,0.811813,0.542401,0.058447,0.33153,0.507732,0.751599,0.277018,0.135892
2000-01-03,0.854265,0.267392,0.197766,0.540227,0.778471,0.306053,0.58095,0.979425,0.497579,0.050785,...,0.585854,0.462822,0.152851,0.40209,0.769772,0.745623,0.231692,0.363018,0.910562,0.056302
2000-01-04,0.02166,0.392681,0.996974,0.04077,0.837398,0.657741,0.092285,0.947186,0.558361,0.604619,...,0.696493,0.933745,0.936531,0.046087,0.59788,0.941766,0.098167,0.97621,0.350679,0.50137
2000-01-05,0.595625,0.252453,0.314736,0.933267,0.820543,0.854095,0.434151,0.747225,0.797754,0.822736,...,0.498067,0.512264,0.97789,0.784289,0.2223,0.235847,0.644344,0.664169,0.812744,0.03836


Using HDF5 (fixed) to read/write data
===

In [15]:
%time df_wide.to_hdf('data/fixed_wide.hdf', key='wide1')

CPU times: user 12 ms, sys: 28.9 ms, total: 40.9 ms
Wall time: 54.2 ms


In [16]:
%time rb_wide = pd.read_hdf('data/fixed_wide.hdf', key='wide1')

CPU times: user 12.7 ms, sys: 37.7 ms, total: 50.4 ms
Wall time: 48.8 ms


In [20]:
!du -h data/fixed_wide.hdf

 84M	data/fixed_wide.hdf


In [44]:
#wide, most compression

In [45]:
%time df_wide.to_hdf('data/fixed_wide_cmp.hdf', key='wide', complevel=9, complib='blosc')

CPU times: user 353 ms, sys: 34 ms, total: 387 ms
Wall time: 411 ms


In [46]:
%time rb_wide = pd.read_hdf('data/fixed_wide_cmp.hdf', key='wide')

CPU times: user 58.7 ms, sys: 65.1 ms, total: 124 ms
Wall time: 126 ms


In [47]:
!du -h data/fixed_wide_cmp.hdf

 74M	data/fixed_wide_cmp.hdf


In [17]:
# reshape wide to tall
%time df_tall = df_wide.stack().reset_index().rename(columns={'level_0': 'date', 'level_1': 'security_id', 0: 'vals'})

CPU times: user 738 ms, sys: 489 ms, total: 1.23 s
Wall time: 1.24 s


In [43]:
# tall, no compression

In [18]:
%time df_tall.to_hdf('data/fixed_tall.hdf', key='tall')

CPU times: user 457 ms, sys: 232 ms, total: 689 ms
Wall time: 769 ms


In [19]:
%time rb_tall = pd.read_hdf('data/fixed_tall.hdf', key='tall')

CPU times: user 355 ms, sys: 328 ms, total: 683 ms
Wall time: 689 ms


In [21]:
!du -h data/fixed_tall.hdf

301M	data/fixed_tall.hdf


In [42]:
# tall, most compression

In [48]:
%time df_tall.to_hdf('data/fixed_tall_cmp.hdf', key='tall', complevel=9, complib='blosc')

CPU times: user 864 ms, sys: 257 ms, total: 1.12 s
Wall time: 1.48 s


In [49]:
%time rb_tall = pd.read_hdf('data/fixed_tall_cmp.hdf', key='tall')

CPU times: user 436 ms, sys: 440 ms, total: 876 ms
Wall time: 1.04 s


In [50]:
!du -h data/fixed_tall_cmp.hdf

259M	data/fixed_tall_cmp.hdf


Using HDF5 (tables) to read/write data
===

In [25]:
%time df_wide.to_hdf('data/tables_wide.hdf', key='wide', format='table')

CPU times: user 125 ms, sys: 163 ms, total: 288 ms
Wall time: 469 ms


In [26]:
%time rb_wide = pd.read_hdf('data/tables_wide.hdf', key='wide')

CPU times: user 29.9 ms, sys: 35.1 ms, total: 65 ms
Wall time: 69.1 ms


In [27]:
!du -h data/tables_wide.hdf

 84M	data/tables_wide.hdf


In [28]:
%time df_tall.to_hdf('data/tables_tall.hdf', key='tall', format='table')

CPU times: user 14 s, sys: 990 ms, total: 15 s
Wall time: 15.2 s


In [29]:
%time rb_tall = pd.read_hdf('data/tables_tall.hdf', key='tall')

CPU times: user 8.17 s, sys: 1.01 s, total: 9.18 s
Wall time: 9.29 s


In [30]:
!du -h data/tables_tall.hdf

388M	data/tables_tall.hdf
