# HDF5 store

https://www.hdfgroup.org/solutions/hdf5/

In [1]:
%env OPTIONS_DATA_PATH=~/work/algo-trading/data

import os
import pandas as pd

env: OPTIONS_DATA_PATH=~/work/algo-trading/data


In [2]:
data_dir = os.path.expanduser(os.environ["OPTIONS_DATA_PATH"])
spx_dir = os.path.join(data_dir, "allspx")

data_path = os.path.join(spx_dir, "SPX_1990.csv")
os.path.getsize(data_path) / 1024**2

%timeit pd.read_csv(data_path, parse_dates=["quotedate", "expiration"])

9.05 s ± 1.03 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [3]:
df = pd.read_csv(data_path, parse_dates=["quotedate", "expiration"])
df.rename(columns={" exchange": "exchange"}, inplace=True)
df["type"] = df["type"].astype("category")
df.dtypes

underlying                 object
underlying_last           float64
exchange                   object
optionroot                 object
optionext                 float64
type                     category
expiration         datetime64[ns]
quotedate          datetime64[ns]
strike                      int64
last                      float64
bid                       float64
ask                       float64
volume                      int64
openinterest                int64
impliedvol                float64
delta                     float64
gamma                     float64
theta                     float64
vega                      float64
optionalias                object
dtype: object

In [4]:
df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46352 entries, 0 to 46351
Data columns (total 20 columns):
underlying         46352 non-null object
underlying_last    46352 non-null float64
exchange           46352 non-null object
optionroot         46352 non-null object
optionext          0 non-null float64
type               46352 non-null category
expiration         46352 non-null datetime64[ns]
quotedate          46352 non-null datetime64[ns]
strike             46352 non-null int64
last               46352 non-null float64
bid                46352 non-null float64
ask                46352 non-null float64
volume             46352 non-null int64
openinterest       46352 non-null int64
impliedvol         46352 non-null float64
delta              46352 non-null float64
gamma              46352 non-null float64
theta              46352 non-null float64
vega               46352 non-null float64
optionalias        46352 non-null object
dtypes: category(1), datetime64[ns](2), float64(10

In [5]:
store = os.path.join(data_dir, "spx_1990.h5")
df.to_hdf(store,
          format="table",
          key="/spx_1990",
          mode="a",
          append=False, 
          complevel=9, 
          complib="blosc",
          fletcher32=True)

In [6]:
%timeit pd.read_hdf(store, "/spx_1990")

300 ms ± 19.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [7]:
h5_df = pd.read_hdf(store, "/spx_1990")

In [8]:
h5_df.dtypes

underlying                 object
underlying_last           float64
exchange                   object
optionroot                 object
optionext                 float64
type                     category
expiration         datetime64[ns]
quotedate          datetime64[ns]
strike                      int64
last                      float64
bid                       float64
ask                       float64
volume                      int64
openinterest                int64
impliedvol                float64
delta                     float64
gamma                     float64
theta                     float64
vega                      float64
optionalias                object
dtype: object

In [9]:
os.path.getsize(store) / 1024**2

2.555933952331543