# HDF5 store

https://www.hdfgroup.org/solutions/hdf5/

In [1]:
%env OPTIONS_DATA_PATH=~/work/algo-trading/data

import os
import pandas as pd

env: OPTIONS_DATA_PATH=~/work/algo-trading/data


In [2]:
data_dir = os.path.expanduser(os.environ["OPTIONS_DATA_PATH"])
spx_dir = os.path.join(data_dir, "allspx")

data_path = os.path.join(spx_dir, "SPX_1990.csv")
os.path.getsize(data_path) / 1024**2

%timeit pd.read_csv(data_path, parse_dates=["quotedate", "expiration"])

9.05 s ± 1.03 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [3]:
df = pd.read_csv(data_path, parse_dates=["quotedate", "expiration"])
df.rename(columns={" exchange": "exchange"}, inplace=True)
df["type"] = df["type"].astype("category")
df.dtypes

underlying                 object
underlying_last           float64
exchange                   object
optionroot                 object
optionext                 float64
type                     category
expiration         datetime64[ns]
quotedate          datetime64[ns]
strike                      int64
last                      float64
bid                       float64
ask                       float64
volume                      int64
openinterest                int64
impliedvol                float64
delta                     float64
gamma                     float64
theta                     float64
vega                      float64
optionalias                object
dtype: object

In [4]:
df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46352 entries, 0 to 46351
Data columns (total 20 columns):
underlying         46352 non-null object
underlying_last    46352 non-null float64
exchange           46352 non-null object
optionroot         46352 non-null object
optionext          0 non-null float64
type               46352 non-null category
expiration         46352 non-null datetime64[ns]
quotedate          46352 non-null datetime64[ns]
strike             46352 non-null int64
last               46352 non-null float64
bid                46352 non-null float64
ask                46352 non-null float64
volume             46352 non-null int64
openinterest       46352 non-null int64
impliedvol         46352 non-null float64
delta              46352 non-null float64
gamma              46352 non-null float64
theta              46352 non-null float64
vega               46352 non-null float64
optionalias        46352 non-null object
dtypes: category(1), datetime64[ns](2), float64(10

In [5]:
store = os.path.join(data_dir, "spx_1990.h5")
df.to_hdf(store,
          format="table",
          key="/spx_1990",
          mode="a",
          append=False, 
          complevel=9, 
          complib="blosc",
          fletcher32=True)

In [6]:
%timeit pd.read_hdf(store, "/spx_1990")

300 ms ± 19.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [7]:
h5_df = pd.read_hdf(store, "/spx_1990")

In [8]:
h5_df.dtypes

underlying                 object
underlying_last           float64
exchange                   object
optionroot                 object
optionext                 float64
type                     category
expiration         datetime64[ns]
quotedate          datetime64[ns]
strike                      int64
last                      float64
bid                       float64
ask                       float64
volume                      int64
openinterest                int64
impliedvol                float64
delta                     float64
gamma                     float64
theta                     float64
vega                      float64
optionalias                object
dtype: object

In [9]:
os.path.getsize(store) / 1024**2

2.555933952331543

In [20]:
store_path = os.path.join(data_dir, "options_data.h5")
store = pd.HDFStore(store_path)

for year in range(1990, 2019):
    filename = "SPX_{}.csv".format(year)
    year_df = pd.read_csv(os.path.join(spx_dir, filename), parse_dates=["expiration", "quotedate"])
    year_df.rename(columns={" exchange": "exchange"}, inplace=True)
    year_df["type"] = df["type"].astype("category")
    key = "spx_{}".format(year)
    store.append(key, year_df)

store.close()
os.path.getsize(store_path) / 1024**2

3328.750859260559

In [25]:
%cd -q $data_dir
!ptrepack --complevel=9 --complib=blosc options_data.h5 options_data_compressed.h5

In [24]:
compressed_path = os.path.join(data_dir, "options_data_compressed.h5")
os.path.getsize(compressed_path) / 1024**2

788.7592725753784

In [27]:
spx_2017 = pd.read_hdf(compressed_path, key="spx_2017")
spx_2017.head()

Unnamed: 0,underlying,underlying_last,exchange,optionroot,optionext,type,expiration,quotedate,strike,last,bid,ask,volume,openinterest,impliedvol,delta,gamma,theta,vega,optionalias
0,SPX,2257.83,*,SPX170120C00100000,,call,2017-01-20,2017-01-03,100,2161.5,2154.3,2158.7,0,5615,0.1214,1.0,0.0,-1.0239,0.0,SPX170120C00100000
1,SPX,2257.83,*,SPX170120C00200000,,call,2017-01-20,2017-01-03,200,2060.95,2054.4,2058.8,0,6355,0.1214,1.0,0.0,-2.0479,0.0,SPX170120C00200000
2,SPX,2257.83,*,SPX170120C00300000,,call,2017-01-20,2017-01-03,300,1960.45,1954.4,1958.8,0,2000,0.1214,1.0,0.0,-3.0718,0.0,SPX170120C00300000
3,SPX,2257.83,*,SPX170120C00400000,,call,2017-01-20,2017-01-03,400,1760.55,1854.6,1858.9,0,25,0.1214,1.0,0.0,-4.0957,0.0,SPX170120C00400000
4,SPX,2257.83,*,SPX170120C00500000,,call,2017-01-20,2017-01-03,500,1749.33,1754.6,1758.9,0,3813,0.1214,1.0,0.0,-5.1197,0.0,SPX170120C00500000


In [28]:
spx_2017.dtypes

underlying                 object
underlying_last           float64
exchange                   object
optionroot                 object
optionext                 float64
type                     category
expiration         datetime64[ns]
quotedate          datetime64[ns]
strike                      int64
last                      float64
bid                       float64
ask                       float64
volume                      int64
openinterest                int64
impliedvol                float64
delta                     float64
gamma                     float64
theta                     float64
vega                      float64
optionalias                object
dtype: object

In [29]:
spx_2017.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2557526 entries, 0 to 2557525
Data columns (total 20 columns):
underlying         object
underlying_last    float64
exchange           object
optionroot         object
optionext          float64
type               category
expiration         datetime64[ns]
quotedate          datetime64[ns]
strike             int64
last               float64
bid                float64
ask                float64
volume             int64
openinterest       int64
impliedvol         float64
delta              float64
gamma              float64
theta              float64
vega               float64
optionalias        object
dtypes: category(1), datetime64[ns](2), float64(10), int64(3), object(4)
memory usage: 988.4 MB
