# HDF5 store

https://www.hdfgroup.org/solutions/hdf5/

In [1]:
%env OPTIONS_DATA_PATH=~/work/algo-trading/data

import os
import pandas as pd

env: OPTIONS_DATA_PATH=~/work/algo-trading/data


In [2]:
data_dir = os.path.expanduser(os.environ["OPTIONS_DATA_PATH"])
spx_dir = os.path.join(data_dir, "allspx")

data_path = os.path.join(spx_dir, "SPX_1990.csv")
os.path.getsize(data_path) / 1024**2

%timeit pd.read_csv(data_path, parse_dates=["quotedate", "expiration"])

12.3 s ± 402 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [3]:
df = pd.read_csv(data_path, parse_dates=["quotedate", "expiration"])
df.rename(columns={" exchange": "exchange"}, inplace=True)
df["type"] = df["type"].astype("category")
df.dtypes

underlying                 object
underlying_last           float64
exchange                   object
optionroot                 object
optionext                 float64
type                     category
expiration         datetime64[ns]
quotedate          datetime64[ns]
strike                      int64
last                      float64
bid                       float64
ask                       float64
volume                      int64
openinterest                int64
impliedvol                float64
delta                     float64
gamma                     float64
theta                     float64
vega                      float64
optionalias                object
dtype: object

In [4]:
df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46352 entries, 0 to 46351
Data columns (total 20 columns):
underlying         46352 non-null object
underlying_last    46352 non-null float64
exchange           46352 non-null object
optionroot         46352 non-null object
optionext          0 non-null float64
type               46352 non-null category
expiration         46352 non-null datetime64[ns]
quotedate          46352 non-null datetime64[ns]
strike             46352 non-null int64
last               46352 non-null float64
bid                46352 non-null float64
ask                46352 non-null float64
volume             46352 non-null int64
openinterest       46352 non-null int64
impliedvol         46352 non-null float64
delta              46352 non-null float64
gamma              46352 non-null float64
theta              46352 non-null float64
vega               46352 non-null float64
optionalias        46352 non-null object
dtypes: category(1), datetime64[ns](2), float64(10

In [5]:
store = os.path.join(data_dir, "spx_1990.h5")
df.to_hdf(store,
          format="table",
          key="/spx_1990",
          mode="a",
          append=False, 
          complevel=9, 
          complib="blosc",
          fletcher32=True)

In [6]:
%timeit pd.read_hdf(store, "/spx_1990")

300 ms ± 19.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [7]:
h5_df = pd.read_hdf(store, "/spx_1990")

In [8]:
h5_df.dtypes

underlying                 object
underlying_last           float64
exchange                   object
optionroot                 object
optionext                 float64
type                     category
expiration         datetime64[ns]
quotedate          datetime64[ns]
strike                      int64
last                      float64
bid                       float64
ask                       float64
volume                      int64
openinterest                int64
impliedvol                float64
delta                     float64
gamma                     float64
theta                     float64
vega                      float64
optionalias                object
dtype: object

In [9]:
os.path.getsize(store) / 1024**2

2.555933952331543

In [20]:
store_path = os.path.join(data_dir, "options_data.h5")
store = pd.HDFStore(store_path)

for year in range(1990, 2019):
    filename = "SPX_{}.csv".format(year)
    year_df = pd.read_csv(os.path.join(spx_dir, filename), parse_dates=["expiration", "quotedate"])
    year_df.rename(columns={" exchange": "exchange"}, inplace=True)
    year_df["type"] = df["type"].astype("category")
    key = "spx_{}".format(year)
    store.append(key, year_df)

store.close()
os.path.getsize(store_path) / 1024**2

3328.750859260559

In [25]:
%cd -q $data_dir
!ptrepack --complevel=9 --complib=blosc options_data.h5 options_data_compressed.h5

In [24]:
compressed_path = os.path.join(data_dir, "options_data_compressed.h5")
os.path.getsize(compressed_path) / 1024**2

788.7592725753784

In [27]:
spx_2017 = pd.read_hdf(compressed_path, key="spx_2017")
spx_2017.head()

Unnamed: 0,underlying,underlying_last,exchange,optionroot,optionext,type,expiration,quotedate,strike,last,bid,ask,volume,openinterest,impliedvol,delta,gamma,theta,vega,optionalias
0,SPX,2257.83,*,SPX170120C00100000,,call,2017-01-20,2017-01-03,100,2161.5,2154.3,2158.7,0,5615,0.1214,1.0,0.0,-1.0239,0.0,SPX170120C00100000
1,SPX,2257.83,*,SPX170120C00200000,,call,2017-01-20,2017-01-03,200,2060.95,2054.4,2058.8,0,6355,0.1214,1.0,0.0,-2.0479,0.0,SPX170120C00200000
2,SPX,2257.83,*,SPX170120C00300000,,call,2017-01-20,2017-01-03,300,1960.45,1954.4,1958.8,0,2000,0.1214,1.0,0.0,-3.0718,0.0,SPX170120C00300000
3,SPX,2257.83,*,SPX170120C00400000,,call,2017-01-20,2017-01-03,400,1760.55,1854.6,1858.9,0,25,0.1214,1.0,0.0,-4.0957,0.0,SPX170120C00400000
4,SPX,2257.83,*,SPX170120C00500000,,call,2017-01-20,2017-01-03,500,1749.33,1754.6,1758.9,0,3813,0.1214,1.0,0.0,-5.1197,0.0,SPX170120C00500000


In [28]:
spx_2017.dtypes

underlying                 object
underlying_last           float64
exchange                   object
optionroot                 object
optionext                 float64
type                     category
expiration         datetime64[ns]
quotedate          datetime64[ns]
strike                      int64
last                      float64
bid                       float64
ask                       float64
volume                      int64
openinterest                int64
impliedvol                float64
delta                     float64
gamma                     float64
theta                     float64
vega                      float64
optionalias                object
dtype: object

In [29]:
spx_2017.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2557526 entries, 0 to 2557525
Data columns (total 20 columns):
underlying         object
underlying_last    float64
exchange           object
optionroot         object
optionext          float64
type               category
expiration         datetime64[ns]
quotedate          datetime64[ns]
strike             int64
last               float64
bid                float64
ask                float64
volume             int64
openinterest       int64
impliedvol         float64
delta              float64
gamma              float64
theta              float64
vega               float64
optionalias        object
dtypes: category(1), datetime64[ns](2), float64(10), int64(3), object(4)
memory usage: 988.4 MB


## Storing all SPX data in a single group

Add indeces for all columns

In [38]:
store_path = os.path.join(data_dir, "options_data_v2.h5")
store = pd.HDFStore(store_path, complevel=9, complib="blosc", fletcher32=True)

offset = 0
sizes = {"optionroot": 20, "optionalias": 20, "underlying": 10, "exchange": 5}
for year in range(1990, 2019):
    filename = "SPX_{}.csv".format(year)
    year_df = pd.read_csv(os.path.join(spx_dir, filename), parse_dates=["expiration", "quotedate"])
    year_df.index += offset
    offset += len(year_df)
    year_df.rename(columns={" exchange": "exchange"}, inplace=True)
    year_df = year_df.astype({"strike": "float", "optionalias": "object", "type": "category"})
    store.append("/SPX", year_df, index=False, data_columns=True, min_itemsize=sizes)
    
os.path.getsize(store_path) / 1024**2

712.7310857772827

In [39]:
store.create_table_index("SPX", optlevel=9, kind="full")
store.get_storer("SPX").table

/SPX/table (Table(16756680,), fletcher32, shuffle, blosc(9)) ''
  description := {
  "index": Int64Col(shape=(), dflt=0, pos=0),
  "underlying": StringCol(itemsize=10, shape=(), dflt=b'', pos=1),
  "underlying_last": Float64Col(shape=(), dflt=0.0, pos=2),
  "exchange": StringCol(itemsize=5, shape=(), dflt=b'', pos=3),
  "optionroot": StringCol(itemsize=20, shape=(), dflt=b'', pos=4),
  "optionext": Float64Col(shape=(), dflt=0.0, pos=5),
  "type": Int8Col(shape=(), dflt=0, pos=6),
  "expiration": Int64Col(shape=(), dflt=0, pos=7),
  "quotedate": Int64Col(shape=(), dflt=0, pos=8),
  "strike": Float64Col(shape=(), dflt=0.0, pos=9),
  "last": Float64Col(shape=(), dflt=0.0, pos=10),
  "bid": Float64Col(shape=(), dflt=0.0, pos=11),
  "ask": Float64Col(shape=(), dflt=0.0, pos=12),
  "volume": Int64Col(shape=(), dflt=0, pos=13),
  "openinterest": Int64Col(shape=(), dflt=0, pos=14),
  "impliedvol": Float64Col(shape=(), dflt=0.0, pos=15),
  "delta": Float64Col(shape=(), dflt=0.0, pos=16),
  "gam

In [40]:
print(store.info())

<class 'pandas.io.pytables.HDFStore'>
File path: /Users/jamoroso/work/algo-trading/data/options_data_v2.h5
/SPX                           frame_table  (typ->appendable,nrows->16756680,ncols->20,indexers->[index],dc->[underlying,underlying_last,exchange,optionroot,optionext,type,expiration,quotedate,strike,last,bid,ask,volume,openinterest,impliedvol,delta,gamma,theta,vega,optionalias])
/SPX/meta/type/meta            series_table (typ->appendable,nrows->2,ncols->1,indexers->[index],dc->[values])                                                                                                                                                                          


In [41]:
store.close()

In [42]:
data = pd.read_hdf(store_path, key="/SPX")
data.head()

Unnamed: 0,underlying,underlying_last,exchange,optionroot,optionext,type,expiration,quotedate,strike,last,bid,ask,volume,openinterest,impliedvol,delta,gamma,theta,vega,optionalias
0,SPX,359.69,*,SPX900120C00225000,,call,1990-01-20,1990-01-02,225.0,0.0,135.5,135.5,0,820,0.0,0.0,0.0,0.0,0.0,SPX900120C00225000
1,SPX,359.69,*,SPX900120C00320000,,call,1990-01-20,1990-01-02,320.0,0.0,40.9,40.9,0,1088,0.0,0.0,0.0,0.0,0.0,SPX900120C00320000
2,SPX,359.69,*,SPX900120C00325000,,call,1990-01-20,1990-01-02,325.0,0.0,35.9,35.9,0,1252,0.0,0.0,0.0,0.0,0.0,SPX900120C00325000
3,SPX,359.69,*,SPX900120C00330000,,call,1990-01-20,1990-01-02,330.0,30.5,30.9,30.9,25,8738,0.0,0.0,0.0,0.0,0.0,SPX900120C00330000
4,SPX,359.69,*,SPX900120C00335000,,call,1990-01-20,1990-01-02,335.0,0.0,26.0,26.0,0,580,0.0,0.0,0.0,0.0,0.0,SPX900120C00335000


In [43]:
data.tail()

Unnamed: 0,underlying,underlying_last,exchange,optionroot,optionext,type,expiration,quotedate,strike,last,bid,ask,volume,openinterest,impliedvol,delta,gamma,theta,vega,optionalias
16756675,SPX,2506.85,*,SPX211217P03200000,,put,2021-12-18,2018-12-31,3200.0,0.0,669.9,692.5,0,0,0.1736,-0.6858,0.0004,-6.5662,1420.1359,
16756676,SPX,2506.85,*,SPX211217P03300000,,put,2021-12-18,2018-12-31,3300.0,0.0,745.8,768.0,0,0,0.1731,-0.7195,0.0004,-1.1379,1331.32,
16756677,SPX,2506.85,*,SPX211217P03400000,,put,2021-12-18,2018-12-31,3400.0,0.0,825.4,846.5,0,0,0.1733,-0.7491,0.0004,4.1453,1240.6058,
16756678,SPX,2506.85,*,SPX211217P03500000,,put,2021-12-18,2018-12-31,3500.0,0.0,908.1,928.0,0,0,0.1744,-0.774,0.0004,9.0917,1153.7674,
16756679,SPX,2506.85,*,SPX211217P03600000,,put,2021-12-18,2018-12-31,3600.0,0.0,988.5,1012.5,0,0,0.1742,-0.7984,0.0003,14.5306,1058.9513,


In [44]:
data.index

Int64Index([       0,        1,        2,        3,        4,        5,
                   6,        7,        8,        9,
            ...
            16756670, 16756671, 16756672, 16756673, 16756674, 16756675,
            16756676, 16756677, 16756678, 16756679],
           dtype='int64', length=16756680)

In [47]:
data.info(memory_usage=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16756680 entries, 0 to 16756679
Data columns (total 20 columns):
underlying         object
underlying_last    float64
exchange           object
optionroot         object
optionext          float64
type               category
expiration         datetime64[ns]
quotedate          datetime64[ns]
strike             float64
last               float64
bid                float64
ask                float64
volume             int64
openinterest       int64
impliedvol         float64
delta              float64
gamma              float64
theta              float64
vega               float64
optionalias        object
dtypes: category(1), datetime64[ns](2), float64(11), int64(2), object(4)
memory usage: 2.5+ GB


In [45]:
len(data)

16756680

In [48]:
del data

In [16]:
sliced = pd.read_hdf(store_path, key="/SPX", where="quotedate<'1990-12-31'")

In [17]:
sliced.head()

Unnamed: 0,underlying,underlying_last,exchange,optionroot,optionext,type,expiration,quotedate,strike,last,bid,ask,volume,openinterest,impliedvol,delta,gamma,theta,vega,optionalias
0,SPX,359.69,*,SPX900120C00225000,,call,1990-01-20,1990-01-02,225,0.0,135.5,135.5,0,820,0.0,0.0,0.0,0.0,0.0,SPX900120C00225000
1,SPX,359.69,*,SPX900120C00320000,,call,1990-01-20,1990-01-02,320,0.0,40.9,40.9,0,1088,0.0,0.0,0.0,0.0,0.0,SPX900120C00320000
2,SPX,359.69,*,SPX900120C00325000,,call,1990-01-20,1990-01-02,325,0.0,35.9,35.9,0,1252,0.0,0.0,0.0,0.0,0.0,SPX900120C00325000
3,SPX,359.69,*,SPX900120C00330000,,call,1990-01-20,1990-01-02,330,30.5,30.9,30.9,25,8738,0.0,0.0,0.0,0.0,0.0,SPX900120C00330000
4,SPX,359.69,*,SPX900120C00335000,,call,1990-01-20,1990-01-02,335,0.0,26.0,26.0,0,580,0.0,0.0,0.0,0.0,0.0,SPX900120C00335000


In [18]:
sliced.tail()

Unnamed: 0,underlying,underlying_last,exchange,optionroot,optionext,type,expiration,quotedate,strike,last,bid,ask,volume,openinterest,impliedvol,delta,gamma,theta,vega,optionalias
46177,SPX,328.72,*,SPX920620P00300000,,put,1992-06-20,1990-12-28,300,0.0,15.9,17.9,0,340,0.2791,-0.2209,0.0027,-4.4186,118.5917,SPX920620P00300000
46178,SPX,328.72,*,SPX920620P00325000,,put,1992-06-20,1990-12-28,325,0.0,23.1,24.9,0,215,0.2694,-0.2945,0.0032,-3.4056,137.7546,SPX920620P00325000
46179,SPX,328.72,*,SPX920620P00350000,,put,1992-06-20,1990-12-28,350,34.0,32.0,32.6,3,242,0.2563,-0.3768,0.0037,-1.3279,151.7502,SPX920620P00350000
46180,SPX,328.72,*,SPX920620P00375000,,put,1992-06-20,1990-12-28,375,44.5,42.5,44.5,1,26,0.2502,-0.4653,0.004,1.4408,158.8069,SPX920620P00375000
46181,SPX,328.72,*,SPX920620P00400000,,put,1992-06-20,1990-12-28,400,0.0,57.0,59.0,0,506,0.2533,-0.5476,0.0039,4.4686,158.2732,SPX920620P00400000


In [19]:
del sliced

In [20]:
puts = pd.read_hdf(store_path, key="/SPX", where="type=put")

In [21]:
puts.head()

Unnamed: 0,underlying,underlying_last,exchange,optionroot,optionext,type,expiration,quotedate,strike,last,bid,ask,volume,openinterest,impliedvol,delta,gamma,theta,vega,optionalias
14,SPX,359.69,*,SPX900120P00225000,,put,1990-01-20,1990-01-02,225,0.0,0.0,0.1,0,2016,0.7613,-0.0017,0.0001,-3.3274,0.4217,SPX900120P00225000
15,SPX,359.69,A,SPX900120P00320000,,put,1990-01-20,1990-01-02,320,0.313,0.3,0.4,5,2622,0.3161,-0.0368,0.0032,-19.8539,6.3071,SPX900120P00320000
16,SPX,359.69,*,SPX900120P00325000,,put,1990-01-20,1990-01-02,325,0.438,0.4,0.6,20,2120,0.2994,-0.0497,0.0044,-23.8023,8.0396,SPX900120P00325000
17,SPX,359.69,*,SPX900120P00330000,,put,1990-01-20,1990-01-02,330,0.75,0.3,0.6,50,4296,0.2558,-0.05,0.0051,-20.2438,8.0885,SPX900120P00330000
18,SPX,359.69,*,SPX900120P00335000,,put,1990-01-20,1990-01-02,335,0.75,0.5,0.7,3,3523,0.2379,-0.0702,0.0072,-24.2525,10.5466,SPX900120P00335000


In [22]:
puts.tail()

Unnamed: 0,underlying,underlying_last,exchange,optionroot,optionext,type,expiration,quotedate,strike,last,bid,ask,volume,openinterest,impliedvol,delta,gamma,theta,vega,optionalias
93077,SPX,417.09,*,SPX930619P00350000,,put,1993-06-19,1991-12-31,350,0.0,8.2,9.0,0,1506,0.2082,-0.1426,0.0021,-0.0142,1.1384,SPX930619P00350000
93078,SPX,417.09,*,SPX930619P00375000,,put,1993-06-19,1991-12-31,375,0.0,12.2,13.0,0,243,0.1954,-0.2029,0.0029,-0.0147,1.4264,SPX930619P00375000
93079,SPX,417.09,*,SPX930619P00400000,,put,1993-06-19,1991-12-31,400,19.125,19.0,19.2,62,1239,0.1889,-0.2844,0.0036,-0.0142,1.7133,SPX930619P00400000
93080,SPX,417.09,*,SPX930619P00425000,,put,1993-06-19,1991-12-31,425,0.0,27.4,28.1,0,355,0.1826,-0.3792,0.0041,-0.0111,1.9221,SPX930619P00425000
93081,SPX,417.09,*,SPX930619P00450000,,put,1993-06-19,1991-12-31,450,0.0,39.2,40.0,0,0,0.1812,-0.481,0.0044,-0.0061,2.0128,SPX930619P00450000


In [23]:
len(puts)

46539

In [28]:
assert all(puts["type"] == "put")

In [29]:
del puts