# HDF5 in Python

In [5]:
import os
import numpy as np
import h5py
import tables as tb

In [6]:
np.set_printoptions(precision=2, suppress=True)

In [7]:
# tab-completion for groups and attributes
?h5py.enable_ipython_completer
h5py.enable_ipython_completer()

In [8]:
data_dir = os.path.join(os.getcwd(), 'data')
print(data_dir)

/home/jack/Repos/hdf5-pydata-munich/data


# h5py

The h5py library is a thin, pythonic wrapper around the HDF5 C API.

It tries to expose most of the functionality that the HDF5 library provides.

### Datasets

In [9]:
with h5py.File(name='data/my_h5py_file.h5', mode='w') as f:
    f.create_dataset(name='my_dataset', data=[1.0, 2.7, 3.7, 4.5])
#     f.create_dataset(name='my_other_dataset', data=[1, 2, 3, 4])
#     f.create_dataset(name='my_other_dataset', data=[1, 2, 3, 4], dtype=np.float32)

In [10]:
with h5py.File(name='data/my_h5py_file.h5', mode='r') as f:
    # the array is just a proxy object
    print(f['my_dataset'])
    # the actual data can be accessed with these 2 syntaxes
#     print(f['my_dataset'][:])
#     print(f['my_dataset'][...])

<HDF5 dataset "my_dataset": shape (4,), type "<f8">


### Preallocation on disk

In [11]:
with h5py.File(name='data/my_h5py_file.h5', mode='w') as f:
    ds = f.create_dataset(name='my_dataset', shape=(8, 1))
    ds[0] = 5.2
    ds[1] = 7

### Pick the correct HDF5 datatype

In [12]:
arr = np.array([0, 1, 254, 255, 256, -1, -2], dtype='uint8')
print(arr)

[  0   1 254 255   0 255 254]


In [13]:
with h5py.File(name='data/my_h5py_file.h5', mode='w') as f:
    f.create_dataset(name='my_dataset', shape=(7,), dtype=h5py.h5t.STD_U8BE)
    f['my_dataset'][0:8] = [0, 1, 254, 255, 123456, -1, -2]
    print(f["my_dataset"][:])

[  0   1 254 255 255   0   0]


### Groups

In [14]:
with h5py.File(name='data/my_h5py_file.h5', mode='w') as f:
    f.create_group(name='group1')
    group2 = f.create_group(name='group2')
    group2.create_group(name='group3')

In [15]:
with h5py.File(name='data/my_h5py_file.h5', mode='r') as f:
    group3 = f['group2/group3']
    print(group3.parent)

<HDF5 group "/group2" (1 members)>


### Attributes

In [16]:
with h5py.File(name='data/my_h5py_file.h5', mode='w') as f:
    ds = f.create_dataset(name='my_dataset', data=[1, 2, 3, 4])
    ds.attrs['Unit'] = 'm/s'
    gr = f.create_group(name='my_group')
    gr.attrs['Created'] = '18/12/2017'
    gr.attrs.create(name='Versions', data=np.array([123, 456])) 

### Traverse a HDF5 file with h5py

In [17]:
def print_name(name):
    print(name)

with h5py.File(name='data/my_h5py_file.h5', mode='r') as f:
    f.visit(print_name)

my_dataset
my_group


### HDF5 Command Line Tools

[Here](https://support.hdfgroup.org/products/hdf5_tools/#h5dist) you can find the command line tools developed by the HDF Group. You don't need h5py or PyTables to use them.

If you are on Ubuntu, you can install them with `sudo apt install hdf5-tools`

In [18]:
# -r stands for 'recursive'
!h5ls -r 'data/my_h5py_file.h5'

/                        Group
/my_dataset              Dataset {4}
/my_group                Group


In [19]:
!h5dump 'data/my_h5py_file.h5'

HDF5 "data/my_h5py_file.h5" {
GROUP "/" {
   DATASET "my_dataset" {
      DATATYPE  H5T_STD_I64LE
      DATASPACE  SIMPLE { ( 4 ) / ( 4 ) }
      DATA {
      (0): 1, 2, 3, 4
      }
      ATTRIBUTE "Unit" {
         DATATYPE  H5T_STRING {
            STRSIZE H5T_VARIABLE;
            STRPAD H5T_STR_NULLTERM;
            CSET H5T_CSET_UTF8;
            CTYPE H5T_C_S1;
         }
         DATASPACE  SCALAR
         DATA {
         (0): "m/s"
         }
      }
   }
   GROUP "my_group" {
      ATTRIBUTE "Created" {
         DATATYPE  H5T_STRING {
            STRSIZE H5T_VARIABLE;
            STRPAD H5T_STR_NULLTERM;
            CSET H5T_CSET_UTF8;
            CTYPE H5T_C_S1;
         }
         DATASPACE  SCALAR
         DATA {
         (0): "18/12/2017"
         }
      }
      ATTRIBUTE "Versions" {
         DATATYPE  H5T_STD_I64LE
         DATASPACE  SIMPLE { ( 2 ) / ( 2 ) }
         DATA {
         (0): 123, 456
         }
      }
   }
}
}


---

# PyTables

PyTables provides a higher abstraction over HDF5. This doesn't make it slower than h5py though.

At the moment PyTables does **not** depend on h5py.

### Array


In [20]:
with tb.open_file(filename='data/my_pytables_file.h5', mode='w') as f:
    f.create_array(where='/', 
                   name='my_array',
#                    name='my-array', # NaturalNameWarning
#                    title='My PyTables Array',
                   obj=[1, 2, 3, 4])

PyTables has a feature called "Natural Naming": nodes (i.e. datasets and groups in the HDF5 file) can be accessed with the dot notation.

In [21]:
with tb.open_file(filename='data/my_pytables_file.h5', mode='r') as f:
    print(f.root.my_array)

/my_array (Array(4,)) ''


### Groups

In [22]:
with tb.open_file(filename='data/my_pytables_file.h5', mode='w') as f:
    f.create_group(where='/', name='my_group')

### Attributes

In [23]:
with tb.open_file(filename='data/my_pytables_file.h5', mode='w') as f:
    f.create_array(where=f.root, name='my_array', obj=[1, 2, 3, 4], title='My PyTables Array')
    f.set_node_attr(where='/my_array', attrname='SomeAttribute', attrvalue='SomeValue')
    f.create_group(where='/', name='my_group')
    f.set_node_attr(where='/my_group', attrname='SomeOtherAttribute', attrvalue=123)

### HDF5 datasets have many abstractions in PyTables

Homogenous dataset:

- **Array**
- **CArray**
- **EArray**
- **VLArray**

Heterogenous dataset:

- **Table**

In [24]:
num_rows = 1000000  # 1 million
num_columns = 5
gaussian = np.random.normal(loc=0, scale=1, size=num_rows).astype('float32')
uniform = np.random.uniform(low=100, high=150, size=num_rows).astype('uint8')
matrix = np.random.random((num_rows, num_columns)).astype('float32')

### Array (again!)

[Docs](http://www.pytables.org/usersguide/libref/homogenous_storage.html#the-array-class)

- Fastest I/O speed
- Homogeneous (i.e. data has same `dtype`)
- Must fit in memory
- Not compressible
- Not enlargeable

In [25]:
%%time
with tb.open_file(filename='data/my_pytables_file.h5', mode='w') as f:
    f.create_array(where='/', name='gaussian', obj=gaussian)
    f.create_array(where='/', name='uniform', obj=uniform)
    f.create_array(where='/', name='matrix', obj=matrix)

CPU times: user 12 ms, sys: 140 ms, total: 152 ms
Wall time: 150 ms


### CArray

[Docs](http://www.pytables.org/usersguide/libref/homogenous_storage.html#carrayclassdescr)

- Chunked storage
- Data must be homogeneous
- Good speed when reading/writing
- Compressible
- Not enlargeable

In [26]:
with tb.open_file(filename='data/my_pytables_file.h5', mode='w') as f:
    f.create_carray(where='/', name='my_carray', obj=[1, 2, 3, 4])
    # you can create a CArray and fill it later, but you need to specify atom and shape
    carray = f.create_carray(where='/', name='my_other_carray', atom=tb.Float32Atom(), shape=(4, 2))
    # later...
    carray[:, 1] = [5, 6, 7, 8]

In [27]:
filters = tb.Filters(complevel=5, complib='zlib')

Tips on how to use compression (from the PyTables docs)

- A mid-level (5) compression is sufficient. No need to go all the way up (9)
- Use zlib if you must guarantee complete portability
- Use blosc all other times (it is optimized for HDF5)

In [28]:
%%time
with tb.open_file(filename='data/my_pytables_file.h5', mode='w') as f:
    f.create_carray(where='/', name='gaussian', obj=gaussian, filters=filters)
    f.create_carray(where='/', name='uniform', obj=uniform, filters=filters)
    f.create_carray(where='/', name='matrix', obj=matrix, filters=filters)

CPU times: user 872 ms, sys: 80 ms, total: 952 ms
Wall time: 952 ms


### EArray

[Docs](http://www.pytables.org/usersguide/libref/homogenous_storage.html#earrayclassdescr)

- Enlargeable on **one** dimension (append)
- Pretty fast at extending, very good at reading
- Data must be homogeneous
- Compressible

In [29]:
%%time
# One (and only one) of the shape dimensions *must* be 0.
# The dimension being 0 means that the resulting EArray object can be extended along it.
# Multiple enlargeable dimensions are not supported right now.
shape = (num_rows, 0)

with tb.open_file(filename='data/my_pytables_file.h5', mode='w') as f:
    earray = f.create_earray(where='/',
                             name='my_earray',
                             atom=tb.Float32Atom(),
                             shape=shape,
                             filters=filters)
    earray.append(sequence=matrix[:, 0:1])
    earray.append(sequence=matrix[:, 1:5])

CPU times: user 840 ms, sys: 100 ms, total: 940 ms
Wall time: 942 ms


### VLArray

[Docs](http://www.pytables.org/usersguide/libref/homogenous_storage.html#the-vlarray-class)

- Supports collections of homogeneous data with a variable number of entries
- Compressible
- Enlargeable (append)
- I/O is not very fast
- Like Table datasets, variable length arrays can have only one dimension, and the elements (atoms) of their rows can be fully multidimensional

In [30]:
with tb.open_file(filename='data/my_pytables_file.h5', mode='w') as f:
    vlarray = f.create_vlarray(where=f.root, name='my_vlarray', atom=tb.Float32Atom())
    vlarray.append(gaussian[0:10])
    vlarray.append(uniform[0:1000])

### Table

[Docs](http://www.pytables.org/usersguide/libref/structured_storage.html?highlight=table#tableclassdescr)

- Data can be heterogeneous (i.e. different shapes and different dtypes)
- The structure of a table is declared by its description
- It supports *in-kernel* searches with `Table.where`
- It supports multi-column searches
- Non-nested columns can be *indexed*

In order to emulate in Python records mapped to HDF5 C structs PyTables implements a special class so as to easily define all its fields and other properties. It's called `IsDescription`.

A *description* defines the table structure (basically, the *schema* of your table).

In [31]:
class Particle(tb.IsDescription):
    identity = tb.StringCol(itemsize=22, dflt=' ', pos=0)  # character String
    idnumber = tb.Int16Col(dflt=1, pos=1)  # short integer

In [32]:
print(Particle.columns)

{'identity': StringCol(itemsize=22, shape=(), dflt=b' ', pos=0), 'idnumber': Int16Col(shape=(), dflt=1, pos=1)}


In [33]:
with tb.open_file(filename='data/my_pytables_file.h5', mode='w') as f:
    table = f.create_table(where='/', name='my_table', description=Particle)
    
    num_rows = 100
    row = table.row
    for i in range(num_rows):
        row['identity'] = 'I am {}'.format(i)
        row['idnumber'] = i
        row.append()
    # Flush the table buffers to release memory and make sure are written to disk
    table.flush()

A description can be nested inside another description (this will look weird...).

In [34]:
class Particle(tb.IsDescription):
    identity = tb.StringCol(itemsize=22, dflt=' ', pos=0)
    idnumber = tb.Int16Col(dflt=1, pos=1)

    class Properties(tb.IsDescription):
        # 2-D float array (single-precision)
        pressure = tb.Float32Col(shape=(2, 3))
        # 3-D float array (double-precision)
        energy = tb.Float64Col(shape=(2, 3, 4))

In [35]:
with tb.open_file('data/my_pytables_file.h5', 'w') as f:
    table = f.create_table(where='/', name='my_table', description=Particle)
    
    num_rows = 100
    row = table.row
    for i in range(num_rows):
        row['identity'] = 'I am {}'.format(i)
        row['idnumber'] = i
        row['Properties/pressure'] = np.random.random(size=(2, 3))
        row['Properties/energy'] = np.random.random(size=(2, 3, 4))
        row.append()
    table.flush()

### Traverse a HDF5 file with PyTables

In [36]:
with tb.open_file(filename='data/my_pytables_file.h5', mode='r') as f:
    for node in f.walk_nodes('/', classname='Table'):
        print('{}'.format(node._v_pathname))

/my_table


# NYC Yellow Taxi Dataset (2015)

![Big data, big deal](https://marktortorici.files.wordpress.com/2013/10/ron-burgundy-big-deal.jpg)

### A quick look

Let's have a quick look at the data - without having to load into memory an entire 2GB CSV file - with the unix `less` command.

`less yellow_tripdata_2015-01.csv`

In these CSV files there are some changes from year to year. A few things I noticed:

- in the CSV files from 2014 there is a field called `pickup_datetime`. From January 2015 onwards this field has been renamed as `tpep_pickup_datetime`;
- starting from July 2016 the columns `pickup_longitude` and `pickup_latitude` have been replaced with `PULocationID`, and the columns `dropoff_longitude` and `dropoff_latitude` with `DOLocationID`.

### Read/Store the CSV files

These CSV files are huge! Don't forget the pandas [rule of thumb](http://wesmckinney.com/blog/apache-arrow-pandas-internals/):

> Have 5 to 10 times as much RAM as the size of your dataset.

On my laptop (Thinkpad X220 i5 10GB RAM) it took roughly:

- **20 minutes** to read/store a **single CSV**.

- **4 hours** to read/store an entire **year**.

<img src="https://i.imgflip.com/20fb1g.jpg" title="made at imgflip.com"/>

In [37]:
!h5ls 'data/NYC-yellow-taxis.h5'

yellow_2015_01           Dataset {12748986/Inf}
yellow_2015_02           Dataset {12450521/Inf}
yellow_2015_03           Dataset {13351609/Inf}
yellow_2015_04           Dataset {13071789/Inf}
yellow_2015_05           Dataset {13158262/Inf}
yellow_2015_06           Dataset {12324935/Inf}
yellow_2015_07           Dataset {11562783/Inf}
yellow_2015_08           Dataset {11130304/Inf}
yellow_2015_09           Dataset {11225063/Inf}
yellow_2015_10           Dataset {12315488/Inf}
yellow_2015_11           Dataset {11312676/Inf}
yellow_2015_12           Dataset {11460573/Inf}


PyTables is shipped with some useful command line utilities. These CLI utils are in `tables/utils`.

You can use them if you are working in a python environment where you installed PyTables (these CLI utils cause your python interpreter to execute a python script in `tables/scripts`).

In [38]:
!pttree --use-si-units --sort-by 'size' 'data/NYC-yellow-taxis.h5'


------------------------------------------------------------

/ (RootGroup)
+--yellow_2015_03 (Table)
|     mem=680.9MB, disk=312.9MB [ 9.3%]
+--yellow_2015_05 (Table)
|     mem=671.1MB, disk=307.0MB [ 9.1%]
+--yellow_2015_04 (Table)
|     mem=666.7MB, disk=305.5MB [ 9.1%]
+--yellow_2015_01 (Table)
|     mem=650.2MB, disk=294.6MB [ 8.7%]
+--yellow_2015_02 (Table)
|     mem=635.0MB, disk=290.5MB [ 8.6%]
+--yellow_2015_06 (Table)
|     mem=628.6MB, disk=287.7MB [ 8.5%]
+--yellow_2015_10 (Table)
|     mem=628.1MB, disk=280.0MB [ 8.3%]
+--yellow_2015_07 (Table)
|     mem=589.7MB, disk=262.6MB [ 7.8%]
+--yellow_2015_12 (Table)
|     mem=584.5MB, disk=260.7MB [ 7.7%]
+--yellow_2015_11 (Table)
|     mem=576.9MB, disk=257.5MB [ 7.6%]
+--yellow_2015_09 (Table)
|     mem=572.5MB, disk=255.6MB [ 7.6%]
`--yellow_2015_08 (Table)
      mem=567.6MB, disk=253.7MB [ 7.5%]

------------------------------------------------------------
Total branch leaves:    12
Total branc

![He Man I have the data](https://www.storegrowers.com/wp-content/uploads/2016/02/analytics-meme-sword-guy.png)

In [39]:
table_where = '/yellow_2015_01'

`table.read` reads the **entire table** (it must fit into memory) and query it with NumPy.

In [346]:
%%time
with tb.open_file(filename='data/NYC-yellow-taxis.h5', mode='r') as f:
    table = f.get_node(where=table_where)
    results = [x['total_amount'] for x in table.read() 
               if 1 < x['passenger_count'] < 4 and x['trip_distance'] > 0.5]
    print('Rows that match the condition: {}'.format(len(results)))

Rows that match the condition: 2218407
CPU times: user 1min 22s, sys: 772 ms, total: 1min 23s
Wall time: 1min 23s


`table.iterrows` returns an **iterator** that iterates over all rows (so no need to load the entire table into memory).

In [40]:
%%time
with tb.open_file(filename='data/NYC-yellow-taxis.h5', mode='r') as f:
    table = f.get_node(where=table_where)
    results = [x['total_amount'] for x in table.iterrows()
               if 1 < x['passenger_count'] < 4 and x['trip_distance'] > 0.5]
    print('Rows that match the condition: {}'.format(len(results)))

Rows that match the condition: 2218407
CPU times: user 6.56 s, sys: 484 ms, total: 7.04 s
Wall time: 7.45 s


In [41]:
condition = """((1 < passenger_count) & (passenger_count < 4)) & (trip_distance > 0.5)"""
# Of course you can make it more readable
# cond0 = '((1 < passenger_count) & (passenger_count < 4))'
# cond1 = '(trip_distance > 0.5)'
# condition = '{} & {}'.format(cond0, cond1)
# This won't work: you can't use Python's standard boolean operators in NumExpr expressions
# condition = """(1 < passenger_count < 4) & (trip_distance > 0.5)"""

In [374]:
%%time
with tb.open_file(filename='data/NYC-yellow-taxis.h5', mode='r') as f:
    table = f.get_node(where=table_where)
    results = [x['total_amount'] for x in table.read_where(condition)]
    print('Rows that match the condition: {}'.format(len(results)))

Rows that match the condition: 2218407
CPU times: user 16.2 s, sys: 388 ms, total: 16.6 s
Wall time: 16.4 s


`table.where` uses **NumExpr** to make a **in-kernel** query.

In [42]:
%%time
with tb.open_file(filename='data/NYC-yellow-taxis.h5', mode='r') as f:
    table = f.get_node(where=table_where)
    results = [x['total_amount'] for x in table.where(condition)]
    print('Rows that match the condition: {}'.format(len(results)))

Rows that match the condition: 2218407
CPU times: user 6.29 s, sys: 160 ms, total: 6.45 s
Wall time: 6.31 s


### Add Indexes to the table

In [43]:
!h5ls 'data/NYC-yellow-taxis-indexed.h5'

_i_yellow_2015_01        Group
yellow_2015_01           Dataset {12748986/Inf}
yellow_2015_02           Dataset {12450521/Inf}
yellow_2015_03           Dataset {13351609/Inf}
yellow_2015_04           Dataset {13071789/Inf}
yellow_2015_05           Dataset {13158262/Inf}
yellow_2015_06           Dataset {12324935/Inf}
yellow_2015_07           Dataset {11562783/Inf}
yellow_2015_08           Dataset {11130304/Inf}
yellow_2015_09           Dataset {11225063/Inf}
yellow_2015_10           Dataset {12315488/Inf}
yellow_2015_11           Dataset {11312676/Inf}
yellow_2015_12           Dataset {11460573/Inf}


In [360]:
%%time
with tb.open_file(filename='data/NYC-yellow-taxis-indexed.h5', mode='r') as f:
    table = f.get_node(where=table_where)
    results = [x['total_amount'] for x in table.read() 
               if 1 < x['passenger_count'] < 4 and x['trip_distance'] > 0.5]
    print('Rows that match the condition: {}'.format(len(results)))

Rows that match the condition: 2218407
CPU times: user 1min 26s, sys: 860 ms, total: 1min 26s
Wall time: 1min 26s


In [44]:
%%time
with tb.open_file(filename='data/NYC-yellow-taxis-indexed.h5', mode='r') as f:
    table = f.get_node(where=table_where)
    results = [x['total_amount'] for x in table.iterrows()
               if 1 < x['passenger_count'] < 4 and x['trip_distance'] > 0.5]
    print('Rows that match the condition: {}'.format(len(results)))

Rows that match the condition: 2218407
CPU times: user 6.7 s, sys: 512 ms, total: 7.21 s
Wall time: 8.03 s


In [379]:
%%time
with tb.open_file(filename='data/NYC-yellow-taxis-indexed.h5', mode='r') as f:
    table = f.get_node(where=table_where)
    results = [x['total_amount'] for x in table.read_where(condition)]
    print('Rows that match the condition: {}'.format(len(results)))

Rows that match the condition: 2218407
CPU times: user 17.4 s, sys: 468 ms, total: 17.9 s
Wall time: 17.6 s


In [45]:
%%time
with tb.open_file(filename='data/NYC-yellow-taxis-indexed.h5', mode='r') as f:
    table = f.get_node(where=table_where)
    results = [x['total_amount'] for x in table.where(condition)]
    print('Rows that match the condition: {}'.format(len(results)))

Rows that match the condition: 2218407
CPU times: user 7.92 s, sys: 300 ms, total: 8.22 s
Wall time: 9.52 s


**Gotcha:** indexes might be slower if you have many results!

[search with index is slower than without index?](https://stackoverflow.com/questions/20769818/search-with-index-is-slower-than-without-index-in-pytables-when-the-result-is-la)

Let's try with a **more restrictive** condition.

In [46]:
condition = """((passenger_count > 3)) & (trip_distance > 30)"""

In [47]:
%%time
with tb.open_file(filename='data/NYC-yellow-taxis.h5', mode='r') as f:
    table = f.get_node(where=table_where)
    results = [x['total_amount'] for x in table.where(condition)]
    print('Rows that match the condition: {}'.format(len(results)))

Rows that match the condition: 265
CPU times: user 6.72 s, sys: 144 ms, total: 6.86 s
Wall time: 6.68 s


In [48]:
%%time
with tb.open_file(filename='data/NYC-yellow-taxis-indexed.h5', mode='r') as f:
    table = f.get_node(where=table_where)
    results = [x['total_amount'] for x in table.where(condition)]
    print('Rows that match the condition: {}'.format(len(results)))

Rows that match the condition: 265
CPU times: user 2.01 s, sys: 52 ms, total: 2.06 s
Wall time: 2.13 s


# Visualization

In [49]:
from dotenv import load_dotenv
dotenv_path = '.env'
load_dotenv(dotenv_path)
print(os.environ['PLOTLY_USERNAME'])

jackdbd


In [51]:
table_where = '/yellow_2015_01'
condition = """((passenger_count > 3)) & (trip_distance > 20.5)"""

In [54]:
%%time
with tb.open_file(filename='data/NYC-yellow-taxis-indexed.h5', mode='r') as f:
    table = f.get_node(where=table_where)
    coordinates = [(x['pickup_latitude'], x['pickup_longitude']) for x in table.where(condition)]
    print('Matches: {}'.format(len(coordinates)))

Matches: 4472
CPU times: user 6.55 s, sys: 172 ms, total: 6.72 s
Wall time: 6.47 s


In [55]:
latitudes, longitudes = zip(*coordinates)
print(latitudes[:2])
print(longitudes[:2])

(40.64424514770508, 40.64564895629883)
(-73.7822036743164, -73.78545379638672)


In [62]:
import plotly.plotly as py
from plotly.graph_objs import Scattermapbox, Scattergl, Marker, Data, Layout

py.sign_in(os.environ['PLOTLY_USERNAME'], os.environ['PLOTLY_API_KEY'])
mapbox_access_token = os.environ['MAPBOX_ACCESS_TOKEN']

data = Data([
    
    Scattermapbox(
        lat=latitudes,
        lon=longitudes,
        mode='markers',
        marker=Marker(
            size=10,
            color='rgb(247, 202, 24)',  # http://www.flatuicolorpicker.com/category/yellow
            opacity=0.7,
        ),
    ),
    
#     Scattergl(
#         x=longitudes,
#         y = latitudes,
#         mode='markers',
#         marker=dict(
#             line=dict(
#                 width=1, 
#                 color='#404040',
#             ),
#         ),
#     ),
    
])

layout = Layout(
    title='New York City yellow taxis pickup locations',
    autosize=True,
    height=600,
    hovermode='closest',
    showlegend=False,
    mapbox=dict(
        accesstoken=mapbox_access_token,
        center=dict(
            lat=40.75,
            lon=-74.0
        ),
        zoom=10,
        style='dark',
    ),
)

fig = dict(data=data, layout=layout)
py.iplot(fig)

---

# Reference

- [Introduction to HDF5](https://www.youtube.com/watch?v=BAjsCldRMMc) by Quincey Koziol
- [HDF5 is Eating the World](https://www.youtube.com/watch?v=nddj5OA8LJo) by Andrew Collette
- [HDF5 take 2 - h5py & PyTables](https://www.youtube.com/watch?v=ofLFhQ9yxCw) by Tom Kooij
- [SciPy 2017 notebooks](https://github.com/tomkooij/scipy2017/tree/master/notebooks) by Tom Kooij
- [h5py documentation](http://docs.h5py.org/en/latest/)
- [PyTables documentation](http://www.pytables.org/index.html)
- [The starving CPU problem (Francesc Alted)](https://python.g-node.org/python-summerschool-2013/_media/starving_cpu/starvingcpus.pdf)