Merge pull request #2629 from ericpre/use_dask_chunks_saving_file

Use dask chunks when saving lazy signal
hyperspy · Feb 16, 2021 · c73b3e8 · c73b3e8
2 parents ae419cb + 87dd3f9
commit c73b3e8
Show file tree

Hide file tree

Showing 7 changed files with 132 additions and 78 deletions.
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -17,6 +17,8 @@ RELEASE_next_patch (Unreleased)
 * Add update instructions to user guide (`#2621 <https://github.com/hyperspy/hyperspy/pull/2621>`_)
 * Fix disconnect event when closing navigator only plot (fixes `#996 <https://github.com/hyperspy/hyperspy/issues/996>`_), (`#2631 <https://github.com/hyperspy/hyperspy/pull/2631>`_)
 * Improve plotting navigator of lazy signals, add `navigator` setter to lazy signals (`#2631 <https://github.com/hyperspy/hyperspy/pull/2631>`_)
+* Use dask chunking when saving lazy signal instead of rechunking and leave the user to decide what is the suitable chunking (`#2629 <https://github.com/hyperspy/hyperspy/pull/2629>`_)
+* Fix incorrect chunksize when saving EMD NCEM file and specifying chunks (`#2629 <https://github.com/hyperspy/hyperspy/pull/2629>`_)
 
 
 Changelog

diff --git a/doc/user_guide/io.rst b/doc/user_guide/io.rst
@@ -345,7 +345,8 @@ This feature is particularly useful when using
 
 The hyperspy HDF5 format supports chunking the data into smaller pieces to make it possible to load only part
 of a dataset at a time. By default, the data is saved in chunks that are optimised to contain at least one
-full signal shape. It is possible to customise the chunk shape using the ``chunks`` keyword. 
+full signal shape for non-lazy signal, while for lazy signal, the chunking of the dask is used. It is possible to
+customise the chunk shape using the ``chunks`` keyword.
 For example, to save the data with ``(20, 20, 256)`` chunks instead of the default ``(7, 7, 2048)`` chunks
 for this signal:
 
@@ -358,8 +359,8 @@ Note that currently it is not possible to pass different customised chunk shapes
 arrays contained in a signal and its metadata. Therefore, the value of ``chunks`` provided on saving
 will be applied to all arrays contained in the signal.
 
-By passing ``True`` to ``chunks`` the chunk shape is guessed using ``h5py``'s ``guess_chunks`` function
-what, for large signal spaces usually leads to smaller chunks as ``guess_chunks`` does not impose the
+By passing ``True`` to ``chunks`` the chunk shape is guessed using ``h5py``'s ``guess_chunk`` function
+what, for large signal spaces usually leads to smaller chunks as ``guess_chunk`` does not impose the
 constrain of storing at least one signal per chunks. For example, for the signal in the example above
 passing ``chunks=True`` results in ``(7, 7, 256)`` chunks.
 
@@ -368,7 +369,7 @@ See the `chunking section <big_data.html#Chunking>`__ under `Working with big da
 
 Extra saving arguments
 ^^^^^^^^^^^^^^^^^^^^^^^
-- ``compression``: One of ``None``, ``'gzip'``, ``'szip'``, ``'lzf'`` (default is ``'gzip'``). 
+- ``compression``: One of ``None``, ``'gzip'``, ``'szip'``, ``'lzf'`` (default is ``'gzip'``).
   ``'szip'`` may be unavailable as it depends on the HDF5 installation including it.
 
 .. note::
@@ -554,7 +555,7 @@ the "r+" mode are incompatible).
 Images
 ------
 
-HyperSpy is able to read and write data too `all the image formats 
+HyperSpy is able to read and write data too `all the image formats
 <https://imageio.readthedocs.io/en/stable/formats.html>`_ supported by
 `imageio`, which used the Python Image Library  (PIL/pillow).
 This includes png, pdf, gif etc.
@@ -888,6 +889,9 @@ Extra loading arguments
 - ``stack_group`` : bool, default is True. Stack datasets of groups with common
   path. Relevant for emd file version >= 0.5 where groups can be named
   'group0000', 'group0001', etc.
+- ``chunks`` : None, True or tuple. Determine the chunking of the dataset to save.
+  See the ``chunks`` arguments of the ``hspy`` file format for more details.
+
 
 For files containing several datasets, the `dataset_name` argument can be
 used to select a specific one:
@@ -934,14 +938,14 @@ the data size in memory.
     <EDSSEMSpectrum, title: EDS, dimensions: (179, 161|4096)>]
 
 .. note::
-    
+
     FFTs made in Velox are loaded in as-is as a HyperSpy ComplexSignal2D object.
     The FFT is not centered and only positive frequencies are stored in the file.
     Lazy reading of these datasets is not supported. Making FFTs with HyperSpy
     from the respective image datasets is recommended.
 
 .. note::
-    
+
     DPC data is loaded in as a HyperSpy ComplexSignal2D object. Lazy reading of these
     datasets is not supported.
 

diff --git a/hyperspy/io_plugins/emd.py b/hyperspy/io_plugins/emd.py
@@ -40,9 +40,10 @@
 from dateutil import tz
 import pint
 
+from hyperspy.exceptions import VisibleDeprecationWarning
 from hyperspy.misc.elements import atomic_number2name
 import hyperspy.misc.io.fei_stream_readers as stream_readers
-from hyperspy.exceptions import VisibleDeprecationWarning
+from hyperspy.io_plugins.hspy import get_signal_chunks
 
 
 # Plugin characteristics
@@ -65,40 +66,6 @@
 _logger = logging.getLogger(__name__)
 
 
-def calculate_chunks(shape, dtype, chunk_size_mb=100):
-    """Calculate chunks to get target chunk size.
-
-    The chunks are optimized for C-order reading speed.
-
-    Parameters
-    ----------
-    shape: tuple of ints
-        The shape of the array
-    dtype: string or numpy dtype
-        The dtype of the array
-    chunk_size_mb: int
-        The maximum size of the resulting chunks in MB. The default is
-        100MB as reccommended by the dask documentation.
-
-    """
-
-    target = chunk_size_mb * 1e6
-    items = int(target // np.dtype(dtype).itemsize)
-    chunks = ()
-    dimsize = np.cumprod(shape[::-1])[::-1][1:]
-    for i, ds in enumerate(dimsize):
-        chunk = items // ds
-        if not chunk:
-            chunks += (1,)
-        elif chunk <= shape[i]:
-            chunks += (chunk, )
-        else:
-            chunks += (shape[i],)
-    # At least one signal
-    chunks += (shape[-1], )
-    return chunks
-
-
 class EMD(object):
 
     """Class for storing electron microscopy datasets.
@@ -627,12 +594,15 @@ def _read_dataset(dataset):
         """Read dataset and use the h5py AsStrWrapper when the dataset is of
         string type (h5py 3.0 and newer)
         """
+        chunks = dataset.chunks
+        if chunks is None:
+            chunks = 'auto'
         if (h5py.check_string_dtype(dataset.dtype) and
             hasattr(dataset, 'asstr')):
             # h5py 3.0 and newer
             # https://docs.h5py.org/en/3.0.0/strings.html
             dataset = dataset.asstr()[:]
-        return dataset
+        return dataset, chunks
 
     def _read_emd_version(self, group):
         """ Return the group version if the group is an EMD group, otherwise
@@ -654,32 +624,27 @@ def _read_data_from_groups(self, group_path, dataset_name, stack_key=None,
         if None in array_list:
             raise IOError("Dataset can't be found.")
 
-        if self.lazy:
-            chunks = array_list[0].chunks
-            if chunks is None:
-                chunks = calculate_chunks(array_list[0].shape, array_list[0].dtype)
-
         if len(array_list) > 1:
             # Squeeze the data only when
             if self.lazy:
-                data_list = [da.from_array(self._read_dataset(d),
-                                           chunks=chunks) for d in array_list]
+                data_list = [da.from_array(*self._read_dataset(d))
+                             for d in array_list]
                 if transpose_required:
                     data_list = [da.transpose(d) for d in data_list]
                 data = da.stack(data_list)
                 data = da.squeeze(data)
             else:
-                data_list = [np.asanyarray(self._read_dataset(d))
+                data_list = [np.asanyarray(self._read_dataset(d)[0])
                              for d in array_list]
                 if transpose_required:
                     data_list = [np.transpose(d) for d in data_list]
                 data = np.stack(data_list).squeeze()
         else:
+            d = array_list[0]
             if self.lazy:
-                data = da.from_array(self._read_dataset(array_list[0]),
-                                     chunks=chunks)
+                data = da.from_array(*self._read_dataset(d))
             else:
-                data = np.asanyarray(self._read_dataset(array_list[0]))
+                data = np.asanyarray(self._read_dataset(d)[0])
             if transpose_required:
                 data = data.transpose()
 
@@ -833,8 +798,8 @@ def write_file(self, file, signal, **kwargs):
         signal : instance of hyperspy signal
             The signal to save.
         **kwargs : dict
-            Dictionary containing metadata which will be written as attribute
-            of the root group.
+            Keyword argument are passed to the ``h5py.Group.create_dataset``
+            method.
 
         """
         if isinstance(file, str):
@@ -856,10 +821,11 @@ def write_file(self, file, signal, **kwargs):
         # Write signals:
         signal_group = emd_file.require_group('signals')
         signal_group.attrs['emd_group_type'] = 1
-        self._write_signal_to_group(signal_group, signal)
+        self._write_signal_to_group(signal_group, signal, **kwargs)
         emd_file.close()
 
-    def _write_signal_to_group(self, signal_group, signal):
+    def _write_signal_to_group(self, signal_group, signal, chunks=None,
+                               **kwargs):
         # Save data:
         title = signal.metadata.General.title or '__unnamed__'
         dataset = signal_group.require_group(title)
@@ -868,8 +834,20 @@ def _write_signal_to_group(self, signal_group, signal):
         if np.issubdtype(data.dtype, np.dtype('U')):
             # Saving numpy unicode type is not supported in h5py
             data = data.astype(np.dtype('S'))
-        dataset.create_dataset('data', data=data, chunks=True,
-                               maxshape=maxshape)
+        if chunks is None:
+            if isinstance(data, da.Array):
+                # For lazy dataset, by default, we use the current dask chunking
+                chunks = tuple([c[0] for c in data.chunks])
+            else:
+                signal_axes = signal.axes_manager.signal_indices_in_array
+                chunks = get_signal_chunks(data.shape, data.dtype, signal_axes)
+        # when chunks=True, we leave it to h5py `guess_chunk`
+        elif chunks is not True:
+            # Need to reverse since the data is transposed when saving
+            chunks = chunks[::-1]
+
+        dataset.create_dataset('data', data=data, maxshape=maxshape,
+                               chunks=chunks, **kwargs)
 
         array_indices = np.arange(0, len(data.shape))
         dim_indices = (array_indices + 1)[::-1]

diff --git a/hyperspy/io_plugins/hspy.py b/hyperspy/io_plugins/hspy.py
@@ -28,7 +28,7 @@
 from traits.api import Undefined
 from hyperspy.misc.utils import ensure_unicode, multiply, get_object_package_info
 from hyperspy.axes import AxesManager
-from collections import namedtuple
+
 
 _logger = logging.getLogger(__name__)
 
@@ -502,7 +502,7 @@ def parse_structure(key, group, value, _type, **kwds):
 
 
 def get_signal_chunks(shape, dtype, signal_axes=None):
-    """Function that claculates chunks for the signal, preferably at least one
+    """Function that calculates chunks for the signal, preferably at least one
     chunk per signal space.
 
     Parameters
@@ -549,11 +549,12 @@ def get_signal_chunks(shape, dtype, signal_axes=None):
 
 def overwrite_dataset(group, data, key, signal_axes=None, chunks=None, **kwds):
     if chunks is None:
-        if signal_axes is None:
-            # Use automatic h5py chunking
-            chunks = True
+        if isinstance(data, da.Array):
+            # For lazy dataset, by default, we use the current dask chunking
+            chunks = tuple([c[0] for c in data.chunks])
         else:
-            # Optimise the chunking to contain at least one signal per chunk
+            # If signal_axes=None, use automatic h5py chunking, otherwise
+            # optimise the chunking to contain at least one signal per chunk
             chunks = get_signal_chunks(data.shape, data.dtype, signal_axes)
 
     if np.issubdtype(data.dtype, np.dtype('U')):
@@ -593,9 +594,11 @@ def overwrite_dataset(group, data, key, signal_axes=None, chunks=None, **kwds):
         # just a reference to already created thing
         pass
     else:
-        _logger.info("Chunks used for saving: %s" % str(dset.chunks))
+        _logger.info(f"Chunks used for saving: {dset.chunks}")
         if isinstance(data, da.Array):
-            da.store(data.rechunk(dset.chunks), dset)
+            if data.chunks != dset.chunks:
+                data = data.rechunk(dset.chunks)
+            da.store(data, dset)
         elif data.flags.c_contiguous:
             dset.write_direct(data)
         else:

diff --git a/hyperspy/signal.py b/hyperspy/signal.py
@@ -2745,6 +2745,23 @@ def save(self, filename=None, overwrite=None, extension=None,
             i) the filename
             ii)  `Signal.tmp_parameters.extension`
             iii) ``'hspy'`` (the default extension)
+        chunks : tuple or True or None (default)
+            HyperSpy, Nexus and EMD NCEM format only. Define chunks used when
+            saving. The chunk shape should follow the order of the array
+            (``s.data.shape``), not the shape of the ``axes_manager``.
+            If None and lazy signal, the dask array chunking is used.
+            If None and non-lazy signal, the chunks are estimated automatically
+            to have at least one chunk per signal space.
+            If True, the chunking is determined by the the h5py ``guess_chunk``
+            function.
+        save_original_metadata : bool , default : False
+            Nexus file only. Option to save hyperspy.original_metadata with
+            the signal. A loaded Nexus file may have a large amount of data
+            when loaded which you may wish to omit on saving
+        use_default : bool , default : False
+            Nexus file only. Define the default dataset in the file.
+            If set to True the signal or first signal in the list of signals
+            will be defined as the default (following Nexus v3 data rules).
 
         """
         if filename is None:

diff --git a/hyperspy/tests/io/test_emd.py b/hyperspy/tests/io/test_emd.py
@@ -16,22 +16,21 @@
 # You should have received a copy of the GNU General Public License
 # along with  HyperSpy.  If not, see <http://www.gnu.org/licenses/>.
 
+
 # The EMD format is a hdf5 standard proposed at Lawrence Berkeley
 # National Lab (see http://emdatasets.com/ for more information).
 # NOT to be confused with the FEI EMD format which was developed later.
 
-
-import gc
-import os.path
-import shutil
-import tempfile
+import dask.array as da
 from datetime import datetime
-from os import remove
-
+from dateutil import tz
+import gc
 import h5py
 import numpy as np
+import os
 import pytest
-from dateutil import tz
+import tempfile
+import shutil
 
 from hyperspy.io import load
 from hyperspy.misc.test_utils import assert_deep_almost_equal
@@ -274,7 +273,32 @@ def test_save_and_read(self, lazy):
         assert isinstance(signal, BaseSignal)
 
     def teardown_method(self, method):
-        remove(os.path.join(my_path, 'emd_files', 'example_temp.emd'))
+        os.remove(os.path.join(my_path, 'emd_files', 'example_temp.emd'))
+
+
+def test_chunking_saving_lazy():
+    s = Signal2D(da.zeros((50, 100, 100))).as_lazy()
+    s.data = s.data.rechunk([50, 25, 25])
+    with tempfile.TemporaryDirectory() as tmp:
+        filename = os.path.join(tmp, 'test_chunking_saving_lazy.emd')
+        filename2 = os.path.join(tmp, 'test_chunking_saving_lazy_chunks_True.emd')
+        filename3 = os.path.join(tmp, 'test_chunking_saving_lazy_chunks_specify.emd')
+    s.save(filename)
+    s1 = load(filename, lazy=True)
+    assert s.data.chunks == s1.data.chunks
+
+    # with chunks=True, use h5py chunking
+    s.save(filename2, chunks=True)
+    s2 = load(filename2, lazy=True)
+    assert tuple([c[0] for c in s2.data.chunks]) == (13, 25, 13)
+    s1.close_file()
+    s2.close_file()
+
+    # Specify chunks
+    chunks = (50, 20, 20)
+    s.save(filename3, chunks=chunks)
+    s3 = load(filename3, lazy=True)
+    assert tuple([c[0] for c in s3.data.chunks]) == chunks
 
 
 def _generate_parameters():