hyperspy · ericpre · Oct 30, 2021 · Oct 20, 2021 · Oct 20, 2021 · Oct 20, 2021
diff --git a/doc/conf.py b/doc/conf.py
@@ -246,6 +246,7 @@
 
 # Add the hyperspy website to the intersphinx domains
 intersphinx_mapping = {'python': ('https://docs.python.org/3', None),
+                       'h5py': ('https://docs.h5py.org/en/stable', None),
                        'hyperspyweb': ('https://hyperspy.org/', None),
                        'matplotlib': ('https://matplotlib.org', None),
                        'numpy': ('https://docs.scipy.org/doc/numpy', None),
@@ -254,6 +255,7 @@
                        'astroML': ('https://www.astroml.org/', None),
                        'sklearn': ('https://scikit-learn.org/stable', None),
                        'skimage': ('https://scikit-image.org/docs/stable', None),
+                       'zarr': ('https://zarr.readthedocs.io/en/stable', None),
                        }
 
 graphviz_output_format = "svg"

diff --git a/doc/user_guide/io.rst b/doc/user_guide/io.rst
@@ -405,21 +405,31 @@ Extra saving arguments
 - ``compression``: One of ``None``, ``'gzip'``, ``'szip'``, ``'lzf'`` (default is ``'gzip'``).
   ``'szip'`` may be unavailable as it depends on the HDF5 installation including it.
 
-.. note::
-
-    HyperSpy uses h5py for reading and writing HDF5 files and, therefore, it
-    supports all `compression filters supported by h5py <https://docs.h5py.org/en/stable/high/dataset.html#dataset-compression>`_.
-    The default is ``'gzip'``. It is possible to enable other compression filters
-    such as ``blosc`` by installing e.g. `hdf5plugin <https://github.com/silx-kit/hdf5plugin>`_.
-    However, be aware that loading those files will require installing the package
-    providing the compression filter. If not available an error will be raised.
-
-    Compression can significantly increase the saving speed. If file size is not
-    an issue, it can be disabled by setting ``compression=None``. Notice that only
-    ``compression=None`` and ``compression='gzip'`` are available in all platforms,
-    see the `h5py documentation <https://docs.h5py.org/en/stable/faq.html#what-compression-processing-filters-are-supported>`_
-    for more details. Therefore, if you choose any other compression filter for
-    saving a file, be aware that it may not be possible to load it in some platforms.
+    .. note::
+
+        HyperSpy uses h5py for reading and writing HDF5 files and, therefore, it
+        supports all `compression filters supported by h5py <https://docs.h5py.org/en/stable/high/dataset.html#dataset-compression>`_.
+        The default is ``'gzip'``. It is possible to enable other compression filters
+        such as ``blosc`` by installing e.g. `hdf5plugin <https://github.com/silx-kit/hdf5plugin>`_.
+        However, be aware that loading those files will require installing the package
+        providing the compression filter. If not available an error will be raised.
+
+        Compression can significantly increase the saving speed. If file size is not
+        an issue, it can be disabled by setting ``compression=None``. Notice that only
+        ``compression=None`` and ``compression='gzip'`` are available in all platforms,
+        see the `h5py documentation <https://docs.h5py.org/en/stable/faq.html#what-compression-processing-filters-are-supported>`_
+        for more details. Therefore, if you choose any other compression filter for
+        saving a file, be aware that it may not be possible to load it in some platforms.
+
+- ``chunks``: tuple of integer or None. Define the chunking used for saving
+  the dataset. If None, calculates chunks for the signal, with preferably at
+  least one chunk per signal space.
+- ``close_file``: if ``False``, doesn't close the file after writing. The file
+  should not be closed if the data need to be accessed lazily after saving.
+  Default is ``True``.
+- ``write_dataset``: if ``False``, doesn't write the dataset when writing the file.
+  This can be useful to overwrite signal attributes only (for example ``axes_manager``)
+  without having to write the whole dataset, which can take time. Default is ``True``.
 
 
 .. _zspy-format:
@@ -448,38 +458,61 @@ the inner-most structures are converted to numpy arrays when saved. This
 procedure homogenizes any types of the objects inside, most notably casting
 numbers as strings if any other strings are present:
 
+By default, a :py:class:`zarr.storage.NestedDirectoryStore` is used, but other
+zarr store can be used by providing a `zarr store <https://zarr.readthedocs.io/en/stable/api/storage.html>`_
+instead as argument to the :py:meth:`~.signal.BaseSignal.save` or the
+:py:func:`~.io.load` function. If a zspy file has been saved with a different
+store, it would need to be loaded by passing a store of the same type:
+
+.. code-block:: python
+
+    >>> import zarr
+    >>> filename = 'test.zspy'
+    >>> store = zarr.LMDBStore(filename)
+    >>> signal.save(store) # saved to LMDB
+
+To load this file again
+
+.. code-block:: python
+
+    >>> import zarr
+    >>> filename = 'test.zspy'
+    >>> store = zarr.LMDBStore(filename)
+    >>> s = hs.load(store) # load from LMDB
+
 Extra saving arguments
 ^^^^^^^^^^^^^^^^^^^^^^
 
-- ``compressor``: A `Numcodecs Codec <https://numcodecs.readthedocs.io/en/stable/index.html?>`_.
-   A compresssor can be passed to the save function to compress the data efficiently. The defualt
-   is to call a Blosc Compressor object.
+- ``compressor``: `Numcodecs codec <https://numcodecs.readthedocs.io/en/stable/index.html?>`_,
+  a compressor can be passed to the save function to compress the data efficiently. The default
+  is to call a Blosc compressor object.
 
-.. code-block:: python
+    .. code-block:: python
 
-    >>> from numcodecs import Blosc
-    >>> compressor=Blosc(cname='zstd', clevel=1, shuffle=Blosc.SHUFFLE)
-    >>> s.save('test.zspy', compressor = compressor) # will save with Blosc compression
+        >>> from numcodecs import Blosc
+        >>> compressor=Blosc(cname='zstd', clevel=1, shuffle=Blosc.SHUFFLE) # Used by default
+        >>> s.save('test.zspy', compressor = compressor) # will save with Blosc compression
 
-.. note::
+    .. note::
 
-    Lazy operations are often i-o bound, reading and writing the data creates a bottle neck in processes
-    due to the slow read write speed of many hard disks. In these cases, compressing your data is often
-    beneficial to the speed of some operation. Compression speeds up the process as there is less to
-    read/write with the trade off of slightly more computational work on the CPU."
+        Lazy operations are often i-o bound, reading and writing the data creates a bottle neck in processes
+        due to the slow read write speed of many hard disks. In these cases, compressing your data is often
+        beneficial to the speed of some operations. Compression speeds up the process as there is less to
+        read/write with the trade off of slightly more computational work on the CPU.
 
-- ``write_to_storage``: The write to storage option allows you to pass the path to a directory (or database)
-   and write directly to the storage container.  This gives you access to the `different storage methods
-   <https://zarr.readthedocs.io/en/stable/api/storage.html>`_
-   available through zarr. Namely using a SQL, MongoDB or LMDB database.  Additional downloads may need
-   to be configured to use these features.
 
-.. code-block:: python
+- ``chunks``: tuple of integer or None. Define the chunking used for saving
+  the dataset. If None, calculates chunks for the signal, with preferably at
+  least one chunk per signal space.
+- ``close_file``: only relevant for some zarr store (``ZipStore``, ``DBMStore``)
+  requiring store to flush data to disk. If ``False``, doesn't close the file
+  after writing. The file should not be closed if the data need to be accessed
+  lazily after saving.
+  Default is ``True``.
+- ``write_dataset``: if ``False``, doesn't write the dataset when writing the file.
+  This can be useful to overwrite signal attributes only (for example ``axes_manager``)
+  without having to write the whole dataset, which can take time. Default is ``True``.
 
-    >>>  filename = 'test.zspy/'
-    >>>  os.mkdir('test.zspy')
-    >>>  store = zarr.LMDBStore(path=filename)
-    >>>  signal.save(store.path, write_to_storage=True) # saved to Lmdb
 
 .. _netcdf-format:
 
@@ -666,7 +699,7 @@ Extra saving arguments
   scalebar. Useful to set formattiong, location, etc. of the scalebar. See the
   `matplotlib-scalebar <https://pypi.org/project/matplotlib-scalebar/>`_
   documentation for more information.
-- ``output_size`` : (int, tuple of length 2 or None, optional): the output size 
+- ``output_size`` : (int, tuple of length 2 or None, optional): the output size
   of the image in pixels:
 
   * if ``int``, defines the width of the image, the height is
@@ -1384,7 +1417,7 @@ Nexus metadata and data are stored in Hierarchical Data Format Files (HDF5) with
 a .nxs extension although standards HDF5 extensions are sometimes used.
 Files must use the ``.nxs`` file extension in order to use this io plugin.
 Using the ``.nxs`` extension will default to the Nexus loader. If your file has
-a HDF5 extension, you can also explicitly set the Nexus file reader:
+an HDF5 extension, you can also explicitly set the Nexus file reader:
 
 .. code-block:: python
 
@@ -1827,7 +1860,7 @@ Extra loading arguments
   acquired last frame, which typically occurs when the acquisition was
   interrupted. When loading incomplete data (``only_valid_data=False``),
   the missing data are filled with zeros. If ``sum_frames=True``, this argument
-  will be ignored to enforce consistent sum over the mapped area. 
+  will be ignored to enforce consistent sum over the mapped area.
   (default True).
 
 

diff --git a/hyperspy/_signals/lazy.py b/hyperspy/_signals/lazy.py
@@ -167,13 +167,21 @@ def rechunk(self,
                                                                   **kwargs)
                                                 )
 
-
     def close_file(self):
         """Closes the associated data file if any.
 
         Currently it only supports closing the file associated with a dask
         array created from an h5py DataSet (default HyperSpy hdf5 reader).
 
+        """
+        try:
+            self._get_file_handle().close()
+        except AttributeError:
+            _logger.warning("Failed to close lazy signal file")
+
+    def _get_file_handle(self, warn=True):
+        """Return file handle when possible; currently only hdf5 file are
+        supported.
         """
         arrkey = None
         for key in self.data.dask.keys():
@@ -182,9 +190,12 @@ def close_file(self):
                 break
         if arrkey:
             try:
-                self.data.dask[arrkey].file.close()
-            except AttributeError:
-                _logger.exception("Failed to close lazy Signal file")
+                return self.data.dask[arrkey].file
+            except (AttributeError, ValueError):
+                if warn:
+                    _logger.warning("Failed to retrieve file handle, either "
+                                    "the file is already closed or it is not "
+                                    "an hdf5 file.")
 
     def _get_dask_chunks(self, axis=None, dtype=None):
         """Returns dask chunks.

diff --git a/hyperspy/axes.py b/hyperspy/axes.py
@@ -289,9 +289,10 @@ def __init__(self,
 
         self.events = Events()
         if '_type' in kwargs:
-            if kwargs.get('_type') != self.__class__.__name__:
-                raise ValueError('The passed `_type` of axis is inconsistent '
-                                'with the given attributes')
+            _type = kwargs.get('_type')
+            if _type != self.__class__.__name__:
+                raise ValueError(f'The passed `_type` ({_type}) of axis is '
+                                 'inconsistent with the given attributes.')
         _name = self.__class__.__name__
         self.events.index_changed = Event("""
             Event that triggers when the index of the `{}` changes

diff --git a/hyperspy/io.py b/hyperspy/io.py
@@ -26,7 +26,7 @@
 from natsort import natsorted
 from inspect import isgenerator
 from pathlib import Path
-from collections import MutableMapping
+from collections.abc import MutableMapping
 
 from hyperspy.drawing.marker import markers_metadata_dict_to_markers
 from hyperspy.exceptions import VisibleDeprecationWarning
@@ -125,6 +125,17 @@ def _escape_square_brackets(text):
     return pattern.sub(lambda m: rep[re.escape(m.group(0))], text)
 
 
+def _parse_path(arg):
+    """Convenience function to get the path from zarr store or string."""
+    # In case of zarr store, get the path
+    if isinstance(arg, MutableMapping):
+        fname = arg.path
+    else:
+        fname = arg
+
+    return fname
+
+
 def load(filenames=None,
          signal_type=None,
          stack=False,
@@ -354,7 +365,7 @@ def load(filenames=None,
     elif isgenerator(filenames):
         filenames = list(filenames)
 
-    elif not isinstance(filenames, (list, tuple)):
+    elif not isinstance(filenames, (list, tuple, MutableMapping)):
         raise ValueError(
             'The filenames parameter must be a list, tuple, '
             f'string or None, not {type(filenames)}'
@@ -363,9 +374,12 @@ def load(filenames=None,
     if not filenames:
         raise ValueError('No file(s) provided to reader.')
 
-    # pathlib.Path not fully supported in io_plugins,
-    # so convert to str here to maintain compatibility
-    filenames = [str(f) if isinstance(f, Path) else f for f in filenames]
+    if isinstance(filenames, MutableMapping):
+        filenames = [filenames]
+    else:
+        # pathlib.Path not fully supported in io_plugins,
+        # so convert to str here to maintain compatibility
+        filenames = [str(f) if isinstance(f, Path) else f for f in filenames]
 
     if len(filenames) > 1:
         _logger.info('Loading individual files')
@@ -427,7 +441,8 @@ def load(filenames=None,
             objects.append(signal)
     else:
         # No stack, so simply we load all signals in all files separately
-        objects = [load_single_file(filename, lazy=lazy, **kwds) for filename in filenames]
+        objects = [load_single_file(filename, lazy=lazy, **kwds)
+                   for filename in filenames]
 
     if len(objects) == 1:
         objects = objects[0]
@@ -456,12 +471,16 @@ def load_single_file(filename, **kwds):
         Data loaded from the file.
 
     """
-    if not os.path.isfile(filename) and not (os.path.isdir(filename) or
-                                             os.path.splitext(filename)[1] == '.zspy'):
-        raise FileNotFoundError(f"File: {filename} not found!")
+    # in case filename is a zarr store, we want to the path and not the store
+    path = _parse_path(filename)
+
+    if (not os.path.isfile(path) and
+            not (os.path.isdir(path) and os.path.splitext(path)[1] == '.zspy')
+            ):
+        raise FileNotFoundError(f"File: {path} not found!")
 
     # File extension without "." separator
-    file_ext = os.path.splitext(filename)[1][1:]
+    file_ext = os.path.splitext(path)[1][1:]
     reader = kwds.pop("reader", None)
 
     if reader is None:
@@ -511,11 +530,17 @@ def load_with_reader(
             if signal_type is not None:
                 signal_dict['metadata']["Signal"]['signal_type'] = signal_type
             signal = dict2signal(signal_dict, lazy=lazy)
-            folder, filename = os.path.split(os.path.abspath(filename))
+            path = _parse_path(filename)
+            folder, filename = os.path.split(os.path.abspath(path))
             filename, extension = os.path.splitext(filename)
             signal.tmp_parameters.folder = folder
             signal.tmp_parameters.filename = filename
             signal.tmp_parameters.extension = extension.replace('.', '')
+            # original_filename and original_file are used to keep track of
+            # where is the file which has been open lazily
+            signal.tmp_parameters.original_folder = folder
+            signal.tmp_parameters.original_filename = filename
+            signal.tmp_parameters.original_extension = extension.replace('.', '')
             # test if binned attribute is still in metadata
             if signal.metadata.has_item('Signal.binned'):
                 for axis in signal.axes_manager.signal_axes: