hyperspy · ericpre · Oct 20, 2021 · Aug 31, 2021 · Sep 1, 2021 · Sep 3, 2021
diff --git a/conda_environment.yml b/conda_environment.yml
@@ -28,5 +28,6 @@ dependencies:
 - toolz
 - tqdm
 - traits
+- zarr
 
 
diff --git a/doc/user_guide/big_data.rst b/doc/user_guide/big_data.rst
@@ -397,6 +397,15 @@ Other minor differences
   convenience, ``nansum``, ``nanmean`` and other ``nan*`` signal methods were
   added to mimic the workflow as closely as possible.
 
+.. _big_data.saving:
+
+Saving Big Data
+^^^^^^^^^^^^^^^^^
+
+The most efficient format supported by HyperSpy to write data is the :ref:` zspy format <zspy-format>`,
+mainly because it supports writing currently from concurrently from multiple threads or processes.
+
+This also allows for smooth interaction with dask-distributed for efficient scaling.
 
 .. _lazy_details:
 

diff --git a/doc/user_guide/io.rst b/doc/user_guide/io.rst
@@ -251,6 +251,8 @@ HyperSpy. The "lazy" column specifies if lazy evaluation is supported.
     +-----------------------------------+--------+--------+--------+
     | hspy                              |    Yes |    Yes |    Yes |
     +-----------------------------------+--------+--------+--------+
+    | zspy                              |    Yes |    Yes |    Yes |
+    +-----------------------------------+--------+--------+--------+
     | Image: e.g. jpg, png, tif, ...    |    Yes |    Yes |    Yes |
     +-----------------------------------+--------+--------+--------+
     | TIFF                              |    Yes |    Yes |    Yes |
@@ -418,6 +420,65 @@ Extra saving arguments
     saving a file, be aware that it may not be possible to load it in some platforms.
 
 
+.. _zspy-format:
+
+ZSpy - HyperSpy's Zarr Specification
+------------------------------------
+
+Similarly to the :ref:`hspy format <hspy-format>`, the zspy format guarantees that no
+information will be lost in the writing process and that supports saving data
+of arbitrary dimensions. It is based on the `Zarr project <https://zarr.readthedocs.io/en/stable/index.html>`_. Which exists as a drop in
+replacement for hdf5 with the intention to fix some of the speed and scaling
+issues with the hdf5 format and is therefore suitable for saving :ref:`big data <big_data.saving>`.
+
+
+.. code-block:: python
+
+    >>> s = hs.signals.BaseSignal([0])
+    >>> s.save('test.zspy') # will save in nested directory
+    >>> hs.load('test.zspy') # loads the directory
+
+
+When saving to `zspy <https://zarr.readthedocs.io/en/stable/index.html>`_, all supported objects in the signal's
+:py:attr:`~.signal.BaseSignal.metadata` is stored. This includes lists, tuples and signals.
+Please note that in order to increase saving efficiency and speed, if possible,
+the inner-most structures are converted to numpy arrays when saved. This
+procedure homogenizes any types of the objects inside, most notably casting
+numbers as strings if any other strings are present:
+
+Extra saving arguments
+^^^^^^^^^^^^^^^^^^^^^^
+
+- ``compressor``: A `Numcodecs Codec <https://numcodecs.readthedocs.io/en/stable/index.html?>`_.
+   A compresssor can be passed to the save function to compress the data efficiently. The defualt
+   is to call a Blosc Compressor object.
+
+.. code-block:: python
+
+    >>> from numcodecs import Blosc
+    >>> compressor=Blosc(cname='zstd', clevel=1, shuffle=Blosc.SHUFFLE)
+    >>> s.save('test.zspy', compressor = compressor) # will save with Blosc compression
+
+.. note::
+
+    Lazy operations are often i-o bound, reading and writing the data creates a bottle neck in processes
+    due to the slow read write speed of many hard disks. In these cases, compressing your data is often
+    beneficial to the speed of some operation. Compression speeds up the process as there is less to
+    read/write with the trade off of slightly more computational work on the CPU."
+
+- ``write_to_storage``: The write to storage option allows you to pass the path to a directory (or database)
+   and write directly to the storage container.  This gives you access to the `different storage methods
+   <https://zarr.readthedocs.io/en/stable/api/storage.html>`_
+   available through zarr. Namely using a SQL, MongoDB or LMDB database.  Additional downloads may need
+   to be configured to use these features.
+
+.. code-block:: python
+
+    >>>  filename = 'test.zspy/'
+    >>>  os.mkdir('test.zspy')
+    >>>  store = zarr.LMDBStore(path=filename)
+    >>>  signal.save(store.path, write_to_storage=True) # saved to Lmdb
+
 .. _netcdf-format:
 
 NetCDF

diff --git a/hyperspy/io.py b/hyperspy/io.py
@@ -26,6 +26,7 @@
 from natsort import natsorted
 from inspect import isgenerator
 from pathlib import Path
+from collections import MutableMapping
 
 from hyperspy.drawing.marker import markers_metadata_dict_to_markers
 from hyperspy.exceptions import VisibleDeprecationWarning
@@ -327,22 +328,23 @@ def load(filenames=None,
             lazy = load_ui.lazy
         if filenames is None:
             raise ValueError("No file provided to reader")
-
     if isinstance(filenames, str):
         pattern = filenames
         if escape_square_brackets:
             filenames = _escape_square_brackets(filenames)
 
         filenames = natsorted([f for f in glob.glob(filenames)
-                               if os.path.isfile(f)])
+                               if os.path.isfile(f) or (os.path.isdir(f) and
+                                                        os.path.splitext(f)[1] == '.zspy')])
 
         if not filenames:
             raise ValueError(f'No filename matches the pattern "{pattern}"')
 
     elif isinstance(filenames, Path):
         # Just convert to list for now, pathlib.Path not
         # fully supported in io_plugins
-        filenames = [f for f in [filenames] if f.is_file()]
+        filenames = [f for f in [filenames]
+                     if f.is_file() or (f.is_dir() and ".zspy" in f.name)]
 
     elif isgenerator(filenames):
         filenames = list(filenames)
@@ -449,7 +451,8 @@ def load_single_file(filename, **kwds):
         Data loaded from the file.
 
     """
-    if not os.path.isfile(filename):
+    if not os.path.isfile(filename) and not (os.path.isdir(filename) or
+                                             os.path.splitext(filename)[1] == '.zspy'):
         raise FileNotFoundError(f"File: {filename} not found!")
 
     # File extension without "." separator
@@ -731,11 +734,14 @@ def save(filename, signal, overwrite=None, **kwds):
     None
 
     """
-    filename = Path(filename).resolve()
-    extension = filename.suffix
-    if extension == '':
-        extension = ".hspy"
-        filename = filename.with_suffix(extension)
+    if isinstance(filename, MutableMapping):
+        extension =".zspy"
+    else:
+        filename = Path(filename).resolve()
+        extension = filename.suffix
+        if extension == '':
+            extension = ".hspy"
+            filename = filename.with_suffix(extension)
 
     writer = None
     for plugin in io_plugins:
@@ -780,24 +786,35 @@ def save(filename, signal, overwrite=None, **kwds):
         )
 
     # Create the directory if it does not exist
-    ensure_directory(filename.parent)
-    is_file = filename.is_file()
-
-    if overwrite is None:
-        write = overwrite_method(filename)  # Ask what to do
-    elif overwrite is True or (overwrite is False and not is_file):
-        write = True  # Write the file
-    elif overwrite is False and is_file:
-        write = False  # Don't write the file
+    if not isinstance(filename, MutableMapping):
+        ensure_directory(filename.parent)
+        is_file = filename.is_file() or (filename.is_dir() and
+                                         os.path.splitext(filename)[1] == '.zspy')
+
+        if overwrite is None:
+            write = overwrite_method(filename)  # Ask what to do
+        elif overwrite is True or (overwrite is False and not is_file):
+            write = True  # Write the file
+        elif overwrite is False and is_file:
+            write = False  # Don't write the file
+        else:
+            raise ValueError("`overwrite` parameter can only be None, True or "
+                             "False.")
     else:
-        raise ValueError("`overwrite` parameter can only be None, True or "
-                         "False.")
+        write = True  # file does not exist (creating it)
     if write:
         # Pass as a string for now, pathlib.Path not
         # properly supported in io_plugins
-        writer.file_writer(str(filename), signal, **kwds)
-
-        _logger.info(f'{filename} was created')
-        signal.tmp_parameters.set_item('folder', filename.parent)
-        signal.tmp_parameters.set_item('filename', filename.stem)
-        signal.tmp_parameters.set_item('extension', extension)
+        if not isinstance(filename, MutableMapping):
+            writer.file_writer(str(filename), signal, **kwds)
+            _logger.info(f'{filename} was created')
+            signal.tmp_parameters.set_item('folder', filename.parent)
+            signal.tmp_parameters.set_item('filename', filename.stem)
+            signal.tmp_parameters.set_item('extension', extension)
+        else:
+            writer.file_writer(filename, signal, **kwds)
+            if hasattr(filename, "path"):
+                file = Path(filename.path).resolve()
+                signal.tmp_parameters.set_item('folder', file.parent)
+                signal.tmp_parameters.set_item('filename', file.stem)
+                signal.tmp_parameters.set_item('extension', extension)
diff --git a/hyperspy/io_plugins/__init__.py b/hyperspy/io_plugins/__init__.py
@@ -40,6 +40,7 @@
     semper_unf,
     sur,
     tiff,
+    zspy,
 )
 
 io_plugins = [
@@ -63,6 +64,7 @@
     semper_unf,
     sur,
     tiff,
+    zspy,
 ]