## A SANDBOX TO UNDERSTAND AND PRACTICE THE ZARR FILE SYSTEMS
https://zarr.readthedocs.io/en/stable/tutorial.html#reading-and-writing-data

In [1]:
import os
import zarr
import numpy as np

In [8]:
GPU = 'C:\\Users\\Viz2\\python_anaconda3\\UCB ABC\\demo_napari\\GPU'
ZARR = 'C:\\Users\\Viz2\\python_anaconda3\\UCB ABC\\git_clones\\LLS_Pipeline\\Exercises\\Zarr'

In [9]:
#create a zarr file:
z = zarr.zeros((10000, 10000), chunks=(1000, 1000), dtype='i4')
z

<zarr.core.Array (10000, 10000) int32>

In [10]:
#read and write a zarr file
#fill entire array with scalar val
z[:] = 42

In [11]:
#import numpy to manipulate zarr file with precision
#more zarr array manipulation
z[0, :] = np.arange(10000)
z[:, 0] = np.arange(10000)

In [12]:
z[:]

array([[   0,    1,    2, ..., 9997, 9998, 9999],
       [   1,   42,   42, ...,   42,   42,   42],
       [   2,   42,   42, ...,   42,   42,   42],
       ...,
       [9997,   42,   42, ...,   42,   42,   42],
       [9998,   42,   42, ...,   42,   42,   42],
       [9999,   42,   42, ...,   42,   42,   42]])

PERSISTENT ARRAYS<br>
zarr files are saved with a .zarr extension
zarr.convenience.open() is another way to have persistently open
zarr files to work with. these files will automatically flush to the disk.
No need to close an array.

In [13]:
os.chdir(ZARR)
z1 = zarr.open('data/example.zarr', mode='w', shape=(10000, 10000),
                chunks=(1000, 1000), dtype='i4')
# from my understanding, z1 is live...

In [14]:
z1[:] = 42
z1[0, :] = np.arange(10000)
z1[:, 0] = np.arange(10000)

In [15]:
# read the data and make sure the content is equal to the current
# loaded z1 zarr file

z2 = zarr.open('data/example.zarr', mode='r')
np.all(z1[:] == z2[:])

True

If you are just looking for a fast and convenient way to save NumPy
arrays to disk then load back into memory later, the functions 
zarr.convenience.save() and zarr.convenience.load() may be useful.

In [16]:
a = np.arange(10)
zarr.save('data/example.zarr', a)
zarr.load('data/example.zarr')

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [17]:
# RESIZING AND APPENDING
# zarr files can be resized and appended easily
z = zarr.zeros(shape=(10000, 10000), chunks=(1000, 1000))
z[:] = 42
z.resize(20000, 10000)
z.shape

# when resizing, none of the underlying is rearranged or deleted
# unless shrunk, then any chunk falling out will be removed

(20000, 10000)

In [18]:
#using the append method inherent to the zarr file
a = np.arange(10000000, dtype='i4').reshape(10000, 1000)
z = zarr.array(a, chunks=(1000, 100))
print('orig z.shape', z.shape)
z.append(a)
print('inter z.shape', z.shape)
z.append(np.vstack([a, a]), axis=1)
print('new z.shape', z.shape)

orig z.shape (10000, 1000)
inter z.shape (20000, 1000)
new z.shape (20000, 2000)


In [19]:
# COMPRESSOR
# num of different compressors can be used with Zarr
# reveal different compressors with the compressor keyword arg

In [20]:
# use Blosc as the primary compressor (also the default compressor)
# use Zstandard algorithm (compression level 3)
# bit-shuffle filter applied
from numcodecs import Blosc
compressor = Blosc(cname='zstd', clevel=3, shuffle=Blosc.BITSHUFFLE)
data = np.arange(100000000, dtype='i4').reshape(10000, 10000)
z = zarr.array(data, chunks=(1000, 1000), compressor=compressor)
z.compressor

Blosc(cname='zstd', clevel=3, shuffle=BITSHUFFLE, blocksize=0)

In [21]:
# use the info property to print diagnostics of a zarr file
z.info

0,1
Type,zarr.core.Array
Data type,int32
Shape,"(10000, 10000)"
Chunk shape,"(1000, 1000)"
Order,C
Read-only,False
Compressor,"Blosc(cname='zstd', clevel=3, shuffle=BITSHUFFLE, blocksize=0)"
Store type,builtins.dict
No. bytes,400000000 (381.5M)
No. bytes stored,3379344 (3.2M)


Blosc is in fact a “meta-compressor”, which means that it can use
a number of different compression algorithms internally to compress
the data. Blosc also provides highly optimized implementations of 
byte- and bit-shuffle filters, which can improve compression ratios
for some data. A list of the internal compression libraries available
within Blosc can be obtained via:

In [22]:
from numcodecs import blosc
blosc.list_compressors()

['blosclz', 'lz4', 'lz4hc', 'snappy', 'zlib', 'zstd']

In [23]:
# now let's try a Zstandard compression, level 1:
from numcodecs import Zstd
z = zarr.array(np.arange(100000000, dtype='i4').reshape(10000, 10000),
                chunks=(1000, 1000), compressor=Zstd(level=1))
z.compressor

Zstd(level=1)

In [24]:
# now let's see LZMA with a custom filter pipeline, including the intrinsic
# delta filter
import lzma
lzma_filters = [dict(id=lzma.FILTER_DELTA, dist=4),
                dict(id=lzma.FILTER_LZMA2, preset=1)]
from numcodecs import LZMA
compressor = LZMA(filters=lzma_filters)
z = zarr.array(np.arange(100000000, dtype='i4').reshape(10000, 10000),
               chunks=(1000, 1000), compressor=compressor)
z.compressor
LZMA(format=1, check=-1, preset=None, filters=[{'dist': 4, 'id': 3}, {'id': 33, 'preset': 1}])

LZMA(format=1, check=-1, preset=None, filters=[{'dist': 4, 'id': 3}, {'id': 33, 'preset': 1}])

In [25]:
"""
The default compressor can be changed by setting the value of 
the zarr.storage.default_compressor variable, e.g.:
"""
import zarr.storage
from numcodecs import Zstd, Blosc
# switch to using Zstandard
zarr.storage.default_compressor = Zstd(level=1)
z = zarr.zeros(100000000, chunks=1000000)
z.compressor
# switch back to Blosc defaults
zarr.storage.default_compressor = Blosc()

In [26]:
"""
To disable compression, set compressor=None when creating an array, e.g.:
"""
z = zarr.zeros(100000000, chunks=1000000, compressor=None)
z.compressor is None

True

In some cases, compression can be improved by transforming the data in some way. For example, if nearby values tend to be correlated, then shuffling the bytes within each numerical value or storing the difference between adjacent values may increase compression ratio. Some compressors provide built-in filters that apply transformations to the data prior to compression. For example, the Blosc compressor has built-in implementations of byte- and bit-shuffle filters, and the LZMA compressor has a built-in implementation of a delta filter. However, to provide additional flexibility for implementing and using filters in combination with different compressors, Zarr also provides a mechanism for configuring filters outside of the primary compressor.

In [27]:
# an example using a delta filter with the Blosc compressor
from numcodecs import Blosc, Delta
filters = [Delta(dtype='i4')]
compressor = Blosc(cname='zstd', clevel=1, shuffle=Blosc.SHUFFLE)
data = np.arange(100000000, dtype='i4').reshape(10000, 10000)
z = zarr.array(data, chunks=(1000, 1000), filters=filters, compressor=compressor)
z.info

0,1
Type,zarr.core.Array
Data type,int32
Shape,"(10000, 10000)"
Chunk shape,"(1000, 1000)"
Order,C
Read-only,False
Filter [0],Delta(dtype='
Compressor,"Blosc(cname='zstd', clevel=1, shuffle=SHUFFLE, blocksize=0)"
Store type,builtins.dict
No. bytes,400000000 (381.5M)


Zarr supports hierarchical organization of arrays via groups. As with arrays, groups can be stored in memory, on disk, or via other storage systems that support a similar interface.

In [28]:
root = zarr.group()
root

<zarr.hierarchy.Group '/'>

In [29]:
# groups have similar APIs to Group class from h5py
foo = root.create_group('foo')
bar = foo.create_group('bar')

In [30]:
# groups can also have arrays
z1 = bar.zeros('baz', shape=(10000, 10000), chunks=(1000, 1000), dtype='i4')
z1

<zarr.core.Array '/foo/bar/baz' (10000, 10000) int32>

In [31]:
"""
Arrays are known as “datasets” in HDF5 terminology. For compatibility with h5py, Zarr groups also implement the create_dataset() and require_dataset() methods, e.g.:
"""
z = bar.create_dataset('quux', shape=(10000, 10000), chunks=(1000,1000), dtype='i4')
z

<zarr.core.Array '/foo/bar/quux' (10000, 10000) int32>

In [32]:
# use the suffix notation to access groups
root['foo']

<zarr.hierarchy.Group '/foo'>

In [33]:
# use '/' to access multiple levels of the hierarchy in one call
print(root['foo/bar'])
print(root['foo/bar/baz'])

<zarr.hierarchy.Group '/foo/bar'>
<zarr.core.Array '/foo/bar/baz' (10000, 10000) int32>


In [34]:
"""
The zarr.hierarchy.Group.tree() method can be used to print a tree representation of the hierarchy, e.g.:
"""
print(root.tree())

/
 └── foo
     └── bar
         ├── baz (10000, 10000) int32
         └── quux (10000, 10000) int32


In [35]:
"""
The zarr.convenience.open() function provides a convenient way to create or re-open a group stored in a directory on the file-system, with sub-groups stored in sub-directories, e.g.:
"""
root = zarr.open('data/group.zarr', mode='w')
print(root)
z = root.zeros('foo/bar/baz', shape=(10000, 10000), chunks=(1000, 1000), dtype='i4')
print(z)

<zarr.hierarchy.Group '/'>
<zarr.core.Array '/foo/bar/baz' (10000, 10000) int32>


Array and group diagnostics <br>
Diagnostic information about arrays and groups is available via the ```info``` property

In [36]:
root = zarr.group()
foo = root.create_group('foo')
bar = foo.zeros('bar', shape=1000000, chunks=100000, dtype='i8')
baz = foo.zeros('baz', shape=(1000, 1000), chunks=(100, 100), dtype='f4')
baz[:] = 4.2
print(root.info)
print(foo.info)
print(bar.info)
print(baz.info)

Name        : /
Type        : zarr.hierarchy.Group
Read-only   : False
Store type  : zarr.storage.MemoryStore
No. members : 1
No. arrays  : 0
No. groups  : 1
Groups      : foo

Name        : /foo
Type        : zarr.hierarchy.Group
Read-only   : False
Store type  : zarr.storage.MemoryStore
No. members : 2
No. arrays  : 2
No. groups  : 0
Arrays      : bar, baz

Name               : /foo/bar
Type               : zarr.core.Array
Data type          : int64
Shape              : (1000000,)
Chunk shape        : (100000,)
Order              : C
Read-only          : False
Compressor         : Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)
Store type         : zarr.storage.MemoryStore
No. bytes          : 8000000 (7.6M)
No. bytes stored   : 320
Storage ratio      : 25000.0
Chunks initialized : 0/10

Name               : /foo/baz
Type               : zarr.core.Array
Data type          : float32
Shape              : (1000, 1000)
Chunk shape        : (100, 100)
Order              : C
Rea

### User attributes -->
all following documentation can be reached via the url at the top of this notebook<br>