### A SANDBOX TO UNDERSTAND AND PRACTICE USING ZARR FILE SYSTEMS
https://zarr.readthedocs.io/en/stable/tutorial.html#reading-and-writing-data

In [7]:
import os
import zarr
import numpy as np

In [13]:
GPU = "/Users/irahorecka/Desktop/Harddrive_Desktop/UCB_ABC/demo_napari/GPU"
ZARR = "/Users/irahorecka/Desktop/Harddrive_Desktop/UCB_ABC/demo_zarr" 

In [5]:
#create a zarr file:
z = zarr.zeros((10000, 10000), chunks=(1000, 1000), dtype='i4')
z

<zarr.core.Array (10000, 10000) int32>

In [6]:
#read and write a zarr file
#fill entire array with scalar val
z[:] = 42

In [8]:
#import numpy to manipulate zarr file with precision
#more zarr array manipulation
z[0, :] = np.arange(10000)
z[:, 0] = np.arange(10000)

In [11]:
z[:]

array([[   0,    1,    2, ..., 9997, 9998, 9999],
       [   1,   42,   42, ...,   42,   42,   42],
       [   2,   42,   42, ...,   42,   42,   42],
       ...,
       [9997,   42,   42, ...,   42,   42,   42],
       [9998,   42,   42, ...,   42,   42,   42],
       [9999,   42,   42, ...,   42,   42,   42]], dtype=int32)

In [15]:
"""
PERSISTENT ARRAYS
zarr files are saved with a .zarr extension
zarr.convenience.open() is another way to have persistently open
zarr files to work with. these files will automatically flush to the disk.
No need to close an array.
"""
os.chdir(ZARR)
z1 = zarr.open('data/example.zarr', mode='w', shape=(10000, 10000),
                chunks=(1000, 1000), dtype='i4')
# from my understanding, z1 is live...

In [17]:
z1[:] = 42
z1[0, :] = np.arange(10000)
z1[:, 0] = np.arange(10000)

In [18]:
# read the data and make sure the content is equal to the current
# loaded z1 zarr file

z2 = zarr.open('data/example.zarr', mode='r')
np.all(z1[:] == z2[:])

True

In [19]:
"""
If you are just looking for a fast and convenient way to save NumPy
arrays to disk then load back into memory later, the functions 
zarr.convenience.save() and zarr.convenience.load() may be useful.
"""
a = np.arange(10)
zarr.save('data/example.zarr', a)
zarr.load('data/example.zarr')

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [21]:
# RESIZING AND APPENDING
# zarr files can be resized and appended easily
z = zarr.zeros(shape=(10000, 10000), chunks=(1000, 1000))
z[:] = 42
z.resize(20000, 10000)
z.shape

# when resizing, none of the underlying is rearranged or deleted
# unless shrunk, then any chunk falling out will be removed

(20000, 10000)

In [24]:
#using the append method inherent to the zarr file
a = np.arange(10000000, dtype='i4').reshape(10000, 1000)
z = zarr.array(a, chunks=(1000, 100))
print('orig z.shape', z.shape)
z.append(a)
print('inter z.shape', z.shape)
z.append(np.vstack([a, a]), axis=1)
print('new z.shape', z.shape)

orig z.shape (10000, 1000)
inter z.shape (20000, 1000)
new z.shape (20000, 2000)


In [25]:
# COMPRESSOR
# num of different compressors can be used with Zarr
# reveal different compressors with the compressor keyword arg

In [26]:
# use Blosc as the primary compressor (also the default compressor)
# use Zstandard algorithm (compression level 3)
# bit-shuffle filter applied
from numcodecs import Blosc
compressor = Blosc(cname='zstd', clevel=3, shuffle=Blosc.BITSHUFFLE)
data = np.arange(100000000, dtype='i4').reshape(10000, 10000)
z = zarr.array(data, chunks=(1000, 1000), compressor=compressor)
z.compressor

Blosc(cname='zstd', clevel=3, shuffle=BITSHUFFLE, blocksize=0)

In [27]:
# use the info property to print diagnostics of a zarr file
z.info

0,1
Type,zarr.core.Array
Data type,int32
Shape,"(10000, 10000)"
Chunk shape,"(1000, 1000)"
Order,C
Read-only,False
Compressor,"Blosc(cname='zstd', clevel=3, shuffle=BITSHUFFLE, blocksize=0)"
Store type,builtins.dict
No. bytes,400000000 (381.5M)
No. bytes stored,3379344 (3.2M)


In [28]:
"""
Blosc is in fact a “meta-compressor”, which means that it can use
a number of different compression algorithms internally to compress
the data. Blosc also provides highly optimized implementations of 
byte- and bit-shuffle filters, which can improve compression ratios
for some data. A list of the internal compression libraries available
within Blosc can be obtained via:
"""
from numcodecs import blosc
blosc.list_compressors()

['blosclz', 'lz4', 'lz4hc', 'snappy', 'zlib', 'zstd']

In [30]:
# now let's try a Zstandard compression, level 1:
from numcodecs import Zstd
z = zarr.array(np.arange(100000000, dtype='i4').reshape(10000, 10000),
                chunks=(1000, 1000), compressor=Zstd(level=1))
z.compressor

Zstd(level=1)

In [31]:
# now let's see LZMA with a custom filter pipeline, including the intrinsic
# delta filter
import lzma
lzma_filters = [dict(id=lzma.FILTER_DELTA, dist=4),
                dict(id=lzma.FILTER_LZMA2, preset=1)]
from numcodecs import LZMA
compressor = LZMA(filters=lzma_filters)
z = zarr.array(np.arange(100000000, dtype='i4').reshape(10000, 10000),
               chunks=(1000, 1000), compressor=compressor)
z.compressor
LZMA(format=1, check=-1, preset=None, filters=[{'dist': 4, 'id': 3}, {'id': 33, 'preset': 1}])

LZMA(format=1, check=-1, preset=None, filters=[{'dist': 4, 'id': 3}, {'id': 33, 'preset': 1}])

In [None]:
"""
The default compressor can be changed by setting the value of 
the zarr.storage.default_compressor variable, e.g.:
"""