In [1]:
import backedarray as ba
import scipy.sparse
import numpy as np
import h5py
import zarr

# Create Dataset

In [2]:
csr_matrix = scipy.sparse.random(100, 50, format="csr", density=0.2)
dense_array = csr_matrix.toarray()

## HDF5 Backend

In [3]:
# Write sparse matrix in csc or csr format to hdf5 file
h5_csr_path = 'csr.h5'
with h5py.File(h5_csr_path, "w") as f:
    ba.write_sparse(f.create_group("X"), csr_matrix)

## Zarr Backend

In [4]:
# Write sparse matrix in csc or csr format to zarr file
zarr_csr_path = 'csr.zarr'
with zarr.open(zarr_csr_path, mode="w") as f:
    ba.write_sparse(f.create_group("X"), csr_matrix)

# Read Dataset

## HDF5 Backend

In [5]:
h5_csr_file = h5py.File(h5_csr_path, "r")
h5_csr_disk = ba.open(h5_csr_file["X"])

## Zarr Backend

In [6]:
zarr_csr_disk = ba.open(zarr.open(zarr_csr_path)["X"])

# Numpy Style Indexing

In [7]:
zarr_csr_disk[1:3].toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.26280307,
        0.        , 0.        , 0.        , 0.        , 0.47496996,
        0.        , 0.        , 0.09703807, 0.        , 0.        ,
        0.        , 0.        , 0.80186166, 0.        , 0.80138092,
        0.51200086, 0.        , 0.        , 0.        , 0.27931344,
        0.        , 0.        , 0.19093669, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.1184287 , 0.        , 0.98470652,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.10973288, 0.        ],
       [0.26588967, 0.54672441, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.14689263, 0.        , 0.        ,
        0.        , 0.64330307, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.11684519, 0.38581068,
        0.        , 0.        , 0.        , 0. 

In [8]:
h5_csr_disk[2:].toarray()

array([[0.26588967, 0.54672441, 0.        , ..., 0.        , 0.        ,
        0.77352417],
       [0.        , 0.        , 0.        , ..., 0.74254072, 0.        ,
        0.0540647 ],
       [0.81327578, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.22822103, 0.        , ..., 0.        , 0.        ,
        0.90604483],
       [0.        , 0.        , 0.81121483, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.90957914, ..., 0.14953815, 0.27138129,
        0.        ]])

In [9]:
h5_csr_disk[...].toarray()

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.10973288,
        0.        ],
       [0.26588967, 0.54672441, 0.        , ..., 0.        , 0.        ,
        0.77352417],
       ...,
       [0.        , 0.22822103, 0.        , ..., 0.        , 0.        ,
        0.90604483],
       [0.        , 0.        , 0.81121483, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.90957914, ..., 0.14953815, 0.27138129,
        0.        ]])

In [10]:
h5_csr_file.close()

# Append

In [11]:
zarr_csr_disk.append(csr_matrix)
np.testing.assert_array_equal(zarr_csr_disk[...].toarray(), scipy.sparse.vstack((csr_matrix, csr_matrix)).toarray())

# Read h5ad files created using [anndata](https://anndata.readthedocs.io/)

In [12]:
%%bash
if [ ! -f "pbmc3k.h5ad" ]; then
    wget -q https://raw.githubusercontent.com/chanzuckerberg/cellxgene/main/example-dataset/pbmc3k.h5ad
fi

--2022-10-27 10:25:59--  https://raw.githubusercontent.com/chanzuckerberg/cellxgene/main/example-dataset/pbmc3k.h5ad
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 24653425 (24M) [application/octet-stream]
Saving to: ‘pbmc3k.h5ad’

     0K .......... .......... .......... .......... ..........  0%  434K 55s
    50K .......... .......... .......... .......... ..........  0%  758K 43s
   100K .......... .......... .......... .......... ..........  0% 2.39M 32s
   150K .......... .......... .......... .......... ..........  0% 3.29M 26s
   200K .......... .......... .......... .......... ..........  1%  964K 26s
   250K .......... .......... .......... .......... ..........  1% 3.82M 22s
   300K .......... .......... .......... .......... ..........  1% 4.51M 

In [13]:
import anndata.experimental
with h5py.File('pbmc3k.h5ad', 'r') as f:
    obs = anndata.experimental.read_elem(f['obs'])
    var = anndata.experimental.read_elem(f['var'])
    X = ba.open(f['X'])