This notebook demonstrates how one would register extensions to the `h5ad` file format via `anndata`.

In [1]:
import scanpy as sc
import h5py
import pickle
import numpy as np
import pandas as pd

In [2]:
from read_partial_registry import read, write, _REGISTRY, IOSpec

In [3]:
sc.settings.datasetdir = "/Users/isaac/data/"

# Storing arbitrary objects

In [4]:
class Foo(object):
    def __init__(self, x, y):
        self.x = x
        self.y = y


@_REGISTRY.register_write(Foo, IOSpec("Foo", "0.1.0"))
def write_foo(g, k, v, dataset_kwargs={}):
    if "compression" in dataset_kwargs:
        dataset_kwargs = dict(dataset_kwargs)
        dataset_kwargs.pop("compression")
    g.create_dataset(k, data=np.void(pickle.dumps(v)), **dataset_kwargs)


@_REGISTRY.register_read(IOSpec("Foo", "0.1.0"))
def read_foo(v):
    return pickle.loads(v[...].tobytes())

In [5]:
pbmc = sc.datasets.pbmc3k_processed()
pbmc.uns["foo"] = Foo({"a": {"b": 1}}, "lorem ipsum")

write(pbmc, "out.h5ad", dataset_kwargs={"compression": "lzf"})
from_disk = read("out.h5ad")

The element is just a normal element on disk.

In [23]:
!h5ls out.h5ad/uns

draw_graph               Group
foo                      Dataset {SCALAR}
louvain                  Group
louvain_colors           Dataset {8}
neighbors                Group
pca                      Group
rank_genes_groups        Group


We keep the information neccesary for reading and writing 

In [19]:
with h5py.File("out.h5ad", "r") as f:
    display(dict(f["uns/foo"].attrs))

{'encoding-type': 'Foo', 'encoding-version': '0.1.0'}

# Pandas extension arrays

This example is mostly to show what's possible, not exactly what should be done. I'm not sure we want to support the sparse array type directly in anndata, since reading it in an at all efficient way requires private pandas functions.

In [6]:
from read_partial_registry import write_elem, read_elem

In [7]:
# pd.array.SparseArray
@_REGISTRY.register_write(pd.arrays.SparseArray, IOSpec("pd-sparse-array", "0.1.0"))
def write_pandas_sparse(f, k, v, dataset_kwargs={}):
    g = f.create_group(k)
    write_elem(g, "fill-value", v.fill_value, dataset_kwargs=dataset_kwargs)
    write_elem(g, "sparse-index", v.sp_index, dataset_kwargs=dataset_kwargs)
    write_elem(g, "sparse-values", v.sp_values, dataset_kwargs=dataset_kwargs)


@_REGISTRY.register_read(IOSpec("pd-sparse-array", "0.1.0"))
def read_pandas_sparse(g):
    return pd.arrays.SparseArray(
        read_elem(g["sparse-values"]),
        sparse_index=read_elem(g["sparse-index"]),
        fill_value=read_elem(g["fill-value"]),
    )


@_REGISTRY.register_write(pd._libs.sparse.IntIndex, IOSpec("pd-sparse-int-index", "0.1.0"))
def write_pandas_sparse_int_index(f, k, v, dataset_kwargs):
    d = f.create_dataset(k, data=v.indices, **dataset_kwargs)
    d.attrs["length"] = v.length

@_REGISTRY.register_read(IOSpec("pd-sparse-int-index", "0.1.0"))
def read_pandas_sparse_int_index(g):
    return pd._libs.sparse.IntIndex(g.attrs["length"], g[...])

### Test writing an individual element

In [8]:
from scipy import sparse

In [9]:
sa = pd.arrays.SparseArray.from_spmatrix(sparse.random(100, 1, density=0.5))

with h5py.File("test.h5", "w") as f:
    write_elem(f, "pd-sparse", sa)
    sa_from_disk = read_elem(f["pd-sparse"])

pd.testing.assert_extension_array_equal(sa, sa_from_disk)

### Writing a dataframe with sparse columns


In [10]:
pbmc.obs["b-cell"] = pd.arrays.SparseArray(pbmc.obs["louvain"] == "B cells")

In [11]:
write(pbmc, "out.h5ad")
from_disk = read("out.h5ad")

pd.testing.assert_series_equal(pbmc.obs["b-cell"], from_disk.obs["b-cell"])