# Vitessce Data Preparation Tutorial

This tutorial has been adopted from the data preparation examples in [the Vitessce documention](https://vitessce.github.io/vitessce-python).

It uses the same pre-processing as there.

In [None]:
!lamin load laminlabs/lamindata

## 1. Import dependencies

We need to import the classes and functions that we will be using from the corresponding packages.

In [None]:
import json
from urllib.request import urlretrieve
from anndata import read_h5ad
import lamindb as ln
from vitessce import (
    VitessceConfig,
    AnnDataWrapper,
)
from vitessce.data_utils import (
    optimize_adata,
    VAR_CHUNK_SIZE,
)

## 2. Download and process data

For this example, we need to download a dataset from the COVID-19 Cell Atlas https://www.covid19cellatlas.org/index.healthy.html#habib17.

In [None]:
adata_filepath = ln.UPath("data") / "habib17.processed.h5ad"
zarr_filepath = adata_filepath.with_suffix(".zarr")
if not adata_filepath.exists():
    adata_filepath.parent.mkdir(parents=True, exist_ok=True)
    urlretrieve(
        "https://covid19.cog.sanger.ac.uk/habib17.processed.h5ad", adata_filepath
    )

adata = read_h5ad(adata_filepath)
top_dispersion = adata.var["dispersions_norm"][
    sorted(
        range(len(adata.var["dispersions_norm"])),
        key=lambda k: adata.var["dispersions_norm"][k],
    )[-51:][0]
]
adata.var["top_highly_variable"] = adata.var["dispersions_norm"] > top_dispersion

In [None]:
adata = optimize_adata(
    adata,
    obs_cols=["CellType"],
    obsm_keys=["X_umap"],
    var_cols=["top_highly_variable"],
    optimize_X=True,
)
adata = adata[:100, :50].copy()  # subset to run on CI
adata.write_zarr(zarr_filepath, chunks=[adata.shape[0], VAR_CHUNK_SIZE])

## 3. Create the Vitessce configuration

Set up the configuration by adding the views and datasets of interest.

In [None]:
vc = VitessceConfig(
    schema_version="1.0.15",
    name="Habib et al",
    description="COVID-19 Healthy Donor Brain",
)
dataset = vc.add_dataset(name="Brain").add_object(
    AnnDataWrapper(
        adata_path=zarr_filepath.as_posix(),
        obs_embedding_paths=["obsm/X_umap"],
        obs_embedding_names=["UMAP"],
        obs_set_paths=["obs/CellType"],
        obs_set_names=["Cell Type"],
        obs_feature_matrix_path="X",
        feature_filter_path="var/top_highly_variable",
    )
)
scatterplot = vc.add_view(cm.SCATTERPLOT, dataset=dataset, mapping="UMAP")
cell_sets = vc.add_view(cm.OBS_SETS, dataset=dataset)
genes = vc.add_view(cm.FEATURE_LIST, dataset=dataset)
heatmap = vc.add_view(cm.HEATMAP, dataset=dataset)
vc.layout((scatterplot | (cell_sets / genes)) / heatmap);

## 4. Ingest into LaminDB

In [None]:
from lamindb.integrations import register_vitessce_config

In [None]:
artifact = register_vitessce_config(vc, description="A vitessce config object")

In [None]:
artifact

In [None]:
artifact.delete(permanent=True)