# Census release 2025-01-30 (LTS)

In [1]:
!lamin init --storage run-tests --modules bionty

[92m→[0m initialized lamindb: zethson/run-tests
[0m

In [2]:
import lamindb as ln
import bionty as bt
from cellxgene_lamin.dev import get_datasets_from_cxg, get_collections_from_cxg

ln.settings.verbosity = "hint"

[92m→[0m connected lamindb: zethson/run-tests


In [3]:
census_version = "2025-01-30"
previous_release = "2024-07-01"

In [4]:
s3path = f"s3://cellxgene-data-public/cell-census/{census_version}/h5ads"
ln.UPath(s3path)

S3QueryPath('s3://cellxgene-data-public/cell-census/2025-01-30/h5ads')

In [5]:
ln.UPath(s3path).view_tree()

0 sub-directories & 1573 files with suffixes '.h5ad'
s3://cellxgene-data-public/cell-census/2025-01-30/h5ads
├── 00099d5e-154f-4a7a-aa8d-fa30c8c0c43c.h5ad
├── 001b01fe-5c70-4bda-a3aa-ee09b7899b14.h5ad
├── 0041b9c3-6a49-4bf7-8514-9bc7190067a7.h5ad
├── 00476f9f-ebc1-4b72-b541-32f912ce36ea.h5ad
├── 00593d17-0693-4646-acad-89dbefba11bb.h5ad
├── 0087cde2-967d-4f7c-8e6e-40e4c9ad1891.h5ad
├── 00ba8341-48ec-4e4e-bb56-be0dd2dd7913.h5ad
├── 00e5dedd-b9b7-43be-8c28-b0e5c6414a62.h5ad
├── 00ff600e-6e2e-4d76-846f-0eec4f0ae417.h5ad
├── 01209dce-3575-4bed-b1df-129f57fbc031.h5ad
├── 0129dbd9-a7d3-4f6b-96b9-1da155a93748.h5ad
├── 015c230d-650c-4527-870d-8a805849a382.h5ad
├── 019c7af2-c827-4454-9970-44d5e39ce068.h5ad
├── 01ad3cd7-3929-4654-84c0-6db05bd5fd59.h5ad
├── 01c93cf6-b695-4e30-a26e-121ae8b16a9e.h5ad
├── 023ce5da-0dbe-4ad0-a290-b6c21824b101.h5ad
├── 023d4d04-4671-4433-af8d-158a05c81d8b.h5ad
├── 02419ebc-fd75-4629-ba13-9b26e3be851c.h5ad
├── 024593dd-d237-4eaa-aff2-9c9d87be595e.h5ad
├── 02792605-4760

In [6]:
ln.context.track("VzV8Ty89eQdK")

  ln.context.track("VzV8Ty89eQdK")


[94m•[0m tracked pip freeze > /home/lukas/.cache/lamindb/environments/run_42Re6s8Nmg0tFhFk/run_env_pip.txt
[92m→[0m created Transform('VzV8Ty89eQdK0000'), started new Run('42Re6s8N...') at 2025-07-18 15:35:38 UTC
[92m→[0m notebook imports: bionty==1.6.0 cellxgene_lamin==0.3.2 lamindb==1.8.0


Get all datasets and associated metadata using cellxgene REST API:

In [7]:
cxg_datasets = get_datasets_from_cxg()
len(cxg_datasets)

1844

In [8]:
cxg_datasets[0].keys()

dict_keys(['assay', 'assets', 'cell_count', 'cell_type', 'citation', 'collection_doi', 'collection_doi_label', 'collection_id', 'collection_name', 'collection_version_id', 'dataset_id', 'dataset_version_id', 'development_stage', 'disease', 'donor_id', 'explorer_url', 'is_primary_data', 'mean_genes_per_cell', 'organism', 'primary_cell_count', 'processing_status', 'published_at', 'revised_at', 'revision_of_collection', 'revision_of_dataset', 'schema_version', 'self_reported_ethnicity', 'sex', 'spatial', 'suspension_type', 'tissue', 'title', 'tombstone', 'visibility', 'x_approximate_distribution'])

## Register artifacts

In [10]:
artifacts = ln.Artifact.from_dir(s3path)
ln.save(artifacts)

[92m✓[0m created 1573 artifacts from directory using storage s3://cellxgene-data-public and key = cell-census/2025-01-30/h5ads/


In [11]:
artifacts = ln.Artifact.filter(key__contains=census_version).all()
len(artifacts)

1573

In [12]:
artifacts.update(version=census_version)

1573

In [13]:
for cxg_dataset in cxg_datasets:
    artifact = artifacts.filter(key__contains=cxg_dataset["dataset_id"]).one_or_none()
    if artifact is not None:
        artifact.n_observations = cxg_dataset["cell_count"]
        artifact.description = cxg_dataset["title"]
        artifact.save()

In [14]:
artifacts_previous = ln.Artifact.filter(version=previous_release).all()
artifacts_previous.count()

0

In [21]:
for artifact in artifacts:
    dataset_id = artifact.key.split("/")[-1]
    artifact_previous = artifacts_previous.filter(
        key__endswith=dataset_id
    ).one_or_none()
    if artifact_previous is not None:
        artifact.add_to_version_family(artifact_previous, version=census_version)

## Register metadata

### Register new features and parent labels

In [22]:
artifacts = ln.Artifact.filter(key__contains=census_version).all()

In [23]:
from cellxgene_lamin.dev._features import register_obs_featureset

ModuleNotFoundError: No module named 'lamindb.core._feature_manager'

In [None]:
# get CxG Schema

if not artifacts[0].feature_sets.filter(name="obs metadata").exists():
    obs_feature_set = register_obs_featureset(artifacts)

In [11]:
features = ln.Feature.lookup()

## organisms

In [None]:
from cellxgene_lamin.dev._organism import register_organisms, curate_organisms

In [13]:
register_organisms(cxg_datasets)

Link collections and organisms to artifacts:

In [None]:
curate_organisms(artifacts, cxg_datasets)

## ontologies

Register all ontology ids:

In [None]:
from cellxgene_lamin.dev._ontology import register_ontology_ids

In [13]:
register_ontology_ids(cxg_datasets)

registering assay
✅ loaded [1;92m36 ExperimentalFactor records[0m matching [3montology_id[0m: [1;92m'EFO:0700016', 'EFO:0008722', 'EFO:0010961', 'EFO:0030003', 'EFO:0008930', 'EFO:0008992', 'EFO:0030019', 'EFO:0030027', 'EFO:0700003', 'EFO:0030002', 'EFO:0008931', 'EFO:0008720', 'EFO:0009899', 'EFO:0009901', 'EFO:0010891', 'EFO:0008796', 'EFO:0008919', 'EFO:0011025', 'EFO:0009919', 'EFO:0700010', ...[0m
✅ created [1;95m2 ExperimentalFactor records from Bionty[0m matching [3montology_id[0m: [1;95m'EFO:0022490', 'EFO:0022492'[0m
❗ now recursing through parents: this only happens once, but is much slower than bulk saving
💡 you can switch this off via: bt.settings.auto_save_parents = False
💡 also saving parents of ExperimentalFactor(uid='NuGX0K6A', name='ScaleBio single cell RNA sequencing', ontology_id='EFO:0022490', description='Scalebio Technology That Utilizes A Plate-Based Approach With Combinatorial Indexing To Generate Single Cell Libraries.', created_by_id=1, run_id=27,

In [16]:
# clean up the celltype in Tissue

bt.Tissue.filter(ontology_id="CL:0000307").delete()
bt.CellType.from_public(ontology_id="CL:0000307").save()

## donors and suspension_types

In [None]:
from cellxgene_lamin.dev._labels import register_ulabels

In [19]:
ln.settings.creation.search_names = False
register_ulabels(cxg_datasets, "donor_id")
ln.settings.creation.search_names = True

registered 1207 donor_ids


In [20]:
register_ulabels(cxg_datasets, "suspension_type")

registered 0 suspension_types


## Annotate artifacts with obs metadata

In [None]:
from cellxgene_lamin.dev._features import FEATURE_TO_ACCESSOR

In [None]:
import bionty as bt

features = ln.Feature.lookup()

for idx, cxg_dataset in enumerate(cxg_datasets):
    if idx % 100 == 0:
        print(f"annotating dataset {idx} of {len(cxg_datasets)}")
    artifact = artifacts.filter(key__contains=cxg_dataset["dataset_id"]).one_or_none()
    if artifact is None:
        continue
    for field, terms in cxg_dataset.items():
        if field not in FEATURE_TO_ACCESSOR:
            continue
        accessor, orm = FEATURE_TO_ACCESSOR.get(field)
        if field in ["donor_id", "suspension_type", "tissue_type"]:
            records = orm.from_values(terms, field="name")
            if len(records) > 0:
                # stratify by feature so that link tables records are written
                artifact.labels.add(records, feature=getattr(features, field))
        else:
            if field == "tissue":
                records = []
                celltypes = [
                    term["ontology_term_id"]
                    for term in terms
                    if term["tissue_type"] == "cell culture"
                ]
                if len(celltypes) > 0:
                    # records += bt.CellType.from_values(
                    #     [i["ontology_term_id"] for i in terms], field="ontology_id"
                    # )
                    print(f"These tissues are not yet linked: {celltypes}")
                tissues = [
                    term["ontology_term_id"]
                    for term in terms
                    if term["tissue_type"] != "cell culture"
                ]
                if len(tissues) > 0:
                    records += bt.Tissue.from_values(
                        [term["ontology_term_id"] for term in terms],
                        field="ontology_id",
                    )
            else:
                records = orm.from_values(
                    [term["ontology_term_id"] for term in terms], field="ontology_id"
                )
            if len(records) > 0:
                getattr(artifact, accessor).add(*records)

# clean up the 2 "unknowns" in DevelopmentalStage
bt.DevelopmentalStage.filter(name="unknown").exclude(ontology_id="unknown").delete()

annotating dataset 0 of 1844


## Validate and register genes

In [None]:
from cellxgene_lamin.dev._gene import register_genes

Register all genes for each organism:

In [10]:
register_genes()

registering homo_sapiens genes
✅ [1;92m62754 terms[0m (100.00%) are validated for [3mensembl_gene_id[0m
registering mus_musculus genes
✅ loaded [1;92m55281 Gene records[0m matching [3mensembl_gene_id[0m: [1;92m'ENSMUSG00000102693', 'ENSMUSG00000064842', 'ENSMUSG00000051951', 'ENSMUSG00000102851', 'ENSMUSG00000103377', 'ENSMUSG00000104017', 'ENSMUSG00000103025', 'ENSMUSG00000089699', 'ENSMUSG00000103201', 'ENSMUSG00000103147', 'ENSMUSG00000103161', 'ENSMUSG00000102331', 'ENSMUSG00000102348', 'ENSMUSG00000102592', 'ENSMUSG00000088333', 'ENSMUSG00000102343', 'ENSMUSG00000025900', 'ENSMUSG00000102948', 'ENSMUSG00000104123', 'ENSMUSG00000025902', ...[0m
✅ created [1;95m1660 Gene records from Bionty[0m matching [3mensembl_gene_id[0m: [1;95m'ENSMUSG00000119939', 'ENSMUSG00000119940', 'ENSMUSG00000119941', 'ENSMUSG00000119942', 'ENSMUSG00000119943', 'ENSMUSG00000119944', 'ENSMUSG00000119945', 'ENSMUSG00000119946', 'ENSMUSG00000119947', 'ENSMUSG00000119948', 'ENSMUSG00000119949',

## Link metadata to individual artifacts

annotate with genes measured in each artifact:

In [11]:
organisms = bt.Organism.lookup(field=bt.Organism.scientific_name)

In [None]:
for idx, artifact in enumerate(artifacts):
    if idx % 100 == 0:
        print(f"annotating dataset {idx} of {len(artifacts)}")

    adata_backed = artifact.backed()
    var_names = adata_backed.var_names
    organism_record = artifact.organisms.first()
    if organism_record is None:
        print(f"No organism found for artifact: {artifact}")
        continue
    genes = bt.Gene.from_values(
        var_names, field=bt.Gene.ensembl_gene_id, organism=organism_record
    )

    if len(genes) == 0 and var_names[0].startswith("ENSG"):
        genes += bt.Gene.from_values(
            var_names, field=bt.Gene.ensembl_gene_id, organism="human"
        )

    if len(var_names[var_names.str.startswith("ERCC")]) > 0:
        genes += bt.Gene.from_values(
            var_names,
            field=bt.Gene.ensembl_gene_id,
            organism=organisms.synthetic_construct,
        )
    if len(var_names[var_names.str.startswith("ENSSASG")]) > 0:
        genes += bt.Gene.from_values(
            var_names,
            field=bt.Gene.ensembl_gene_id,
            organism=organisms.severe_acute_respiratory_syndrome_coronavirus_2,
        )

    var_feature_set_artifact = ln.FeatureSet(genes, dtype="number")
    var_feature_set_artifact.save()
    artifact.feature_sets.add(
        var_feature_set_artifact, through_defaults={"slot": "var"}
    )

annotating dataset 0 of 812
annotating dataset 100 of 812
annotating dataset 200 of 812
annotating dataset 300 of 812
annotating dataset 500 of 812
annotating dataset 600 of 812
annotating dataset 700 of 812
annotating dataset 800 of 812


In [16]:
artifact.describe()

[1;92mArtifact[0m(uid='g0RcSSYe5vQKzSWYkhMc', version='2024-07-01', description='Dissection: Amygdaloid complex (AMY) - basolateral nuclear group (BLN) - basolateral nucleus (basal nucleus) - BL', key='cell-census/2024-07-01/h5ads/fe1a73ab-a203-45fd-84e9-0f7fd19efcbd.h5ad', suffix='.h5ad', type='dataset', accessor='AnnData', size=391552151, hash='1V_lPFFOF51ioRTSVWx9Mg-47', hash_type='md5-n', n_observations=35285, visibility=1, key_is_virtual=False, updated_at='2024-07-12 12:40:48 UTC')
  [3mProvenance[0m
    .created_by = 'sunnyosun'
    .storage = 's3://cellxgene-data-public'
    .transform = 'Census release 2024-07-01 (LTS)'
    .run = '2024-07-12 12:17:31 UTC'
  [3mLabels[0m
    .organisms = 'human'
    .tissues = 'cerebral nuclei'
    .cell_types = 'astrocyte', 'central nervous system macrophage', 'leukocyte', 'vascular associated smooth muscle cell', 'endothelial cell', 'ependymal cell', 'neuron', 'pericyte', 'fibroblast', 'oligodendrocyte precursor cell', ...
    .diseases

## Annotate tissue_type

In [18]:
register_ulabels(cxg_datasets, "tissue_type")

## Register collections

In [23]:
collection = ln.Collection(
    artifacts,
    name="cellxgene-census",
    version=census_version,
    is_new_version_of=ln.Collection.filter(
        name="cellxgene-census", version=previous_release
    ).one(),
)
collection.save()

💡 adding collection ids [456] as inputs for run 27, adding parent transform 19
✅ saved 1 feature set for slot: 'var'


In [25]:
cxg_collections = get_collections_from_cxg()

In [27]:
artifacts.first().key

'cell-census/2024-07-01/h5ads/98e5ea9f-16d6-47ec-a529-686e76515e39.h5ad'

In [29]:
ln.settings.creation.search_names = False
for collection_meta in cxg_collections:
    keys = [
        f"cell-census/{census_version}/h5ads/{dataset['dataset_id']}.h5ad"
        for dataset in collection_meta["datasets"]
    ]
    collection_artifacts = artifacts.filter(key__in=keys).all()
    if collection_artifacts.count() > 0:
        kwargs = {
            "name": collection_meta["name"],
            "description": collection_meta["doi"],
            "reference": collection_meta["collection_id"],
            "reference_type": "CELLxGENE Collection ID",
            "version": census_version,
        }
        collection_record = ln.Collection(
            collection_artifacts,
            **kwargs,
        )
        # if is needed here as .save() errors if collection is already saved
        if collection_record._state.adding:
            collection_record.save()
ln.settings.creation.search_names = True

❗ returning existing collection with same hash: Collection(uid='5LEE4fd5yMXsRtR6WUrs', version='2024-07-01', name='Type I interferon responsive microglia shape cortical development and behavior', description='10.1016/j.cell.2024.02.020', hash='UzOU3URUb50M_5lZtThV', reference='4828d33d-fb26-42e7-bf36-18293b0eec85', reference_type='CELLxGENE Collection ID', visibility=1, created_by_id=1, transform_id=22, run_id=27, updated_at='2024-07-16 12:17:38 UTC')
❗ returning existing collection with same hash: Collection(uid='fYJ7GBbbLRwmVRHyDYu9', version='2024-07-01', name='Human breast cell atlas', description='10.1038/s41588-024-01688-9', hash='wXMzOvp8a-_nGgkwfjSM', reference='48259aa8-f168-4bf5-b797-af8e88da6637', reference_type='CELLxGENE Collection ID', visibility=1, created_by_id=1, transform_id=22, run_id=27, updated_at='2024-07-16 12:17:38 UTC')
❗ returning existing collection with same hash: Collection(uid='6Nu0TAYjj2ePrtyUe6HL', version='2024-07-01', name='SEA-AD: Seattle Alzheimer’s 

Add existing collections to their corresponding version families:

In [30]:
collections = ln.Collection.filter(version=census_version).all()
collections.count()

177

In [31]:
collections_previous = ln.Collection.filter(version=previous_release).all()
collections_previous.count()

165

In [32]:
for collection in collections:
    collection_previous = collections_previous.filter(
        reference=collection.reference
    ).one_or_none()
    if collection_previous is not None:
        collection.add_to_version_family(collection_previous, version=census_version)

✅ updated uid from dMyEX3NTfKOEYXyMciWu to dMyEX3NTfKOEYXyMKDD7!
✅ updated uid from fYJ7GBbbLRwmVRHyDYu9 to kAcitlx0g6C2lgacOCAS!
✅ updated uid from 6Nu0TAYjj2ePrtyUe6HL to Q2lwtI06DtUQbWUMUAs5!
✅ updated uid from iuZQ5FuxUx8djUoy0wBW to 1MYLC8DhQK3wsKtZHZgD!
✅ updated uid from mKfQgOwlCTuLFneySJGV to jMgmBjDxsNvnkgTHWWoP!
✅ updated uid from jelUsbHyfZH67CXH4Y9I to 0H2X3A2FhWOgA7i8EtaM!
✅ updated uid from Iu3eo1onLtTxVw1jhXA9 to gwCdNiXJtlYDhd6gPLLP!
✅ updated uid from kZGohNB7hv9Qu2QlfMe3 to PUJ2Tv8Ie9lUbxb0ZSfl!
✅ updated uid from Ioc0kigyLbByHgTGVEdi to Op2drQc2W4DokUc9PpQ6!
✅ updated uid from 7D6yIlnySNfVDZozoMuk to uarP82A6F0cOH8dKjpQL!
✅ updated uid from moIxw3JxMtRl9Py5Js6I to nZaV7G3xaFHYYWeJSCvZ!
✅ updated uid from hg78nvGj2jsvrdLVkJHa to IwAqdxTSCfEgrnuWWvKP!
✅ updated uid from Cnveibz2XUCqju5A6PbU to WJLbdahJcDE8E9mzsjbl!
✅ updated uid from Q4327JEqd4gLomTAK1YO to D4mQlKcQOGzGvZrWCXrh!
✅ updated uid from J9rYTZb0QfERXA6O8gnm to DI60aiNNLqOpa8t3JJPJ!
✅ updated uid from K4fEur

## Link metadata to collection

In [34]:
collection = ln.Collection.filter(name="cellxgene-census", version=census_version).one()
collection

Collection(uid='dMyEX3NTfKOEYXyMKDD7', version='2024-07-01', name='cellxgene-census', hash='nI8Ag-HANeOpZOz-8CSn', visibility=1, created_by_id=1, transform_id=22, run_id=27, updated_at='2024-07-16 12:24:38 UTC')

feature sets:

In [35]:
collection.describe()

[1;92mCollection[0m(uid='dMyEX3NTfKOEYXyMKDD7', version='2024-07-01', name='cellxgene-census', hash='nI8Ag-HANeOpZOz-8CSn', visibility=1, updated_at='2024-07-16 12:24:38 UTC')
  [3mProvenance[0m
    .created_by = 'sunnyosun'
    .transform = 'Census release 2024-07-01 (LTS)'
    .run = '2024-07-12 12:17:31 UTC'
  [3mFeature sets[0m
    'obs' = 'assay', 'cell_type', 'development_stage', 'disease', 'donor_id', 'self_reported_ethnicity', 'sex', 'tissue', 'organism', 'tissue_type', 'suspension_type'
    'var' = 'DDX11L1', 'WASH7P', 'MIR6859-1', 'MIR1302-2HG', 'MIR1302-2', 'FAM138A', 'OR4G4P', 'OR4G11P', 'OR4F5', 'None', 'CICP27', 'RNU6-1100P', 'DDX11L17', 'WASH9P'



## Register the soma store

In [7]:
soma_path = f"s3://cellxgene-data-public/cell-census/{census_version}/soma"
ln.UPath(soma_path).view_tree()

14 sub-directories & 6 files with suffixes '', '.tdb'
s3://cellxgene-data-public/cell-census/2024-07-01/soma
├── __tiledb_group.tdb
├── __group/
│   └── __1716234740654_1716234740654_d973bcf54a2242b78bf352eaed600c4a_2
├── __meta/
│   ├── __1716234740654_1716234740654_421b60d8e8ca434a9a3685ac90643161
│   └── __1716234740654_1716234740654_42cb2a544da846dfbe555399fa25c462
├── census_data/
│   ├── __tiledb_group.tdb
│   ├── __group/
│   ├── __meta/
│   ├── homo_sapiens/
│   └── mus_musculus/
└── census_info/
    ├── __tiledb_group.tdb
    ├── __group/
    ├── __meta/
    ├── datasets/
    ├── organisms/
    ├── summary/
    └── summary_cell_counts/


In [9]:
soma_artifact = ln.Artifact(soma_path, description=f"Census {census_version}").save()
soma_artifact

💡 path in storage 's3://cellxgene-data-public' with key 'cell-census/2024-07-01/soma'


Artifact(uid='TwIR80MO51oM3CCuFail', description='Census 2024-07-01', key='cell-census/2024-07-01/soma', suffix='', type='dataset', size=870700998221, hash='bzrXBPNvitSVKvb3GG38_w', hash_type='md5-d', n_objects=330, visibility=1, key_is_virtual=False, created_by_id=1, storage_id=2, transform_id=22, run_id=27, updated_at='2024-07-16 12:52:01 UTC')

In [10]:
ln.finish()

❗ cells [(13, 8), (14, 12), (13, 16), (17, 19), (20, 23), (23, 28), (28, 9), (11, None), (None, 16), (16, 18), (18, 23), (23, 25), (25, 27), (27, 29), (32, 34), (35, 7), (7, 9)] were not run consecutively
💡 go to: https://lamin.ai/laminlabs/cellxgene/transform/MIbO3QeJERcy5zKv
💡 if you want to update your notebook without re-running it, use `lamin save notebook.ipynb`
