# Census release 2025-01-30 (LTS)

In [1]:
!lamin init --storage run-tests --modules bionty

[92m→[0m connected lamindb: zethson/run-tests
[0m

In [2]:
import lamindb as ln
import bionty as bt
from cellxgene_lamin.dev import get_datasets_from_cxg, get_collections_from_cxg

ln.track()

[92m→[0m connected lamindb: zethson/run-tests
[92m→[0m created Transform('QCFDv9VkhSpG0000'), started new Run('JewHm8w8...') at 2025-08-01 07:46:16 UTC
[92m→[0m notebook imports: bionty==1.6.1 cellxgene_lamin==0.3.2 lamindb==1.10.0
[94m•[0m recommendation: to identify the notebook across renames, pass the uid: ln.track("QCFDv9VkhSpG")


## Set up instance

The code in this section is necessary if you're testing this script locally.
You need the synthetic construct and sars-2 genes & organisms which come from an external source.
Moreover, the remaining defaults need to be stored to the instance.

In [3]:
# from cellxgene_lamin.dev._gene import register_genes

# ln.examples.cellxgene.save_cxg_defaults()

# pin correct Gene Source (release 110)
# for organism in ["human", "mouse"]:
#     source = bt.Source.filter(version="release-110", organism=organism).one()
#     source.currently_used = True
#     source.save()

# register_genes()

## Ingest non-curated CELLxGENE Artifacts

In [12]:
census_version = "2025-01-30"
previous_release = "2024-07-01"

In [13]:
s3path = f"s3://cellxgene-data-public/cell-census/{census_version}/h5ads"
ln.UPath(s3path)

S3QueryPath('s3://cellxgene-data-public/cell-census/2025-01-30/h5ads')

In [14]:
ln.UPath(s3path).view_tree()

0 sub-directories & 1573 files with suffixes '.h5ad'
s3://cellxgene-data-public/cell-census/2025-01-30/h5ads
├── 00099d5e-154f-4a7a-aa8d-fa30c8c0c43c.h5ad
├── 001b01fe-5c70-4bda-a3aa-ee09b7899b14.h5ad
├── 0041b9c3-6a49-4bf7-8514-9bc7190067a7.h5ad
├── 00476f9f-ebc1-4b72-b541-32f912ce36ea.h5ad
├── 00593d17-0693-4646-acad-89dbefba11bb.h5ad
├── 0087cde2-967d-4f7c-8e6e-40e4c9ad1891.h5ad
├── 00ba8341-48ec-4e4e-bb56-be0dd2dd7913.h5ad
├── 00e5dedd-b9b7-43be-8c28-b0e5c6414a62.h5ad
├── 00ff600e-6e2e-4d76-846f-0eec4f0ae417.h5ad
├── 01209dce-3575-4bed-b1df-129f57fbc031.h5ad
├── 0129dbd9-a7d3-4f6b-96b9-1da155a93748.h5ad
├── 015c230d-650c-4527-870d-8a805849a382.h5ad
├── 019c7af2-c827-4454-9970-44d5e39ce068.h5ad
├── 01ad3cd7-3929-4654-84c0-6db05bd5fd59.h5ad
├── 01c93cf6-b695-4e30-a26e-121ae8b16a9e.h5ad
├── 023ce5da-0dbe-4ad0-a290-b6c21824b101.h5ad
├── 023d4d04-4671-4433-af8d-158a05c81d8b.h5ad
├── 02419ebc-fd75-4629-ba13-9b26e3be851c.h5ad
├── 024593dd-d237-4eaa-aff2-9c9d87be595e.h5ad
├── 02792605-4760

Get all datasets and associated metadata using cellxgene REST API:

In [15]:
cxg_datasets = get_datasets_from_cxg()
len(cxg_datasets)

1850

In [16]:
artifacts = ln.Artifact.from_dir(s3path)
ln.save(artifacts)

In [17]:
artifacts = ln.Artifact.filter(key__contains=census_version).all()
len(artifacts)

1573

In [18]:
artifacts.update(version=census_version)

1573

In [19]:
for cxg_dataset in cxg_datasets:
    artifact = artifacts.filter(key__contains=cxg_dataset["dataset_id"]).one_or_none()
    if artifact is not None:
        artifact.n_observations = cxg_dataset["cell_count"]
        artifact.description = cxg_dataset["title"]
        artifact.save()

In [20]:
artifacts_previous = ln.Artifact.filter(version=previous_release).all()
artifacts_previous.count()

813

In [22]:
for artifact in artifacts:
    dataset_id = artifact.key.split("/")[-1]
    artifact_previous = artifacts_previous.filter(
        key__endswith=dataset_id
    ).one_or_none()
    if artifact_previous is not None:
        artifact.revises = artifact_previous
        artifact.version = census_version
        artifact.save()

## Label Artifacts

This is done in an external script for better tmux/Sagemaker usage.

## Register Collections

In [None]:
collection = ln.Collection(
    artifacts,
    key="cellxgene-census",
    version=census_version,
    revises=ln.Collection.filter(
        key="cellxgene-census", version=previous_release
    ).one(),
)
collection.save()

[92m→[0m mapped: Artifact(uid='1BNWhcCqu1CMSJaHxpbn')
[92m→[0m mapped: Artifact(uid='aJTH55LW2CTIWu306YiY')
[92m→[0m mapped: Artifact(uid='pnQX4jvkj3eFWGOzDxbW')
[92m→[0m mapped: Artifact(uid='2bF2gDSwbNbDsFVg2KQf')
[92m→[0m mapped: Artifact(uid='Pvhx7GAmAt4SYg03sE0M')
[92m→[0m mapped: Artifact(uid='W24OA3PL0R9an0aRh8pO')
[92m→[0m mapped: Artifact(uid='RCzyhZz9tfi6YI4F7mxb')
[92m→[0m mapped: Artifact(uid='xT6KRWIcByIyORlog2cs')
[92m→[0m mapped: Artifact(uid='24jpTvAM7ipZe2mvty5W')
[92m→[0m mapped: Artifact(uid='cs5xflc9TR0nrgpHmT4L')
[92m→[0m mapped: Artifact(uid='1pGy4sJuik81S3TndIbn')
[92m→[0m mapped: Artifact(uid='yBqxA1OBSSwMps0364FK')
[92m→[0m mapped: Artifact(uid='TUAbAHaYd7pc87WBfhIp')
[92m→[0m mapped: Artifact(uid='C7KKerpUwrbqBadAFk6k')
[92m→[0m mapped: Artifact(uid='dEP0dZ8UxLgwnkLjr5qo')
[92m→[0m mapped: Artifact(uid='ylmtPo5gptrM28Jm3iVU')
[92m→[0m mapped: Artifact(uid='C6V9dAdLXQ8N03klD2C2')
[92m→[0m mapped: Artifact(uid='V81vgcjDidJ6iCN

Collection(uid='dMyEX3NTfKOEYXyMKDD8', version='2025-01-30', is_latest=True, key='cellxgene-census', hash='NjqvY0g6hlzgyVXTYer0Ng', branch_id=1, space_id=1, created_by_id=1, run_id=1, created_at=2025-07-29 21:41:19 UTC)

In [30]:
cxg_collections = get_collections_from_cxg()

In [None]:
ln.settings.creation.search_names = False
for collection_meta in cxg_collections:
    keys = [
        f"cell-census/{census_version}/h5ads/{dataset['dataset_id']}.h5ad"
        for dataset in collection_meta["datasets"]
    ]
    collection_artifacts = artifacts.filter(key__in=keys).all()
    if collection_artifacts.count() > 0:
        kwargs = {
            "key": collection_meta["name"],
            "description": collection_meta["doi"],
            "reference": collection_meta["collection_id"],
            "reference_type": "CELLxGENE Collection ID",
            "version": census_version,
        }
        previous_collection = ln.Collection.filter(
            reference=collection_meta["collection_id"], version=previous_release
        ).one_or_none()
        if previous_collection is not None:
            kwargs["revises"] = previous_collection

        collection_record = ln.Collection(
            collection_artifacts,
            **kwargs,
        )
        collection_record.version = census_version
        # if is needed here as .save() errors if collection is already saved
        if collection_record._state.adding:
            collection_record.save()
ln.settings.creation.search_names = True

Add existing collections to their corresponding version families:

In [32]:
collections = ln.Collection.filter(version=census_version).all()
collections.count()

248

In [33]:
collections_previous = ln.Collection.filter(version=previous_release).all()
collections_previous.count()

1

In [35]:
for collection in collections:
    collection_previous = collections_previous.filter(
        reference=collection.reference
    ).one_or_none()
    if collection_previous is not None:
        collection.revises = collection_previous
        collection.version = census_version
        collection.save()

## Register the soma store

In [38]:
soma_path = f"s3://cellxgene-data-public/cell-census/{census_version}/soma"
ln.UPath(soma_path).view_tree()

19 sub-directories & 7 files with suffixes '', '.tdb'
s3://cellxgene-data-public/cell-census/2025-01-30/soma
├── __tiledb_group.tdb
├── __group/
│   └── __1738275389572_1738275389572_13353233232d51777852409560eaff8d_2
├── __meta/
│   ├── __1738275389508_1738275389508_4e0e75839417b5b157905a31316816e6
│   └── __1738275389572_1738275389572_2639dfe80b470afbf96934cfd352226e
├── census_data/
│   ├── __tiledb_group.tdb
│   ├── __group/
│   ├── __meta/
│   ├── homo_sapiens/
│   └── mus_musculus/
├── census_info/
│   ├── __tiledb_group.tdb
│   ├── __group/
│   ├── __meta/
│   ├── datasets/
│   ├── organisms/
│   ├── summary/
│   └── summary_cell_counts/
└── census_spatial_sequencing/
    ├── __tiledb_group.tdb
    ├── __group/
    ├── __meta/
    ├── homo_sapiens/
    └── mus_musculus/


In [39]:
soma_artifact = ln.Artifact(soma_path, description=f"Census {census_version}").save()
soma_artifact

Artifact(uid='wYEatbDRgjDrhat40000', is_latest=True, key='cell-census/2025-01-30/soma', description='Census 2025-01-30', suffix='', size=1128639499732, hash='uq_4QzGnaveTq5XinBJiCA', n_files=14217, branch_id=1, space_id=1, storage_id=2, run_id=1, created_by_id=1, created_at=2025-07-29 21:46:05 UTC)

In [None]:
ln.finish()