# Census release 2025-01-30 (LTS)

In [1]:
!lamin init --storage run-tests --modules bionty

[92m→[0m initialized lamindb: zethson/run-tests
[0m

In [None]:
import lamindb as ln
import bionty as bt
from cellxgene_lamin.dev import get_datasets_from_cxg, get_collections_from_cxg

ln.settings.verbosity = "hint"

[92m→[0m connected lamindb: zethson/run-tests


In [3]:
census_version = "2025-01-30"
previous_release = "2024-07-01"

In [4]:
s3path = f"s3://cellxgene-data-public/cell-census/{census_version}/h5ads"
ln.UPath(s3path)

S3QueryPath('s3://cellxgene-data-public/cell-census/2025-01-30/h5ads')

In [5]:
ln.UPath(s3path).view_tree()

0 sub-directories & 1573 files with suffixes '.h5ad'
s3://cellxgene-data-public/cell-census/2025-01-30/h5ads
├── 00099d5e-154f-4a7a-aa8d-fa30c8c0c43c.h5ad
├── 001b01fe-5c70-4bda-a3aa-ee09b7899b14.h5ad
├── 0041b9c3-6a49-4bf7-8514-9bc7190067a7.h5ad
├── 00476f9f-ebc1-4b72-b541-32f912ce36ea.h5ad
├── 00593d17-0693-4646-acad-89dbefba11bb.h5ad
├── 0087cde2-967d-4f7c-8e6e-40e4c9ad1891.h5ad
├── 00ba8341-48ec-4e4e-bb56-be0dd2dd7913.h5ad
├── 00e5dedd-b9b7-43be-8c28-b0e5c6414a62.h5ad
├── 00ff600e-6e2e-4d76-846f-0eec4f0ae417.h5ad
├── 01209dce-3575-4bed-b1df-129f57fbc031.h5ad
├── 0129dbd9-a7d3-4f6b-96b9-1da155a93748.h5ad
├── 015c230d-650c-4527-870d-8a805849a382.h5ad
├── 019c7af2-c827-4454-9970-44d5e39ce068.h5ad
├── 01ad3cd7-3929-4654-84c0-6db05bd5fd59.h5ad
├── 01c93cf6-b695-4e30-a26e-121ae8b16a9e.h5ad
├── 023ce5da-0dbe-4ad0-a290-b6c21824b101.h5ad
├── 023d4d04-4671-4433-af8d-158a05c81d8b.h5ad
├── 02419ebc-fd75-4629-ba13-9b26e3be851c.h5ad
├── 024593dd-d237-4eaa-aff2-9c9d87be595e.h5ad
├── 02792605-4760

In [6]:
ln.track("VzV8Ty89eQdK")

[94m•[0m tracked pip freeze > /home/lukas/.cache/lamindb/environments/run_xkjTYBy2Vsc82WJw/run_env_pip.txt
[92m→[0m created Transform('VzV8Ty89eQdK0000'), started new Run('xkjTYBy2...') at 2025-07-23 09:25:03 UTC
[92m→[0m notebook imports: bionty==1.6.1 cellxgene_lamin==0.3.2 lamindb==1.9.0


Get all datasets and associated metadata using cellxgene REST API:

In [7]:
cxg_datasets = get_datasets_from_cxg()
len(cxg_datasets)

1844

In [8]:
cxg_datasets[0].keys()

dict_keys(['assay', 'assets', 'cell_count', 'cell_type', 'citation', 'collection_doi', 'collection_doi_label', 'collection_id', 'collection_name', 'collection_version_id', 'dataset_id', 'dataset_version_id', 'development_stage', 'disease', 'donor_id', 'explorer_url', 'is_primary_data', 'mean_genes_per_cell', 'organism', 'primary_cell_count', 'processing_status', 'published_at', 'revised_at', 'revision_of_collection', 'revision_of_dataset', 'schema_version', 'self_reported_ethnicity', 'sex', 'spatial', 'suspension_type', 'tissue', 'title', 'tombstone', 'visibility', 'x_approximate_distribution'])

## Register artifacts

In [9]:
artifacts = ln.Artifact.from_dir(s3path)
ln.save(artifacts)

[92m→[0m due to lack of write access, LaminDB won't manage this storage location: s3://cellxgene-data-public
[92m→[0m referenced read-only storage location at s3://cellxgene-data-public
[92m✓[0m created 1573 artifacts from directory using storage s3://cellxgene-data-public and key = cell-census/2025-01-30/h5ads/


In [10]:
artifacts = ln.Artifact.filter(key__contains=census_version).all()
len(artifacts)

1573

In [11]:
artifacts.update(version=census_version)

1573

In [12]:
for cxg_dataset in cxg_datasets:
    artifact = artifacts.filter(key__contains=cxg_dataset["dataset_id"]).one_or_none()
    if artifact is not None:
        artifact.n_observations = cxg_dataset["cell_count"]
        artifact.description = cxg_dataset["title"]
        artifact.save()

In [13]:
artifacts_previous = ln.Artifact.filter(version=previous_release).all()
artifacts_previous.count()

0

In [14]:
for artifact in artifacts:
    dataset_id = artifact.key.split("/")[-1]
    artifact_previous = artifacts_previous.filter(
        key__endswith=dataset_id
    ).one_or_none()
    if artifact_previous is not None:
        artifact.add_to_version_family(artifact_previous, version=census_version)

## Register metadata

### Register new features and parent labels

In [15]:
artifacts = ln.Artifact.filter(key__contains=census_version).all()

In [16]:
from cellxgene_lamin.dev._features import register_obs_schema

In [17]:
if not artifacts[0].feature_sets.filter(name="obs metadata").exists():
    obs_feature_set = register_obs_schema(artifacts)

In [18]:
features = ln.Feature.lookup()

## organisms

In [19]:
from cellxgene_lamin.dev._organism import register_organisms, curate_organisms

In [20]:
register_organisms(cxg_datasets)

[92m✓[0m created [1;95m7 Organism records from Bionty[0m matching [3montology_id[0m: [1;95m'NCBITaxon:9825', 'NCBITaxon:7955', 'NCBITaxon:9598', 'NCBITaxon:9483', 'NCBITaxon:9544', 'NCBITaxon:9606', 'NCBITaxon:10090'[0m


Link collections and organisms to artifacts:

In [21]:
curate_organisms(artifacts, cxg_datasets)

## ontologies

Register all ontology ids:

In [22]:
from cellxgene_lamin.dev._ontology import register_ontology_ids

In [23]:
register_ontology_ids(cxg_datasets)

[92m✓[0m created [1;95m43 ExperimentalFactor records from Bionty[0m matching [3montology_id[0m: [1;95m'EFO:0008919', 'EFO:0030007', 'EFO:0700016', 'EFO:0008780', 'EFO:0030074', 'EFO:0030060', 'EFO:0030027', 'EFO:0011025', 'EFO:0010891', 'EFO:0009919', 'EFO:0700003', 'EFO:0030003', 'EFO:0030002', 'EFO:0008877', 'EFO:0030028', 'EFO:0008953', 'EFO:0700004', 'EFO:0009922', 'EFO:0700010', 'EFO:0008931', ...[0m
[94m•[0m [1;91mdid not create[0m ExperimentalFactor records for [1;93m2 non-validated[0m [3montology_ids[0m: [1;93m'EFO:0022845', 'EFO:0022857'[0m


[92m✓[0m created [1;95m1019 CellType records from Bionty[0m matching [3montology_id[0m: [1;95m'CL:1000497', 'CL:0008036', 'CL:4023111', 'CL:0000221', 'CL:0002488', 'CL:0000216', 'CL:0001072', 'CL:0000586', 'CL:0002543', 'CL:0002280', 'CL:1000398', 'CL:4033057', 'CL:0002677', 'CL:0000017', 'CL:0002131', 'CL:1000349', 'CL:0000650', 'CL:0000038', 'CL:4023018', 'CL:4030064', ...[0m
[94m•[0m [1;91mdid not create[0m CellType records for [1;93m13 non-validated[0m [3montology_ids[0m: [1;93m'CL:4033085', 'CL:4033095', 'CL:4033096', 'CL:4042021', 'CL:4052001', 'CL:4052010', 'CL:4052024', 'CL:4052025', 'CL:4052026', 'CL:4052030', 'CL:4052048', 'CL:4052049', 'unknown'[0m


[92m✓[0m created [1;95m189 DevelopmentalStage records from Bionty[0m matching [3montology_id[0m: [1;95m'HsapDv:0000024', 'HsapDv:0000214', 'HsapDv:0000057', 'HsapDv:0000192', 'HsapDv:0000183', 'HsapDv:0000152', 'HsapDv:0000058', 'HsapDv:0000071', 'HsapDv:0000163', 'HsapDv:0000244', 'HsapDv:0000030', 'HsapDv:0000108', 'HsapDv:0000063', 'HsapDv:0000154', 'HsapDv:0000201', 'HsapDv:0000050', 'HsapDv:0000229', 'HsapDv:0000262', 'HsapDv:0000208', 'HsapDv:0000101', ...[0m
[94m•[0m [1;91mdid not create[0m DevelopmentalStage records for [1;93m88 non-validated[0m [3montology_ids[0m: [1;93m'HsapDv:0000274', 'MmusDv:0000019', 'MmusDv:0000020', 'MmusDv:0000021', 'MmusDv:0000022', 'MmusDv:0000023', 'MmusDv:0000024', 'MmusDv:0000025', 'MmusDv:0000026', 'MmusDv:0000027', 'MmusDv:0000028', 'MmusDv:0000029', 'MmusDv:0000032', 'MmusDv:0000033', 'MmusDv:0000034', 'MmusDv:0000035', 'MmusDv:0000036', 'MmusDv:0000062', 'MmusDv:0000063', 'MmusDv:0000064', ...[0m
[92m✓[0m created [1;95m71 

[92m✓[0m created [1;95m168 Disease records from Bionty[0m matching [3montology_id[0m: [1;95m'MONDO:0005109', 'MONDO:0009831', 'MONDO:0005005', 'MONDO:0000265', 'MONDO:0016468', 'MONDO:0005087', 'MONDO:0600025', 'MONDO:0003050', 'MONDO:0004781', 'MONDO:0003573', 'MONDO:0003004', 'MONDO:0001150', 'MONDO:0005180', 'MONDO:0005828', 'MONDO:0005565', 'MONDO:0005453', 'MONDO:0018874', 'MONDO:0004970', 'MONDO:0007763', 'MONDO:0011705', ...[0m
[94m•[0m [1;91mdid not create[0m Disease records for [1;93m4 non-validated[0m [3montology_ids[0m: [1;93m'MONDO:0004981 || MONDO:1030008', 'MONDO:0005109 || MONDO:0005445', 'MONDO:0005109 || MONDO:0011989', 'PATO:0000461'[0m
[92m✓[0m created [1;95m1 Phenotype record from Bionty[0m matching [3montology_id[0m: [1;95m'PATO:0000461'[0m


[92m✓[0m created [1;95m35 Ethnicity records from Bionty[0m matching [3montology_id[0m: [1;95m'HANCESTRO:0005', 'HANCESTRO:0025', 'HANCESTRO:0016', 'HANCESTRO:0598', 'HANCESTRO:0009', 'HANCESTRO:0027', 'HANCESTRO:0364', 'HANCESTRO:0500', 'HANCESTRO:0463', 'HANCESTRO:0014', 'HANCESTRO:0485', 'HANCESTRO:0352', 'HANCESTRO:0597', 'HANCESTRO:0487', 'HANCESTRO:0595', 'HANCESTRO:0439', 'HANCESTRO:0021', 'HANCESTRO:0019', 'HANCESTRO:0022', 'HANCESTRO:0007', ...[0m
[94m•[0m [1;91mdid not create[0m Ethnicity records for [1;93m5 non-validated[0m [3montology_ids[0m: [1;93m'HANCESTRO:0005 || HANCESTRO:0008', 'HANCESTRO:0013 || HANCESTRO:0014', 'HANCESTRO:0014 || HANCESTRO:0590', 'na', 'unknown'[0m


[92m✓[0m created [1;95m2 Phenotype records from Bionty[0m matching [3montology_id[0m: [1;95m'PATO:0000383', 'PATO:0000384'[0m
[94m•[0m [1;91mdid not create[0m Phenotype record for [1;93m1 non-validated[0m [3montology_id[0m: [1;93m'unknown'[0m


[92m✓[0m created [1;95m592 Tissue records from Bionty[0m matching [3montology_id[0m: [1;95m'UBERON:0004167', 'UBERON:0003017', 'UBERON:0002113', 'UBERON:0002116', 'UBERON:0002550', 'UBERON:0005406', 'UBERON:0001809', 'UBERON:0001985', 'UBERON:0002060', 'UBERON:0002657', 'UBERON:0001238', 'UBERON:0001295', 'UBERON:0004262', 'UBERON:0003059', 'UBERON:0013706', 'UBERON:0002351', 'UBERON:0035886', 'UBERON:0004499', 'UBERON:0002328', 'UBERON:0001830', ...[0m
[94m•[0m [1;91mdid not create[0m Tissue records for [1;93m14 non-validated[0m [3montology_ids[0m: [1;93m'CL:0000010', 'CL:0000082', 'CL:0000084', 'CL:0000115', 'CL:0000307', 'CL:0000322', 'CL:0000351', 'CL:0002322', 'CL:0002327', 'CL:0002328', 'CL:0002334', 'CL:0002335', 'CL:0002633', 'CL:4052001'[0m


In [24]:
# clean up the celltype in Tissue

bt.Tissue.filter(ontology_id="CL:0000307").delete()
bt.CellType.from_source(ontology_id="CL:0000307").save()

CellType(uid='3bgY2Yz8', name='tracheal epithelial cell', ontology_id='CL:0000307', synonyms='tracheocyte', description='An Epithelial Cell Found In The Trachea.', branch_id=1, space_id=1, created_by_id=1, run_id=1, source_id=16, created_at=2025-07-23 09:26:55 UTC)

## donors and suspension_types

In [25]:
from cellxgene_lamin.dev._labels import register_ulabels

In [26]:
ln.settings.creation.search_names = False
register_ulabels(cxg_datasets, "donor_id")
ln.settings.creation.search_names = True

In [27]:
ln.settings.creation.search_names = False
register_ulabels(cxg_datasets, "suspension_type")
ln.settings.creation.search_names = True

## Annotate artifacts with obs metadata

In [28]:
from cellxgene_lamin.dev._features import FEATURE_TO_ACCESSOR

In [29]:
import bionty as bt

features = ln.Feature.lookup()

for idx, cxg_dataset in enumerate(cxg_datasets):
    if idx % 100 == 0:
        print(f"annotating dataset {idx} of {len(cxg_datasets)}")
    artifact = artifacts.filter(key__contains=cxg_dataset["dataset_id"]).one_or_none()
    if artifact is None:
        continue
    for field, terms in cxg_dataset.items():
        if field not in FEATURE_TO_ACCESSOR:
            continue
        accessor, orm = FEATURE_TO_ACCESSOR.get(field)
        if field in ["donor_id", "suspension_type", "tissue_type"]:
            records = orm.from_values(terms, field="name")
            if len(records) > 0:
                # stratify by feature so that link tables records are written
                getattr(artifact, accessor).add(*records)
        else:
            if field == "tissue":
                records = []
                celltypes = [
                    term["ontology_term_id"]
                    for term in terms
                    if term["tissue_type"] == "cell culture"
                ]
                if len(celltypes) > 0:
                    # records += bt.CellType.from_values(
                    #     [i["ontology_term_id"] for i in terms], field="ontology_id"
                    # )
                    print(f"These tissues are not yet linked: {celltypes}")
                tissues = [
                    term["ontology_term_id"]
                    for term in terms
                    if term["tissue_type"] != "cell culture"
                ]
                if len(tissues) > 0:
                    records += bt.Tissue.from_values(
                        [term["ontology_term_id"] for term in terms],
                        field="ontology_id",
                    )
            else:
                records = orm.from_values(
                    [term["ontology_term_id"] for term in terms], field="ontology_id"
                )
            if len(records) > 0:
                getattr(artifact, accessor).add(*records)

# clean up the 2 "unknowns" in DevelopmentalStage
bt.DevelopmentalStage.filter(name="unknown").exclude(ontology_id="unknown").delete()

annotating dataset 0 of 1844
annotating dataset 100 of 1844
[92m✓[0m loaded [1;92m1 Disease record[0m matching [3montology_id[0m: [1;92m'MONDO:0004981 || MONDO:1030008'[0m
[94m•[0m [1;91mdid not create[0m Disease record for [1;93m1 non-validated[0m [3montology_id[0m: [1;93m'PATO:0000461'[0m
[92m✓[0m loaded [1;92m3 Disease records[0m matching [3montology_id[0m: [1;92m'MONDO:0005570', 'MONDO:0018881', 'MONDO:0020077'[0m
[94m•[0m [1;91mdid not create[0m Disease record for [1;93m1 non-validated[0m [3montology_id[0m: [1;93m'PATO:0000461'[0m
[92m✓[0m loaded [1;92m3 Disease records[0m matching [3montology_id[0m: [1;92m'MONDO:0005570', 'MONDO:0018881', 'MONDO:0020077'[0m
[94m•[0m [1;91mdid not create[0m Disease record for [1;93m1 non-validated[0m [3montology_id[0m: [1;93m'PATO:0000461'[0m
[94m•[0m [1;91mdid not create[0m Disease record for [1;93m1 non-validated[0m [3montology_id[0m: [1;93m'PATO:0000461'[0m
[94m•[0m [1;91mdid no

## Validate and register genes

In [30]:
from cellxgene_lamin.dev._gene import register_genes

Register all genes for each organism:

In [31]:
register_genes()

[92m✓[0m created [1;95m1 Organism record from Bionty[0m matching [3mname[0m: [1;95m'synthetic construct'[0m
[92m✓[0m created [1;95m1 Organism record from Bionty[0m matching [3montology_id[0m: [1;95m'NCBITaxon:81077'[0m
[92m✓[0m created [1;95m1 Organism record from Bionty[0m matching [3mname[0m: [1;95m'sars-2'[0m
[92m✓[0m created [1;95m1 Organism record from Bionty[0m matching [3montology_id[0m: [1;95m'NCBITaxon:694009'[0m
[96m•[0m key has more than one suffix (path.suffixes), using only last suffix: '.parquet' - if you want your composite suffix to be recognized add it to lamindb.core.storage.VALID_SIMPLE_SUFFIXES.add()
[96m•[0m path content will be copied to default storage upon `save()` with key 'df_synthetic construct__gencode_ercc__1.0.0__Gene.parquet'
[92m✓[0m storing artifact 'e5fnB0arQ8P1q0Bp0000' at '/home/lukas/code/cellxgene-lamin/docs/notebooks/run-tests/.lamindb/e5fnB0arQ8P1q0Bp0000.parquet'
[92m→[0m source added!
[96m•[0m key has m

[92m✓[0m created [1;95m62595 Gene records from Bionty[0m matching [3mensembl_gene_id[0m: [1;95m'ENSG00000290825', 'ENSG00000223972', 'ENSG00000227232', 'ENSG00000278267', 'ENSG00000243485', 'ENSG00000284332', 'ENSG00000237613', 'ENSG00000268020', 'ENSG00000290826', 'ENSG00000240361', 'ENSG00000186092', 'ENSG00000238009', 'ENSG00000239945', 'ENSG00000233750', 'ENSG00000268903', 'ENSG00000269981', 'ENSG00000239906', 'ENSG00000241860', 'ENSG00000222623', 'ENSG00000241599', ...[0m
[94m•[0m [1;91mdid not create[0m Gene records for [1;93m159 non-validated[0m [3mensembl_gene_ids[0m: [1;93m'ENSG00000203441', 'ENSG00000214783', 'ENSG00000214970', 'ENSG00000215067', 'ENSG00000223458', 'ENSG00000223797', 'ENSG00000224167', 'ENSG00000224247', 'ENSG00000225205', 'ENSG00000226032', 'ENSG00000226277', 'ENSG00000226362', 'ENSG00000226747', 'ENSG00000226822', 'ENSG00000226849', 'ENSG00000227925', 'ENSG00000228135', 'ENSG00000228434', 'ENSG00000228890', 'ENSG00000229611', ...[0m
[92m✓

[92m✓[0m created [1;95m56867 Gene records from Bionty[0m matching [3mensembl_gene_id[0m: [1;95m'ENSMUSG00000102693', 'ENSMUSG00000064842', 'ENSMUSG00000051951', 'ENSMUSG00000102851', 'ENSMUSG00000103377', 'ENSMUSG00000104017', 'ENSMUSG00000103025', 'ENSMUSG00000089699', 'ENSMUSG00000103201', 'ENSMUSG00000103147', 'ENSMUSG00000103161', 'ENSMUSG00000102331', 'ENSMUSG00000102348', 'ENSMUSG00000102592', 'ENSMUSG00000088333', 'ENSMUSG00000102343', 'ENSMUSG00000025900', 'ENSMUSG00000102948', 'ENSMUSG00000104123', 'ENSMUSG00000025902', ...[0m
[94m•[0m [1;91mdid not create[0m Gene records for [1;93m74 non-validated[0m [3mensembl_gene_ids[0m: [1;93m'ENSMUSG00000043623', 'ENSMUSG00000046145', 'ENSMUSG00000046388', 'ENSMUSG00000046747', 'ENSMUSG00000047189', 'ENSMUSG00000048316', 'ENSMUSG00000048406', 'ENSMUSG00000049202', 'ENSMUSG00000052005', 'ENSMUSG00000052241', 'ENSMUSG00000052426', 'ENSMUSG00000052779', 'ENSMUSG00000054304', 'ENSMUSG00000054379', 'ENSMUSG00000054957', 'ENSM

[92m✓[0m created [1;95m92 Gene records from Bionty[0m matching [3mensembl_gene_id[0m: [1;95m'ERCC-00002', 'ERCC-00003', 'ERCC-00004', 'ERCC-00009', 'ERCC-00012', 'ERCC-00013', 'ERCC-00014', 'ERCC-00016', 'ERCC-00017', 'ERCC-00019', 'ERCC-00022', 'ERCC-00024', 'ERCC-00025', 'ERCC-00028', 'ERCC-00031', 'ERCC-00033', 'ERCC-00034', 'ERCC-00035', 'ERCC-00039', 'ERCC-00040', ...[0m
[92m✓[0m [1;92m92 unique terms[0m (100.00%) are validated for [3mensembl_gene_id[0m


[92m✓[0m created [1;95m12 Gene records from Bionty[0m matching [3mensembl_gene_id[0m: [1;95m'ENSSASG00005000002', 'ENSSASG00005000003', 'ENSSASG00005000004', 'ENSSASG00005000006', 'ENSSASG00005000010', 'ENSSASG00005000007', 'ENSSASG00005000011', 'ENSSASG00005000009', 'ENSSASG00005000012', 'ENSSASG00005000008', 'ENSSASG00005000005', 'ENSSASG00005000013'[0m
[92m✓[0m [1;92m12 unique terms[0m (100.00%) are validated for [3mensembl_gene_id[0m


## Link metadata to individual artifacts

annotate with genes measured in each artifact:

In [32]:
organisms = bt.Organism.lookup(field=bt.Organism.scientific_name)

In [33]:
# TODO discuss whether we even want to keep doing this
for idx, artifact in enumerate(artifacts):
    if idx % 100 == 0:
        print(f"annotating dataset {idx} of {len(artifacts)}")

    adata = artifact.open()
    var_names = adata.var_names
    organism_record = artifact.organisms.first()
    if organism_record is None:
        print(f"No organism found for artifact: {artifact}")
        continue

    genes = bt.Gene.from_values(
        var_names, field=bt.Gene.ensembl_gene_id, organism=organism_record
    )

    if len(genes) == 0 and var_names[0].startswith("ENSG"):
        genes = bt.Gene.from_values(
            var_names, field=bt.Gene.ensembl_gene_id, organism="human"
        )

    if len(var_names[var_names.str.startswith("ERCC")]) > 0:
        genes = bt.Gene.from_values(
            var_names,
            field=bt.Gene.ensembl_gene_id,
            organism=organisms.synthetic_construct,
        )
    if len(var_names[var_names.str.startswith("ENSSASG")]) > 0:
        genes = bt.Gene.from_values(
            var_names,
            field=bt.Gene.ensembl_gene_id,
            organism=organisms.severe_acute_respiratory_syndrome_coronavirus_2,
        )

    if genes:
        genes = [g for g in genes if not g._state.adding]
        var_schema = ln.Schema(genes, dtype="number").save()
        artifact.feature_sets.add(var_schema, through_defaults={"slot": "var"})

annotating dataset 0 of 1573
[92m✓[0m loaded [1;92m36402 Gene records[0m matching [3mensembl_gene_id[0m: [1;92m'ENSG00000243485', 'ENSG00000237613', 'ENSG00000186092', 'ENSG00000238009', 'ENSG00000239945', 'ENSG00000239906', 'ENSG00000241860', 'ENSG00000241599', 'ENSG00000286448', 'ENSG00000236601', 'ENSG00000284733', 'ENSG00000235146', 'ENSG00000284662', 'ENSG00000229905', 'ENSG00000237491', 'ENSG00000177757', 'ENSG00000228794', 'ENSG00000225880', 'ENSG00000230368', 'ENSG00000272438', ...[0m
[92m✓[0m created [1;95m4 Gene records from Bionty[0m matching [3mensembl_gene_id[0m: [1;95m'ENSG00000224516', 'ENSG00000225489', 'ENSG00000285517', 'ENSG00000286061'[0m
[92m✓[0m loaded [1;92m61752 Gene records[0m matching [3mensembl_gene_id[0m: [1;92m'ENSG00000000003', 'ENSG00000000005', 'ENSG00000000419', 'ENSG00000000457', 'ENSG00000000460', 'ENSG00000000938', 'ENSG00000000971', 'ENSG00000001036', 'ENSG00000001084', 'ENSG00000001167', 'ENSG00000001460', 'ENSG00000001461', 

KeyboardInterrupt: 

In [34]:
artifact.describe()

## Annotate tissue_type

In [35]:
register_ulabels(cxg_datasets, "tissue_type")

[93m![0m you are trying to create a record with name='tissue' but a record with similar name exists: 'is_tissue_type'. Did you mean to load it?
[93m![0m you are trying to create a record with name='organoid' but records with similar names exist: 'Chan_NatCommun_2022_bronchial_organoids', 'Lim_CellStemCell_2023_Lung_organoid_epithelium', 'Wesley_NatCellBio_2022_Hepatoblast_organoids_treated_with_hepatozyme'. Did you mean to load one of them?


## Register collections

In [None]:
collection = ln.Collection(
    artifacts,
    key="cellxgene-census",
    version=census_version,
    is_new_version_of=ln.Collection.filter(
        key="cellxgene-census", version=previous_release
    ).one(),
)
collection.save()

Collection(uid='1eshvBBOwC0IdzGZ0000', version='2025-01-30', is_latest=True, key='cellxgene-census', hash='NjqvY0g6hlzgyVXTYer0Ng', branch_id=1, space_id=1, created_by_id=1, run_id=1, created_at=2025-07-23 13:29:50 UTC)

In [37]:
cxg_collections = get_collections_from_cxg()

In [39]:
artifacts.first().key

'cell-census/2025-01-30/h5ads/00099d5e-154f-4a7a-aa8d-fa30c8c0c43c.h5ad'

In [40]:
ln.settings.creation.search_names = False
for collection_meta in cxg_collections:
    keys = [
        f"cell-census/{census_version}/h5ads/{dataset['dataset_id']}.h5ad"
        for dataset in collection_meta["datasets"]
    ]
    collection_artifacts = artifacts.filter(key__in=keys).all()
    if collection_artifacts.count() > 0:
        kwargs = {
            "key": collection_meta["name"],
            "description": collection_meta["doi"],
            "reference": collection_meta["collection_id"],
            "reference_type": "CELLxGENE Collection ID",
            "version": census_version,
        }
        collection_record = ln.Collection(
            collection_artifacts,
            **kwargs,
        )
        # if is needed here as .save() errors if collection is already saved
        if collection_record._state.adding:
            collection_record.save()
ln.settings.creation.search_names = True

Add existing collections to their corresponding version families:

In [41]:
collections = ln.Collection.filter(version=census_version).all()
collections.count()

248

In [42]:
collections_previous = ln.Collection.filter(version=previous_release).all()
collections_previous.count()

0

In [43]:
for collection in collections:
    collection_previous = collections_previous.filter(
        reference=collection.reference
    ).one_or_none()
    if collection_previous is not None:
        collection.add_to_version_family(collection_previous, version=census_version)

## Link metadata to collection

In [44]:
collection = ln.Collection.filter(key="cellxgene-census", version=census_version).one()
collection

Collection(uid='1eshvBBOwC0IdzGZ0000', version='2025-01-30', is_latest=True, key='cellxgene-census', hash='NjqvY0g6hlzgyVXTYer0Ng', branch_id=1, space_id=1, created_by_id=1, run_id=1, created_at=2025-07-23 13:29:50 UTC)

feature sets:

In [45]:
collection.describe()

''

## Register the soma store

In [46]:
soma_path = f"s3://cellxgene-data-public/cell-census/{census_version}/soma"
ln.UPath(soma_path).view_tree()

19 sub-directories & 7 files with suffixes '', '.tdb'
s3://cellxgene-data-public/cell-census/2025-01-30/soma
├── __tiledb_group.tdb
├── __group/
│   └── __1738275389572_1738275389572_13353233232d51777852409560eaff8d_2
├── __meta/
│   ├── __1738275389508_1738275389508_4e0e75839417b5b157905a31316816e6
│   └── __1738275389572_1738275389572_2639dfe80b470afbf96934cfd352226e
├── census_data/
│   ├── __tiledb_group.tdb
│   ├── __group/
│   ├── __meta/
│   ├── homo_sapiens/
│   └── mus_musculus/
├── census_info/
│   ├── __tiledb_group.tdb
│   ├── __group/
│   ├── __meta/
│   ├── datasets/
│   ├── organisms/
│   ├── summary/
│   └── summary_cell_counts/
└── census_spatial_sequencing/
    ├── __tiledb_group.tdb
    ├── __group/
    ├── __meta/
    ├── homo_sapiens/
    └── mus_musculus/


In [47]:
soma_artifact = ln.Artifact(soma_path, description=f"Census {census_version}").save()
soma_artifact

[96m•[0m path in storage 's3://cellxgene-data-public' with key 'cell-census/2025-01-30/soma'


Artifact(uid='DstoXzgJPW2PrnYr0000', is_latest=True, key='cell-census/2025-01-30/soma', description='Census 2025-01-30', suffix='', size=1128639499732, hash='uq_4QzGnaveTq5XinBJiCA', n_files=14217, branch_id=1, space_id=1, storage_id=2, run_id=1, created_by_id=1, created_at=2025-07-23 13:32:09 UTC)

In [48]:
ln.finish()

[94m•[0m please hit CTRL + s to save the notebook in your editor . [92m✓[0m
[93m![0m cells [(35, None), (None, 37), (37, 39)] were not run consecutively
[92m→[0m finished Run('xkjTYBy2') after 4h at 2025-07-23 13:32:12 UTC
