# Census release 2023-12-15 (LTS)

In [None]:
import lamindb as ln
import lnschema_bionty as lb
from cellxgene_lamin import get_datasets_from_cxg, get_collections_from_cxg

ln.settings.verbosity = "hint"

In [None]:
census_version = "2023-12-15"

In [None]:
s3path = f"s3://cellxgene-data-public/cell-census/{census_version}/h5ads"
ln.UPath(s3path)

In [None]:
ln.UPath(s3path).view_tree()

In [None]:
ln.track()

Get all datasets and associated metadata using cellxgene REST API:

In [None]:
cxg_datasets = get_datasets_from_cxg()
len(cxg_datasets)

In [None]:
cxg_datasets[0].keys()

## Register artifacts

In [None]:
artifacts = ln.Artifact.from_dir(s3path)
ln.save(artifacts)

In [None]:
artifacts = ln.Artifact.filter(key__contains=census_version).all()
len(artifacts)

In [None]:
artifacts.update(version=census_version)

In [None]:
for cxg_dataset in cxg_datasets:
    artifact = artifacts.filter(key__contains=cxg_dataset["dataset_id"]).one_or_none()
    if artifact is not None:
        artifact.n_observations = cxg_dataset["cell_count"]
        artifact.description = cxg_dataset["title"]
        artifact.save()

In [None]:
artifacts_20230725 = ln.Artifact.filter(key__contains="2023-07-25").all()
artifacts_20230725.count()

In [None]:
for artifact in artifacts:
    dataset_id = artifact.key.split("/")[-1]
    artifact_20230725 = artifacts_20230725.filter(
        key__endswith=dataset_id
    ).one_or_none()
    if artifact_20230725 is not None:
        artifact.add_to_version_family(artifact_20230725, version=census_version)

## Register metadata

### Register new features and parent labels

In [None]:
from cellxgene_lamin._features import OBS_FEATURES, EXT_FEATURES, register_feature_set

In [None]:
obs_feature_set = ln.FeatureSet.filter(name="obs features").one_or_none()
if obs_feature_set is None:
    obs_feature_set = register_feature_set(artifacts, "obs")

ext_feature_set = ln.FeatureSet.filter(name="external metadata").one_or_none()
if ext_feature_set is None:
    ext_feature_set = register_feature_set(artifacts, "ext")

In [None]:
features = ln.Feature.lookup()

## organisms

In [None]:
from cellxgene_lamin._organism import register_organisms, annotate_organisms

In [None]:
register_organisms(cxg_datasets)

Link collections and organisms to artifacts:

In [None]:
annotate_organisms(artifacts, cxg_datasets)

## ontologies

Register all ontology ids:

In [None]:
from cellxgene_lamin._ontology import register_ontology_ids

In [None]:
register_ontology_ids(cxg_datasets)

## donors and suspension_types

In [None]:
from cellxgene_lamin._labels import register_ulabels

In [None]:
register_ulabels(cxg_datasets, "donor_id")

In [None]:
register_ulabels(cxg_datasets, "suspension_type")

## Annotate artifacts with obs metadata

In [None]:
from cellxgene_lamin._features import FEATURE_TO_ACCESSOR

In [None]:
features = ln.Feature.lookup()

for idx, cxg_dataset in enumerate(cxg_datasets):
    if idx % 100 == 0:
        print(f"annotating dataset {idx} of {len(cxg_datasets)}")
    artifact = artifacts.filter(key__contains=cxg_dataset["dataset_id"]).one_or_none()
    if artifact is None:
        continue
    for field, terms in cxg_dataset.items():
        if field not in FEATURE_TO_ACCESSOR:
            continue
        accessor, orm = FEATURE_TO_ACCESSOR.get(field)
        if field in ["donor_id", "suspension_type", "tissue_type"]:
            records = orm.from_values(terms, field="name")
            if len(records) > 0:
                # stratify by feature so that link tables records are written
                artifact.labels.add(records, feature=getattr(features, field))
        else:
            records = orm.from_values(
                [i["ontology_term_id"] for i in terms], field="ontology_id"
            )
            if len(records) > 0:
                getattr(artifact, accessor).add(*records)

# clean up the 2 "unknowns" in DevelopmentalStage
lb.DevelopmentalStage.filter(name="unknown").exclude(ontology_id="unknown").delete()

## Validate and register genes

In [None]:
from cellxgene_lamin._gene import register_genes

Register all genes for each organism:

In [None]:
register_genes()

## Link metadata to individual artifacts

annotate with genes measured in each artifact:

In [None]:
organisms = lb.Organism.lookup(field=lb.Organism.scientific_name)

In [None]:
for idx, artifact in enumerate(artifacts):
    if idx % 100 == 0:
        print(f"annotating dataset {idx} of {len(artifacts)}")

    adata_backed = artifact.backed()
    var_names = adata_backed.var_names
    organism_record = artifact.organism.first()
    if organism_record is None:
        print(f"No organism found for artifact: {artifact}")
        continue
    genes = lb.Gene.from_values(
        var_names, field=lb.Gene.ensembl_gene_id, organism=organism_record
    )

    if len(genes) == 0 and var_names[0].startswith("ENSG"):
        genes += lb.Gene.from_values(
            var_names, field=lb.Gene.ensembl_gene_id, organism="human"
        )

    if len(var_names[var_names.str.startswith("ERCC")]) > 0:
        genes += lb.Gene.from_values(
            var_names,
            field=lb.Gene.ensembl_gene_id,
            organism=organisms.synthetic_construct,
        )
    if len(var_names[var_names.str.startswith("ENSSASG")]) > 0:
        genes += lb.Gene.from_values(
            var_names,
            field=lb.Gene.ensembl_gene_id,
            organism=organisms.severe_acute_respiratory_syndrome_coronavirus_2,
        )

    var_feature_set_artifact = ln.FeatureSet(genes, type="number")
    var_feature_set_artifact.save()
    artifact.feature_sets.add(
        var_feature_set_artifact, through_defaults={"slot": "var"}
    )

In [None]:
artifact.describe()

## Annotate tissue_type

Before CxG schema 4.0, tissue_type column was not annotated, instead "cell culture" or "organoid" was added to the record ontology_id.

In [None]:
register_ulabels(cxg_datasets, "tissue_type")

In [None]:
is_tissue_type = ln.ULabel.filter(name="is_tissue_type").one()
tissue_types = is_tissue_type.children.lookup()
features = ln.Feature.lookup()

In [None]:
organoids = lb.Tissue.filter(ontology_id__contains="organoid").all()
organoids.df()

In [None]:
for record in organoids:
    print(record.name)
    ontology_id = record.ontology_id.split(" ")[0]
    tissue_record = lb.Tissue.from_bionty(ontology_id=ontology_id)
    if tissue_record._state.adding:
        tissue_record.save()
    for f in tissue_record.artifacts.all():
        f.labels.add(tissue_types.organoid, features.tissue_type)

In [None]:
organoids.delete()

In [None]:
cell_cultures = lb.Tissue.filter(ontology_id__contains="cell culture").all()
cell_cultures.df()

In [None]:
for record in cell_cultures:
    print(record.name)
    ontology_id = record.ontology_id.split(" ")[0]
    tissue_record = lb.CellType.from_bionty(ontology_id=ontology_id)
    if tissue_record._state.adding:
        tissue_record.save()
    for f in tissue_record.artifacts.all():
        f.labels.add(tissue_types.cell_culture, features.tissue_type)

In [None]:
cell_cultures.delete()

## Register collections

In [None]:
collection = ln.Collection(
    artifacts,
    name="cellxgene-census",
    version=census_version,
    is_new_version_of=ln.Collection.filter(
        name="cellxgene-census", version="2023-07-25"
    ).one(),
)
collection.save()

In [None]:
cxg_collections = get_collections_from_cxg()

In [None]:
for collection_meta in cxg_collections:
    keys = [
        f'cell-census/{census_version}/h5ads/{dataset["dataset_id"]}.h5ad'
        for dataset in collection_meta["datasets"]
    ]
    collection_artifacts = artifacts.filter(key__in=keys).all()
    if collection_artifacts.count() > 0:
        kwargs = dict(
            name=collection_meta["name"],
            description=collection_meta["doi"],
            reference=collection_meta["collection_id"],
            reference_type="CELLxGENE Collection ID",
            version=census_version,
        )
        collection_record = ln.Collection(
            collection_artifacts,
            **kwargs,
        )
        # if is needed here as .save() errors if collection is already saved
        if collection_record._state.adding:
            collection_record.save()

Add existing collections to their corresponding version families:

In [None]:
collections = ln.Collection.filter(version=census_version).all()
collections.count()

In [None]:
collections_20230725 = ln.Collection.filter(version="2023-07-25").all()
collections_20230725.count()

In [None]:
for collection in collections:
    collection_20230725 = collections_20230725.filter(
        reference=collection.reference
    ).one_or_none()
    if collection_20230725 is not None:
        collection.add_to_version_family(collection_20230725, version=census_version)

## Link metadata to collection

In [None]:
collection = ln.Collection.filter(name="cellxgene-census", version=census_version).one()

feature sets:

In [None]:
collection.feature_sets.add(
    ln.FeatureSet.filter(name__contains="obs").one(), through_defaults={"slot": "obs"}
)
collection.feature_sets.add(
    ln.FeatureSet.filter(name__contains="ext").one(),
    through_defaults={"slot": "external"},
)

In [None]:
collection.describe()