# Register the cellxgene metadata

In this notebook, we show how to register all CELLxGENE datasets and metadata using LaminDB under 10min.

Registered metadata can be readily used for querying, validating, annotating and integrating data, see {doc}`./cellxgene`.

For background, see [CELLxGENE Discover API](https://api.cellxgene.cziscience.com/curation/ui/#/), [cellxgene-census tutorials](https://chanzuckerberg.github.io/cellxgene-census/notebooks/api_demo/census_query_extract.html).

## Setup

In [None]:
!lamin init --storage ./test-cellxgene-registries --schema bionty

In [None]:
import lamindb as ln
import bionty as bt
import cellxgene_census
import pandas as pd

In [None]:
ln.track()

## Register datasets

In [None]:
census_version = "2023-07-25"  # LTS release of Census

Get the h5ad artifacts directory on s3 from Census:

In [None]:
h5ad_dir = (
    cellxgene_census.get_census_version_directory()
    .get("stable")
    .get("h5ads")
    .get("uri")
)
h5ad_dir

In [None]:
ln.UPath("s3://cellxgene-data-public/cell-census/2023-07-25/h5ads").view_tree()

In [None]:
artifacts = ln.Artifact.from_dir(
    "s3://cellxgene-data-public/cell-census/2023-07-25/h5ads"
)
ln.save(artifacts)

In [None]:
collection = ln.Collection(artifacts, name="cellxgene-census", version=census_version)
collection.save()

## Register metadata

Get all datasets and associated metadata using cellxgene REST API:

In [None]:
import requests


def get_metadata_from_cxg():
    api_url_base = "https://api.cellxgene.cziscience.com"
    datasets_path = "/curation/v1/datasets"
    datasets_url = f"{api_url_base}{datasets_path}"
    headers = {"Content-Type": "application/json"}
    res = requests.get(url=datasets_url, headers=headers)
    res.raise_for_status()
    cellxgene_meta = res.json()
    return cellxgene_meta

In [None]:
cellxgene_meta = get_metadata_from_cxg()
len(cellxgene_meta)

In [None]:
cellxgene_meta[0].keys()

### features

In [None]:
obs_features = {
    "assay": "bionty.ExperimentalFactor",
    "cell_type": "bionty.CellType",
    "development_stage": "bionty.DevelopmentalStage",
    "disease": "bionty.Disease",
    "donor_id": "core.ULabel",
    "self_reported_ethnicity": "bionty.Ethnicity",
    "sex": "bionty.Phenotype",
    "suspension_type": "core.ULabel",
    "tissue": "bionty.Tissue",
}

obs_features_records = []
for name, registry in obs_features.items():
    record = ln.Feature(name=name, type="category", registries=registry)
    obs_features_records.append(record)
ln.save(obs_features_records)
obs_feature_set = ln.FeatureSet(features=obs_features_records, name="obs features")
obs_feature_set.save()
obs_feature_set.artifacts.set(artifacts, through_defaults={"slot": "obs"})

In [None]:
ext_features = {"organism": "bionty.Organism", "collection": "core.ULabel"}

ext_features_records = []
for name, registry in ext_features.items():
    record = ln.Feature(name=name, type="category", registries=registry)
    ext_features_records.append(record)
ln.save(ext_features_records)
ext_feature_set = ln.FeatureSet(features=ext_features_records, name="external features")
ext_feature_set.save()
ext_feature_set.artifacts.set(artifacts, through_defaults={"slot": "external"})

### collections, organisms

Register collections:

In [None]:
is_collection = ln.ULabel(name="is_collection")
is_collection.save()

collections_meta = set()
for dataset_meta in cellxgene_meta:
    collections_meta.add(
        (
            dataset_meta["collection_name"],
            dataset_meta["collection_doi"],
            dataset_meta["collection_id"],
        )
    )

collections_records = []
for collection_name, collection_doi, collection_id in collections_meta:
    cellxgene_collection = ln.ULabel(
        name=collection_name,
        description=collection_doi,
        reference=collection_id,
        reference_type="collection_id",
    )
    collections_records.append(cellxgene_collection)
ln.save(collections_records)
is_collection.children.set(collections_records)

Register organisms:

In [None]:
ncbitaxon_source = bt.PublicSource.filter(source="ncbitaxon").one()

organisms_meta = set()
for dataset_meta in cellxgene_meta:
    organisms_meta.update({i["ontology_term_id"] for i in dataset_meta["organism"]})

organisms_records = bt.Organism.from_values(
    organisms_meta, field=bt.Organism.ontology_id, public_source=ncbitaxon_source
)
# rename house mouse to mouse
for r in organisms_records:
    if r.name == "house mouse":
        r.name = "mouse"
ln.save(organisms_records, parents=False)

Annotate artifacts with collections and organisms:

In [None]:
ext_features = ext_feature_set.members.lookup()
artifacts = collection.artifacts.all()
collections = is_collection.children.all()
organisms = bt.Organism.filter().all()

for dataset_meta in cellxgene_meta:
    # get registered file record based on dataset_id
    file = artifacts.filter(key__contains=dataset_meta["dataset_id"]).one_or_none()
    if file is None:
        continue

    # register collection
    collection = ln.ULabel.filter(reference=dataset_meta["collection_id"]).one()
    file.labels.add(collection, feature=ext_features.collection)

    # register organism
    organism_ontology_ids = [i["ontology_term_id"] for i in dataset_meta["organism"]]
    organism_records = bt.Organism.filter(ontology_id__in=organism_ontology_ids).list()
    file.labels.add(organism_records, feature=ext_features.organism)

### ontologies

Register all ontology ids:

In [None]:
from typing import Optional
from lnschema_bionty.models import Registry
from lamindb.dev._feature_manager import get_accessor_by_orm

obs_features_records = obs_feature_set.members.lookup()
ACCESSORS = get_accessor_by_orm(ln.Artifact)
FEATURE_TO_ACCESSOR = {}
for name in obs_features.keys():
    feature = getattr(obs_features_records, name)
    accessor = ACCESSORS.get(feature.registries)
    orm = getattr(ln.Artifact, accessor).field.model
    # TODO: ulabels are defined in the Artifact model, improve this in LaminDB
    if orm == ln.Artifact:
        orm = getattr(ln.Artifact, accessor).field.related_model
    FEATURE_TO_ACCESSOR[name] = (accessor, orm)


def create_ontology_record_from_source(
    ontology_id: str,
    from_orm: Registry,
    target_orm: Registry,
    public_source: Optional[bt.PublicSource] = None,
):
    from_record = from_orm(ontology_id=ontology_id, public_source=public_source)
    try:
        target_record = target_orm(
            name=from_record.name,
            description=from_record.description,
            ontology_id=from_record.ontology_id,
            public_source_id=from_record.public_source_id,
        )
        return target_record
    except Exception:
        pass

In [None]:
# add parents for several cell types here for CI
bt.CellType(ontology_id="CL:0000911").save()
bt.CellType(ontology_id="CL:0000910").save()
bt.CellType(ontology_id="CL:0001044").save()
bt.CellType(ontology_id="CL:0001050").save()
bt.CellType(ontology_id="CL:0011025").save()

In [None]:
ln.settings.upon_create_search_names = False

ontology_ids = {}
for name in obs_features.keys():
    if name in ["donor_id", "suspension_type"]:
        continue
    allids = set()
    for i in cellxgene_meta:
        if name in i:
            allids.update([(j["label"], j["ontology_term_id"]) for j in i[name]])

    ontology_ids[name] = allids

public_source_ds_mouse = bt.PublicSource.filter(
    entity="DevelopmentalStage", organism="mouse"
).one()
public_source_pato = bt.PublicSource.filter(source="pato").one()

# register all ontology ids
for name, terms in ontology_ids.items():
    print(f"registering {name}")
    accessor, orm = FEATURE_TO_ACCESSOR.get(name)
    terms_ids = [i[1] for i in terms]
    records = orm.from_values(terms_ids, field="ontology_id")
    if len(records) > 0:
        ln.save(records, parents=False)  # not saving parents for CI
    inspect_result = orm.inspect(terms_ids, field="ontology_id", mute=True)
    if len(inspect_result.non_validated) > 0:
        if name == "development_stage":
            records = orm.from_values(
                inspect_result.non_validated,
                field="ontology_id",
                public_source=public_source_ds_mouse,
            )
            records += [
                create_ontology_record_from_source(
                    ontology_id=term_id, from_orm=bt.Tissue, target_orm=orm
                )
                for term_id in inspect_result.non_validated
                if term_id.startswith("UBERON:")
            ]
            records += [
                orm(name=term_id, ontology_id=term_id)
                for term_id in inspect_result.non_validated
                if term_id == "unknown"
            ]
        else:
            records = [
                orm(name=term[0], ontology_id=term[1])
                for term in terms
                if not term[1].startswith("PATO:")
                and term[1] in inspect_result.non_validated
            ]
            records += [
                create_ontology_record_from_source(
                    ontology_id=term_id,
                    from_orm=bt.Phenotype,
                    target_orm=orm,
                    public_source=public_source_pato,
                )
                for term_id in inspect_result.non_validated
                if term_id.startswith("PATO:")
            ]

        if len(records) > 0:
            print(f"registered {len(records)} records: {records}")
            ln.save(records, parents=False)  # not saving parents for CI

### donors and suspension_types

In [None]:
donor_ids = set()
suspension_types = set()

for i in cellxgene_meta:
    if "donor_id" in i:
        donor_ids.update(i["donor_id"])
    if "suspension_type" in i:
        suspension_types.update(i["suspension_type"])

is_donor = ln.ULabel(name="is_donor", description="parent of donor ids")
is_donor.save()

is_suspension_type = ln.ULabel(
    name="is_suspension_type", description="parent of suspension types"
)
is_suspension_type.save()

In [None]:
donors = ln.ULabel.filter(name="is_donor").one().children.all()
result = donors.inspect(donor_ids, mute=True)
new_donors = [ln.ULabel(name=name) for name in result.non_validated]
ln.save(new_donors)
is_donor = ln.ULabel.filter(name="is_donor").one()
is_donor.children.add(*new_donors)

stypes = ln.ULabel.filter(name="is_suspension_type").one().children.all()
result = stypes.inspect(suspension_types, mute=True)
new_stypes = [ln.ULabel(name=name) for name in result.non_validated]
ln.save(new_stypes)
is_suspension_type = ln.ULabel.filter(name="is_suspension_type").one()
is_suspension_type.children.add(*new_stypes)

## Annotate artifacts with metadata

In [None]:
features = ln.Feature.lookup()

for idx, dataset_meta in enumerate(cellxgene_meta):
    # only runs 1 file
    if dataset_meta["dataset_id"] == "8f98c236-43f0-4dc4-985b-c304499f7b44":
        file = artifacts.filter(key__contains=dataset_meta["dataset_id"]).one_or_none()
        if file is None:
            continue
        for field, terms in dataset_meta.items():
            if field not in FEATURE_TO_ACCESSOR:
                continue
            accessor, orm = FEATURE_TO_ACCESSOR.get(field)
            if field in ["donor_id", "suspension_type"]:
                records = orm.from_values(terms, field="name")
                if len(records) > 0:
                    # stratify by feature so that link tables records are written
                    file.labels.add(records, feature=getattr(features, field))
            else:
                records = orm.from_values(
                    [i["ontology_term_id"] for i in terms], field="ontology_id"
                )
                if len(records) > 0:
                    getattr(file, accessor).add(*records)

## Validate and register genes

In [None]:
# register synthetic constructs as a new organism
bt.Organism.from_public(
    ontology_id="NCBITaxon:32630", public_source=ncbitaxon_source
).save(parents=False)

# genes artifacts
organisms = bt.Organism.lookup(field=bt.Organism.scientific_name)
genes_files = {
    "homo_sapiens": "https://github.com/chanzuckerberg/single-cell-curation/raw/main/cellxgene_schema_cli/cellxgene_schema/ontology_files/genes_homo_sapiens.csv.gz",
    "mus_musculus": "https://github.com/chanzuckerberg/single-cell-curation/raw/main/cellxgene_schema_cli/cellxgene_schema/ontology_files/genes_mus_musculus.csv.gz",
    "synthetic_construct": "https://github.com/chanzuckerberg/single-cell-curation/raw/main/cellxgene_schema_cli/cellxgene_schema/ontology_files/genes_ercc.csv.gz",
}

Register all genes for each organism:

In [None]:
for organism_name, genes_file in genes_files.items():
    print(f"registering {organism_name} genes")
    df = pd.read_csv(genes_file, header=None, index_col=0)
    organism_record = getattr(organisms, organism_name)
    gene_records = bt.Gene.from_values(
        df.index, field=bt.Gene.ensembl_gene_id, organism=organism_record
    )
    ln.save(gene_records)
    validated = bt.Gene.validate(
        df.index, field=bt.Gene.ensembl_gene_id, organism=organism_record
    )
    # register legacy genes manually
    new_records = []
    for gene_id in df.index[~validated]:
        new_records.append(
            bt.Gene(
                ensembl_gene_id=gene_id,
                symbol=df.loc[gene_id][1],
                organism=organism_record,
            )
        )
    ln.save(new_records)

    genes_feature_set = ln.FeatureSet(
        features=gene_records + new_records, name=f"all {organism_record.name} genes"
    )
    genes_feature_set.save()

## Link metadata to individual artifacts

Here we show how to link a file to its validated metadata records:

In [None]:
# take the file corresponds to a dataset
file = artifacts.filter(key__contains="8f98c236-43f0-4dc4-985b-c304499f7b44.h5ad").one()
feature_sets = {}

genes measured in the dataset:

In [None]:
adata_backed = file.backed()
var_names = adata_backed.var_names
genes = bt.Gene.from_values(
    var_names, field=bt.Gene.ensembl_gene_id, organism=file.organism.first()
)

if len(var_names[var_names.str.startswith("ERCC")]) > 0:
    genes += bt.Gene.from_values(
        var_names, field=bt.Gene.ensembl_gene_id, organism=organisms.synthetic_construct
    )

var_feature_set_file = ln.FeatureSet(genes, type="number")
var_feature_set_file.save()
file.feature_sets.add(var_feature_set_file, through_defaults={"slot": "var"})

In [None]:
file.describe()

In [None]:
# clean up test instance
!lamin delete --force test-cellxgene-registries
!rm -r ./test-cellxgene-registries