![](https://img.shields.io/badge/3/4-lightgrey)

# Register cellxgene-census metadata - mouse

In [None]:
import lamindb as ln
import lnschema_bionty as lb
import cellxgene_census

In [None]:
lb.settings.organism = "mouse"
mouse = lb.settings.organism.scientific_name
rna = "RNA"

In [None]:
ln.track()

In [None]:
census = cellxgene_census.open_soma(census_version="2023-07-25")

In [None]:
census_data = census["census_data"][mouse]

## Validate and register genes

Gene metadata:

In [None]:
census_data.ms[rna].var.keys()

In [None]:
gene_metadata = census_data.ms[rna].var.read().concat().to_pandas()

In [None]:
gene_metadata.shape

In [None]:
gene_metadata.head()

In [None]:
lb.Gene.inspect(gene_metadata["feature_id"], field=lb.Gene.ensembl_gene_id);

In [None]:
# register genes from bionty
gene_records = lb.Gene.from_values(
    gene_metadata["feature_id"], field=lb.Gene.ensembl_gene_id
)
ln.save(gene_records)

validated = lb.Gene.validate(gene_metadata["feature_id"], field=lb.Gene.ensembl_gene_id)
# register legacy genes manually
gene_metadata_id = gene_metadata.set_index("feature_id")
records = []
for gene_id in gene_metadata["feature_id"][~validated]:
    records.append(
        lb.Gene(
            ensembl_gene_id=gene_id, symbol=gene_metadata_id.loc[gene_id].feature_name
        )
    )
ln.save(records)

In [None]:
gene_records = lb.Gene.from_values(
    gene_metadata["feature_id"], field=lb.Gene.ensembl_gene_id
)

feature_set = ln.FeatureSet(
    features=gene_records,
    name="all mouse genes in cellxgene-census RNA measurement",
)
feature_set.save()

## Observational metadata

All available metadata columns:

In [None]:
census_data.obs.keys()

In [None]:
features = ln.Feature.lookup(return_field=ln.Feature.name)
features_records = ln.Feature.lookup()

### Validate and register ontologies

Fetch all terms used in census for each ontology:

In [None]:
dfs = {}

for cols in [
    (features.assay, features.assay_ontology_term_id),
    (features.cell_type, features.cell_type_ontology_term_id),
    (features.development_stage, features.development_stage_ontology_term_id),
    (features.disease, features.disease_ontology_term_id),
    (
        features.self_reported_ethnicity,
        features.self_reported_ethnicity_ontology_term_id,
    ),
    (features.sex, features.sex_ontology_term_id),
    (features.tissue, features.tissue_ontology_term_id),
    (features.tissue_general, features.tissue_general_ontology_term_id),
]:
    dfs[cols[0]] = (
        census_data.obs.read(column_names=[cols[0], cols[1]])
        .concat()
        .to_pandas()
        .drop_duplicates()
    )

In [None]:
dfs[features.development_stage].head()

In [None]:
def register_ontology(orm, name: str, parents: bool = True, **kwargs):
    from lamin_utils import logger

    df = dfs[name]
    records = orm.from_values(
        df[f"{name}_ontology_term_id"], field=orm.ontology_id, **kwargs
    )
    for record in records:
        census_name = df[df[f"{name}_ontology_term_id"] == record.ontology_id][
            name
        ].tolist()[0]
        if census_name != record.name:
            logger.warning(
                f"census name '{census_name}' doesn't match ontology name"
                f" '{record.name}', adding census name as abbr\n"
            )
            record.set_abbr(census_name)
    name_with_schema = orm.__get_name_with_schema__()
    feature = ln.Feature.filter(name=name).one()
    feature.registries = name_with_schema
    feature.save()
    feature = ln.Feature.filter(name=f"{name}_ontology_term_id").one()
    feature.registries = name_with_schema
    feature.save()
    ln.save(records, parents=parents)

In [None]:
register_ontology(lb.ExperimentalFactor, features.assay)

In [None]:
register_ontology(lb.CellType, features.cell_type)

In [None]:
mouse_dv = lb.BiontySource.filter(entity="DevelopmentalStage", organism="mouse").one()
register_ontology(
    lb.DevelopmentalStage,
    features.development_stage,
    parents=True,
    bionty_source=mouse_dv,
)

In [None]:
register_ontology(lb.Disease, features.disease)

In [None]:
register_ontology(lb.Ethnicity, features.self_reported_ethnicity)

In [None]:
pato = lb.BiontySource.filter(source="pato").one()
register_ontology(lb.Phenotype, features.sex, parents=False, bionty_source=pato)

In [None]:
register_ontology(lb.Tissue, features.tissue_general)

In [None]:
register_ontology(lb.Tissue, features.tissue)

### Validate and register non-ontological metadata

"donor_id" and "suspension_type" are two fields without public ontologies, let's register them using "ULabel":

In [None]:
donor_ids = (
    census_data.obs.read(column_names=[features.donor_id])
    .concat()
    .to_pandas()
    .drop_duplicates()
)

Construct a parent "is_donor" to group these ulabels:

In [None]:
records = []
for donor_id in donor_ids[features.donor_id].unique():
    record = ln.ULabel(
        name=donor_id, description=f"mouse {features.donor_id}: {donor_id}"
    )
    records.append(record)

ln.save(records)
is_donor = ln.ULabel.filter(name="is_donor").one()
is_donor.children.set(records)

We do the same for "suspension_type":

In [None]:
suspension_types = (
    census_data.obs.read(column_names=[features.suspension_type])
    .concat()
    .to_pandas()
    .drop_duplicates()
)

records = []
for suspension_type in suspension_types[features.suspension_type].unique():
    record = ln.ULabel(
        name=suspension_type,
        description=f"{features.suspension_type}: {suspension_type}",
    )
    records.append(record)

ln.save(records)

is_suspension_type = ln.ULabel(
    name="is_suspension_type", description="parent of suspension types"
)
is_suspension_type.save()
is_suspension_type.children.set(records)

In [None]:
is_suspension_type.view_parents(with_children=True)

In [None]:
census.close()