# Register cellxgene-census metadata

Also see [cellxgene-census tutorials](https://chanzuckerberg.github.io/cellxgene-census/notebooks/api_demo/census_query_extract.html).

## Setup

In [None]:
!lamin init --storage ./test-census --schema bionty

In [None]:
import lamindb as ln
import lnschema_bionty as lb
import cellxgene_census

In [None]:
lb.settings.species = "human"
species = lb.settings.species.scientific_name

## Register modalities

Register "RNA" measurement as a modality:

In [None]:
modality = ln.Modality(name="RNA", description="RNA measurements")
modality.save()
rna = modality.name

## Validate and register genes

In [None]:
census = cellxgene_census.open_soma()
census_data = census["census_data"][species]

Gene metadata:

In [None]:
census_data.ms[rna].var.keys()

In [None]:
gene_metadata = census_data.ms[rna].var.read().concat().to_pandas()

In [None]:
gene_metadata.shape

In [None]:
gene_metadata.head()

In [None]:
lb.Gene.inspect(gene_metadata["feature_id"], field=lb.Gene.ensembl_gene_id);

In [None]:
# register genes from bionty
gene_records = lb.Gene.from_values(
    gene_metadata["feature_id"], field=lb.Gene.ensembl_gene_id
)
ln.save(gene_records)

validated = lb.Gene.validate(gene_metadata["feature_id"], field=lb.Gene.ensembl_gene_id)
# register legacy genes manually
records = []
for gene_id in gene_metadata["feature_id"][~validated]:
    records.append(lb.Gene(ensembl_gene_id=gene_id))
ln.save(records)

## `obs` metadata in Census

All available metadata columns:

In [None]:
census_data.obs.keys()

## Register features and modalities

Register `obs` column names as features:

In [None]:
features = []
for col in census_data.obs.keys():
    if col == "soma_id":
        type = "int"
    else:
        type = "category"
    features.append(ln.Feature(name=col, type=type))

ln.save(features)

In [None]:
features = ln.Feature.lookup(return_field=ln.Feature.name)

## Validate and register ontologies

Fetch all terms used in census for each ontology:

In [None]:
dfs = {}

for cols in [
    (features.assay, features.assay_ontology_term_id),
    (features.cell_type, features.cell_type_ontology_term_id),
    (features.development_stage, features.development_stage_ontology_term_id),
    (features.disease, features.disease_ontology_term_id),
    (
        features.self_reported_ethnicity,
        features.self_reported_ethnicity_ontology_term_id,
    ),
    (features.sex, features.sex_ontology_term_id),
    (features.tissue, features.tissue_ontology_term_id),
    (features.tissue_general, features.tissue_general_ontology_term_id),
]:
    dfs[cols[0]] = (
        census_data.obs.read(column_names=[cols[0], cols[1]])
        .concat()
        .to_pandas()
        .drop_duplicates()
    )

In [None]:
dfs["assay"].head()

In [None]:
def register_ontology(orm, name: str, **kwargs):
    from lamin_utils import logger

    df = dfs[name]
    records = orm.from_values(
        df[f"{name}_ontology_term_id"], field=orm.ontology_id, **kwargs
    )
    for record in records:
        census_name = df[df[f"{name}_ontology_term_id"] == record.ontology_id][
            name
        ].tolist()[0]
        if census_name != record.name:
            logger.warning(
                f"census name '{census_name}' doesn't match ontology name"
                f" '{record.name}', adding census name as a synonym\n"
            )
            record.add_synonym(census_name)
    name_with_schema = orm.__get_name_with_schema__()
    feature = ln.Feature.filter(name=name).one()
    feature.registries = name_with_schema
    feature.save()
    feature = ln.Feature.filter(name=f"{name}_ontology_term_id").one()
    feature.registries = name_with_schema
    feature.save()
    ln.save(records, parents=False)

In [None]:
register_ontology(lb.ExperimentalFactor, features.assay)

In [None]:
register_ontology(lb.CellType, features.cell_type)

In [None]:
register_ontology(lb.Disease, features.disease)

'PATO:0000461' is a term for "normal" which can be typed with `Phenotype`:

In [None]:
pato = lb.BiontySource.filter(source="pato").one()
lb.Phenotype.from_bionty(ontology_id="PATO:0000461", bionty_source=pato).save()

In [None]:
register_ontology(lb.Ethnicity, features.self_reported_ethnicity)

Let's manually add two terms to the Ethnicity registry:

In [None]:
lb.Ethnicity(name="multiethnic").save()
lb.Ethnicity(name="unknown").save()

In [None]:
register_ontology(lb.Phenotype, features.sex, bionty_source=pato)

In [None]:
lb.Phenotype(name="unknown").save()

In [None]:
register_ontology(lb.Tissue, features.tissue)

In [None]:
register_ontology(lb.Tissue, features.tissue_general)

## Validate and register non-ontological metadata

"donor_id" and "suspension_type" are two fields without public ontologies, let's register them using "ULabel":

In [None]:
features.donor_id.registries = ln.ULabel.__get_name_with_schema__()
features.donor_id.save()
features.suspension_type.registries = ln.ULabel.__get_name_with_schema__()
features.suspension_type.save()

In [None]:
donor_ids = (
    census_data.obs.read(column_names=[features.donor_id])
    .concat()
    .to_pandas()
    .drop_duplicates()
)

In [None]:
ln.settings.upon_create_search_names = False

records = []
for donor_id in donor_ids[features.donor_id].unique():
    record = ln.ULabel(name=donor_id, description=f"{features.donor_id}: {donor_id}")
    records.append(record)

ln.save(records)

Construct a parent "is_donor" to group these ulabels:

In [None]:
is_donor = ln.ULabel(name="is_donor", description="parent of donor ids")
is_donor.save()
is_donor.children.set(records)

We do the same for "suspension_type":

In [None]:
suspension_types = (
    census_data.obs.read(column_names=[features.suspension_type])
    .concat()
    .to_pandas()
    .drop_duplicates()
)

records = []
for suspension_type in suspension_types[features.suspension_type].unique():
    record = ln.ULabel(
        name=suspension_type,
        description=f"{features.suspension_type}: {suspension_type}",
    )
    records.append(record)

ln.save(records)

is_suspension_type = ln.ULabel(
    name="is_suspension_type", description="parent of suspension types"
)
is_suspension_type.save()
is_suspension_type.children.set(records)

Now we have validated all ontological terms in Census metadata!🎉 Let's see how they can be useful when {doc}`./query-census`.