# Register the cellxgene-census metadata

In this notebook, we show how to register all census datasets and metadata using LaminDB under 10min.

Registered metadata can be readily used for querying, validating, annotating and integrating data, see {doc}`./cellxgene`.

For background, see [cellxgene-census tutorials](https://chanzuckerberg.github.io/cellxgene-census/notebooks/api_demo/census_query_extract.html).

## Setup

In [None]:
!lamin init --storage ./test-cellxgene --schema bionty

In [None]:
import lamindb as ln
import lnschema_bionty as lb
import cellxgene_census

In [None]:
lb.settings.organism = "human"  # "mouse" for registering metadata of mouse datasets
human = lb.settings.organism.scientific_name
rna = "RNA"

In [None]:
ln.track()

## Register datasets

In [None]:
census_version = "2023-07-25"
census = cellxgene_census.open_soma(census_version=census_version)

In [None]:
datasets_df = census["census_info"]["datasets"].read().concat().to_pandas()
datasets_df.shape

In [None]:
datasets_df.head()

In [None]:
files = ln.File.from_dir(
    f"s3://cellxgene-data-public/cell-census/{census_version}/h5ads"
)
ln.save(files)

In [None]:
# use a dataset record to track version
ln.Dataset(files, name="cellxgene-census", version=census_version).save()

In [None]:
collections_df = (
    datasets_df[["collection_id", "collection_name", "collection_doi"]]
    .drop_duplicates()
    .set_index("collection_id")
)
collections = []
for collection_id, row in collections_df.iterrows():
    collection = ln.ULabel(
        name=row.collection_name,
        description=row.collection_doi,
        reference=collection_id,
        reference_type="collection_id",
    )
    collections.append(collection)

ln.save(collections)

is_collection = ln.ULabel(name="is_collection")
is_collection.save()
is_collection.children.set(collections)

In [None]:
collections = is_collection.children
files = ln.File.filter()

In [None]:
feature = ln.Feature(name="organism", type="category")
feature.save()

In [None]:
for _, row in datasets_df.iterrows():
    file = files.filter(key__endswith=f"{row.dataset_id}.h5ad").one()
    file.description = f"{row.dataset_title}|{row.dataset_id}"
    file.save()
    file.labels.add(collections.get(reference=row.collection_id), feature)
    file.organism.add(lb.settings.organism)

## Validate and register genes

In [None]:
census_data = census["census_data"][human]

Gene metadata:

In [None]:
census_data.ms[rna].var.keys()

In [None]:
gene_metadata = census_data.ms[rna].var.read().concat().to_pandas()

In [None]:
gene_metadata.shape

In [None]:
gene_metadata.head()

In [None]:
lb.Gene.inspect(gene_metadata["feature_id"], field=lb.Gene.ensembl_gene_id);

In [None]:
# register genes from bionty
gene_records = lb.Gene.from_values(
    gene_metadata["feature_id"], field=lb.Gene.ensembl_gene_id
)
ln.save(gene_records)

validated = lb.Gene.validate(gene_metadata["feature_id"], field=lb.Gene.ensembl_gene_id)
# register legacy genes manually
gene_metadata_id = gene_metadata.set_index("feature_id")
records = []
for gene_id in gene_metadata["feature_id"][~validated]:
    records.append(
        lb.Gene(
            ensembl_gene_id=gene_id, symbol=gene_metadata_id.loc[gene_id].feature_name
        )
    )
ln.save(records)

In [None]:
lb.Gene.validate(gene_metadata["feature_id"], field=lb.Gene.ensembl_gene_id);

## Observational metadata

All available metadata columns:

In [None]:
census_data.obs.keys()

### Register features

Register `obs` column names as features:

In [None]:
features = []
for col in census_data.obs.keys():
    if col == "soma_joinid":
        type = "int"
    elif col == "is_primary_data":
        type = "bool"
    else:
        type = "category"
    features.append(ln.Feature(name=col, type=type))

ln.save(features)

In [None]:
features = ln.Feature.lookup(return_field=ln.Feature.name)
features_records = ln.Feature.lookup()

### Validate and register ontologies

Fetch all terms used in census for each ontology:

In [None]:
dfs = {}

for cols in [
    (features.assay, features.assay_ontology_term_id),
    (features.cell_type, features.cell_type_ontology_term_id),
    (features.development_stage, features.development_stage_ontology_term_id),
    (features.disease, features.disease_ontology_term_id),
    (
        features.self_reported_ethnicity,
        features.self_reported_ethnicity_ontology_term_id,
    ),
    (features.sex, features.sex_ontology_term_id),
    (features.tissue, features.tissue_ontology_term_id),
    (features.tissue_general, features.tissue_general_ontology_term_id),
]:
    dfs[cols[0]] = (
        census_data.obs.read(column_names=[cols[0], cols[1]])
        .concat()
        .to_pandas()
        .drop_duplicates()
    )

In [None]:
dfs[features.assay].head()

In [None]:
# Note: set parents=True in real-world case
def register_ontology(orm, name: str, parents: bool = False, **kwargs):
    from lamin_utils import logger

    df = dfs[name]
    records = orm.from_values(
        df[f"{name}_ontology_term_id"], field=orm.ontology_id, **kwargs
    )
    for record in records:
        census_name = df[df[f"{name}_ontology_term_id"] == record.ontology_id][
            name
        ].tolist()[0]
        if census_name != record.name:
            logger.warning(
                f"census name '{census_name}' doesn't match ontology name"
                f" '{record.name}', adding census name as abbr\n"
            )
            record.set_abbr(census_name)
    name_with_schema = orm.__get_name_with_schema__()
    feature = ln.Feature.filter(name=name).one()
    feature.registries = name_with_schema
    feature.save()
    feature = ln.Feature.filter(name=f"{name}_ontology_term_id").one()
    feature.registries = name_with_schema
    feature.save()
    ln.save(records, parents=parents)

In [None]:
register_ontology(lb.ExperimentalFactor, features.assay)

In [None]:
# add parents for several cell types here
lb.CellType.from_bionty(ontology_id="CL:0000911").save()
lb.CellType.from_bionty(ontology_id="CL:0000910").save()
lb.CellType.from_bionty(ontology_id="CL:0001044").save()
lb.CellType.from_bionty(ontology_id="CL:0001050").save()
lb.CellType.from_bionty(ontology_id="CL:0011025").save()

In [None]:
register_ontology(lb.CellType, features.cell_type)

In [None]:
register_ontology(lb.DevelopmentalStage, features.development_stage)

In [None]:
lb.DevelopmentalStage(name="unknown").save()

In [None]:
register_ontology(lb.Disease, features.disease)

'PATO:0000461' is a term for "normal" from `Phenotype`:

In [None]:
pato = lb.BiontySource.filter(source="pato").one()
normal_record = lb.Phenotype.from_bionty(ontology_id="PATO:0000461", bionty_source=pato)
lb.Disease(
    name=normal_record.name,
    ontology_id=normal_record.ontology_id,
    description=normal_record.description,
    bionty_source_id=normal_record.bionty_source_id,
).save()

In [None]:
register_ontology(lb.Ethnicity, features.self_reported_ethnicity)

Let's manually add two terms to the Ethnicity registry:

In [None]:
lb.Ethnicity(name="multiethnic").save()
lb.Ethnicity(name="unknown").save()

In [None]:
register_ontology(lb.Phenotype, features.sex, bionty_source=pato)

In [None]:
lb.Phenotype(name="unknown").save()

In [None]:
register_ontology(lb.Tissue, features.tissue)
register_ontology(lb.Tissue, features.tissue_general)

### Validate and register non-ontological metadata

"donor_id" and "suspension_type" are two fields without public ontologies, let's register them using "ULabel":

In [None]:
features_records.donor_id.registries = ln.ULabel.__get_name_with_schema__()
features_records.donor_id.save()
features_records.suspension_type.registries = ln.ULabel.__get_name_with_schema__()
features_records.suspension_type.save()

In [None]:
donor_ids = (
    census_data.obs.read(column_names=[features.donor_id])
    .concat()
    .to_pandas()
    .drop_duplicates()
)

In [None]:
records = []
for donor_id in donor_ids[features.donor_id].unique():
    record = ln.ULabel(name=donor_id, description=f"{features.donor_id}: {donor_id}")
    records.append(record)

ln.save(records)

Construct a parent "is_donor" to group these ulabels:

In [None]:
is_donor = ln.ULabel(name="is_donor", description="parent of donor ids")
is_donor.save()
is_donor.children.set(records)

We do the same for "suspension_type":

In [None]:
suspension_types = (
    census_data.obs.read(column_names=[features.suspension_type])
    .concat()
    .to_pandas()
    .drop_duplicates()
)

records = []
for suspension_type in suspension_types[features.suspension_type].unique():
    record = ln.ULabel(
        name=suspension_type,
        description=f"{features.suspension_type}: {suspension_type}",
    )
    records.append(record)

ln.save(records)

is_suspension_type = ln.ULabel(
    name="is_suspension_type", description="parent of suspension types"
)
is_suspension_type.save()
is_suspension_type.children.set(records)

In [None]:
is_suspension_type.view_parents(with_children=True)

Now we have validated all ontological terms in Census metadata!

## Link metadata to individual files

Here we show how to link a file to its validated metadata records:

In [None]:
# take the file corresponds to a dataset
dataset = datasets_df[
    datasets_df["dataset_id"] == "20d87640-4be8-487f-93d4-dce38378d00f"
].iloc[0]
file = files.filter(description__contains=dataset.dataset_id).one()
feature_sets = {}

obs metadata in the dataset:

In [None]:
obs = (
    census_data.obs.read(value_filter=f"dataset_id == '{dataset.dataset_id}'")
    .concat()
    .to_pandas()
)
print(obs.shape)

feature_set_obs = ln.FeatureSet.from_df(obs.loc[:, ~obs.columns.str.endswith("_id")])
feature_sets["obs"] = feature_set_obs

genes measured in the dataset:

In [None]:
presence_matrix = cellxgene_census.get_presence_matrix(
    census, organism=human, measurement_name=rna
)
var_joinid = presence_matrix[dataset.soma_joinid, :].tocoo().col
var = gene_metadata.loc[gene_metadata.soma_joinid.isin(var_joinid)]
print(var.shape)

feature_set_var = ln.FeatureSet.from_values(
    var.feature_id,
    lb.Gene.ensembl_gene_id,
    type="number",
)
feature_sets["var"] = feature_set_var

In [None]:
file._feature_sets = feature_sets
file.save()

In [None]:
file.feature_sets.df()

In [None]:
for feature in feature_set_obs.members:
    if feature.type == "category":
        file.labels.add(obs[feature.name], feature)

In [None]:
file.describe()

In [None]:
census.close()