# Register files from Census release 2023-12-06

In [None]:
import lamindb as ln
import lnschema_bionty as lb
import pandas as pd

In [None]:
census_version = "2023-12-06"

In [None]:
s3path = f"s3://cellxgene-data-public/cell-census/{census_version}/h5ads"
ln.UPath(s3path).view_tree()

In [None]:
ln.track()

## Register artifacts (files)

In [None]:
artifacts = ln.Artifact.from_dir(s3path)
ln.save(artifacts)

In [None]:
dataset = ln.Dataset(artifacts, name="cellxgene-census", version=census_version)
dataset.save()

## Register metadata

Get all datasets and associated metadata using cellxgene REST API:

In [None]:
import requests


def get_datasets_df_from_cxg():
    api_url_base = "https://api.cellxgene.cziscience.com"
    datasets_path = "/curation/v1/datasets"
    datasets_url = f"{api_url_base}{datasets_path}"
    headers = {"Content-Type": "application/json"}
    res = requests.get(url=datasets_url, headers=headers)
    res.raise_for_status()
    res_content = res.json()
    return res_content

In [None]:
cellxgene_meta = get_datasets_df_from_cxg()
len(cellxgene_meta)

In [None]:
cellxgene_meta[0].keys()

### Register new features and parent labels

In [None]:
obs_features = {
    "assay": "bionty.ExperimentalFactor",
    "cell_type": "bionty.CellType",
    "development_stage": "bionty.DevelopmentalStage",
    "disease": "bionty.Disease",
    "donor_id": "core.ULabel",
    "self_reported_ethnicity": "bionty.Ethnicity",
    "sex": "bionty.Phenotype",
    "suspension_type": "core.ULabel",
    "tissue": "bionty.Tissue",
    "tissue_type": "core.ULabel",
}

obs_features_records = []
for name, registry in obs_features.items():
    record = ln.Feature(name=name, type="category", registries=registry)
    obs_features_records.append(record)
ln.save(obs_features_records)
obs_feature_set = ln.FeatureSet(features=obs_features_records, name="obs features")
obs_feature_set.save()
obs_feature_set.artifacts.set(artifacts, through_defaults={"slot": "obs"})

In [None]:
ext_features = {"organism": "bionty.Organism", "collection": "core.ULabel"}

ext_features_records = []
for name, registry in ext_features.items():
    record = ln.Feature(name=name, type="category", registries=registry)
    ext_features_records.append(record)
ln.save(ext_features_records)
ext_feature_set = ln.FeatureSet(features=ext_features_records, name="external features")
ext_feature_set.save()
ext_feature_set.artifacts.set(artifacts, through_defaults={"slot": "external"})

In [None]:
ln.ULabel(name="is_collection", description="parents of collections").save()
ln.ULabel(name="is_donor", description="parents of donors").save()
ln.ULabel(name="is_suspension_type", description="parents of suspension types").save()
ln.ULabel(name="is_tissue_type", description="parents of tissue types").save()

In [None]:
features = ln.Feature.lookup()
artifacts = ln.File.filter(key__contains=census_version).all()

## collections, organisms

In [None]:
# register all collections
is_collection = ln.ULabel.filter(name="is_collection").one()
collections_meta = set()
for dataset_meta in cellxgene_meta:
    collections_meta.add(
        (
            dataset_meta["collection_name"],
            dataset_meta["collection_doi"],
            dataset_meta["collection_id"],
        )
    )

collections_records = []
for collection_name, collection_doi, collection_id in collections_meta:
    collection = ln.ULabel(
        name=collection_name,
        description=collection_doi,
        reference=collection_id,
        reference_type="collection_id",
    )
    collections_records.append(collection)
ln.save(collections_records)
is_collection.children.add(*collections_records)

In [None]:
# register all organisms
ncbitaxon_source = lb.BiontySource.filter(source="ncbitaxon").one()

organisms_meta = set()
for dataset_meta in cellxgene_meta:
    organisms_meta.update({i["ontology_term_id"] for i in dataset_meta["organism"]})

organisms_records = lb.Organism.from_values(
    organisms_meta, field=lb.Organism.ontology_id, bionty_source=ncbitaxon_source
)
# rename house mouse to mouse
for r in organisms_records:
    if r.name == "house mouse":
        r.name = "mouse"
ln.save(organisms_records, parents=False)

Link collections and organisms to artifacts:

In [None]:
ext_feature_set = ln.FeatureSet.filter(name="external features").one()
ext_features = ext_feature_set.members.lookup()
collections = is_collection.children.all()
organisms = lb.Organism.filter().all()

for dataset_meta in cellxgene_meta:
    # get registered file record based on dataset_id
    file = artifacts.filter(key__contains=dataset_meta["dataset_id"]).one_or_none()
    if file is None:
        continue

    # register collection
    collection = ln.ULabel.filter(reference=dataset_meta["collection_id"]).one()
    file.labels.add(collection, feature=ext_features.collection)

    # register organism
    organism_ontology_ids = [i["ontology_term_id"] for i in dataset_meta["organism"]]
    organism_records = lb.Organism.filter(ontology_id__in=organism_ontology_ids).list()
    file.labels.add(organism_records, feature=ext_features.organism)

## ontologies

Register all ontology ids:

In [None]:
from typing import Optional
from lnschema_bionty.models import Registry
from lamindb.dev._feature_manager import get_accessor_by_orm

obs_feature_set = ln.FeatureSet.filter(name="obs features").one()
obs_features_records = obs_feature_set.members.lookup()
ACCESSORS = get_accessor_by_orm(ln.File)
FEATURE_TO_ACCESSOR = {}
for name in obs_features.keys():
    feature = getattr(obs_features_records, name)
    accessor = ACCESSORS.get(feature.registries)
    orm = getattr(ln.File, accessor).field.model
    # TODO: ulabels are defined in the File model, improve this in LaminDB
    if orm == ln.File:
        orm = getattr(ln.File, accessor).field.related_model
    FEATURE_TO_ACCESSOR[name] = (accessor, orm)


def create_ontology_record_from_source(
    ontology_id: str,
    from_orm: Registry,
    target_orm: Registry,
    bionty_source: Optional[lb.BiontySource] = None,
):
    from_record = from_orm.from_bionty(
        ontology_id=ontology_id, bionty_source=bionty_source
    )
    try:
        target_record = target_orm(
            name=from_record.name,
            description=from_record.description,
            ontology_id=from_record.ontology_id,
            bionty_source_id=from_record.bionty_source_id,
        )
        return target_record
    except Exception:
        pass

In [None]:
obs_features.keys()

In [None]:
ln.settings.upon_create_search_names = False

ontology_ids = {}
for name in obs_features.keys():
    if name in ["donor_id", "suspension_type", "tissue_type"]:
        continue
    allids = set()
    for i in cellxgene_meta:
        if name in i:
            allids.update([(j["label"], j["ontology_term_id"]) for j in i[name]])

    ontology_ids[name] = allids

bionty_source_ds_mouse = lb.BiontySource.filter(
    entity="DevelopmentalStage", organism="mouse"
).one()
bionty_source_pato = lb.BiontySource.filter(source="pato").one()

# register all ontology ids
for name, terms in ontology_ids.items():
    print(f"registering {name}")
    accessor, orm = FEATURE_TO_ACCESSOR.get(name)
    terms_ids = [i[1] for i in terms]
    records = orm.from_values(terms_ids, field="ontology_id")
    if len(records) > 0:
        ln.save(records)
    inspect_result = orm.inspect(terms_ids, field="ontology_id", mute=True)
    if len(inspect_result.non_validated) > 0:
        if name == "development_stage":
            records = orm.from_values(
                inspect_result.non_validated,
                field="ontology_id",
                bionty_source=bionty_source_ds_mouse,
            )
            records += [
                create_ontology_record_from_source(
                    ontology_id=term_id, from_orm=lb.Tissue, target_orm=orm
                )
                for term_id in inspect_result.non_validated
                if term_id.startswith("UBERON:")
            ]
            records += [
                orm(name=term_id, ontology_id=term_id)
                for term_id in inspect_result.non_validated
                if term_id == "unknown"
            ]
        else:
            records = [
                orm(name=term[0], ontology_id=term[1])
                for term in terms
                if (not term[1].startswith("PATO:"))
                and (term[1] in inspect_result.non_validated)
            ]
            records += [
                create_ontology_record_from_source(
                    ontology_id=term_id,
                    from_orm=lb.Phenotype,
                    target_orm=orm,
                    bionty_source=bionty_source_pato,
                )
                for term_id in inspect_result.non_validated
                if term_id.startswith("PATO:")
            ]

        if len(records) > 0:
            print(f"registered {len(records)} records: {records}")
            ln.save(records)

## donors and suspension_types

In [None]:
donor_ids = set()
suspension_types = set()

for i in cellxgene_meta:
    if "donor_id" in i:
        donor_ids.update(i["donor_id"])
    if "suspension_type" in i:
        suspension_types.update(i["suspension_type"])

In [None]:
is_donor = ln.ULabel.filter(name="is_donor").one()
donors = is_donor.children.all()
result = donors.inspect(donor_ids, mute=True)
new_donors = [ln.ULabel(name=name) for name in result.non_validated]
ln.save(new_donors)
is_donor.children.add(*new_donors)

is_suspension_type = ln.ULabel.filter(name="is_suspension_type").one()
stypes = is_suspension_type.children.all()
result = stypes.inspect(suspension_types, mute=True)
new_stypes = [ln.ULabel(name=name) for name in result.non_validated]
ln.save(new_stypes)
is_suspension_type.children.add(*new_stypes)

## Annotate artifacts with obs metadata

In [None]:
FEATURE_TO_ACCESSOR

In [None]:
features = ln.Feature.lookup()

for idx, dataset_meta in enumerate(cellxgene_meta):
    if idx % 100 == 0:
        print(f"annotating dataset {idx} of {len(cellxgene_meta)}")
    file = artifacts.filter(key__contains=dataset_meta["dataset_id"]).one_or_none()
    if file is None:
        continue
    for field, terms in dataset_meta.items():
        if field not in FEATURE_TO_ACCESSOR:
            continue
        accessor, orm = FEATURE_TO_ACCESSOR.get(field)
        if field in ["donor_id", "suspension_type", "tissue_type"]:
            records = orm.from_values(terms, field="name")
            if len(records) > 0:
                # stratify by feature so that link tables records are written
                file.labels.add(records, feature=getattr(features, field))
        else:
            records = orm.from_values(
                [i["ontology_term_id"] for i in terms], field="ontology_id"
            )
            if len(records) > 0:
                getattr(file, accessor).add(*records)

Clean up the 2 "unknowns" in DevelopmentalStage:

In [None]:
lb.DevelopmentalStage.filter(name="unknown").exclude(ontology_id="unknown").delete()

## Validate and register genes

In [None]:
# register synthetic constructs and sars_cov_2 as new organisms
new_organisms = lb.Organism.from_values(
    ["NCBITaxon:32630", "NCBITaxon:2697049"],
    field=lb.Organism.ontology_id,
    bionty_source=ncbitaxon_source,
)
ln.save(new_organisms, parents=False)

# genes files
organisms = lb.Organism.lookup(field=lb.Organism.scientific_name)
genes_files = {
    "homo_sapiens": "https://github.com/chanzuckerberg/single-cell-curation/raw/main/cellxgene_schema_cli/cellxgene_schema/ontology_files/genes_homo_sapiens.csv.gz",
    "mus_musculus": "https://github.com/chanzuckerberg/single-cell-curation/raw/main/cellxgene_schema_cli/cellxgene_schema/ontology_files/genes_mus_musculus.csv.gz",
    "synthetic_construct": "https://github.com/chanzuckerberg/single-cell-curation/raw/main/cellxgene_schema_cli/cellxgene_schema/ontology_files/genes_ercc.csv.gz",
    "severe_acute_respiratory_syndrome_coronavirus_2": "https://github.com/chanzuckerberg/single-cell-curation/raw/main/cellxgene_schema_cli/cellxgene_schema/ontology_files/genes_sars_cov_2.csv.gz",
}

Register all genes for each organism:

In [None]:
for organism_name, genes_file in genes_files.items():
    print(f"registering {organism_name} genes")
    df = pd.read_csv(genes_file, header=None, index_col=0)
    organism_record = getattr(organisms, organism_name)
    gene_records = lb.Gene.from_values(
        df.index, field=lb.Gene.ensembl_gene_id, organism=organism_record
    )
    ln.save(gene_records)
    validated = lb.Gene.validate(
        df.index, field=lb.Gene.ensembl_gene_id, organism=organism_record
    )
    # register legacy genes manually
    new_records = []
    for gene_id in df.index[~validated]:
        new_records.append(
            lb.Gene(
                ensembl_gene_id=gene_id,
                symbol=df.loc[gene_id][1],
                organism=organism_record,
            )
        )
    ln.save(new_records)

    genes_feature_set = ln.FeatureSet(
        features=gene_records + new_records, name=f"all {organism_record.name} genes"
    )
    genes_feature_set.save()

## Link metadata to individual artifacts

annotate with genes measured in each file:

In [None]:
organisms = lb.Organism.lookup(field=lb.Organism.scientific_name)

In [None]:
for idx, file in enumerate(artifacts):
    if idx % 100 == 0:
        print(f"annotating dataset {idx} of {len(artifacts)}")

    adata_backed = file.backed()
    var_names = adata_backed.var_names
    organism_record = file.organism.first()
    if organism_record is None:
        print(f"No organism found for file: {file}")
        continue
    genes = lb.Gene.from_values(
        var_names, field=lb.Gene.ensembl_gene_id, organism=organism_record
    )

    if len(genes) == 0 and var_names[0].startswith("ENSG"):
        genes += lb.Gene.from_values(
            var_names, field=lb.Gene.ensembl_gene_id, organism="human"
        )

    if len(var_names[var_names.str.startswith("ERCC")]) > 0:
        genes += lb.Gene.from_values(
            var_names,
            field=lb.Gene.ensembl_gene_id,
            organism=organisms.synthetic_construct,
        )
    if len(var_names[var_names.str.startswith("ENSSASG")]) > 0:
        genes += lb.Gene.from_values(
            var_names,
            field=lb.Gene.ensembl_gene_id,
            organism=organisms.severe_acute_respiratory_syndrome_coronavirus_2,
        )

    var_feature_set_file = ln.FeatureSet(genes, type="number")
    var_feature_set_file.save()
    file.feature_sets.add(var_feature_set_file, through_defaults={"slot": "var"})

In [None]:
file.describe()

## Annotate tissue_type

Before CxG schema 4.0, tissue_type column was not annotated, instead "cell culture" or "organoid" was added to the record ontology_id.

In [None]:
tissue_types = [ln.ULabel(name=i) for i in ["tissue", "organoid", "cell culture"]]
ln.save(tissue_types)

is_tissue_type = ln.ULabel.filter(name="is_tissue_type").one()
is_tissue_type.children.add(*tissue_types)

In [None]:
is_tissue_type = ln.ULabel.filter(name="is_tissue_type").one()
tissue_types = is_tissue_type.children.lookup()
features = ln.Feature.lookup()

In [None]:
organoids = lb.Tissue.filter(ontology_id__contains="organoid").all()
organoids.df()

In [None]:
for record in organoids:
    print(record.name)
    ontology_id = record.ontology_id.split(" ")[0]
    tissue_record = lb.Tissue.from_bionty(ontology_id=ontology_id)
    if tissue_record._state.adding:
        tissue_record.save()
    for f in tissue_record.artifacts.all():
        f.labels.add(tissue_types.organoid, features.tissue_type)

In [None]:
organoids.delete()

In [None]:
cell_cultures = lb.Tissue.filter(ontology_id__contains="cell culture").all()
cell_cultures.df()

In [None]:
for record in cell_cultures:
    print(record.name)
    ontology_id = record.ontology_id.split(" ")[0]
    tissue_record = lb.CellType.from_bionty(ontology_id=ontology_id)
    if tissue_record._state.adding:
        tissue_record.save()
    for f in tissue_record.artifacts.all():
        f.labels.add(tissue_types.cell_culture, features.tissue_type)

In [None]:
cell_cultures.delete()

## Link metadata to collection

In [None]:
collection = ln.Collection.filter(name="cellxgene-census", version=census_version).one()

feature sets:

In [None]:
collection.feature_sets.add(
    ln.FeatureSet.filter(name__contains="obs").one(), through_defaults={"slot": "obs"}
)
collection.feature_sets.add(
    ln.FeatureSet.filter(name__contains="ext").one(),
    through_defaults={"slot": "external"},
)
collection.feature_sets.add(
    ln.FeatureSet.filter(name__contains="human").one(),
    through_defaults={"slot": "var-human"},
)
collection.feature_sets.add(
    ln.FeatureSet.filter(name__contains="mouse").one(),
    through_defaults={"slot": "var-mouse"},
)
collection.feature_sets.add(
    ln.FeatureSet.filter(name__contains="sars-2").one(),
    through_defaults={"slot": "var-sars-cov-2"},
)
collection.feature_sets.add(
    ln.FeatureSet.filter(name__contains="synthetic construct").one(),
    through_defaults={"slot": "var-ercc"},
)

In [None]:
is_donor = ln.ULabel.filter(name="is_donor").one()
donors = is_donor.children.all().filter().exclude(artifacts=None).all()
is_collection = ln.ULabel.filter(name="is_collection").one()
collections = is_collection.children.all().filter().exclude(artifacts=None).all()
is_suspension_type = ln.ULabel.filter(name="is_suspension_type").one()
stypes = is_suspension_type.children.all().filter().exclude(artifacts=None).all()
is_tissue_type = ln.ULabel.filter(name="is_tissue_type").one()
tissue_types = is_tissue_type.children.all().filter().exclude(artifacts=None).all()

collection.labels.add(donors, features.donor_id)
collection.labels.add(collections, features.collection)
collection.labels.add(stypes, features.suspension_type)
collection.labels.add(tissue_types, features.tissue_type)

collection.labels.add(
    lb.ExperimentalFactor.filter().exclude(artifacts=None).all(), features.assay
)
collection.labels.add(
    lb.CellType.filter().exclude(artifacts=None).all(), features.cell_type
)
collection.labels.add(
    lb.DevelopmentalStage.filter().exclude(artifacts=None).all(),
    features.development_stage,
)
collection.labels.add(
    lb.Disease.filter().exclude(artifacts=None).all(), features.disease
)
collection.labels.add(
    lb.Ethnicity.filter().exclude(artifacts=None).all(),
    features.self_reported_ethnicity,
)
collection.labels.add(lb.Phenotype.filter().exclude(artifacts=None).all(), features.sex)
collection.labels.add(lb.Tissue.filter().exclude(artifacts=None).all(), features.tissue)

In [None]:
collection.describe()

## Register collections

In [None]:
for i, ulabel in enumerate(is_collection.children.all()):
    if i % 20 == 0:
        print(i)
    artifacts = ulabel.artifacts.all()
    if artifacts.count() == 0:
        continue
    if artifacts.count() == 1:
        artifacts = artifacts[0]
    collection = ln.Collection(
        artifacts,
        name=ulabel.name,
        description=ulabel.description,
        reference=ulabel.reference,
        reference_type="CELLxGENE Collection ID",
    )
    collection.save()