# Register files from Census release 2023-12-06

In [1]:
import lamindb as ln
import lnschema_bionty as lb
import pandas as pd

2023-12-13 12:03:04,521:INFO - NumExpr defaulting to 2 threads.


💡 lamindb instance: laminlabs/cellxgene-latest


In [2]:
census_version = "2023-12-06"

In [3]:
s3path = f"s3://cellxgene-data-public/cell-census/{census_version}/h5ads"
ln.UPath(s3path).view_tree()

h5ads (0 sub-directories & 1139 files with suffixes '.h5ad'): 
├── 00099d5e-154f-4a7a-aa8d-fa30c8c0c43c.h5ad
├── 0041b9c3-6a49-4bf7-8514-9bc7190067a7.h5ad
├── 00476f9f-ebc1-4b72-b541-32f912ce36ea.h5ad
├── 00e5dedd-b9b7-43be-8c28-b0e5c6414a62.h5ad
├── 00ff600e-6e2e-4d76-846f-0eec4f0ae417.h5ad
├── 01209dce-3575-4bed-b1df-129f57fbc031.h5ad
...


In [4]:
ln.context.track()

💡 notebook imports: lamindb==0.64.0 lnschema_bionty==0.36.1 pandas==1.4.4 requests==2.31.0
💡 loaded: Transform(uid='4hbVFAINneXCz8', name='Register files from Census release 2023-12-06', short_name='census-release-2023-12-06', version='0', type='notebook', updated_at=2023-12-11 15:39:44 UTC, created_by_id=2)
💡 loaded: Run(uid='yq2FEOYiiNwTV6HJRReE', run_at=2023-12-13 12:03:11 UTC, transform_id=1, created_by_id=2)


## Register artifacts (files)

In [5]:
artifacts = ln.Artifact.from_dir(s3path)
ln.save(artifacts)

In [6]:
dataset = ln.Dataset(artifacts, name="cellxgene-census", version=census_version)
dataset.save()

## Register metadata

Get all datasets and associated metadata using cellxgene REST API:

In [5]:
import requests


def get_datasets_df_from_cxg():
    api_url_base = "https://api.cellxgene.cziscience.com"
    datasets_path = "/curation/v1/datasets"
    datasets_url = f"{api_url_base}{datasets_path}"
    headers = {"Content-Type": "application/json"}
    res = requests.get(url=datasets_url, headers=headers)
    res.raise_for_status()
    res_content = res.json()
    return res_content

In [6]:
cellxgene_meta = get_datasets_df_from_cxg()
len(cellxgene_meta)

1152

In [7]:
cellxgene_meta[0].keys()

dict_keys(['assay', 'assets', 'batch_condition', 'cell_count', 'cell_type', 'citation', 'collection_doi', 'collection_id', 'collection_name', 'collection_version_id', 'dataset_id', 'dataset_version_id', 'development_stage', 'disease', 'donor_id', 'explorer_url', 'is_primary_data', 'mean_genes_per_cell', 'organism', 'primary_cell_count', 'processing_status', 'published_at', 'revised_at', 'schema_version', 'self_reported_ethnicity', 'sex', 'suspension_type', 'tissue', 'title', 'tombstone', 'x_approximate_distribution'])

### Register new features and parent labels

In [10]:
obs_features = {
    "assay": "bionty.ExperimentalFactor",
    "cell_type": "bionty.CellType",
    "development_stage": "bionty.DevelopmentalStage",
    "disease": "bionty.Disease",
    "donor_id": "core.ULabel",
    "self_reported_ethnicity": "bionty.Ethnicity",
    "sex": "bionty.Phenotype",
    "suspension_type": "core.ULabel",
    "tissue": "bionty.Tissue",
    "tissue_type": "core.ULabel",
}

obs_features_records = []
for name, registry in obs_features.items():
    record = ln.Feature(name=name, type="category", registries=registry)
    obs_features_records.append(record)
ln.save(obs_features_records)
obs_feature_set = ln.FeatureSet(features=obs_features_records, name="obs features")
obs_feature_set.save()
obs_feature_set.artifacts.set(artifacts, through_defaults={"slot": "obs"})

In [11]:
ext_features = {"organism": "bionty.Organism", "collection": "core.ULabel"}

ext_features_records = []
for name, registry in ext_features.items():
    record = ln.Feature(name=name, type="category", registries=registry)
    ext_features_records.append(record)
ln.save(ext_features_records)
ext_feature_set = ln.FeatureSet(features=ext_features_records, name="external features")
ext_feature_set.save()
ext_feature_set.artifacts.set(artifacts, through_defaults={"slot": "external"})

In [12]:
ln.ULabel(name="is_collection", description="parents of collections").save()
ln.ULabel(name="is_donor", description="parents of donors").save()
ln.ULabel(name="is_suspension_type", description="parents of suspension types").save()
ln.ULabel(name="is_tissue_type", description="parents of tissue types").save()

In [8]:
features = ln.Feature.lookup()
artifacts = ln.File.filter(key__contains=census_version).all()

## collections, organisms

In [12]:
# register all collections
is_collection = ln.ULabel.filter(name="is_collection").one()
collections_meta = set()
for dataset_meta in cellxgene_meta:
    collections_meta.add(
        (
            dataset_meta["collection_name"],
            dataset_meta["collection_doi"],
            dataset_meta["collection_id"],
        )
    )

collections_records = []
for collection_name, collection_doi, collection_id in collections_meta:
    collection = ln.ULabel(
        name=collection_name,
        description=collection_doi,
        reference=collection_id,
        reference_type="collection_id",
    )
    collections_records.append(collection)
ln.save(collections_records)
is_collection.children.add(*collections_records)

In [9]:
# register all organisms
ncbitaxon_source = lb.BiontySource.filter(source="ncbitaxon").one()

organisms_meta = set()
for dataset_meta in cellxgene_meta:
    organisms_meta.update({i["ontology_term_id"] for i in dataset_meta["organism"]})

organisms_records = lb.Organism.from_values(
    organisms_meta, field=lb.Organism.ontology_id, bionty_source=ncbitaxon_source
)
# rename house mouse to mouse
for r in organisms_records:
    if r.name == "house mouse":
        r.name = "mouse"
ln.save(organisms_records, parents=False)

Link collections and organisms to artifacts:

In [15]:
ext_feature_set = ln.FeatureSet.filter(name="external features").one()
ext_features = ext_feature_set.members.lookup()
collections = is_collection.children.all()
organisms = lb.Organism.filter().all()

for dataset_meta in cellxgene_meta:
    # get registered file record based on dataset_id
    file = artifacts.filter(key__contains=dataset_meta["dataset_id"]).one_or_none()
    if file is None:
        continue

    # register collection
    collection = ln.ULabel.filter(reference=dataset_meta["collection_id"]).one()
    file.labels.add(collection, feature=ext_features.collection)

    # register organism
    organism_ontology_ids = [i["ontology_term_id"] for i in dataset_meta["organism"]]
    organism_records = lb.Organism.filter(ontology_id__in=organism_ontology_ids).list()
    file.labels.add(organism_records, feature=ext_features.organism)

## ontologies

Register all ontology ids:

In [20]:
from typing import Optional
from lnschema_bionty.models import Registry
from lamindb.dev._feature_manager import get_accessor_by_orm

obs_feature_set = ln.FeatureSet.filter(name="obs features").one()
obs_features_records = obs_feature_set.members.lookup()
ACCESSORS = get_accessor_by_orm(ln.File)
FEATURE_TO_ACCESSOR = {}
for name in obs_features.keys():
    feature = getattr(obs_features_records, name)
    accessor = ACCESSORS.get(feature.registries)
    orm = getattr(ln.File, accessor).field.model
    # TODO: ulabels are defined in the File model, improve this in LaminDB
    if orm == ln.File:
        orm = getattr(ln.File, accessor).field.related_model
    FEATURE_TO_ACCESSOR[name] = (accessor, orm)


def create_ontology_record_from_source(
    ontology_id: str,
    from_orm: Registry,
    target_orm: Registry,
    bionty_source: lb.BiontySource | None = None,
):
    from_record = from_orm.from_bionty(
        ontology_id=ontology_id, bionty_source=bionty_source
    )
    try:
        target_record = target_orm(
            name=from_record.name,
            description=from_record.description,
            ontology_id=from_record.ontology_id,
            bionty_source_id=from_record.bionty_source_id,
        )
        return target_record
    except Exception:
        pass

In [22]:
obs_features.keys()

dict_keys(['assay', 'cell_type', 'development_stage', 'disease', 'donor_id', 'self_reported_ethnicity', 'sex', 'suspension_type', 'tissue', 'tissue_type'])

In [21]:
ln.settings.upon_create_search_names = False

ontology_ids = {}
for name in obs_features.keys():
    if name in ["donor_id", "suspension_type", "tissue_type"]:
        continue
    allids = set()
    for i in cellxgene_meta:
        if name in i:
            allids.update([(j["label"], j["ontology_term_id"]) for j in i[name]])

    ontology_ids[name] = allids

bionty_source_ds_mouse = lb.BiontySource.filter(
    entity="DevelopmentalStage", organism="mouse"
).one()
bionty_source_pato = lb.BiontySource.filter(source="pato").one()

# register all ontology ids
for name, terms in ontology_ids.items():
    print(f"registering {name}")
    accessor, orm = FEATURE_TO_ACCESSOR.get(name)
    terms_ids = [i[1] for i in terms]
    records = orm.from_values(terms_ids, field="ontology_id")
    if len(records) > 0:
        ln.save(records)
    inspect_result = orm.inspect(terms_ids, field="ontology_id", mute=True)
    if len(inspect_result.non_validated) > 0:
        if name == "development_stage":
            records = orm.from_values(
                inspect_result.non_validated,
                field="ontology_id",
                bionty_source=bionty_source_ds_mouse,
            )
            records += [
                create_ontology_record_from_source(
                    ontology_id=term_id, from_orm=lb.Tissue, target_orm=orm
                )
                for term_id in inspect_result.non_validated
                if term_id.startswith("UBERON:")
            ]
            records += [
                orm(name=term_id, ontology_id=term_id)
                for term_id in inspect_result.non_validated
                if term_id == "unknown"
            ]
        else:
            records = [
                orm(name=term[0], ontology_id=term[1])
                for term in terms
                if (not term[1].startswith("PATO:"))
                and (term[1] in inspect_result.non_validated)
            ]
            records += [
                create_ontology_record_from_source(
                    ontology_id=term_id,
                    from_orm=lb.Phenotype,
                    target_orm=orm,
                    bionty_source=bionty_source_pato,
                )
                for term_id in inspect_result.non_validated
                if term_id.startswith("PATO:")
            ]

        if len(records) > 0:
            print(f"registered {len(records)} records: {records}")
            ln.save(records)

registering assay
❗ now recursing through parents: this only happens once, but is much slower than bulk saving
registering cell_type
❗ now recursing through parents: this only happens once, but is much slower than bulk saving
registering development_stage
❗ [1;91mdid not create[0m DevelopmentalStage records for [1;93m57 non-validated[0m [3montology_ids[0m: [1;93m'MmusDv:0000021', 'MmusDv:0000024', 'MmusDv:0000025', 'MmusDv:0000026', 'MmusDv:0000027', 'MmusDv:0000028', 'MmusDv:0000029', 'MmusDv:0000032', 'MmusDv:0000033', 'MmusDv:0000034', 'MmusDv:0000035', 'MmusDv:0000036', 'MmusDv:0000037', 'MmusDv:0000041', 'MmusDv:0000046', 'MmusDv:0000048', 'MmusDv:0000049', 'MmusDv:0000050', 'MmusDv:0000051', 'MmusDv:0000052', ...[0m
❗ now recursing through parents: this only happens once, but is much slower than bulk saving
❗ [1;91mdid not create[0m DevelopmentalStage records for [1;93m6 non-validated[0m [3montology_ids[0m: [1;93m'UBERON:0000113', 'UBERON:0007220', 'UBERON:0007222'

## donors and suspension_types

In [23]:
donor_ids = set()
suspension_types = set()

for i in cellxgene_meta:
    if "donor_id" in i:
        donor_ids.update(i["donor_id"])
    if "suspension_type" in i:
        suspension_types.update(i["suspension_type"])

In [24]:
is_donor = ln.ULabel.filter(name="is_donor").one()
donors = is_donor.children.all()
result = donors.inspect(donor_ids, mute=True)
new_donors = [ln.ULabel(name=name) for name in result.non_validated]
ln.save(new_donors)
is_donor.children.add(*new_donors)

is_suspension_type = ln.ULabel.filter(name="is_suspension_type").one()
stypes = is_suspension_type.children.all()
result = stypes.inspect(suspension_types, mute=True)
new_stypes = [ln.ULabel(name=name) for name in result.non_validated]
ln.save(new_stypes)
is_suspension_type.children.add(*new_stypes)

## Annotate artifacts with obs metadata

In [25]:
FEATURE_TO_ACCESSOR

{'assay': ('experimental_factors', lnschema_bionty.models.ExperimentalFactor),
 'cell_type': ('cell_types', lnschema_bionty.models.CellType),
 'development_stage': ('developmental_stages',
  lnschema_bionty.models.DevelopmentalStage),
 'disease': ('diseases', lnschema_bionty.models.Disease),
 'donor_id': ('ulabels', lnschema_core.models.ULabel),
 'self_reported_ethnicity': ('ethnicities', lnschema_bionty.models.Ethnicity),
 'sex': ('phenotypes', lnschema_bionty.models.Phenotype),
 'suspension_type': ('ulabels', lnschema_core.models.ULabel),
 'tissue': ('tissues', lnschema_bionty.models.Tissue),
 'tissue_type': ('ulabels', lnschema_core.models.ULabel)}

In [29]:
features = ln.Feature.lookup()

for idx, dataset_meta in enumerate(cellxgene_meta):
    if idx % 100 == 0:
        print(f"annotating dataset {idx} of {len(cellxgene_meta)}")
    file = artifacts.filter(key__contains=dataset_meta["dataset_id"]).one_or_none()
    if file is None:
        continue
    for field, terms in dataset_meta.items():
        if field not in FEATURE_TO_ACCESSOR:
            continue
        accessor, orm = FEATURE_TO_ACCESSOR.get(field)
        if field in ["donor_id", "suspension_type", "tissue_type"]:
            records = orm.from_values(terms, field="name")
            if len(records) > 0:
                # stratify by feature so that link tables records are written
                file.labels.add(records, feature=getattr(features, field))
        else:
            records = orm.from_values(
                [i["ontology_term_id"] for i in terms], field="ontology_id"
            )
            if len(records) > 0:
                getattr(file, accessor).add(*records)

annotating dataset 0 of 1152
annotating dataset 100 of 1152
annotating dataset 200 of 1152
annotating dataset 300 of 1152
annotating dataset 400 of 1152
annotating dataset 500 of 1152
annotating dataset 600 of 1152
annotating dataset 700 of 1152
annotating dataset 800 of 1152
annotating dataset 900 of 1152
annotating dataset 1000 of 1152
annotating dataset 1100 of 1152


Clean up the 2 "unknowns" in DevelopmentalStage:

In [None]:
lb.DevelopmentalStage.filter(name="unknown").exclude(ontology_id="unknown").delete()

## Validate and register genes

In [30]:
# register synthetic constructs and sars_cov_2 as new organisms
new_organisms = lb.Organism.from_values(
    ["NCBITaxon:32630", "NCBITaxon:2697049"],
    field=lb.Organism.ontology_id,
    bionty_source=ncbitaxon_source,
)
ln.save(new_organisms, parents=False)

# genes files
organisms = lb.Organism.lookup(field=lb.Organism.scientific_name)
genes_files = {
    "homo_sapiens": "https://github.com/chanzuckerberg/single-cell-curation/raw/main/cellxgene_schema_cli/cellxgene_schema/ontology_files/genes_homo_sapiens.csv.gz",
    "mus_musculus": "https://github.com/chanzuckerberg/single-cell-curation/raw/main/cellxgene_schema_cli/cellxgene_schema/ontology_files/genes_mus_musculus.csv.gz",
    "synthetic_construct": "https://github.com/chanzuckerberg/single-cell-curation/raw/main/cellxgene_schema_cli/cellxgene_schema/ontology_files/genes_ercc.csv.gz",
    "severe_acute_respiratory_syndrome_coronavirus_2": "https://github.com/chanzuckerberg/single-cell-curation/raw/main/cellxgene_schema_cli/cellxgene_schema/ontology_files/genes_sars_cov_2.csv.gz",
}

Register all genes for each organism:

In [34]:
for organism_name, genes_file in genes_files.items():
    print(f"registering {organism_name} genes")
    df = pd.read_csv(genes_file, header=None, index_col=0)
    organism_record = getattr(organisms, organism_name)
    gene_records = lb.Gene.from_values(
        df.index, field=lb.Gene.ensembl_gene_id, organism=organism_record
    )
    ln.save(gene_records)
    validated = lb.Gene.validate(
        df.index, field=lb.Gene.ensembl_gene_id, organism=organism_record
    )
    # register legacy genes manually
    new_records = []
    for gene_id in df.index[~validated]:
        new_records.append(
            lb.Gene(
                ensembl_gene_id=gene_id,
                symbol=df.loc[gene_id][1],
                organism=organism_record,
            )
        )
    ln.save(new_records)

    genes_feature_set = ln.FeatureSet(
        features=gene_records + new_records, name=f"all {organism_record.name} genes"
    )
    genes_feature_set.save()

registering homo_sapiens genes
❗ [1;91mdid not create[0m Gene records for [1;93m147 non-validated[0m [3mensembl_gene_ids[0m: [1;93m'ENSG00000112096', 'ENSG00000137808', 'ENSG00000161149', 'ENSG00000182230', 'ENSG00000203812', 'ENSG00000204092', 'ENSG00000205485', 'ENSG00000212951', 'ENSG00000215271', 'ENSG00000221995', 'ENSG00000224739', 'ENSG00000224745', 'ENSG00000225178', 'ENSG00000225932', 'ENSG00000226377', 'ENSG00000226380', 'ENSG00000226403', 'ENSG00000227021', 'ENSG00000227220', 'ENSG00000227902', ...[0m
❗ [1;93m147 terms[0m (0.20%) are not validated for [3mensembl_gene_id[0m: [1;93mENSG00000269933, ENSG00000261737, ENSG00000259834, ENSG00000256374, ENSG00000263464, ENSG00000203812, ENSG00000272196, ENSG00000272880, ENSG00000284299, ENSG00000270188, ENSG00000287116, ENSG00000237133, ENSG00000224739, ENSG00000227902, ENSG00000239467, ENSG00000272551, ENSG00000280374, ENSG00000284741, ENSG00000236886, ENSG00000229352, ...[0m
registering mus_musculus genes
❗ [1;91md

## Link metadata to individual artifacts

annotate with genes measured in each file:

In [26]:
organisms = lb.Organism.lookup(field=lb.Organism.scientific_name)

In [27]:
for idx, file in enumerate(artifacts):
    if idx % 100 == 0:
        print(f"annotating dataset {idx} of {len(artifacts)}")

    adata_backed = file.backed()
    var_names = adata_backed.var_names
    organism_record = file.organism.first()
    if organism_record is None:
        print(f"No organism found for file: {file}")
        continue
    genes = lb.Gene.from_values(
        var_names, field=lb.Gene.ensembl_gene_id, organism=organism_record
    )

    if len(genes) == 0 and var_names[0].startswith("ENSG"):
        genes += lb.Gene.from_values(
            var_names, field=lb.Gene.ensembl_gene_id, organism="human"
        )

    if len(var_names[var_names.str.startswith("ERCC")]) > 0:
        genes += lb.Gene.from_values(
            var_names,
            field=lb.Gene.ensembl_gene_id,
            organism=organisms.synthetic_construct,
        )
    if len(var_names[var_names.str.startswith("ENSSASG")]) > 0:
        genes += lb.Gene.from_values(
            var_names,
            field=lb.Gene.ensembl_gene_id,
            organism=organisms.severe_acute_respiratory_syndrome_coronavirus_2,
        )

    var_feature_set_file = ln.FeatureSet(genes, type="number")
    var_feature_set_file.save()
    file.feature_sets.add(var_feature_set_file, through_defaults={"slot": "var"})

annotating dataset 0 of 12


In [30]:
file.describe()

[1;92mFile[0m(uid='8aIkAQpSXAWvebiuOT53', key='cell-census/2023-12-06/h5ads/ff7d15fa-f4b6-4a0e-992e-fd0c9d088ded.h5ad', suffix='.h5ad', accessor='AnnData', size=339098252, hash='wk4aVyHI7iZWNq2n99_s4w-41', hash_type='md5-n', visibility=1, key_is_virtual=False, updated_at=2023-12-11 15:46:45 UTC)

[1;92mProvenance[0m:
  🗃️ storage: Storage(uid='vm6fiuHv', root='s3://cellxgene-data-public', type='s3', region='us-west-2', updated_at=2023-12-11 15:39:59 UTC, created_by_id=2)
  📔 transform: Transform(uid='4hbVFAINneXCz8', name='Register files from Census release 2023-12-06', short_name='census-release-2023-12-06', version='0', type='notebook', updated_at=2023-12-11 15:39:44 UTC, created_by_id=2)
  👣 run: Run(uid='yq2FEOYiiNwTV6HJRReE', run_at=2023-12-13 10:17:40 UTC, transform_id=1, created_by_id=2)
  👤 created_by: User(uid='kmvZDIX9', handle='sunnyosun', name='Sunny Sun', updated_at=2023-12-12 10:05:51 UTC)
[1;92mFeatures[0m:
  [1mobs[0m: FeatureSet(uid='VOhmBdxtNgUpMiUUMR56', name

## Annotate tissue_type

Before CxG schema 4.0, tissue_type column was not annotated, instead "cell culture" or "organoid" was added to the record ontology_id.

In [19]:
tissue_types = [ln.ULabel(name=i) for i in ["tissue", "organoid", "cell culture"]]
ln.save(tissue_types)

is_tissue_type = ln.ULabel.filter(name="is_tissue_type").one()
is_tissue_type.children.add(*tissue_types)

In [10]:
is_tissue_type = ln.ULabel.filter(name="is_tissue_type").one()
tissue_types = is_tissue_type.children.lookup()
features = ln.Feature.lookup()

In [6]:
organoids = lb.Tissue.filter(ontology_id__contains="organoid").all()
organoids.df()

Unnamed: 0_level_0,uid,name,ontology_id,abbr,synonyms,description,bionty_source_id,updated_at,created_by_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
692,x3tRcugV,trophoblast (organoid),UBERON:0000088 (organoid),,,,,2023-12-11 19:11:55.832890+00:00,2
693,uS0Cw8zN,retina (organoid),UBERON:0000966 (organoid),,,,,2023-12-11 19:11:55.832921+00:00,2
697,RkE6D8y1,endometrium (organoid),UBERON:0001295 (organoid),,,,,2023-12-11 19:11:55.833155+00:00,2
699,K4RSNRBc,thymus (organoid),UBERON:0002370 (organoid),,,,,2023-12-11 19:11:55.833223+00:00,2
701,WSs6UA9e,lung (organoid),UBERON:0002048 (organoid),,,,,2023-12-11 19:11:55.833293+00:00,2


In [27]:
for record in organoids:
    print(record.name)
    ontology_id = record.ontology_id.split(" ")[0]
    tissue_record = lb.Tissue.from_bionty(ontology_id=ontology_id)
    if tissue_record._state.adding:
        tissue_record.save()
    for f in tissue_record.artifacts.all():
        f.labels.add(tissue_types.organoid, features.tissue_type)

trophoblast (organoid)
retina (organoid)
endometrium (organoid)
thymus (organoid)
lung (organoid)


In [29]:
organoids.delete()

(10, {'lnschema_bionty.Tissue_files': 5, 'lnschema_bionty.Tissue': 5})

In [6]:
cell_cultures = lb.Tissue.filter(ontology_id__contains="cell culture").all()
cell_cultures.df()

Unnamed: 0_level_0,uid,name,ontology_id,abbr,synonyms,description,bionty_source_id,updated_at,created_by_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
691,rIPA0OEl,T cell (cell culture),CL:0000084 (cell culture),,,,,2023-12-11 19:11:55.832859+00:00,2
689,vg9s890t,respiratory basal cell (cell culture),CL:0002633 (cell culture),,,,,2023-12-11 19:11:55.832782+00:00,2
690,lfIFQFR5,epithelial cell of lung (cell culture),CL:0000082 (cell culture),,,,,2023-12-11 19:11:55.832827+00:00,2
694,kWD0kb5x,brown preadipocyte (cell culture),CL:0002335 (cell culture),,,,,2023-12-11 19:11:55.833031+00:00,2
695,UoElNxsj,endothelial cell (cell culture),CL:0000115 (cell culture),,,,,2023-12-11 19:11:55.833064+00:00,2
696,7MzqN14b,bronchial epithelial cell (cell culture),CL:0002328 (cell culture),,,,,2023-12-11 19:11:55.833122+00:00,2
698,yPk6E1V8,epithelial cell of alveolus of lung (cell cult...,CL:0010003 (cell culture),,,,,2023-12-11 19:11:55.833189+00:00,2
700,9ICArUMH,embryonic stem cell (cell culture),CL:0002322 (cell culture),,,,,2023-12-11 19:11:55.833256+00:00,2
702,w6gzNa8D,mammary gland epithelial cell (cell culture),CL:0002327 (cell culture),,,,,2023-12-11 19:11:55.833349+00:00,2
703,Ash8pGf8,trophoblast cell (cell culture),CL:0000351 (cell culture),,,,,2023-12-11 19:11:55.833382+00:00,2


In [11]:
for record in cell_cultures:
    print(record.name)
    ontology_id = record.ontology_id.split(" ")[0]
    tissue_record = lb.CellType.from_bionty(ontology_id=ontology_id)
    if tissue_record._state.adding:
        tissue_record.save()
    for f in tissue_record.artifacts.all():
        f.labels.add(tissue_types.cell_culture, features.tissue_type)

T cell (cell culture)
respiratory basal cell (cell culture)
epithelial cell of lung (cell culture)
brown preadipocyte (cell culture)
endothelial cell (cell culture)
bronchial epithelial cell (cell culture)
epithelial cell of alveolus of lung (cell culture)
embryonic stem cell (cell culture)
mammary gland epithelial cell (cell culture)
trophoblast cell (cell culture)
preadipocyte (cell culture)
cultured cell (cell culture)


In [14]:
cell_cultures.delete()

(0, {})

## Link metadata to collection

In [15]:
collection = ln.Collection.filter(name="cellxgene-census", version=census_version).one()

feature sets:

In [16]:
collection.feature_sets.add(
    ln.FeatureSet.filter(name__contains="obs").one(), through_defaults={"slot": "obs"}
)
collection.feature_sets.add(
    ln.FeatureSet.filter(name__contains="ext").one(),
    through_defaults={"slot": "external"},
)
collection.feature_sets.add(
    ln.FeatureSet.filter(name__contains="human").one(),
    through_defaults={"slot": "var-human"},
)
collection.feature_sets.add(
    ln.FeatureSet.filter(name__contains="mouse").one(),
    through_defaults={"slot": "var-mouse"},
)
collection.feature_sets.add(
    ln.FeatureSet.filter(name__contains="sars-2").one(),
    through_defaults={"slot": "var-sars-cov-2"},
)
collection.feature_sets.add(
    ln.FeatureSet.filter(name__contains="synthetic construct").one(),
    through_defaults={"slot": "var-ercc"},
)

In [17]:
is_donor = ln.ULabel.filter(name="is_donor").one()
donors = is_donor.children.all().filter().exclude(artifacts=None).all()
is_collection = ln.ULabel.filter(name="is_collection").one()
collections = is_collection.children.all().filter().exclude(artifacts=None).all()
is_suspension_type = ln.ULabel.filter(name="is_suspension_type").one()
stypes = is_suspension_type.children.all().filter().exclude(artifacts=None).all()
is_tissue_type = ln.ULabel.filter(name="is_tissue_type").one()
tissue_types = is_tissue_type.children.all().filter().exclude(artifacts=None).all()

collection.labels.add(donors, features.donor_id)
collection.labels.add(collections, features.collection)
collection.labels.add(stypes, features.suspension_type)
collection.labels.add(tissue_types, features.tissue_type)

collection.labels.add(
    lb.ExperimentalFactor.filter().exclude(artifacts=None).all(), features.assay
)
collection.labels.add(
    lb.CellType.filter().exclude(artifacts=None).all(), features.cell_type
)
collection.labels.add(
    lb.DevelopmentalStage.filter().exclude(artifacts=None).all(),
    features.development_stage,
)
collection.labels.add(
    lb.Disease.filter().exclude(artifacts=None).all(), features.disease
)
collection.labels.add(
    lb.Ethnicity.filter().exclude(artifacts=None).all(),
    features.self_reported_ethnicity,
)
collection.labels.add(lb.Phenotype.filter().exclude(artifacts=None).all(), features.sex)
collection.labels.add(lb.Tissue.filter().exclude(artifacts=None).all(), features.tissue)

In [18]:
collection.describe()

[1;92mDataset[0m(uid='vAGS2R54eJGhRV6VWCYb', name='cellxgene-census', version='2023-12-06', hash='ak5599uHQCLwQNFgRusr', visibility=1, updated_at=2023-12-11 18:32:57 UTC)

[1;92mProvenance[0m:
  📔 transform: Transform(uid='4hbVFAINneXCz8', name='Register files from Census release 2023-12-06', short_name='census-release-2023-12-06', version='0', type='notebook', updated_at=2023-12-11 15:39:44 UTC, created_by_id=2)
  👣 run: Run(uid='yq2FEOYiiNwTV6HJRReE', run_at=2023-12-13 12:03:11 UTC, transform_id=1, created_by_id=2)
  👤 created_by: User(uid='kmvZDIX9', handle='sunnyosun', name='Sunny Sun', updated_at=2023-12-12 10:05:51 UTC)
[1;92mFeatures[0m:
  [1mobs[0m: FeatureSet(uid='VOhmBdxtNgUpMiUUMR56', name='obs features', n=10, registry='core.Feature', hash='vRtez9Dl4oTSutrbWK13', updated_at=2023-12-11 18:45:36 UTC, created_by_id=2)
    🔗 assay (36, [3mbionty.ExperimentalFactor[0m): 'BD Rhapsody Targeted mRNA', 'sci-RNA-seq3', '10x multiome', 'mCT-seq', 'DroNc-seq', 'MARS-seq', '10

## Register collections

In [None]:
for i, ulabel in enumerate(is_collection.children.all()):
    if i % 20 == 0:
        print(i)
    artifacts = ulabel.artifacts.all()
    if artifacts.count() == 0:
        continue
    if artifacts.count() == 1:
        artifacts = artifacts[0]
    collection = ln.Collection(
        artifacts,
        name=ulabel.name,
        description=ulabel.description,
        reference=ulabel.reference,
        reference_type="CELLxGENE Collection ID",
    )
    collection.save()