# Register files from Census release 2023-11-13

In [1]:
import lamindb as ln
import lnschema_bionty as lb

2023-11-22 14:45:12,929:INFO - NumExpr defaulting to 2 threads.


💡 lamindb instance: theislab/nicheformer


In [18]:
census_version = "2023-11-13"

In [4]:
s3path = f"s3://cellxgene-data-public/cell-census/{census_version}/h5ads"
ln.UPath(s3path).view_tree()

h5ads (0 sub-directories & 1122 files with suffixes '.h5ad'): 
├── 00099d5e-154f-4a7a-aa8d-fa30c8c0c43c.h5ad
├── 0041b9c3-6a49-4bf7-8514-9bc7190067a7.h5ad
├── 00476f9f-ebc1-4b72-b541-32f912ce36ea.h5ad
├── 00e5dedd-b9b7-43be-8c28-b0e5c6414a62.h5ad
├── 00ff600e-6e2e-4d76-846f-0eec4f0ae417.h5ad
├── 01209dce-3575-4bed-b1df-129f57fbc031.h5ad
...


In [4]:
ln.context.track()

💡 notebook imports: lamindb==0.61.0 lnschema_bionty==0.34.0 requests==2.31.0
💡 loaded: Transform(uid='4hbVFAINneXCz8', name='Register files from Census release 2023-11-13', short_name='census-release-2023-11', version='0', type='notebook', updated_at=2023-11-20 21:25:20 UTC, created_by_id=1)
💡 loaded: Run(uid='6ZcVntFiofyv226ywRDL', run_at=2023-11-21 20:58:33 UTC, transform_id=18, created_by_id=1)


## Register files

In [5]:
files = ln.File.from_dir(s3path)
ln.save(files)

✅ created 1122 files from directory using storage s3://cellxgene-data-public and key = cell-census/2023-11-13/h5ads/


In [6]:
dataset = ln.Dataset(files, name="cellxgene-census", version=census_version)
dataset.save()

💡 initializing versioning for this dataset! create future versions of it using ln.Dataset(..., is_new_version_of=old_dataset)


## Register metadata

Get all datasets and associated metadata using cellxgene REST API:

In [5]:
import requests


def get_datasets_df_from_cxg():
    api_url_base = "https://api.cellxgene.cziscience.com"
    datasets_path = "/curation/v1/datasets"
    datasets_url = f"{api_url_base}{datasets_path}"
    headers = {"Content-Type": "application/json"}
    res = requests.get(url=datasets_url, headers=headers)
    res.raise_for_status()
    res_content = res.json()
    return res_content

In [6]:
res_content = get_datasets_df_from_cxg()
len(res_content)

1130

In [7]:
res_content[0].keys()

dict_keys(['assay', 'assets', 'cell_count', 'cell_type', 'collection_doi', 'collection_id', 'collection_name', 'collection_version_id', 'dataset_id', 'dataset_version_id', 'development_stage', 'disease', 'donor_id', 'explorer_url', 'is_primary_data', 'mean_genes_per_cell', 'organism', 'primary_cell_count', 'processing_status', 'published_at', 'revised_at', 'schema_version', 'self_reported_ethnicity', 'sex', 'suspension_type', 'tissue', 'title', 'tombstone', 'x_approximate_distribution'])

In [19]:
features = ln.Feature.lookup()
files = ln.File.filter(key__contains=census_version).all()

### collections, organisms

In [20]:
is_collection = ln.ULabel.filter(name="is_collection").one()
collections = is_collection.children.all()
organisms = lb.Organism.filter().all()

for dataset_meta in res_content:
    file = files.filter(key__contains=dataset_meta["dataset_id"]).one_or_none()
    if file is None:
        continue
    # register collection
    collection = collections.filter(
        reference=dataset_meta["collection_id"]
    ).one_or_none()
    if collection is None:
        collection = ln.ULabel(
            name=dataset_meta["collection_name"],
            description=dataset_meta["collection_doi"],
            reference=dataset_meta["collection_id"],
            reference_type="collection_id",
        )
        collection.save()
        collection.parents.add(is_collection)
    file.labels.add(collection, feature=features.collection)

    # annotate with organism
    organism_ontology_ids = [i["ontology_term_id"] for i in dataset_meta["organism"]]
    organism_records = organisms.filter(ontology_id__in=organism_ontology_ids).list()
    # register new organisms
    if len(organism_records) == 0:
        bionty_source = lb.BiontySource.filter(entity="Organism", organism="all").one()
        for i in organism_ontology_ids:
            record = lb.Organism.from_bionty(ontology_id=i, bionty_source=bionty_source)
            record.save(parents=False)
            print(f"registered organism: {record}")
        organism_records = organisms.filter(
            ontology_id__in=organism_ontology_ids
        ).list()
    file.labels.add(organism_records, feature=features.organism)

❗ loading non-default source inside a LaminDB instance
registered organism: Organism(uid='sMq3BQz0', name='rhesus macaque', ontology_id='NCBITaxon:9544', scientific_name='Macaca mulatta', updated_at=2023-11-21 12:30:09 UTC, bionty_source_id=50, created_by_id=1)
❗ loading non-default source inside a LaminDB instance
registered organism: Organism(uid='aYpsHRNc', name='white-tufted-ear marmoset', ontology_id='NCBITaxon:9483', scientific_name='Callithrix jacchus', updated_at=2023-11-21 12:30:18 UTC, bionty_source_id=50, created_by_id=1)
❗ loading non-default source inside a LaminDB instance
registered organism: Organism(uid='Ihbc7tQx', name='chimpanzee', ontology_id='NCBITaxon:9598', scientific_name='Pan troglodytes', updated_at=2023-11-21 12:30:26 UTC, bionty_source_id=50, created_by_id=1)
❗ loading non-default source inside a LaminDB instance
registered organism: Organism(uid='Z0wGZNxo', name='domestic pig', ontology_id='NCBITaxon:9825', scientific_name='Sus scrofa domesticus', updated_a

### obs ontologies

In [9]:
feature_names = [
    "self_reported_ethnicity",
    "development_stage",
    "cell_type",
    "assay",
    "tissue",
    "disease",
    "sex",
    "donor_id",
    "suspension_type",
]

from lamindb.dev._feature_manager import get_accessor_by_orm

ACCESSORS = get_accessor_by_orm(ln.File)
FEATURE_TO_ACCESSOR = {}
for name in feature_names:
    feature = getattr(features, name)
    accessor = ACCESSORS.get(feature.registries)
    orm = getattr(ln.File, accessor).field.model
    # TODO: ulabels are defined in the File model, improve this in LaminDB
    if orm == ln.File:
        orm = getattr(ln.File, accessor).field.related_model
    FEATURE_TO_ACCESSOR[name] = (accessor, orm)

In [37]:
obs_featureset = ln.FeatureSet(features=[getattr(features, i) for i in feature_names])
obs_featureset.save()

obs_featureset.files.set(files, through_defaults={"slot": "obs"})

In [10]:
# extra step to register uberon ontologies as developmental stages
def create_dv_record_from_uberon(ontology_id: str):
    tissue_record = lb.Tissue.from_bionty(ontology_id=ontology_id)
    dvs_record = lb.DevelopmentalStage(
        name=tissue_record.name,
        description=tissue_record.description,
        ontology_id=tissue_record.ontology_id,
        bionty_source_id=tissue_record.bionty_source_id,
    )
    dvs_record.save()


for id in [
    "UBERON:0018241",
    "UBERON:0034919",
    "UBERON:0007222",
    "UBERON:0000113",
    "UBERON:0007220",
    "UBERON:0007222",
]:
    create_dv_record_from_uberon(id)

In [10]:
ontology_ids = {}
for name in feature_names:
    if name in ["donor_id", "suspension_type"]:
        continue
    allids = set()
    for i in res_content:
        if name in i:
            allids.update([(j["label"], j["ontology_term_id"]) for j in i[name]])

    ontology_ids[name] = allids

# register all ontology ids
for name, terms in ontology_ids.items():
    accessor, orm = FEATURE_TO_ACCESSOR.get(name)
    terms_ids = [i[1] for i in terms]
    records = orm.from_values(terms_ids, field="ontology_id")
    if len(records) > 0:
        ln.save(records)

❗ now recursing through parents: this only happens once, but is much slower than bulk saving
❗ [1;91mdid not create[0m DevelopmentalStage records for [1;93m19 non-validated[0m [3montology_ids[0m: [1;93m'MmusDv:0000057', 'MmusDv:0000058', 'MmusDv:0000059', 'MmusDv:0000065', 'MmusDv:0000066', 'MmusDv:0000067', 'MmusDv:0000068', 'MmusDv:0000070', 'MmusDv:0000071', 'MmusDv:0000072', 'MmusDv:0000073', 'MmusDv:0000074', 'MmusDv:0000079', 'MmusDv:0000098', 'MmusDv:0000099', 'MmusDv:0000102', 'UBERON:0000113', 'UBERON:0007220', 'UBERON:0007222'[0m
❗ now recursing through parents: this only happens once, but is much slower than bulk saving
❗ now recursing through parents: this only happens once, but is much slower than bulk saving
❗ [1;91mdid not create[0m Tissue records for [1;93m18 non-validated[0m [3montology_ids[0m: [1;93m'CL:0000010 (cell culture)', 'CL:0000082 (cell culture)', 'CL:0000084 (cell culture)', 'CL:0000115 (cell culture)', 'CL:0000351 (cell culture)', 'CL:0002322

In [16]:
# register the non-validated terms
bionty_source_ds_mouse = lb.BiontySource.filter(
    entity="DevelopmentalStage", organism="mouse"
).one()

for name, terms in ontology_ids.items():
    accessor, orm = FEATURE_TO_ACCESSOR.get(name)
    terms_ids = [i[1] for i in terms]
    result = orm.inspect(terms_ids, field="ontology_id")
    if len(result.non_validated) > 0:
        if name == "development_stage":
            dv_records = orm.from_values(
                result.non_validated,
                field="ontology_id",
                bionty_source=bionty_source_ds_mouse,
            )
        elif name == "tissue":
            ts_records = [
                orm(name=term[0], ontology_id=term[1])
                for term in terms
                if term[1] in result.non_validated
            ]

ln.save(dv_records)
ln.save(ts_records)

❗ [1;93m16 terms[0m (7.00%) are not validated for [3montology_id[0m: [1;93mMmusDv:0000066, MmusDv:0000057, MmusDv:0000072, MmusDv:0000067, MmusDv:0000099, MmusDv:0000070, MmusDv:0000065, MmusDv:0000071, MmusDv:0000058, MmusDv:0000102, MmusDv:0000098, MmusDv:0000068, MmusDv:0000079, MmusDv:0000059, MmusDv:0000073, MmusDv:0000074[0m
   couldn't validate [1;91m16 terms[0m: [1;91m'MmusDv:0000059', 'MmusDv:0000068', 'MmusDv:0000058', 'MmusDv:0000066', 'MmusDv:0000057', 'MmusDv:0000102', 'MmusDv:0000098', 'MmusDv:0000071', 'MmusDv:0000074', 'MmusDv:0000065', 'MmusDv:0000067', 'MmusDv:0000073', 'MmusDv:0000070', 'MmusDv:0000072', 'MmusDv:0000079', 'MmusDv:0000099'[0m
→  if you are sure, create new records via [3mln.DevelopmentalStage()[0m and save to your registry
❗ [1;93m18 terms[0m (4.80%) are not validated for [3montology_id[0m: [1;93mUBERON:0001295 (organoid), UBERON:0002370 (organoid), CL:0002327 (cell culture), CL:0000115 (cell culture), CL:0000082 (cell culture), CL:00

### donors and suspension types

In [31]:
donor_ids = set()
suspension_types = set()

for i in res_content:
    if "donor_id" in i:
        donor_ids.update(i["donor_id"])
    if "suspension_type" in i:
        suspension_types.update(i["suspension_type"])

In [18]:
donors = ln.ULabel.filter(name="is_donor").one().children.all()
result = donors.inspect(donor_ids)
new_donors = [ln.ULabel(name=name) for name in result.non_validated]
ln.save(new_donors)
is_donor = ln.ULabel.filter(name="is_donor").one()
is_donor.children.add(*new_donors)

❗ [1;93m1216 terms[0m (16.60%) are not validated for [3mname[0m: [1;93mmouse_YUYDJ, COVID19_Participant12, COVID19_Participant3, 328806, 364368, Human-BRCA1-B, RL2105, COVID19_Participant6, 1864, mouse_LVGJZ, Control_Participant14, 424707, 291740, 377001, mouse_VBJMY, G19.32.002, mouse_EUEJJ, mouse_RZFCS, mouse003, HTA4_11, ...[0m
   couldn't validate [1;91m1216 terms[0m: [1;91m'Control_Participant16', 'C19.32.004', 'mouse_YUYDJ', '14104', 'Williams_GSE164241_GSM5177040', '5976', 'H18.29.134', 'COVID19_Participant12', 'mouse_VTQGY', 'ac45', 'HTA11_99999974143_84620', '779', 'AH2', 'Kydar05', 'GW20-12-7-18', 'Human-WT-C', 'Q19.26.003', 'SPECTRUM-OV-031', '380739', '514', ...[0m
→  if you are sure, create new records via [3mln.ULabel()[0m and save to your registry


In [36]:
stypes = ln.ULabel.filter(name="is_suspension_type").one().children.all()
result = stypes.inspect(suspension_types)
new_stypes = [ln.ULabel(name=name) for name in result.non_validated]
ln.save(new_stypes)
is_suspension_type = ln.ULabel.filter(name="is_suspension_type").one()
is_suspension_type.children.add(*new_stypes)

❗ [1;93m1 term[0m (33.30%) is not validated for [3mname[0m: [1;93mna[0m
   couldn't validate [1;91m1 term[0m: [1;91m'na'[0m
→  if you are sure, create new record via [3mln.ULabel()[0m and save to your registry
❗ records with similar names exist! did you mean to load one of them?


Unnamed: 0_level_0,uid,score
name,Unnamed: 1_level_1,Unnamed: 2_level_1
,etXMfuAG,90.0
mouse_NASMM,H1NNCpCh,90.0


## Annotate files

In [45]:
for idx, dataset_meta in enumerate(res_content):
    if idx % 50 == 0:
        print(f"annotating dataset {idx} of {len(res_content)}")
    file = files.filter(key__contains=dataset_meta["dataset_id"]).one_or_none()
    if file is None:
        continue
    for field, terms in dataset_meta.items():
        if field not in FEATURE_TO_ACCESSOR:
            continue
        accessor, orm = FEATURE_TO_ACCESSOR.get(field)
        if field in ["donor_id", "suspension_type"]:
            records = orm.from_values(terms, field="name")
        else:
            records = orm.from_values(
                [i["ontology_term_id"] for i in terms], field="ontology_id"
            )
        if len(records) > 0:
            getattr(file, accessor).add(*records)

annotating dataset 0 of 1130
annotating dataset 50 of 1130
annotating dataset 100 of 1130
annotating dataset 150 of 1130
annotating dataset 200 of 1130
annotating dataset 250 of 1130
annotating dataset 300 of 1130
annotating dataset 350 of 1130
annotating dataset 400 of 1130
annotating dataset 450 of 1130
annotating dataset 500 of 1130
annotating dataset 550 of 1130
annotating dataset 600 of 1130
annotating dataset 650 of 1130
annotating dataset 700 of 1130
annotating dataset 750 of 1130
annotating dataset 800 of 1130
annotating dataset 850 of 1130
annotating dataset 900 of 1130
annotating dataset 950 of 1130
annotating dataset 1000 of 1130
annotating dataset 1050 of 1130
annotating dataset 1100 of 1130


In [46]:
files.last().describe()

[1;92mFile[0m(uid='aEKsguSJF6kAoeXcICKt', key='cell-census/2023-11-13/h5ads/ff7d15fa-f4b6-4a0e-992e-fd0c9d088ded.h5ad', suffix='.h5ad', accessor='AnnData', size=339098252, hash='wk4aVyHI7iZWNq2n99_s4w-41', hash_type='md5-n', visibility=0, key_is_virtual=False, updated_at=2023-11-20 21:45:35 UTC)

[1;92mProvenance[0m:
  🗃️ storage: Storage(uid='oIYGbD74', root='s3://cellxgene-data-public', type='s3', region='us-west-2', updated_at=2023-10-16 15:04:08 UTC, created_by_id=1)
  📔 transform: Transform(uid='4hbVFAINneXCz8', name='Register files from Census release 2023-11-13', short_name='census-release-2023-11', version='0', type='notebook', updated_at=2023-11-20 21:25:20 UTC, created_by_id=1)
  👣 run: Run(uid='6ZcVntFiofyv226ywRDL', run_at=2023-11-21 19:43:44 UTC, transform_id=18, created_by_id=1)
  👤 created_by: User(uid='kmvZDIX9', handle='sunnyosun', name='Sunny Sun', updated_at=2023-11-21 09:23:50 UTC)
[1;92mFeatures[0m:
  [1mexternal[0m: FeatureSet(uid='qgAXuV6LwRitejVy0ZY9', n

## Register genes

In [23]:
ln.settings.track_run_inputs = False
ln.settings.verbosity = "hint"

for idx, file in enumerate(files):
    if idx % 50 == 0:
        print(f"annotating file {idx} of {len(files)}")
    adata_backed = file.backed()
    genes = adata_backed.var_names
    organism = file.organism.first()
    featureset = ln.FeatureSet.from_values(
        genes, field=lb.Gene.ensembl_gene_id, organism=organism
    )
    # skips non-human datasets
    if featureset is None:
        continue
    if featureset._state.adding:
        featureset.save()
    # not sure why some feature sets are not linked with any genes
    if featureset.genes.count() == 0:
        records = lb.Gene.from_values(
            genes, field=lb.Gene.ensembl_gene_id, organism=organism
        )
        featureset.genes.set(records)
    file.feature_sets.add(featureset, through_defaults={"slot": "var"})

❗ [1;93m19149 terms[0m (100.00%) are not validated for [3mensembl_gene_id[0m: [1;93mENSG00000162063, ENSG00000162062, ENSG00000198951, ENSG00000163121, ENSG00000105829, ENSG00000081237, ENSG00000134153, ENSG00000134152, ENSG00000173728, ENSG00000035687, ENSG00000166961, ENSG00000172689, ENSG00000110104, ENSG00000179397, ENSG00000149506, ENSG00000125384, ENSG00000168229, ENSG00000183579, ENSG00000166548, ENSG00000181234, ...[0m
❗ no validated features, skip creating feature set
✅ [1;92m22266 terms[0m (100.00%) are validated for [3mensembl_gene_id[0m
✅ [1;92m33234 terms[0m (100.00%) are validated for [3mensembl_gene_id[0m
✅ loaded: FeatureSet(uid='scbiqEmIlbbqMbh8rVkX', n=33234, type='number', registry='bionty.Gene', hash='gR384waDKLteDyGkj59I', updated_at=2023-11-21 21:51:08 UTC, created_by_id=1)
✅ [1;92m33234 terms[0m (100.00%) are validated for [3mensembl_gene_id[0m
✅ loaded: FeatureSet(uid='scbiqEmIlbbqMbh8rVkX', n=33234, type='number', registry='bionty.Gene', hash=

### Datasets with human or mouse genes but annotated as other organisms

These files don't have a 'var' featureset:

In [38]:
from django.db.models import Count

novar_files = files.annotate(c=Count("feature_sets")).filter(c=2).all()
len(novar_files)

43

In [46]:
for idx, file in enumerate(novar_files):
    if idx % 5 == 0:
        print(f"annotating file {idx} of {len(novar_files)}")
    adata_backed = file.backed()
    genes = lb.Gene.from_values(
        adata_backed.var_names, field=lb.Gene.ensembl_gene_id, organism="human"
    )
    if len(genes) == 0:
        genes = lb.Gene.from_values(
            adata_backed.var_names, field=lb.Gene.ensembl_gene_id, organism="mouse"
        )
    if len(genes) == len(adata_backed.var_names):
        feature_set = ln.FeatureSet(genes, type="number")
        file.feature_sets.add(featureset, through_defaults={"slot": "var"})

annotating file 0 of 43
annotating file 5 of 43
annotating file 10 of 43
annotating file 15 of 43
annotating file 20 of 43
annotating file 25 of 43
annotating file 30 of 43
annotating file 35 of 43
annotating file 40 of 43


### Datasets with unregistered genes

In [62]:
n = 52126
file = ln.FeatureSet.filter(n=n).one().files.all().one()
adata_backed = file.backed()
genes = lb.Gene.from_values(
    adata_backed.var_names, field=lb.Gene.ensembl_gene_id, organism="mouse"
)
ln.save(genes)

feature_set = ln.FeatureSet(genes, type="number")
file.feature_sets.add(featureset, through_defaults={"slot": "var"})

ln.FeatureSet.filter(n=n).one().delete()

✅ loaded [1;92m52126 Gene records[0m matching [3mensembl_gene_id[0m: [1;92m'ENSMUSG00000050732', 'ENSMUSG00000101535', 'ENSMUSG00000101517', 'ENSMUSG00000103873', 'ENSMUSG00000099414', 'ENSMUSG00000101362', 'ENSMUSG00000065433', 'ENSMUSG00000117183', 'ENSMUSG00000077799', 'ENSMUSG00000092514', 'ENSMUSG00000116818', 'ENSMUSG00000021548', 'ENSMUSG00000074037', 'ENSMUSG00000079505', 'ENSMUSG00000064653', 'ENSMUSG00000001472', 'ENSMUSG00000111435', 'ENSMUSG00000117082', 'ENSMUSG00000104205', 'ENSMUSG00000079602', ...[0m
✅ created [1;95m43 Gene records from Bionty[0m matching [3mensembl_gene_id[0m: [1;95m'ENSMUSG00000061852', 'ENSMUSG00000064400', 'ENSMUSG00000064742', 'ENSMUSG00000080465', 'ENSMUSG00000080542', 'ENSMUSG00000081849', 'ENSMUSG00000083237', 'ENSMUSG00000086513', 'ENSMUSG00000087239', 'ENSMUSG00000090186', 'ENSMUSG00000094162', 'ENSMUSG00000098137', 'ENSMUSG00000098212', 'ENSMUSG00000101890', 'ENSMUSG00000103759', 'ENSMUSG00000103863', 'ENSMUSG00000105705', 'ENSMUSG

In [90]:
n = 52127
file = ln.FeatureSet.filter(n=n).one().files.all().one()
adata_backed = file.backed()
genes = lb.Gene.from_values(
    adata_backed.var_names, field=lb.Gene.ensembl_gene_id, organism="mouse"
)
ln.save(genes)
feature_set = ln.FeatureSet(genes, type="number")
file.feature_sets.add(featureset, through_defaults={"slot": "var"})

ln.FeatureSet.filter(n=n).one().delete()

In [101]:
n = 39091
for file in ln.FeatureSet.filter(n=n).one().files.all():
    adata_backed = file.backed()
    genes = lb.Gene.from_values(
        adata_backed.var_names, field=lb.Gene.ensembl_gene_id, organism="mouse"
    )
    ln.save(genes)
    feature_set = ln.FeatureSet(genes, type="number")
    file.feature_sets.add(featureset, through_defaults={"slot": "var"})

ln.FeatureSet.filter(n=n).one().delete()

(39094,
 {'lnschema_core.FileFeatureSet': 2,
  'lnschema_bionty.Gene_feature_sets': 39091,
  'lnschema_core.FeatureSet': 1})

In [105]:
n = 35412
file = ln.FeatureSet.filter(n=n).one().files.all().one()
adata_backed = file.backed()
genes = lb.Gene.from_values(
    adata_backed.var_names, field=lb.Gene.ensembl_gene_id, organism="mouse"
)
ln.save(genes)
feature_set = ln.FeatureSet(genes, type="number")
file.feature_sets.add(featureset, through_defaults={"slot": "var"})

ln.FeatureSet.filter(n=n).one().delete()

✅ loaded [1;92m35419 Gene records[0m matching [3mensembl_gene_id[0m: [1;92m'ENSMUSG00000109644', 'ENSMUSG00000108652', 'ENSMUSG00000021252', 'ENSMUSG00000007777', 'ENSMUSG00000086714', 'ENSMUSG00000043644', 'ENSMUSG00000024442', 'ENSMUSG00000078886', 'ENSMUSG00000042208', 'ENSMUSG00000020831', 'ENSMUSG00000025731', 'ENSMUSG00000107002', 'ENSMUSG00000046683', 'ENSMUSG00000058706', 'ENSMUSG00000099146', 'ENSMUSG00000028608', 'ENSMUSG00000097882', 'ENSMUSG00000058812', 'ENSMUSG00000089889', 'ENSMUSG00000087341', ...[0m
✅ created [1;95m18 Gene records from Bionty[0m matching [3mensembl_gene_id[0m: [1;95m'ENSMUSG00002074853', 'ENSMUSG00002074875', 'ENSMUSG00002074955', 'ENSMUSG00002075146', 'ENSMUSG00002075525', 'ENSMUSG00002075746', 'ENSMUSG00002075931', 'ENSMUSG00002075991', 'ENSMUSG00002076083', 'ENSMUSG00002076155', 'ENSMUSG00002076161', 'ENSMUSG00002076250', 'ENSMUSG00002076288', 'ENSMUSG00002076601', 'ENSMUSG00002076650', 'ENSMUSG00002076766', 'ENSMUSG00002076818', 'ENSMUSG

(35414,
 {'lnschema_core.FileFeatureSet': 1,
  'lnschema_bionty.Gene_feature_sets': 35412,
  'lnschema_core.FeatureSet': 1})

### Register ERCC genes

Register the organism:

In [3]:
organism_ercc = lb.Organism.from_bionty(
    ontology_id="NCBITaxon:32630",
    bionty_source=ncbitaxon_bs,  # noqa: F821
)
organism_ercc.save(parents=False)
organism_ercc

Organism(uid='pwt0w3kH', name='synthetic construct', ontology_id='NCBITaxon:32630', scientific_name='synthetic construct', updated_at=2023-11-22 14:44:47 UTC, bionty_source_id=50, created_by_id=1)

Get the gene table from cellxgene:

In [7]:
import pandas as pd

df_ercc = pd.read_csv(
    "https://github.com/chanzuckerberg/single-cell-curation/raw/main/cellxgene_schema_cli/cellxgene_schema/ontology_files/genes_ercc.csv.gz",
    header=None,
)

In [8]:
df_ercc

Unnamed: 0,0,1,2,3
0,ERCC-00002,ERCC-00002 (spike-in control),1,1061
1,ERCC-00003,ERCC-00003 (spike-in control),1,1023
2,ERCC-00004,ERCC-00004 (spike-in control),1,523
3,ERCC-00009,ERCC-00009 (spike-in control),1,984
4,ERCC-00012,ERCC-00012 (spike-in control),1,994
...,...,...,...,...
87,ERCC-00164,ERCC-00164 (spike-in control),1,1022
88,ERCC-00165,ERCC-00165 (spike-in control),1,872
89,ERCC-00168,ERCC-00168 (spike-in control),1,1024
90,ERCC-00170,ERCC-00170 (spike-in control),1,1023


In [10]:
ercc_genes = []

for _, row in df_ercc.iterrows():
    ercc_genes.append(
        lb.Gene(
            symbol=row[0], stable_id=row[0], description=row[1], organism=organism_ercc
        )
    )

In [12]:
ln.save(ercc_genes)

### Datasets with multi-organism genes

5 files have ERCC genes

In [106]:
for file in files.all():
    adata_backed = file.backed()
    var_names = adata_backed.var_names
    if len(var_names[var_names.str.startswith("ERCC")]) > 0:
        print(file)

File(uid='KuIP3MrldEjc7dPantpo', key='cell-census/2023-11-13/h5ads/341a0702-9d26-4d8a-b047-ab475f3b492e.h5ad', suffix='.h5ad', accessor='AnnData', size=23851451, hash='BVEUkS6Wd19ZHZgoSu0wgQ-3', hash_type='md5-n', visibility=0, key_is_virtual=False, updated_at=2023-11-20 21:45:24 UTC, storage_id=2, transform_id=18, run_id=18, created_by_id=1)
File(uid='MeByXegdYg3sdk9vueHh', key='cell-census/2023-11-13/h5ads/66d15835-5dc8-4e96-b0eb-f48971cb65e8.h5ad', suffix='.h5ad', accessor='AnnData', size=29517127, hash='WF8sHBv3hCki9bAf_f3uPw-4', hash_type='md5-n', visibility=0, key_is_virtual=False, updated_at=2023-11-20 21:45:27 UTC, storage_id=2, transform_id=18, run_id=18, created_by_id=1)
File(uid='m0OxWMCH24t2qmZHjUu8', key='cell-census/2023-11-13/h5ads/8f98c236-43f0-4dc4-985b-c304499f7b44.h5ad', suffix='.h5ad', accessor='AnnData', size=23760963, hash='8N-PX25XG5w0AAdMC2SzyQ-3', hash_type='md5-n', visibility=0, key_is_virtual=False, updated_at=2023-11-20 21:45:29 UTC, storage_id=2, transform_

In [108]:
for uid in [
    "KuIP3MrldEjc7dPantpo",
    "MeByXegdYg3sdk9vueHh",
    "m0OxWMCH24t2qmZHjUu8",
    "rmrJahYS2l4lUuHyK1hh",
    "5e0KFuZGuR4YMMOePdOT",
]:
    file = files.get(uid=uid)
    adata_backed = file.backed()
    genes_ercc = lb.Gene.filter(organism=organism_ercc).all()
    genes = [i for i in genes_ercc if i.symbol in adata_backed.var_names]
    feature_set_ercc = ln.FeatureSet(genes, type="number")
    feature_set_ercc.save()
    file.feature_sets.add(feature_set_ercc, through_defaults={"slot": "var-ercc"})

In [148]:
files.annotate(c=Count("feature_sets")).filter(c=4).df()

Unnamed: 0_level_0,uid,storage_id,key,suffix,accessor,description,version,size,hash,hash_type,transform_id,run_id,initial_version_id,visibility,key_is_virtual,updated_at,created_by_id,c
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2243,KuIP3MrldEjc7dPantpo,2,cell-census/2023-11-13/h5ads/341a0702-9d26-4d8...,.h5ad,AnnData,,,23851451,BVEUkS6Wd19ZHZgoSu0wgQ-3,md5-n,18,18,,0,False,2023-11-20 21:45:24.572279+00:00,1,4
2775,rmrJahYS2l4lUuHyK1hh,2,cell-census/2023-11-13/h5ads/b07e5164-baf6-43d...,.h5ad,AnnData,,,41494181,2ghddwTagONe1cfHAjqXLg-5,md5-n,18,18,,0,False,2023-11-20 21:45:30.991571+00:00,1,4
2470,MeByXegdYg3sdk9vueHh,2,cell-census/2023-11-13/h5ads/66d15835-5dc8-4e9...,.h5ad,AnnData,,,29517127,WF8sHBv3hCki9bAf_f3uPw-4,md5-n,18,18,,0,False,2023-11-20 21:45:27.285195+00:00,1,4
2625,m0OxWMCH24t2qmZHjUu8,2,cell-census/2023-11-13/h5ads/8f98c236-43f0-4dc...,.h5ad,AnnData,,,23760963,8N-PX25XG5w0AAdMC2SzyQ-3,md5-n,18,18,,0,False,2023-11-20 21:45:29.147256+00:00,1,4
2832,5e0KFuZGuR4YMMOePdOT,2,cell-census/2023-11-13/h5ads/bf6a5c78-5a2e-4e3...,.h5ad,AnnData,,,41782864,rKD1K5dtkGcShQFsR1W6ug-5,md5-n,18,18,,0,False,2023-11-20 21:45:31.653346+00:00,1,4
