![](https://img.shields.io/badge/4/4-lightgrey)

# Link cellxgene-census files to registered metadata

In [1]:
import lamindb as ln
import lnschema_bionty as lb
import cellxgene_census

💡 lamindb instance: laminlabs/cellxgene-census


In [2]:
ln.context.track()

💡 notebook imports: cellxgene-census==1.3.0 lamindb==0.55.0 lnschema_bionty==0.31.1
💡 Transform(id='xE0COcgUvwwtz8', name='Link cellxgene-census files to registered metadata', short_name='files-meta', version='0', type=notebook, updated_at=2023-10-05 14:51:36, created_by_id='kmvZDIX9')
💡 Run(id='8a3C3UexMSvG4UcVeXLD', run_at=2023-10-05 14:51:36, transform_id='xE0COcgUvwwtz8', created_by_id='kmvZDIX9')


In [3]:
census = cellxgene_census.open_soma(census_version="2023-07-25")

The "stable" release is currently 2023-07-25. Specify 'census_version="2023-07-25"' in future calls to open_soma() to ensure data consistency.
2023-10-05 16:53:04,682:INFO - The "stable" release is currently 2023-07-25. Specify 'census_version="2023-07-25"' in future calls to open_soma() to ensure data consistency.


In [4]:
datasets_df = census["census_info"]["datasets"].read().concat().to_pandas()

In [5]:
datasets_df

Unnamed: 0,soma_joinid,collection_id,collection_name,collection_doi,dataset_id,dataset_title,dataset_h5ad_path,dataset_total_cell_count
0,0,e2c257e7-6f79-487c-b81c-39451cd4ab3c,Spatial multiomics map of trophoblast developm...,10.1038/s41586-023-05869-0,f171db61-e57e-4535-a06a-35d8b6ef8f2b,donor_p13_trophoblasts,f171db61-e57e-4535-a06a-35d8b6ef8f2b.h5ad,31497
1,1,e2c257e7-6f79-487c-b81c-39451cd4ab3c,Spatial multiomics map of trophoblast developm...,10.1038/s41586-023-05869-0,ecf2e08e-2032-4a9e-b466-b65b395f4a02,All donors trophoblasts,ecf2e08e-2032-4a9e-b466-b65b395f4a02.h5ad,67070
2,2,e2c257e7-6f79-487c-b81c-39451cd4ab3c,Spatial multiomics map of trophoblast developm...,10.1038/s41586-023-05869-0,74cff64f-9da9-4b2a-9b3b-8a04a1598040,All donors all cell states (in vivo),74cff64f-9da9-4b2a-9b3b-8a04a1598040.h5ad,286326
3,3,f7cecffa-00b4-4560-a29a-8ad626b8ee08,Mapping single-cell transcriptomes in the intr...,10.1016/j.ccell.2022.11.001,5af90777-6760-4003-9dba-8f945fec6fdf,Single-cell transcriptomic datasets of Renal c...,5af90777-6760-4003-9dba-8f945fec6fdf.h5ad,270855
4,4,3f50314f-bdc9-40c6-8e4a-b0901ebfbe4c,Single-cell sequencing links multiregional imm...,10.1016/j.ccell.2021.03.007,bd65a70f-b274-4133-b9dd-0d1431b6af34,Single-cell sequencing links multiregional imm...,bd65a70f-b274-4133-b9dd-0d1431b6af34.h5ad,167283
...,...,...,...,...,...,...,...,...
588,588,180bff9c-c8a5-4539-b13b-ddbc00d643e6,Molecular characterization of selectively vuln...,10.1038/s41593-020-00764-7,f9ad5649-f372-43e1-a3a8-423383e5a8a2,Molecular characterization of selectively vuln...,f9ad5649-f372-43e1-a3a8-423383e5a8a2.h5ad,8168
589,589,a72afd53-ab92-4511-88da-252fb0e26b9a,Single-cell atlas of peripheral immune respons...,10.1038/s41591-020-0944-y,456e8b9b-f872-488b-871d-94534090a865,Single-cell atlas of peripheral immune respons...,456e8b9b-f872-488b-871d-94534090a865.h5ad,44721
590,590,38833785-fac5-48fd-944a-0f62a4c23ed1,Construction of a human cell landscape at sing...,10.1038/s41586-020-2157-4,2adb1f8a-a6b1-4909-8ee8-484814e2d4bf,Construction of a human cell landscape at sing...,2adb1f8a-a6b1-4909-8ee8-484814e2d4bf.h5ad,598266
591,591,5d445965-6f1a-4b68-ba3a-b8f765155d3a,A molecular cell atlas of the human lung from ...,10.1038/s41586-020-2922-4,e04daea4-4412-45b5-989e-76a9be070a89,"Krasnow Lab Human Lung Cell Atlas, Smart-seq2",e04daea4-4412-45b5-989e-76a9be070a89.h5ad,9409


In [49]:
files = ln.File.filter()

In [None]:
presence_matrix_dict = {}
gene_metadata_dict = {}
no_data = []

In [97]:
for _, row in datasets_df.iloc[499:].iterrows():
    print(f"dataset: {row.dataset_id}")

    file = files.filter(description__contains=row.dataset_id).one()

    # get organism
    organism_record = files.first().organism.all().one()
    lb.settings.organism = organism_record.name
    organism = organism_record.scientific_name
    census_data = census["census_data"][organism]

    feature_sets = {}

    # obs feature set
    obs = (
        census_data.obs.read(value_filter=f"dataset_id == '{row.dataset_id}'")
        .concat()
        .to_pandas()
    )
    if obs.shape[0] == 0:
        no_data.append(row.dataset_id)
        continue
    feature_set_obs = ln.FeatureSet.from_df(
        obs.loc[:, ~obs.columns.str.endswith("_id")],
    )
    feature_sets["obs"] = feature_set_obs

    # var feature set
    if organism not in presence_matrix_dict:
        presence_matrix_dict[organism] = cellxgene_census.get_presence_matrix(
            census, organism=organism, measurement_name="RNA"
        )
    presence_matrix = presence_matrix_dict.get(organism)
    var_joinid = presence_matrix[row.soma_joinid, :].tocoo().col
    if organism not in gene_metadata_dict:
        gene_metadata_dict[organism] = (
            census_data.ms["RNA"].var.read().concat().to_pandas()
        )
    gene_metadata = gene_metadata_dict.get(organism)
    var = gene_metadata.loc[gene_metadata.soma_joinid.isin(var_joinid)]
    feature_set_var = ln.FeatureSet.from_values(
        var.feature_id,
        lb.Gene.ensembl_gene_id,
        type="number",
    )
    feature_sets["var"] = feature_set_var

    # link two feature sets to file
    file.feature_sets = feature_sets
    file.save()

    # add labels to file
    for feature in feature_set_obs.members:
        if feature.type == "category":
            file.labels.add(obs[feature.name], feature)

dataset: 1009f384-b12d-448e-ba9f-1b7d2ecfbb4e
dataset: ed852810-a003-4386-9846-1638362cee39
dataset: 9d584fcb-a28a-4b91-a886-ceb66a88ef81
dataset: 78fd69d2-75e4-4207-819a-563139f273c6
dataset: 84f1a631-910b-4fbb-9f76-d915a07316d2
dataset: f75f2ff4-2884-4c2d-b375-70de37a34507
dataset: d4e69e01-3ba2-4d6b-a15d-e7048f78f22e
dataset: 26ae14da-9e5f-4d18-abae-18a5a328feef
dataset: cfa3c355-ee77-4fc8-9a00-78e61d23024c
dataset: 30cd5311-6c09-46c9-94f1-71fe4b91813c
dataset: 21d3e683-80a4-4d9b-bc89-ebb2df513dde
dataset: 774c18c5-efa1-4dc5-9e5e-2c824bab2e34
dataset: 37b21763-7f0f-41ae-9001-60bad6e2841d
dataset: 98e5ea9f-16d6-47ec-a529-686e76515e39
dataset: 48b37086-25f7-4ecd-be66-f5bb378e3aea
dataset: 3f4fe86f-aced-4d10-b174-ee35b9f46b9d
dataset: c9096ac4-ea44-4cf9-82f4-af05cb83eb24
dataset: 170ce19f-7a2f-4926-a1cc-adcad99e7474
dataset: e80d4e1c-672f-496a-8f32-37eab34f727d
dataset: 1d29fd10-c8b3-4611-b0ac-3c578125adbf
dataset: c2878000-d3f0-4d30-9a8a-2139a13c72f8
dataset: e3b8c485-7811-407e-99ed-c

In [100]:
file.describe()

[1;92mFile[0m(id='0sbCRBKbqkEuSjhzfp42', key='cell-census/2023-07-25/h5ads/8c42cfd0-0b0a-46d5-910c-fc833d83c45e.h5ad', suffix='.h5ad', accessor='AnnData', description='Krasnow Lab Human Lung Cell Atlas, 10X|8c42cfd0-0b0a-46d5-910c-fc833d83c45e', size=588959280, hash='N0yW4Iksvgw93PzdE_4M0w-71', hash_type='md5-n', updated_at=2023-10-05 20:55:51)

[1;92mProvenance[0m:
  🗃️ storage: Storage(id='oIYGbD74', root='s3://cellxgene-data-public', type='s3', region='us-west-2', updated_at=2023-09-19 13:17:56, created_by_id='kmvZDIX9')
  📔 transform: Transform(id='nhGTqlIHEyn7z8', name='Register h5ad files of cellxgene-census', short_name='files', version='0', type='notebook', reference='https://github.com/laminlabs/cellxgene-census-lamin/blob/2553c2690909976efe380ca96d9e4d6b9a6c6749/docs/notebooks/datasets.ipynb', reference_type='github', updated_at=2023-10-05 14:04:28, created_by_id='kmvZDIX9')
  👣 run: Run(id='60jqKpxivkwkpEFZr8mp', run_at=2023-10-05 18:56:43, transform_id='nhGTqlIHEyn7z8',

## no data datasets

Querying the following datasets from census didn't result any data:

In [98]:
no_data

['d7291f04-fbbb-4d65-990a-f01fa44e915b',
 'be46dfdc-0f99-4731-8957-64ca37364985',
 'a13bda79-9134-46c9-9ed1-a2858be9aafe',
 '5695d556-974e-4d92-9e99-5f61b8695313',
 '12967895-3d58-4e93-be2c-4e1bcf4388d5',
 '49e4ffcc-5444-406d-bdee-577127404ba8',
 '3bbb6cf9-72b9-41be-b568-656de6eb18b5',
 '58b01044-c5e5-4b0f-8a2d-6ebf951e01ff',
 'a539c7af-fb65-44b1-8812-000b097eac99',
 '6347cc90-f284-41d8-a131-db4a37bd796f',
 'dbb4e1ed-d820-4e83-981f-88ef7eb55a35',
 'f71f046d-ea2d-4470-ac38-572cbb06ee12',
 'a14a995b-2f1d-40df-8f61-ab2754cac630',
 '8d8411b2-eb8c-4d8b-9cf1-c605de44d1e6',
 '899e2906-cdb1-4dcc-8dd8-3e3fead6c910',
 '79a2344d-eddd-45b1-b376-39eddfab1899',
 '76b604f3-bb29-4db7-be2b-6500dc5fbdb7',
 '6de332e1-465e-4243-9412-6fdc7497e99d',
 '07f14e26-ff0d-43c4-bfe3-bf1a94dc73c3',
 '42bb7f78-cef8-4b0d-9bba-50037d64d8c1',
 'cb5efdb0-f91c-4cbd-9ad4-9d4fa41c572d',
 '9bb9596d-f23f-4558-912f-d4dc7d52721b',
 '42ff5b55-b848-4f4c-b7cb-b8aac107841c',
 '047d57f2-4d14-45de-aa98-336c6f583750',
 'a810e511-c18b-

In [99]:
len(no_data)

82