# Register files from Census release 2023-11-13

In [1]:
import lamindb as ln
import lnschema_bionty as lb

ln.settings.verbosity = "hint"

2023-11-21 09:29:02,200:INFO - NumExpr defaulting to 2 threads.


💡 lamindb instance: theislab/nicheformer


In [2]:
census_version = "2023-11-13"

In [3]:
s3path = f"s3://cellxgene-data-public/cell-census/{census_version}/h5ads"
ln.UPath(s3path).view_tree()

h5ads (0 sub-directories & 1122 files with suffixes '.h5ad'): 
├── 00099d5e-154f-4a7a-aa8d-fa30c8c0c43c.h5ad
├── 0041b9c3-6a49-4bf7-8514-9bc7190067a7.h5ad
├── 00476f9f-ebc1-4b72-b541-32f912ce36ea.h5ad
├── 00e5dedd-b9b7-43be-8c28-b0e5c6414a62.h5ad
├── 00ff600e-6e2e-4d76-846f-0eec4f0ae417.h5ad
├── 01209dce-3575-4bed-b1df-129f57fbc031.h5ad
...


In [5]:
# on sagemaker
ln.track(notebook_path="/root/cellxgene-census-lamin/docs/notebooks/census-release-2023-11.ipynb")

💡 Assuming editor is Jupyter Lab.
💡 notebook imports: lamindb==0.62.0 lnschema_bionty==0.35.1 requests==2.31.0
💡 loaded: Transform(uid='4hbVFAINneXCz8', name='Register files from Census release 2023-11-13', short_name='census-release-2023-11', version='0', type='notebook', updated_at=2023-11-20 21:25:20 UTC, created_by_id=1)
💡 loaded: Run(uid='6ZcVntFiofyv226ywRDL', run_at=2023-11-21 09:29:07 UTC, transform_id=18, created_by_id=1)


## Register files

In [5]:
files = ln.File.from_dir(s3path)
ln.save(files)

✅ created 1122 files from directory using storage s3://cellxgene-data-public and key = cell-census/2023-11-13/h5ads/


In [6]:
dataset = ln.Dataset(files, name="cellxgene-census", version=census_version)
dataset.save()

💡 initializing versioning for this dataset! create future versions of it using ln.Dataset(..., is_new_version_of=old_dataset)


## Register metadata

Get all datasets and associated metadata using cellxgene REST API:

In [7]:
import requests

def get_datasets_df_from_cxg():
    domain_name = "cellxgene.cziscience.com"
    site_url = f"https://{domain_name}"
    api_url_base = f"https://api.{domain_name}"
    datasets_path = "/curation/v1/datasets"
    datasets_url = f"{api_url_base}{datasets_path}"
    headers = {"Content-Type": "application/json"}
    res = requests.get(url=datasets_url, headers=headers)
    res.raise_for_status()
    res_content = res.json()
    return res_content

In [8]:
res_content = get_datasets_df_from_cxg()

In [10]:
len(res_content)

1130

In [33]:
features = ln.Feature.lookup()
files = ln.File.filter(key__contains=census_version).all()

In [35]:
files[0]

File(uid='STPpBkpPyWE2Dw6hUNMn', key='cell-census/2023-11-13/h5ads/00099d5e-154f-4a7a-aa8d-fa30c8c0c43c.h5ad', suffix='.h5ad', accessor='AnnData', size=15445978, hash='-fz6gvPyTayFcsmM5cEqOw-2', hash_type='md5-n', visibility=0, key_is_virtual=False, updated_at=2023-11-20 21:45:21 UTC, storage_id=2, transform_id=18, run_id=18, created_by_id=1)

In [None]:
for dataset

In [11]:
res_content[0].keys()

dict_keys(['assay', 'assets', 'cell_count', 'cell_type', 'collection_doi', 'collection_id', 'collection_name', 'collection_version_id', 'dataset_id', 'dataset_version_id', 'development_stage', 'disease', 'donor_id', 'explorer_url', 'is_primary_data', 'mean_genes_per_cell', 'organism', 'primary_cell_count', 'processing_status', 'published_at', 'revised_at', 'schema_version', 'self_reported_ethnicity', 'sex', 'suspension_type', 'tissue', 'title', 'tombstone', 'x_approximate_distribution'])

In [31]:
res_content[0]["self_reported_ethnicity"]

[{'label': 'European', 'ontology_term_id': 'HANCESTRO:0005'},
 {'label': 'Hispanic or Latin American', 'ontology_term_id': 'HANCESTRO:0014'}]

In [5]:
dataset = ln.Dataset.filter(version=census_version).one()

In [6]:
file = dataset.files.first()

In [7]:
adata_backed = file.backed()

In [8]:
features = ln.Feature.lookup()

In [9]:
adata_backed.uns["title"]

'Spatial transcriptomics in mouse: Puck_191206_04'

In [13]:
import requests

In [None]:
requests.get("")

In [6]:
lb.CellType.search("gamma delta T cell")

Unnamed: 0_level_0,uid,synonyms,score
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
gamma-delta T cell,64kIG7So,gamma-delta T lymphocyte|gamma-delta T-lymphoc...,100.0
mature gamma-delta T cell,ZcvAPnLl,mature gamma-delta T-cell|mature gamma-delta T...,83.7
alpha-beta T cell,nkJI55X0,alpha-beta T-cell|alpha-beta T-lymphocyte|alph...,74.3
gamma-delta intraepithelial T cell,dMbVABW2,gamma-delta intraepithelial T-cell|gamma-delta...,69.2
immature T cell,7yftEdeR,immature T-cell,66.7
pancreatic D cell,RE9i1A9O,delta cell of pancreatic islet|D-cell of pancr...,66.7
vasa recta cell,wp4PbtJ0,,66.7
stem cell,p0pUUB75,animal stem cell,64.7
mature T cell,2C5PhwrW,mature T-cell|CD3e-positive T cell,64.5


In [7]:
lookup = lb.CellType.lookup()
lookup.hema

In [11]:
ln.ULabel.filter(name="Spatial transcriptomics in mouse: Puck_191206_04")

<QuerySet []>

In [10]:
adata_backed.obs.columns

Index(['assay_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'is_primary_data', 'organism_ontology_term_id', 'sample',
       'tissue_ontology_term_id', 'disease_state', 'sex_ontology_term_id',
       'genotype', 'development_stage_ontology_term_id', 'author_cell_type',
       'cell_type_ontology_term_id', 'disease_ontology_term_id', 'donor_id',
       'suspension_type', 'cell_type', 'assay', 'disease', 'organism', 'sex',
       'tissue', 'self_reported_ethnicity', 'development_stage'],
      dtype='object')

In [17]:
adata_backed.obs.suspension_type.cat.categories

Index(['na'], dtype='object')

In [12]:
adata_backed.obs

Unnamed: 0,assay_ontology_term_id,self_reported_ethnicity_ontology_term_id,is_primary_data,organism_ontology_term_id,sample,tissue_ontology_term_id,disease_state,sex_ontology_term_id,genotype,development_stage_ontology_term_id,...,donor_id,suspension_type,cell_type,assay,disease,organism,sex,tissue,self_reported_ethnicity,development_stage
AAAAAAGCGCTTCG,EFO:0030062,na,True,NCBITaxon:10090,BTBR-ob/ob-3g,UBERON:0002113,early diabetic kidney disease,PATO:0000384,BTBR-ob/ob,MmusDv:0000057,...,BTBR-ob/ob-3,na,kidney proximal convoluted tubule epithelial cell,Slide-seqV2,diabetic kidney disease,Mus musculus,male,kidney,na,13 weeks
AAAAACCCGCGGTG,EFO:0030062,na,True,NCBITaxon:10090,BTBR-ob/ob-3g,UBERON:0002113,early diabetic kidney disease,PATO:0000384,BTBR-ob/ob,MmusDv:0000057,...,BTBR-ob/ob-3,na,kidney proximal convoluted tubule epithelial cell,Slide-seqV2,diabetic kidney disease,Mus musculus,male,kidney,na,13 weeks
AAAAACGAATGCGA,EFO:0030062,na,True,NCBITaxon:10090,BTBR-ob/ob-3g,UBERON:0002113,early diabetic kidney disease,PATO:0000384,BTBR-ob/ob,MmusDv:0000057,...,BTBR-ob/ob-3,na,kidney proximal convoluted tubule epithelial cell,Slide-seqV2,diabetic kidney disease,Mus musculus,male,kidney,na,13 weeks
AAAAACGTGCCGCG,EFO:0030062,na,True,NCBITaxon:10090,BTBR-ob/ob-3g,UBERON:0002113,early diabetic kidney disease,PATO:0000384,BTBR-ob/ob,MmusDv:0000057,...,BTBR-ob/ob-3,na,kidney proximal convoluted tubule epithelial cell,Slide-seqV2,diabetic kidney disease,Mus musculus,male,kidney,na,13 weeks
AAAAACTTTGTGTG,EFO:0030062,na,True,NCBITaxon:10090,BTBR-ob/ob-3g,UBERON:0002113,early diabetic kidney disease,PATO:0000384,BTBR-ob/ob,MmusDv:0000057,...,BTBR-ob/ob-3,na,kidney proximal convoluted tubule epithelial cell,Slide-seqV2,diabetic kidney disease,Mus musculus,male,kidney,na,13 weeks
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTCGGAACCCGAG,EFO:0030062,na,True,NCBITaxon:10090,BTBR-ob/ob-3g,UBERON:0002113,early diabetic kidney disease,PATO:0000384,BTBR-ob/ob,MmusDv:0000057,...,BTBR-ob/ob-3,na,leukocyte,Slide-seqV2,diabetic kidney disease,Mus musculus,male,kidney,na,13 weeks
TTTCGTCTCCAACA,EFO:0030062,na,True,NCBITaxon:10090,BTBR-ob/ob-3g,UBERON:0002113,early diabetic kidney disease,PATO:0000384,BTBR-ob/ob,MmusDv:0000057,...,BTBR-ob/ob-3,na,leukocyte,Slide-seqV2,diabetic kidney disease,Mus musculus,male,kidney,na,13 weeks
TTTGGACTACAACA,EFO:0030062,na,True,NCBITaxon:10090,BTBR-ob/ob-3g,UBERON:0002113,early diabetic kidney disease,PATO:0000384,BTBR-ob/ob,MmusDv:0000057,...,BTBR-ob/ob-3,na,leukocyte,Slide-seqV2,diabetic kidney disease,Mus musculus,male,kidney,na,13 weeks
TTTTCGGCATGGGC,EFO:0030062,na,True,NCBITaxon:10090,BTBR-ob/ob-3g,UBERON:0002113,early diabetic kidney disease,PATO:0000384,BTBR-ob/ob,MmusDv:0000057,...,BTBR-ob/ob-3,na,leukocyte,Slide-seqV2,diabetic kidney disease,Mus musculus,male,kidney,na,13 weeks


In [None]:
def validate_register_obs_metadata(file):
    adata_backed = file.backed()
    obs = adata_backed.obs

## Couldn't get metadata from Census

In [14]:
census = cellxgene_census.open_soma(census_version=census_version)

In [19]:
census["census_info"]

<Collection 's3://cellxgene-data-public/cell-census/2023-11-13/soma/census_info' (open for 'r') (3 items)
    'summary': 's3://cellxgene-data-public/cell-census/2023-11-13/soma/census_info/summary' (unopened)
    'summary_cell_counts': 's3://cellxgene-data-public/cell-census/2023-11-13/soma/census_info/summary_cell_counts' (unopened)
    'datasets': 's3://cellxgene-data-public/cell-census/2023-11-13/soma/census_info/datasets' (unopened)>

In [34]:
datasets_df = census["census_info"]["datasets"].read().concat().to_pandas()

TileDBError: [TileDB::Array] Error: [TileDB::StorageManager] Error: ArraySchema: Failed to deserialize array schema: incompatible format version: got 19; expected <= 18

In [None]:
collections_df = (
    datasets_df[["collection_id", "collection_name", "collection_doi"]]
    .drop_duplicates()
    .set_index("collection_id")
)
collections = []
for collection_id, row in collections_df.iterrows():
    collection = ln.ULabel(
        name=row.collection_name,
        description=row.collection_doi,
        reference=collection_id,
        reference_type="collection_id",
    )
    collections.append(collection)

ln.save(collections)

is_collection = ln.ULabel(name="is_collection")
is_collection.save()
is_collection.children.set(collections)

In [8]:
census["census_data"][lb.settings.organism.scientific_name]

<Experiment 's3://cellxgene-data-public/cell-census/2023-11-13/soma/census_data/homo_sapiens' (open for 'r') (2 items)
    'obs': 's3://cellxgene-data-public/cell-census/2023-11-13/soma/census_data/homo_sapiens/obs' (unopened)
    'ms': 's3://cellxgene-data-public/cell-census/2023-11-13/soma/census_data/homo_sapiens/ms' (unopened)>

In [7]:
lb.settings.organism = "human"

human_datasets = (
    census["census_data"][lb.settings.organism.scientific_name]
    .obs.read(column_names=["dataset_id"])
    .concat()
    .to_pandas()
    .drop_duplicates()
)
print(human_datasets.shape)

TileDBError: [TileDB::Array] Error: [TileDB::StorageManager] Error: ArraySchema: Failed to deserialize array schema: incompatible format version: got 19; expected <= 18