In [16]:
import cellxgene_census
import tiledbsoma as soma
import cellxgene_census.experimental.ml as census_ml

import torch
import pandas as pd
import numpy as np
from scipy import sparse

# Endpoint for loading data
census = cellxgene_census.open_soma()

  from .autonotebook import tqdm as notebook_tqdm
The "stable" release is currently 2023-07-25. Specify 'census_version="2023-07-25"' in future calls to open_soma() to ensure data consistency.


# Exploration of the cellxgene dataset

In this initial dive, we show how to (1) list the different studies available, (2) subset to particular studies of interest, and (3) obtain raw and processed gene expression levels.

This initial exploration of the cellxgene dataset is based on the following materials:

- https://chanzuckerberg.github.io/cellxgene-census/examples.html (tutorials)
- https://chanzuckerberg.github.io/cellxgene-census/cellxgene_census_docsite_schema.html (schema for the dataset)

### Table of contents

1. [Accessing list of studies available](#Accessing-list-of-studies-available)
2. [Subset to particular studies](#Subset-to-particular-studies)
3. [Obtain raw and processed gene expression levels](#Obtain-raw-and-processed-gene-expression-levels)


## Accessing list of studies available

In cellxgene, the `census` object is a context manager that serves as an endpoint for all data access. It is used to load the data and metadata associated with different studies. Below we will use it to list all possible studies available to us.

In [2]:
with census:
    # List of all studies available
    all_studies = census["census_info"]["datasets"].read().concat().to_pandas()

In [3]:
# Snippet of metadata available for each title. Some examples below:
# - name of the study; seems to correspond to the name of a published paper (collection_name)
# - collection unique identifier (collection_id)
# - dataset unique identifier (dataset_id)
# Collections contains datasets and some other info (e.g. author information): https://api.cellxgene.cziscience.com/curation/ui/
all_studies.head()

Unnamed: 0,soma_joinid,collection_id,collection_name,collection_doi,dataset_id,dataset_title,dataset_h5ad_path,dataset_total_cell_count
0,0,e2c257e7-6f79-487c-b81c-39451cd4ab3c,Spatial multiomics map of trophoblast developm...,10.1038/s41586-023-05869-0,f171db61-e57e-4535-a06a-35d8b6ef8f2b,donor_p13_trophoblasts,f171db61-e57e-4535-a06a-35d8b6ef8f2b.h5ad,31497
1,1,e2c257e7-6f79-487c-b81c-39451cd4ab3c,Spatial multiomics map of trophoblast developm...,10.1038/s41586-023-05869-0,ecf2e08e-2032-4a9e-b466-b65b395f4a02,All donors trophoblasts,ecf2e08e-2032-4a9e-b466-b65b395f4a02.h5ad,67070
2,2,e2c257e7-6f79-487c-b81c-39451cd4ab3c,Spatial multiomics map of trophoblast developm...,10.1038/s41586-023-05869-0,74cff64f-9da9-4b2a-9b3b-8a04a1598040,All donors all cell states (in vivo),74cff64f-9da9-4b2a-9b3b-8a04a1598040.h5ad,286326
3,3,f7cecffa-00b4-4560-a29a-8ad626b8ee08,Mapping single-cell transcriptomes in the intr...,10.1016/j.ccell.2022.11.001,5af90777-6760-4003-9dba-8f945fec6fdf,Single-cell transcriptomic datasets of Renal c...,5af90777-6760-4003-9dba-8f945fec6fdf.h5ad,270855
4,4,3f50314f-bdc9-40c6-8e4a-b0901ebfbe4c,Single-cell sequencing links multiregional imm...,10.1016/j.ccell.2021.03.007,bd65a70f-b274-4133-b9dd-0d1431b6af34,Single-cell sequencing links multiregional imm...,bd65a70f-b274-4133-b9dd-0d1431b6af34.h5ad,167283


In [4]:
# Number of cells across all datasets
all_studies['dataset_total_cell_count'].sum()

# IMPORTANT: this is an overcount because studies sometimes use data from previous studies.
# see https://chanzuckerberg.github.io/cellxgene-census/notebooks/api_demo/census_dataset_presence.html#Identifying-genes-measured-in-a-specific-dataset.
# for details

61656118

In [5]:
# Distribution of the number of cells sequenced by study 
# (broken into bins for easier inspection)

# The vast majority of studies have 10k-100k cells sequenced. However, some have millions
# of cells sequences. This large skew in cell counts might bias downstream analyses (i.e., 
# a particular study might be weighted more heavily than others).
pd.cut(
    all_studies['dataset_total_cell_count'],
    bins = [0, 1000, 10000, 100000, 1000000, 10000000, 1e20],
    labels=['<1k', '1k-10k', '10k-100k', '100k-1M', '1M-10M', '10M+']
).value_counts()

10k-100k    331
1k-10k      143
100k-1M     102
1M-10M       12
<1k           5
10M+          0
Name: dataset_total_cell_count, dtype: int64

## Subset to particular studies

Below we show how to access cell-level information for each cell sequenced in a particular study. This includes information such as the cell type, the tissue, etc.

In [6]:
# Get all metadata for a study
# Below, we just took the first dataset_id from the dataframe above

example_study = (
    census["census_data"]["homo_sapiens"]
    .obs.read(
        value_filter="dataset_id == 'f171db61-e57e-4535-a06a-35d8b6ef8f2b'",
    )
    .concat()
    .to_pandas()
)

example_study.head()


Unnamed: 0,soma_joinid,dataset_id,assay,assay_ontology_term_id,cell_type,cell_type_ontology_term_id,development_stage,development_stage_ontology_term_id,disease,disease_ontology_term_id,...,is_primary_data,self_reported_ethnicity,self_reported_ethnicity_ontology_term_id,sex,sex_ontology_term_id,suspension_type,tissue,tissue_ontology_term_id,tissue_general,tissue_general_ontology_term_id
0,0,f171db61-e57e-4535-a06a-35d8b6ef8f2b,10x 3' v3,EFO:0009922,syncytiotrophoblast cell,CL:0000525,9th week post-fertilization human stage,HsapDv:0000046,normal,PATO:0000461,...,False,unknown,unknown,unknown,unknown,nucleus,decidua basalis,UBERON:0000453,placenta,UBERON:0001987
1,1,f171db61-e57e-4535-a06a-35d8b6ef8f2b,10x 3' v3,EFO:0009922,placental villous trophoblast,CL:2000060,9th week post-fertilization human stage,HsapDv:0000046,normal,PATO:0000461,...,False,unknown,unknown,unknown,unknown,nucleus,decidua basalis,UBERON:0000453,placenta,UBERON:0001987
2,2,f171db61-e57e-4535-a06a-35d8b6ef8f2b,10x 3' v3,EFO:0009922,syncytiotrophoblast cell,CL:0000525,9th week post-fertilization human stage,HsapDv:0000046,normal,PATO:0000461,...,False,unknown,unknown,unknown,unknown,nucleus,decidua basalis,UBERON:0000453,placenta,UBERON:0001987
3,3,f171db61-e57e-4535-a06a-35d8b6ef8f2b,10x 3' v3,EFO:0009922,syncytiotrophoblast cell,CL:0000525,9th week post-fertilization human stage,HsapDv:0000046,normal,PATO:0000461,...,False,unknown,unknown,unknown,unknown,nucleus,decidua basalis,UBERON:0000453,placenta,UBERON:0001987
4,4,f171db61-e57e-4535-a06a-35d8b6ef8f2b,10x 3' v3,EFO:0009922,extravillous trophoblast,CL:0008036,9th week post-fertilization human stage,HsapDv:0000046,normal,PATO:0000461,...,False,unknown,unknown,unknown,unknown,nucleus,decidua basalis,UBERON:0000453,placenta,UBERON:0001987


In [7]:
# List of cell data available
example_study.columns

Index(['soma_joinid', 'dataset_id', 'assay', 'assay_ontology_term_id',
       'cell_type', 'cell_type_ontology_term_id', 'development_stage',
       'development_stage_ontology_term_id', 'disease',
       'disease_ontology_term_id', 'donor_id', 'is_primary_data',
       'self_reported_ethnicity', 'self_reported_ethnicity_ontology_term_id',
       'sex', 'sex_ontology_term_id', 'suspension_type', 'tissue',
       'tissue_ontology_term_id', 'tissue_general',
       'tissue_general_ontology_term_id'],
      dtype='object')

In [8]:
# Distribution of the cell types represented in this study
example_study['cell_type'].value_counts()

syncytiotrophoblast cell         18471
placental villous trophoblast    10038
extravillous trophoblast          2966
trophoblast giant cell              22
Name: cell_type, dtype: int64

In [9]:
# Distribution of the tissues represented in this study
example_study['tissue'].value_counts()

decidua basalis    31497
Name: tissue, dtype: int64

In [10]:
# If is_primary_data = False, then this means this data is not the primary source.
# That is, this data is sourced from another study. Here, all the data is non-primary
# so this paper probably just analyzed someone else's data when publishing their 
# paper.
example_study['is_primary_data'].value_counts()

False    31497
Name: is_primary_data, dtype: int64

In [11]:
# The majority of sex is unknown
example_study['sex'].value_counts()

unknown    31472
female        25
Name: sex, dtype: int64

### We can also subset to studies that satisfy certain criteria using the `value_filter` argument and certain columns of interest using the `columns` argument.

In [12]:
example_multiple_studies = (
    census["census_data"]["homo_sapiens"]
    .obs.read(
        value_filter="cell_type == 'B cell' and tissue_general == 'lung'",
        column_names=["disease"],
    )
    .concat()
    .to_pandas()
)

In [13]:
# There over 100k cells that mean criteria
example_multiple_studies

Unnamed: 0,disease,cell_type,tissue_general
0,normal,B cell,lung
1,normal,B cell,lung
2,normal,B cell,lung
3,normal,B cell,lung
4,normal,B cell,lung
...,...,...,...
138794,normal,B cell,lung
138795,normal,B cell,lung
138796,normal,B cell,lung
138797,normal,B cell,lung


In [14]:
example_multiple_studies.value_counts()

disease                                cell_type  tissue_general
lung adenocarcinoma                    B cell     lung              62351
normal                                 B cell     lung              25461
non-small cell lung carcinoma          B cell     lung              17484
squamous cell lung carcinoma           B cell     lung              11920
pulmonary fibrosis                     B cell     lung               6798
chronic obstructive pulmonary disease  B cell     lung               6369
COVID-19                               B cell     lung               2729
lung large cell carcinoma              B cell     lung               1534
pulmonary emphysema                    B cell     lung               1512
pleomorphic carcinoma                  B cell     lung               1210
small cell lung carcinoma              B cell     lung                583
interstitial lung disease              B cell     lung                376
non-specific interstitial pneumonia    B cell  

# Obtain raw and processed gene expression levels

Now, we show how to obtain raw and processed gene expression levels.

In [17]:
# The `ExperimentDataPipe` allows us to load data in batches to avoid
# memory issues. We can also filter the data by cell type, tissue, etc.
experiment = census["census_data"]["homo_sapiens"]
experiment_datapipe = census_ml.ExperimentDataPipe(
    experiment,
    measurement_name="RNA",
    X_name="raw",
    obs_query=soma.AxisQuery(value_filter="tissue_general == 'tongue' and is_primary_data == True"),
    obs_column_names=["cell_type"],
    batch_size=16,
)

In [18]:
# there are 15k cells and 60k genes
experiment_datapipe.shape

(15020, 60664)