In [1]:
# https://chanzuckerberg.github.io/cellxgene-census/notebooks/api_demo/census_duplicated_cells.html


In [5]:
import cellxgene_census

tabula_muris_dataset_id = "48b37086-25f7-4ecd-be66-f5bb378e3aea"
census = cellxgene_census.open_soma()

# GET BY dataset_id
tabula_muris_obs = cellxgene_census.get_obs(
    census,
    "mus_musculus",
    value_filter=f"dataset_id == '{tabula_muris_dataset_id}'",
    column_names=["tissue", "is_primary_data"],
)

The "stable" release is currently 2025-01-30. Specify 'census_version="2025-01-30"' in future calls to open_soma() to ensure data consistency.


In [21]:
tabula_muris_obs.value_counts()

tissue           is_primary_data  dataset_id                          
bone marrow      True             48b37086-25f7-4ecd-be66-f5bb378e3aea    40220
spleen           True             48b37086-25f7-4ecd-be66-f5bb378e3aea    35718
limb muscle      True             48b37086-25f7-4ecd-be66-f5bb378e3aea    28867
lung             True             48b37086-25f7-4ecd-be66-f5bb378e3aea    24540
kidney           True             48b37086-25f7-4ecd-be66-f5bb378e3aea    21647
tongue           True             48b37086-25f7-4ecd-be66-f5bb378e3aea    20680
mammary gland    True             48b37086-25f7-4ecd-be66-f5bb378e3aea    12295
thymus           True             48b37086-25f7-4ecd-be66-f5bb378e3aea     9275
bladder lumen    True             48b37086-25f7-4ecd-be66-f5bb378e3aea     8945
heart            True             48b37086-25f7-4ecd-be66-f5bb378e3aea     8613
trachea          True             48b37086-25f7-4ecd-be66-f5bb378e3aea     7976
liver            True             48b37086-25f7-4

In [22]:
"""
But what if we select cells from the dataset that only contains cells from
the liver: "Liver - A single-cell transcriptomic atlas characterizes ageing
tissues in the mouse - 10x"
"""

# GET BY dataset_id
tabula_muris_liver_dataset_id = "6202a243-b713-4e12-9ced-c387f8483dea"
tabula_muris_liver_obs = cellxgene_census.get_obs(
    census,
    "mus_musculus",
    value_filter=f"dataset_id == '{tabula_muris_liver_dataset_id}'",
    column_names=["tissue", "is_primary_data"],
)

In [23]:
tabula_muris_liver_obs.value_counts()

tissue  is_primary_data  dataset_id                          
liver   False            6202a243-b713-4e12-9ced-c387f8483dea    7294
Name: count, dtype: int64

In [49]:
"""
Filtering out duplicate cells when reading the obs dataframe
"""

# GET BY value_filter QUERY
nk_cells = cellxgene_census.get_obs(
    census,
    "mus_musculus",
    value_filter="cell_type == 'natural killer cell' "
    "and sex == 'female'"
    "and is_primary_data == True"
    ,
)

In [50]:
nk_cells.shape

(5494, 28)

In [51]:
nk_cells.keys()

Index(['soma_joinid', 'dataset_id', 'assay', 'assay_ontology_term_id',
       'cell_type', 'cell_type_ontology_term_id', 'development_stage',
       'development_stage_ontology_term_id', 'disease',
       'disease_ontology_term_id', 'donor_id', 'is_primary_data',
       'observation_joinid', 'self_reported_ethnicity',
       'self_reported_ethnicity_ontology_term_id', 'sex',
       'sex_ontology_term_id', 'suspension_type', 'tissue',
       'tissue_ontology_term_id', 'tissue_type', 'tissue_general',
       'tissue_general_ontology_term_id', 'raw_sum', 'nnz', 'raw_mean_nnz',
       'raw_variance_nnz', 'n_measured_vars'],
      dtype='object')

In [52]:
# nk_cells["disease"].unique()
nk_cells["tissue"].unique()
nk_cells["tissue_general"].unique()
# nk_cells.value_counts()

['brain', 'central nervous system', 'bone marrow', 'spleen', 'adipose tissue', 'liver', 'lung', 'endocrine gland', 'kidney', 'embryo']
Categories (36, object): ['adipose tissue', 'blood', 'bone marrow', 'brain', ..., 'tongue', 'urethra', 'urinary bladder', 'vasculature']

In [53]:
"""
Filtering out duplicate cells when creating an AnnData
"""
adata = cellxgene_census.get_anndata(
    census,
    organism="Homo sapiens",
    var_value_filter="feature_name == 'AQP5'",
    obs_value_filter="cell_type == 'natural killer cell' "
    "and sex == 'female'"
    "and tissue_general == 'blood'",
)

In [55]:
len(adata.obs)

335619

In [56]:
adata

AnnData object with n_obs × n_vars = 335619 × 1
    obs: 'soma_joinid', 'dataset_id', 'assay', 'assay_ontology_term_id', 'cell_type', 'cell_type_ontology_term_id', 'development_stage', 'development_stage_ontology_term_id', 'disease', 'disease_ontology_term_id', 'donor_id', 'is_primary_data', 'observation_joinid', 'self_reported_ethnicity', 'self_reported_ethnicity_ontology_term_id', 'sex', 'sex_ontology_term_id', 'suspension_type', 'tissue', 'tissue_ontology_term_id', 'tissue_type', 'tissue_general', 'tissue_general_ontology_term_id', 'raw_sum', 'nnz', 'raw_mean_nnz', 'raw_variance_nnz', 'n_measured_vars'
    var: 'soma_joinid', 'feature_id', 'feature_name', 'feature_type', 'feature_length', 'nnz', 'n_measured_obs'