# Setup

Using Colab

## Installs

In [8]:
pip install -U cellxgene-census

Collecting cellxgene-census
  Downloading cellxgene_census-1.17.0-py3-none-any.whl.metadata (5.2 kB)
Collecting tiledbsoma>=1.15.3 (from cellxgene-census)
  Downloading tiledbsoma-1.17.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (7.0 kB)
Collecting anndata (from cellxgene-census)
  Downloading anndata-0.12.2-py3-none-any.whl.metadata (9.6 kB)
Collecting s3fs>=2021.06.1 (from cellxgene-census)
  Downloading s3fs-2025.7.0-py3-none-any.whl.metadata (1.4 kB)
Collecting aiobotocore<3.0.0,>=2.5.4 (from s3fs>=2021.06.1->cellxgene-census)
  Downloading aiobotocore-2.24.1-py3-none-any.whl.metadata (25 kB)
Collecting fsspec==2025.7.0 (from s3fs>=2021.06.1->cellxgene-census)
  Downloading fsspec-2025.7.0-py3-none-any.whl.metadata (12 kB)
Collecting scanpy>=1.9.2 (from tiledbsoma>=1.15.3->cellxgene-census)
  Downloading scanpy-1.11.4-py3-none-any.whl.metadata (9.2 kB)
Collecting somacore==1.0.28 (from tiledbsoma>=1.15.3->cellxgene-census)
  Downloading somacore-1.0.28-py3-none-any.whl.metadat

## Imports

In [None]:
import cellxgene_census
import scanpy as sc
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "notebook"

# Data

In [None]:
census_config = {
    "organism" : "Homo sapiens",
    "var_value_filter" : "feature_type in ['protein_coding']",
    "obs_value_filter" : "sex in ['male', 'female'] and cell_type == 'hepatocyte' and disease == 'normal'",
    "var_column_names" : ["feature_id", "feature_name", "feature_type", "feature_length"],
    "obs_column_names" : ["cell_type", "sex", "assay", "suspension_type"],
}

In [None]:
with cellxgene_census.open_soma(census_version="2025-01-30") as census:
    adata = cellxgene_census.get_anndata(
        census = census,
        organism = "Homo sapiens",
        var_value_filter = "feature_type in ['protein_coding']",
        var_column_names = ["feature_id", "feature_name", "feature_type", "feature_length"],
        obs_value_filter = "sex in ['male', 'female'] and cell_type == 'hepatocyte' and disease == 'normal'",
        obs_column_names = ["cell_type", "sex", "assay", "suspension_type"],
    )

In [44]:
adata

AnnData object with n_obs × n_vars = 78776 × 20045
    obs: 'cell_type', 'sex', 'disease'
    var: 'soma_joinid', 'feature_id', 'feature_name', 'feature_type', 'feature_length', 'nnz', 'n_measured_obs'

In [47]:
sc.pp.filter_genes(adata, min_cells=1000)
adata

AnnData object with n_obs × n_vars = 78776 × 12851
    obs: 'cell_type', 'sex', 'disease'
    var: 'soma_joinid', 'feature_id', 'feature_name', 'feature_type', 'feature_length', 'nnz', 'n_measured_obs', 'n_cells'

In [48]:
sc.pp.filter_cells(adata, min_genes=200)
adata

AnnData object with n_obs × n_vars = 78339 × 12851
    obs: 'cell_type', 'sex', 'disease', 'n_genes'
    var: 'soma_joinid', 'feature_id', 'feature_name', 'feature_type', 'feature_length', 'nnz', 'n_measured_obs', 'n_cells'

In [49]:
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)

In [50]:
adata

AnnData object with n_obs × n_vars = 78339 × 12851
    obs: 'cell_type', 'sex', 'disease', 'n_genes'
    var: 'soma_joinid', 'feature_id', 'feature_name', 'feature_type', 'feature_length', 'nnz', 'n_measured_obs', 'n_cells'
    uns: 'log1p'

In [58]:
sc.pp.pca(adata, n_comps=5)

In [60]:
px.scatter(
    x=adata.obsm["X_pca"][:, 0],
    y=adata.obsm["X_pca"][:, 1],
    color=adata.obs["sex"],
    title="PCA of Gene Expression Data",
    labels={"x": "PC1", "y": "PC2"},
    width=600,
    height=600,
)

Next: add Census wrapper to datasets.py