# Setup

Using Colab

## Installs

In [1]:
!pip install -U cellxgene-census



## Imports

In [2]:
# from scripts.datasets import census_classes

import scanpy as sc
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "notebook"

In [3]:
# import cellxgene_census
# census_version = "2025-01-30"

# Data

In [4]:
census_config = {
    "organism" : "Homo sapiens",
    "var_value_filter" : "feature_type in ['protein_coding']",
    "obs_value_filter" : "sex in ['male', 'female'] and cell_type == 'hepatocyte' and disease == 'normal'",
    "var_column_names" : ["feature_id", "feature_name", "feature_type", "feature_length"],
    "obs_column_names" : ["cell_type", "sex", "assay", "suspension_type"],
}
class_key = "sex"

In [5]:
# with cellxgene_census.open_soma(census_version=census_version) as census:
#     adata = cellxgene_census.get_anndata(
#         census=census,
#         organism=census_config["organism"],
#         var_value_filter=census_config["var_value_filter"],
#         obs_value_filter=census_config["obs_value_filter"],
#         var_column_names=census_config["var_column_names"],
#         obs_column_names=census_config["obs_column_names"],
#     )

In [6]:
adata = sc.read_h5ad("/content/drive/MyDrive/census_test_data.h5ad")

In [7]:
adata

AnnData object with n_obs × n_vars = 78776 × 20045
    obs: 'cell_type', 'sex', 'assay', 'suspension_type', 'disease'
    var: 'feature_id', 'feature_name', 'feature_type', 'feature_length'

In [8]:
total_counts = adata.X.toarray().sum(axis=1)
px.histogram(
    x=total_counts,
)

In [9]:
min_counts_per_cell = 100
max_counts_per_cell = 50_000
sc.pp.filter_cells(adata, min_counts=min_counts_per_cell)
sc.pp.filter_cells(adata, max_counts=max_counts_per_cell)

In [10]:
adata

AnnData object with n_obs × n_vars = 78000 × 20045
    obs: 'cell_type', 'sex', 'assay', 'suspension_type', 'disease', 'n_counts'
    var: 'feature_id', 'feature_name', 'feature_type', 'feature_length'

In [11]:
total_counts = adata.X.toarray().sum(axis=1)
px.histogram(
    x=total_counts,
)

In [12]:
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)

In [13]:
sc.pp.highly_variable_genes(adata, n_top_genes=5000)
adata.raw = adata
adata = adata[:, adata.var.highly_variable]

In [14]:
adata

View of AnnData object with n_obs × n_vars = 78000 × 5000
    obs: 'cell_type', 'sex', 'assay', 'suspension_type', 'disease', 'n_counts'
    var: 'feature_id', 'feature_name', 'feature_type', 'feature_length', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'log1p', 'hvg'

In [15]:
sc.pp.pca(adata, n_comps=5)


Setting element `.obsm['X_pca']` of view, initializing view as actual.



In [16]:
px.scatter(
    x=adata.obsm["X_pca"][:, 1],
    y=adata.obsm["X_pca"][:, 2],
    color=adata.obs["sex"],
    title="PCA of Gene Expression Data",
    labels={"x": "PC1", "y": "PC2"},
    width=600,
    height=600,
)

In [17]:
!pip install git+https://github.com/kmaherx/ScBMLP.git

Collecting git+https://github.com/kmaherx/ScBMLP.git
  Cloning https://github.com/kmaherx/ScBMLP.git to /tmp/pip-req-build-gmilc1yl
  Running command git clone --filter=blob:none --quiet https://github.com/kmaherx/ScBMLP.git /tmp/pip-req-build-gmilc1yl
  Resolved https://github.com/kmaherx/ScBMLP.git to commit 37e73b935b2c66b1107e7fa1f05548a7e1b85183
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [None]:
from scripts.datasets import get_split_idxs, get_type_datasets

val_split = 0.15
random_state = 0
DEVICE = "cuda"

train_indices, val_indices, test_indices = get_split_idxs(
    adata, val_split=val_split, random_state=random_state,
)
# use cpu for dataset creation to avoid sparse cuda issues, model will move to gpu
train_dataset, val_dataset, test_dataset = get_type_datasets(
    adata, train_indices, val_indices, test_indices, class_key, device="cpu",
)

# Train

In [19]:
from scripts.bmlp import ScBMLPClassifier, Config

In [20]:
d_hidden = 64
n_epochs = 100
lr = 1e-4
DEVICE = "cuda"

n_cells, n_genes = adata.shape
class_key = "sex"
n_classes = adata.obs[class_key].nunique()

n_epochs = 100

In [21]:
cfg = Config(
    d_input=n_genes,
    d_hidden=d_hidden,
    d_output=n_classes,
    n_epochs=n_epochs,
    lr=lr,
    device=DEVICE,
)
model = ScBMLPClassifier(cfg)
train_losses, val_losses = model.fit(train_dataset, val_dataset)

Training for 100 epochs:   0%|          | 0/100 [01:33<?, ?it/s]


NotImplementedError: Could not run 'aten::as_strided' with arguments from the 'SparseCUDA' backend. This could be because the operator doesn't exist for this backend, or was omitted during the selective/custom build process (if using custom build). If you are a Facebook employee using PyTorch on mobile, please visit https://fburl.com/ptmfixes for possible resolutions. 'aten::as_strided' is only available for these backends: [CPU, CUDA, Meta, QuantizedCPU, QuantizedCUDA, BackendSelect, Python, FuncTorchDynamicLayerBackMode, Functionalize, Named, Conjugate, Negative, ZeroTensor, ADInplaceOrView, AutogradOther, AutogradCPU, AutogradCUDA, AutogradHIP, AutogradXLA, AutogradMPS, AutogradIPU, AutogradXPU, AutogradHPU, AutogradVE, AutogradLazy, AutogradMTIA, AutogradMAIA, AutogradPrivateUse1, AutogradPrivateUse2, AutogradPrivateUse3, AutogradMeta, AutogradNestedTensor, Tracer, AutocastCPU, AutocastMTIA, AutocastMAIA, AutocastXPU, AutocastMPS, AutocastCUDA, FuncTorchBatched, BatchedNestedTensor, FuncTorchVmapMode, Batched, VmapMode, FuncTorchGradWrapper, PythonTLSSnapshot, FuncTorchDynamicLayerFrontMode, PreDispatch, PythonDispatcher].

CPU: registered at /pytorch/build/aten/src/ATen/RegisterCPU_0.cpp:1823 [kernel]
CUDA: registered at /pytorch/build/aten/src/ATen/RegisterCUDA_0.cpp:4146 [kernel]
Meta: registered at /pytorch/build/aten/src/ATen/RegisterMeta_0.cpp:2431 [kernel]
QuantizedCPU: registered at /pytorch/build/aten/src/ATen/RegisterQuantizedCPU_0.cpp:194 [kernel]
QuantizedCUDA: registered at /pytorch/build/aten/src/ATen/RegisterQuantizedCUDA_0.cpp:159 [kernel]
BackendSelect: fallthrough registered at /pytorch/aten/src/ATen/core/BackendSelectFallbackKernel.cpp:3 [backend fallback]
Python: registered at /pytorch/aten/src/ATen/core/PythonFallbackKernel.cpp:194 [backend fallback]
FuncTorchDynamicLayerBackMode: registered at /pytorch/aten/src/ATen/functorch/DynamicLayer.cpp:479 [backend fallback]
Functionalize: registered at /pytorch/build/aten/src/ATen/RegisterFunctionalization_0.cpp:23451 [kernel]
Named: fallthrough registered at /pytorch/aten/src/ATen/core/NamedRegistrations.cpp:11 [kernel]
Conjugate: fallthrough registered at /pytorch/aten/src/ATen/ConjugateFallback.cpp:21 [kernel]
Negative: fallthrough registered at /pytorch/aten/src/ATen/native/NegateFallback.cpp:22 [kernel]
ZeroTensor: registered at /pytorch/build/aten/src/ATen/RegisterZeroTensor_0.cpp:139 [kernel]
ADInplaceOrView: registered at /pytorch/torch/csrc/autograd/generated/ADInplaceOrViewType_0.cpp:4969 [kernel]
AutogradOther: registered at /pytorch/torch/csrc/autograd/generated/VariableType_0.cpp:18192 [autograd kernel]
AutogradCPU: registered at /pytorch/torch/csrc/autograd/generated/VariableType_0.cpp:18192 [autograd kernel]
AutogradCUDA: registered at /pytorch/torch/csrc/autograd/generated/VariableType_0.cpp:18192 [autograd kernel]
AutogradHIP: registered at /pytorch/torch/csrc/autograd/generated/VariableType_0.cpp:18192 [autograd kernel]
AutogradXLA: registered at /pytorch/torch/csrc/autograd/generated/VariableType_0.cpp:18192 [autograd kernel]
AutogradMPS: registered at /pytorch/torch/csrc/autograd/generated/VariableType_0.cpp:18192 [autograd kernel]
AutogradIPU: registered at /pytorch/torch/csrc/autograd/generated/VariableType_0.cpp:18192 [autograd kernel]
AutogradXPU: registered at /pytorch/torch/csrc/autograd/generated/VariableType_0.cpp:18192 [autograd kernel]
AutogradHPU: registered at /pytorch/torch/csrc/autograd/generated/VariableType_0.cpp:18192 [autograd kernel]
AutogradVE: registered at /pytorch/torch/csrc/autograd/generated/VariableType_0.cpp:18192 [autograd kernel]
AutogradLazy: registered at /pytorch/torch/csrc/autograd/generated/VariableType_0.cpp:18192 [autograd kernel]
AutogradMTIA: registered at /pytorch/torch/csrc/autograd/generated/VariableType_0.cpp:18192 [autograd kernel]
AutogradMAIA: registered at /pytorch/torch/csrc/autograd/generated/VariableType_0.cpp:18192 [autograd kernel]
AutogradPrivateUse1: registered at /pytorch/torch/csrc/autograd/generated/VariableType_0.cpp:18192 [autograd kernel]
AutogradPrivateUse2: registered at /pytorch/torch/csrc/autograd/generated/VariableType_0.cpp:18192 [autograd kernel]
AutogradPrivateUse3: registered at /pytorch/torch/csrc/autograd/generated/VariableType_0.cpp:18192 [autograd kernel]
AutogradMeta: registered at /pytorch/torch/csrc/autograd/generated/VariableType_0.cpp:18192 [autograd kernel]
AutogradNestedTensor: registered at /pytorch/torch/csrc/autograd/generated/VariableType_0.cpp:18192 [autograd kernel]
Tracer: registered at /pytorch/torch/csrc/autograd/generated/TraceType_0.cpp:17232 [kernel]
AutocastCPU: fallthrough registered at /pytorch/aten/src/ATen/autocast_mode.cpp:322 [backend fallback]
AutocastMTIA: fallthrough registered at /pytorch/aten/src/ATen/autocast_mode.cpp:466 [backend fallback]
AutocastMAIA: fallthrough registered at /pytorch/aten/src/ATen/autocast_mode.cpp:504 [backend fallback]
AutocastXPU: fallthrough registered at /pytorch/aten/src/ATen/autocast_mode.cpp:542 [backend fallback]
AutocastMPS: fallthrough registered at /pytorch/aten/src/ATen/autocast_mode.cpp:209 [backend fallback]
AutocastCUDA: fallthrough registered at /pytorch/aten/src/ATen/autocast_mode.cpp:165 [backend fallback]
FuncTorchBatched: registered at /pytorch/aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp:735 [kernel]
BatchedNestedTensor: registered at /pytorch/aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp:758 [backend fallback]
FuncTorchVmapMode: fallthrough registered at /pytorch/aten/src/ATen/functorch/VmapModeRegistrations.cpp:27 [backend fallback]
Batched: registered at /pytorch/aten/src/ATen/LegacyBatchingRegistrations.cpp:1079 [kernel]
VmapMode: fallthrough registered at /pytorch/aten/src/ATen/VmapModeRegistrations.cpp:33 [backend fallback]
FuncTorchGradWrapper: registered at /pytorch/aten/src/ATen/functorch/TensorWrapper.cpp:210 [backend fallback]
PythonTLSSnapshot: registered at /pytorch/aten/src/ATen/core/PythonFallbackKernel.cpp:202 [backend fallback]
FuncTorchDynamicLayerFrontMode: registered at /pytorch/aten/src/ATen/functorch/DynamicLayer.cpp:475 [backend fallback]
PreDispatch: registered at /pytorch/aten/src/ATen/core/PythonFallbackKernel.cpp:206 [backend fallback]
PythonDispatcher: registered at /pytorch/aten/src/ATen/core/PythonFallbackKernel.cpp:198 [backend fallback]


In [None]:
# Combine train and val losses into a single plot
loss_df = pd.DataFrame({
    'Epoch': list(range(len(train_losses))) + list(range(len(val_losses))),
    'Loss': train_losses + val_losses,
    'Type': ['Train'] * len(train_losses) + ['Validation'] * len(val_losses)
})

px.line(loss_df, x='Epoch', y='Loss', color='Type', 
        title='Training and Validation Loss', 
        labels={'Loss': 'Loss', 'Epoch': 'Epoch'}).show()

In [None]:
train_dataset[0][0].shape

torch.Size([20045])