# Setup

Using Colab

## Installs

In [1]:
!pip install -U cellxgene-census



## Imports

In [2]:
# from scripts.datasets import census_classes

import scanpy as sc
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "notebook"

In [6]:
import cellxgene_census
census_version = "2025-01-30"

# Data

In [7]:
census_config = {
    "organism" : "Homo sapiens",
    "var_value_filter" : "feature_type in ['protein_coding']",
    "obs_value_filter" : "sex in ['male', 'female'] and cell_type == 'hepatocyte' and disease == 'normal'",
    "var_column_names" : ["feature_id", "feature_name", "feature_type", "feature_length"],
    "obs_column_names" : ["cell_type", "sex", "assay", "suspension_type"],
}
class_key = "sex"

In [8]:
with cellxgene_census.open_soma(census_version=census_version) as census:
    adata = cellxgene_census.get_anndata(
        census=census,
        organism=census_config["organism"],
        var_value_filter=census_config["var_value_filter"],
        obs_value_filter=census_config["obs_value_filter"],
        var_column_names=census_config["var_column_names"],
        obs_column_names=census_config["obs_column_names"],
    )

In [9]:
adata

AnnData object with n_obs × n_vars = 78776 × 20045
    obs: 'cell_type', 'sex', 'assay', 'suspension_type', 'disease'
    var: 'feature_id', 'feature_name', 'feature_type', 'feature_length'

In [14]:
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)

In [15]:
sc.pp.pca(adata, n_comps=5)

In [16]:
px.scatter(
    x=adata.obsm["X_pca"][:, 0],
    y=adata.obsm["X_pca"][:, 1],
    color=adata.obs["sex"],
    title="PCA of Gene Expression Data",
    labels={"x": "PC1", "y": "PC2"},
    width=600,
    height=600,
)

In [21]:
!pip install git+https://github.com/kmaherx/ScBMLP.git

Collecting git+https://github.com/kmaherx/ScBMLP.git
  Cloning https://github.com/kmaherx/ScBMLP.git to /tmp/pip-req-build-mjd8jda7
  Running command git clone --filter=blob:none --quiet https://github.com/kmaherx/ScBMLP.git /tmp/pip-req-build-mjd8jda7
  Resolved https://github.com/kmaherx/ScBMLP.git to commit 8c216965c8fc522d83ded3a4914be5cd68deed84
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [25]:
from scripts.datasets import get_split_idxs, get_type_datasets

val_split = 0.15
random_state = 0
device = "cpu"

train_indices, val_indices, test_indices = get_split_idxs(
    adata, val_split=val_split, random_state=random_state,
)
train_dataset, val_dataset, test_dataset = get_type_datasets(
    adata, train_indices, val_indices, test_indices, class_key, device=device,
)

ValueError: Cannot cast object dtype to int64

# Train

In [13]:
from scripts.bmlp import ScBMLPClassifier, Config

In [19]:
d_hidden = 64
n_epochs = 100
lr = 1e-4
DEVICE = "cpu"

n_cells, n_genes = adata.shape
class_key = "sex"
n_classes = adata.obs[class_key].nunique()

n_epochs = 100

In [20]:
cfg = Config(
    d_input=n_genes,
    d_hidden=d_hidden,
    d_output=n_classes,
    n_epochs=n_epochs,
    lr=lr,
    device=DEVICE,
)
model = ScBMLPClassifier(cfg)
train_losses, val_losses = model.fit(train_dataset, val_dataset)

NameError: name 'train_dataset' is not defined

In [None]:
# Combine train and val losses into a single plot
loss_df = pd.DataFrame({
    'Epoch': list(range(len(train_losses))) + list(range(len(val_losses))),
    'Loss': train_losses + val_losses,
    'Type': ['Train'] * len(train_losses) + ['Validation'] * len(val_losses)
})

px.line(loss_df, x='Epoch', y='Loss', color='Type', 
        title='Training and Validation Loss', 
        labels={'Loss': 'Loss', 'Epoch': 'Epoch'}).show()