# Setup

Using colab

## Installs

In [1]:
!pip install pertpy "git+https://github.com/kmaherx/ScBMLP.git"

Collecting git+https://github.com/kmaherx/ScBMLP.git
  Cloning https://github.com/kmaherx/ScBMLP.git to /tmp/pip-req-build-wl_3t6bs
  Running command git clone --filter=blob:none --quiet https://github.com/kmaherx/ScBMLP.git /tmp/pip-req-build-wl_3t6bs
  Resolved https://github.com/kmaherx/ScBMLP.git to commit 77fd3cdd0fe17a2d2a295dd327362581a3c79d35
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


## Imports

In [1]:
from typing import Tuple, List, Dict, Any

import pertpy as pt
import scanpy as sc
import numpy as np
import plotly.express as px
import pandas as pd
import einops
import gseapy as gp
from gseapy import enrichr
import torch
import gc
import psutil

from scripts.datasets import get_classification_datasets
from scripts.bmlp import ScBMLPClassifier, Config

In [2]:
# For colab
import plotly.io as pio
pio.renderers.default = "notebook_connected"

# Set params

In [3]:
DEVICE = "cuda"  # faster than mps...

In [4]:
# Memory optimization settings
import gc
import psutil
import torch

def get_memory_usage():
    """Get current memory usage"""
    process = psutil.Process()
    cpu_mem = process.memory_info().rss / 1024**3  # GB
    gpu_mem = torch.cuda.memory_allocated() / 1024**3 if torch.cuda.is_available() else 0  # GB
    return cpu_mem, gpu_mem

def calculate_optimal_batch_size(n_samples, n_features, available_gpu_ram_gb=15):
    """Calculate optimal batch size based on available GPU memory"""
    # Rough estimate: 4 bytes per float32, plus overhead
    bytes_per_sample = n_features * 4  # float32
    overhead_factor = 3  # Account for gradients, activations, etc.
    total_bytes_per_sample = bytes_per_sample * overhead_factor
    
    # Use 80% of available GPU RAM
    usable_ram_bytes = available_gpu_ram_gb * 0.8 * 1024**3
    
    optimal_batch_size = int(usable_ram_bytes / total_bytes_per_sample)
    
    # Round down to nearest power of 2
    batch_size = 2 ** int(np.log2(optimal_batch_size))
    
    # Ensure reasonable bounds
    batch_size = max(32, min(batch_size, 1024))
    
    return batch_size

# Clear any existing tensors from GPU
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    
print(f"Initial memory - CPU: {get_memory_usage()[0]:.1f}GB, GPU: {get_memory_usage()[1]:.1f}GB")

Initial memory - CPU: 1.1GB, GPU: 0.0GB


# Load data

## Download, preprocess, and save

In [None]:
adata = pt.data.srivatsan_2020_sciplex3()

Output()

In [None]:
adata

AnnData object with n_obs × n_vars = 799317 × 110983
    obs: 'ncounts', 'well', 'plate', 'cell_line', 'replicate', 'time', 'dose_value', 'pathway_level_1', 'pathway_level_2', 'perturbation', 'target', 'pathway', 'dose_unit', 'celltype', 'disease', 'cancer', 'tissue_type', 'organism', 'perturbation_type'
    var: 'ensembl_id'

In [None]:
adata.write("/content/drive/MyDrive/srivatsan_2020_sciplex3_raw.h5ad")

In [None]:
adata.X.max()

np.int64(27373)

In [None]:
sc.pp.filter_cells(adata, min_counts=500)
sc.pp.filter_genes(adata, min_cells=10_000)

In [None]:
adata

AnnData object with n_obs × n_vars = 799317 × 12207
    obs: 'ncounts', 'well', 'plate', 'cell_line', 'replicate', 'time', 'dose_value', 'pathway_level_1', 'pathway_level_2', 'perturbation', 'target', 'pathway', 'dose_unit', 'celltype', 'disease', 'cancer', 'tissue_type', 'organism', 'perturbation_type', 'n_counts'
    var: 'ensembl_id', 'n_cells'

In [None]:
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, n_top_genes=10_000, subset=True)

In [None]:
adata

AnnData object with n_obs × n_vars = 799317 × 10000
    obs: 'ncounts', 'well', 'plate', 'cell_line', 'replicate', 'time', 'dose_value', 'pathway_level_1', 'pathway_level_2', 'perturbation', 'target', 'pathway', 'dose_unit', 'celltype', 'disease', 'cancer', 'tissue_type', 'organism', 'perturbation_type', 'n_counts'
    var: 'ensembl_id', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'log1p', 'hvg'

In [None]:
adata.obs

Unnamed: 0,ncounts,well,plate,cell_line,replicate,time,dose_value,pathway_level_1,pathway_level_2,perturbation,target,pathway,dose_unit,celltype,disease,cancer,tissue_type,organism,perturbation_type,n_counts
A01_E09_RT_BC_100_Lig_BC_147,2957,plate6_A9,plate44,MCF7,rep2,24.0,10000.0,Tyrosine kinase signaling,RTK activity,TAK-901,Aurora Kinase,Cell Cycle,nM,mammary epithelial cells,breast adenocarcinoma,True,cell_line,human,drug,2957
A01_E09_RT_BC_100_Lig_BC_186,1528,plate8_H3,plate46,MCF7,rep2,24.0,10.0,Tyrosine kinase signaling,RTK activity,AG-490 (Tyrphostin B42),EGFR,Protein Tyrosine Kinase,nM,mammary epithelial cells,breast adenocarcinoma,True,cell_line,human,drug,1528
A01_E09_RT_BC_100_Lig_BC_196,1881,plate3_C2,plate41,MCF7,rep2,24.0,1000.0,Epigenetic regulation,Histone deacetylation,Abexinostat (PCI-24781),HDAC,Cytoskeletal Signaling,nM,mammary epithelial cells,breast adenocarcinoma,True,cell_line,human,drug,1881
A01_E09_RT_BC_100_Lig_BC_213,1700,plate9_E3,plate51,A549,rep2,72.0,1000.0,Cell cycle regulation,Aurora kinase activity,Alisertib (MLN8237),Aurora Kinase,Cell Cycle,nM,alveolar basal epithelial cells,lung adenocarcinoma,True,cell_line,human,drug,1700
A01_E09_RT_BC_100_Lig_BC_220,1430,plate8_H10,plate30,K562,rep2,24.0,10000.0,DNA damage & DNA repair,Alkylating agent,Busulfan,DNA alkylator,DNA Damage,nM,lymphoblasts,chronic myelogenous leukemia,True,cell_line,human,drug,1430
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
H12_F10_RT_BC_9_Lig_BC_237,759,plate5_C2,plate43,MCF7,rep2,24.0,1000.0,Antioxidant,Antioxidant,Quercetin,"Src,Sirtuin,PKC,PI3K",Epigenetics,nM,mammary epithelial cells,breast adenocarcinoma,True,cell_line,human,drug,759
H12_F10_RT_BC_9_Lig_BC_274,1108,plate8_B9,plate6,K562,rep1,24.0,10.0,Nuclear receptor signaling,Nuclear receptor activity,Andarine,Androgen Receptor,Endocrinology & Hormones,nM,lymphoblasts,chronic myelogenous leukemia,True,cell_line,human,drug,1108
H12_F10_RT_BC_9_Lig_BC_291,714,plate3_B8,plate41,MCF7,rep2,24.0,100.0,DNA damage & DNA repair,ADP-rybosilation,Iniparib (BSI-201),PARP,DNA Damage,nM,mammary epithelial cells,breast adenocarcinoma,True,cell_line,human,drug,714
H12_F10_RT_BC_9_Lig_BC_298,1659,plate4_D4,plate18,MCF7,rep1,24.0,100.0,PKC signaling,PKC activitiy,Bisindolylmaleimide IX (Ro 31-8220 Mesylate),PKC,TGF-beta/Smad,nM,mammary epithelial cells,breast adenocarcinoma,True,cell_line,human,drug,1659


In [None]:
adata.obs["perturbation"].value_counts()

Unnamed: 0_level_0,count
perturbation,Unnamed: 1_level_1
control,17578
Ellagic acid,6257
Divalproex Sodium,6203
Ruxolitinib (INCB018424),6143
MC1568,6126
...,...
Alvespimycin (17-DMAG) HCl,2089
"Patupilone (EPO906, Epothilone B)",1822
Flavopiridol HCl,1729
Epothilone A,1426


In [None]:
adata.obs["dose_value"].value_counts()

Unnamed: 0_level_0,count
dose_value,Unnamed: 1_level_1
10.0,202725
100.0,192858
1000.0,183356
10000.0,166278
0.0,17578


In [None]:
adata.obs["dose_unit"].value_counts()

Unnamed: 0_level_0,count
dose_unit,Unnamed: 1_level_1
nM,799317


In [None]:
adata.obs["perturbation"].isna().value_counts()

Unnamed: 0_level_0,count
perturbation,Unnamed: 1_level_1
False,762795
True,36522


In [None]:
adata = adata[adata.obs["perturbation"].notna()]

In [None]:
# # Create perturbation column based on Group and Treatment
# # If Group is pre-treatment, use "pre"
# # If Group is post-treatment, use the corresponding Treatment value
# adata.obs["perturbation"] = adata.obs.apply(
#     lambda row: row["Group"] if "Pre" in row["Group"] else row["Treatment"], 
#     axis=1
# )

In [None]:
adata.write("/content/drive/MyDrive/srivatsan_2020_sciplex3_preprocessed.h5ad")

## Visualize

In [None]:
class_key = "perturbation"
pdata = sc.pp.subsample(adata, 0.1, copy=True)
sc.pp.pca(pdata, n_comps=2)
fig = px.scatter(
    x=pdata.obsm["X_pca"][:, 0],
    y=pdata.obsm["X_pca"][:, 1],
    color=pdata.obs[class_key],
    width=600,
    height=600,
)
fig.update_traces(marker=dict(size=5))
fig.show()

Colors different across plots.

## Format

In [5]:
adata = sc.read("/content/drive/MyDrive/srivatsan_2020_sciplex3_preprocessed.h5ad")

In [6]:
sc.pp.highly_variable_genes(adata, n_top_genes=5_000, subset=True)

In [7]:
# Check memory and data size before dataset creation
cpu_mem, gpu_mem = get_memory_usage()
print(f"Before dataset creation - CPU: {cpu_mem:.1f}GB, GPU: {gpu_mem:.1f}GB")

# Check data properties
print(f"AnnData shape: {adata.shape}")
print(f"Data type: {type(adata.X)}")
if hasattr(adata.X, 'nnz'):
    density = adata.X.nnz / (adata.shape[0] * adata.shape[1])
    print(f"Matrix density: {density:.2%}")
    
# Estimate memory requirement
n_cells, n_genes = adata.shape
dense_size_gb = (n_cells * n_genes * 4) / (1024**3)  # 4 bytes per float32
print(f"Estimated dense matrix size: {dense_size_gb:.1f}GB")

if dense_size_gb > 40:  # Leave some headroom
    print("⚠️  WARNING: Dense matrix may be too large for available RAM!")
    print("   Consider subsampling the data or using a machine with more RAM.")

Before dataset creation - CPU: 4.3GB, GPU: 0.0GB
AnnData shape: (762795, 5000)
Data type: <class 'scipy.sparse._csr.csr_matrix'>
Matrix density: 10.23%
Estimated dense matrix size: 14.2GB


In [None]:
random_state = 0
class_key = "perturbation"

# Create datasets with CPU device for storage - only transfer to GPU during training
train_dataset, val_dataset, test_dataset = get_classification_datasets(
    adata, class_key=class_key, random_state=random_state, device="cpu",  # Keep on CPU!
)

print(f"Dataset creation complete!")
print(f"Train: {len(train_dataset):,}, Val: {len(val_dataset):,}, Test: {len(test_dataset):,}")

# Check memory after dataset creation
cpu_mem, gpu_mem = get_memory_usage()
print(f"Memory after dataset creation - CPU: {cpu_mem:.1f}GB, GPU: {gpu_mem:.1f}GB")

In [9]:
# Debug GPU memory usage after dataset creation
if torch.cuda.is_available():
    print("GPU Memory Summary:")
    print(torch.cuda.memory_summary())
    print(f"Current GPU memory: {torch.cuda.memory_allocated() / 1024**3:.2f}GB")
    print(f"Peak GPU memory: {torch.cuda.max_memory_allocated() / 1024**3:.2f}GB")
    
    # Check if any tensors are on GPU
    objects = gc.get_objects()
    gpu_tensors = []
    total_gpu_memory = 0
    
    for obj in objects:
        if torch.is_tensor(obj) and obj.is_cuda:
            size_mb = obj.nelement() * obj.element_size() / 1024**2
            total_gpu_memory += size_mb
            if size_mb > 100:  # Only show tensors > 100MB
                gpu_tensors.append((type(obj), obj.shape, f"{size_mb:.1f}MB"))
    
    print(f"Found {len([obj for obj in objects if torch.is_tensor(obj) and obj.is_cuda])} GPU tensors")
    print(f"Total tensor memory: {total_gpu_memory/1024:.2f}GB")
    
    if gpu_tensors:
        print("Large GPU tensors:")
        for tensor_info in gpu_tensors[:5]:  # Show top 5
            print(f"  {tensor_info}")
    
    # Clear any accidental GPU allocations
    torch.cuda.empty_cache()

GPU Memory Summary:
|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |  14554 MiB |  14554 MiB |  14554 MiB |      0 B   |
|       from large pool |  14553 MiB |  14553 MiB |  14553 MiB |      0 B   |
|       from small pool |      1 MiB |      1 MiB |      1 MiB |      0 B   |
|---------------------------------------------------------------------------|
| Active memory         |  14554 MiB |  14554 MiB |  14554 MiB |      0 B   |
|       from large pool |  14553 MiB |  14553 MiB |  14553 MiB |      0 B   |
|       from small pool |      1 MiB |      1 MiB |      1 MiB |      0 B   |
|-------------------------------------------


`torch.distributed.reduce_op` is deprecated, please use `torch.distributed.ReduceOp` instead



Found 6 GPU tensors
Total tensor memory: 14.21GB
Large GPU tensors:
  (<class 'torch.Tensor'>, torch.Size([533956, 5000]), '10184.4MB')
  (<class 'torch.Tensor'>, torch.Size([114419, 5000]), '2182.4MB')
  (<class 'torch.Tensor'>, torch.Size([114420, 5000]), '2182.4MB')


# Train model

In [None]:
# Set device for datasets to GPU for training
train_dataset.set_device(DEVICE)
val_dataset.set_device(DEVICE)
test_dataset.set_device(DEVICE)

n_genes = train_dataset.adata.shape[1]
n_classes = train_dataset.adata.obs[class_key].nunique()

# Calculate optimal batch size based on available GPU memory
optimal_batch_size = calculate_optimal_batch_size(
    n_samples=len(train_dataset),
    n_features=n_genes,
    available_gpu_ram_gb=15  # T4 GPU
)

d_hidden = 258
n_epochs = 50
lr = 1e-5
batch_size = optimal_batch_size

print(f"Dataset info:")
print(f"  - Samples: {len(train_dataset):,}")
print(f"  - Features: {n_genes:,}")
print(f"  - Classes: {n_classes}")
print(f"  - Optimal batch size: {batch_size}")

# Check memory after setting GPU device
cpu_mem, gpu_mem = get_memory_usage()
print(f"Memory after GPU setup - CPU: {cpu_mem:.1f}GB, GPU: {gpu_mem:.1f}GB")

NameError: name 'train_dataset' is not defined

In [None]:
cfg = Config(
    d_input=n_genes,
    d_hidden=d_hidden,
    d_output=n_classes,
    n_epochs=n_epochs,
    lr=lr,
    device=DEVICE,
    batch_size=batch_size,  # Use calculated optimal batch size
)
# model = ScBMLPRegressor(cfg, loss_fn="l1")
model = ScBMLPClassifier(cfg)
train_losses, val_losses = model.fit(train_dataset, val_dataset)

Training for 50 epochs: 100%|██████████| 50/50 [23:35<00:00, 28.31s/it, train_acc=0.7422, train_loss=0.5558, val_acc=0.7247, val_loss=0.6255]


In [None]:
# Check memory after training
cpu_mem, gpu_mem = get_memory_usage()
print(f"Memory after training - CPU: {cpu_mem:.1f}GB, GPU: {gpu_mem:.1f}GB")

# Clean up GPU memory
torch.cuda.empty_cache()
gc.collect()

cpu_mem, gpu_mem = get_memory_usage()
print(f"Memory after cleanup - CPU: {cpu_mem:.1f}GB, GPU: {gpu_mem:.1f}GB")

In [None]:
# Combine train and val losses into a single plot
loss_df = pd.DataFrame({
    'Epoch': list(range(len(train_losses))) + list(range(len(val_losses))),
    'Loss': train_losses + val_losses,
    'Type': ['Train'] * len(train_losses) + ['Validation'] * len(val_losses)
})

px.line(loss_df, x='Epoch', y='Loss', color='Type', 
        title='Training and Validation Loss', 
        labels={'Loss': 'Loss', 'Epoch': 'Epoch'}).show()

# Weight interpretation

In [None]:
def get_marker_gene_lists(
    gene_names: np.ndarray,
    vecs: np.ndarray,
    n_top_comps: int = 1,
    n_top_genes: int = 50,
) -> np.ndarray:
    """Extract marker genes optimized for GO analysis."""
    gene_lists = []
    for i in range(n_top_comps):
        top_idxs = vecs[:,i].topk(n_top_genes).indices
        top_genes = gene_names[top_idxs].tolist()
        bottom_idxs = (-vecs[:,i]).topk(n_top_genes).indices
        bottom_genes = gene_names[bottom_idxs].tolist()
        gene_lists.append([top_genes, bottom_genes])
    return np.array(gene_lists)

In [None]:
adata = sc.concat([train_dataset.adata, val_dataset.adata])

## Top component hists

In [None]:
b = einops.einsum(model.w_p, model.w_l, model.w_r, "out hid, hid in1, hid in2 -> out in1 in2")
b = 0.5 * (b + b.mT)  # symmetrize

In [None]:
for i in range(n_classes):
    _, vecs = torch.linalg.eigh(b[i])
    vecs = vecs.flip([1])

    proj = adata.X @ vecs[:, 0].numpy()  # project into component space

    hist_df = pd.DataFrame({
        "component": proj,
        class_key: adata.obs[class_key]
    })

    fig = px.histogram(
        hist_df,
        x="component",
        color=class_key,
        opacity=0.7,
        nbins=30,
        barmode="overlay",
        width=600,
        height=300,
        title=f"Class {i}"
    )
    fig.show()

## Bilinear

### Gene markers

In [None]:
train_dataset.label_mapping

{'Anti-PD-L1+Chemo': 0, 'Chemo': 1, 'Pre-treatment': 2}

In [None]:
q = einops.einsum(model.w_p[0], model.w_l, model.w_r, "hid, hid in1, hid in2 -> in1 in2")
q = 0.5 * (q + q.mT)  # symmetrize

In [None]:
# Eigendecompose to get gene module weights
_, vecs_bmlp = torch.linalg.eigh(q)
vecs_bmlp = vecs_bmlp.flip([1])

In [None]:
# Get gene names per module (i.e. component; "comp")
n_top_comps = 3
n_top_genes = 50
gene_names = adata.var_names.values
gene_lists_bmlp = get_marker_gene_lists(
    gene_names, vecs_bmlp, n_top_comps=n_top_comps, n_top_genes=n_top_genes
)  # [comp, top/bottom, gene]

In [None]:
for i in range(n_top_comps):
    print("="*20, "Component", i, "="*20)
    print(f"Top genes: {gene_lists_bmlp[i,0,:8]}...")
    print(f"Bottom genes: {gene_lists_bmlp[i,1,:8]}...")

Top genes: ['FDCSP' 'HTN1' 'DEFA3' 'FTH1' 'MT-ND4L' 'CCR7' 'S100A9' 'HSP90AA1']...
Bottom genes: ['AZGP1' 'CALML5' 'KRT19' 'KRT15' 'S100A1' 'PF4V1' 'ID4' 'KRT14']...
Top genes: ['MTRNR2L12' 'GZMH' 'NKG7' 'FP671120.3' 'IGKV3-11' 'RPL36' 'MALAT1'
 'FGFBP2']...
Bottom genes: ['DEFA3' 'ALB' 'IFI6' 'SOCS1' 'IFI44L' 'MYADM' 'CREM' 'TNFAIP3']...
Top genes: ['DEFA3' 'HBB' 'HIST1H1D' 'HIST1H1E' 'LYZ' 'GBP4' 'ALB' 'GZMB']...
Bottom genes: ['RPL41' 'CCL3L1' 'DNAJB1' 'HSP90AA1' 'RPS21' 'MALAT1' 'CCL4L2' 'MYOM2']...


In [None]:
for i in range(n_top_comps):
    print("="*20, "Component", i, "="*20)
    print(f"Top genes: {gene_lists_bmlp[i,0,:8]}...")
    print(f"Bottom genes: {gene_lists_bmlp[i,1,:8]}...")

Top genes: ['FDCSP' 'HTN1' 'DEFA3' 'FTH1' 'MT-ND4L' 'CCR7' 'S100A9' 'HSP90AA1']...
Bottom genes: ['AZGP1' 'CALML5' 'KRT19' 'KRT15' 'S100A1' 'PF4V1' 'ID4' 'KRT14']...
Top genes: ['MTRNR2L12' 'GZMH' 'NKG7' 'FP671120.3' 'IGKV3-11' 'RPL36' 'MALAT1'
 'FGFBP2']...
Bottom genes: ['DEFA3' 'ALB' 'IFI6' 'SOCS1' 'IFI44L' 'MYADM' 'CREM' 'TNFAIP3']...
Top genes: ['DEFA3' 'HBB' 'HIST1H1D' 'HIST1H1E' 'LYZ' 'GBP4' 'ALB' 'GZMB']...
Bottom genes: ['RPL41' 'CCL3L1' 'DNAJB1' 'HSP90AA1' 'RPS21' 'MALAT1' 'CCL4L2' 'MYOM2']...


In [None]:
for i in range(n_top_comps):
    print("="*20, "Component", i, "="*20)
    print(f"Top genes: {gene_lists_bmlp[i,0,:8]}...")
    print(f"Bottom genes: {gene_lists_bmlp[i,1,:8]}...")

Top genes: ['FDCSP' 'HTN1' 'DEFA3' 'FTH1' 'MT-ND4L' 'CCR7' 'S100A9' 'HSP90AA1']...
Bottom genes: ['AZGP1' 'CALML5' 'KRT19' 'KRT15' 'S100A1' 'PF4V1' 'ID4' 'KRT14']...
Top genes: ['MTRNR2L12' 'GZMH' 'NKG7' 'FP671120.3' 'IGKV3-11' 'RPL36' 'MALAT1'
 'FGFBP2']...
Bottom genes: ['DEFA3' 'ALB' 'IFI6' 'SOCS1' 'IFI44L' 'MYADM' 'CREM' 'TNFAIP3']...
Top genes: ['DEFA3' 'HBB' 'HIST1H1D' 'HIST1H1E' 'LYZ' 'GBP4' 'ALB' 'GZMB']...
Bottom genes: ['RPL41' 'CCL3L1' 'DNAJB1' 'HSP90AA1' 'RPS21' 'MALAT1' 'CCL4L2' 'MYOM2']...


### GO analysis

In [None]:
n_results = 5
results_cols = ["Term", "Genes", "Gene_set", "Adjusted P-value"]

for comp in range(n_top_comps):
    print("="*40, "Component", comp, "="*40)
    for i in range(2):
        enr = gp.enrichr(
            gene_list=gene_lists_bmlp[comp, i].tolist(),
            gene_sets=[
                "GO_Biological_Process_2023",
                # "KEGG_2021_Human",
                # "Reactome_2022"
            ],
            cutoff=0.05,
        )
        display(enr.results.head(n_results)[results_cols])



Unnamed: 0,Term,Genes,Gene_set,Adjusted P-value
0,Defense Response To Bacterium (GO:0042742),DEFA3;HLA-A;LYZ;S100A9;S100A8;HTN1,GO_Biological_Process_2023,0.003989
1,Positive Regulation Of Alpha-Beta T Cell Activ...,HSPH1;HLA-A;CD55,GO_Biological_Process_2023,0.003989
2,Cytoplasmic Translation (GO:0002181),RPL41;RPS27;RPL10;RPS21,GO_Biological_Process_2023,0.01225
3,Intracellular Sequestering Of Iron Ion (GO:000...,FTH1;FTL,GO_Biological_Process_2023,0.01225
4,Antimicrobial Humoral Response (GO:0019730),DEFA3;HLA-A;LYZ;S100A9,GO_Biological_Process_2023,0.01225


Unnamed: 0,Term,Genes,Gene_set,Adjusted P-value
0,Intermediate Filament Organization (GO:0045109),DSP;KRT19;KRT17;KRT16;KRT15;KRT14;KRT7,GO_Biological_Process_2023,1.496672e-07
1,Supramolecular Fiber Organization (GO:0097435),DSP;KRT19;KRT17;KRT16;KRT15;KRT14;KRT7,GO_Biological_Process_2023,0.001705037
2,Establishment Of Skin Barrier (GO:0061436),CLDN4;KRT16;SFN,GO_Biological_Process_2023,0.001705037
3,Skin Epidermis Development (GO:0098773),CLDN4;KRT16;SFN,GO_Biological_Process_2023,0.001705037
4,Epithelial Cell Differentiation (GO:0030855),KRT19;KRT17;KRT16;KRT15;KRT14,GO_Biological_Process_2023,0.001705037




Unnamed: 0,Term,Genes,Gene_set,Adjusted P-value
0,Cytoplasmic Translation (GO:0002181),RPL41;RPS27;RPL10;RPS29;RPL36;RPS21,GO_Biological_Process_2023,2.9e-05
1,Peptide Biosynthetic Process (GO:0043043),RPL41;RPS27;RPL10;RPS29;RPL36;RPS21,GO_Biological_Process_2023,0.000333
2,Macromolecule Biosynthetic Process (GO:0009059),RPL41;RPS27;RPL10;RPS29;RPL36;RPS21,GO_Biological_Process_2023,0.000519
3,Gene Expression (GO:0010467),RPL41;RPS27;RPL10;RPS29;RPL36;GZMB;RPS21,GO_Biological_Process_2023,0.000531
4,Natural Killer Cell Mediated Immunity (GO:0002...,GZMB;NKG7;KLRD1,GO_Biological_Process_2023,0.001048


Unnamed: 0,Term,Genes,Gene_set,Adjusted P-value
0,Defense Response To Virus (GO:0051607),RSAD2;IFI27;IFI6;IFIT1;USP18;IFI44L,GO_Biological_Process_2023,0.005267
1,Defense Response To Symbiont (GO:0140546),RSAD2;IFI27;IFI6;IFIT1;IFI44L,GO_Biological_Process_2023,0.011961
2,Regulation Of Very-Low-Density Lipoprotein Par...,APOA2;APOC3,GO_Biological_Process_2023,0.014467
3,Negative Regulation Of Lipase Activity (GO:006...,APOA2;APOC3,GO_Biological_Process_2023,0.030236
4,Negative Regulation Of Cholesterol Transport (...,APOA2;APOC3,GO_Biological_Process_2023,0.03875




Unnamed: 0,Term,Genes,Gene_set,Adjusted P-value
0,Defense Response To Bacterium (GO:0042742),GNLY;DEFA3;LYZ;S100A9;GBP1;MPEG1;S100A8;GBP4;LTF,GO_Biological_Process_2023,1e-06
1,Retina Homeostasis (GO:0001895),AZGP1;ALB;LYZ;LTF,GO_Biological_Process_2023,0.000787
2,Antimicrobial Humoral Response (GO:0019730),GNLY;DEFA3;LYZ;S100A9;LTF,GO_Biological_Process_2023,0.000986
3,Cellular Response To Cytokine Stimulus (GO:007...,EGR1;HCK;CSF3R;LRRK2;MNDA;GBP1;GBP4,GO_Biological_Process_2023,0.001605
4,Antimicrobial Humoral Immune Response Mediated...,GNLY;DEFA3;S100A9;LTF,GO_Biological_Process_2023,0.002472


Unnamed: 0,Term,Genes,Gene_set,Adjusted P-value
0,Cellular Response To Heat (GO:0034605),DNAJB1;HSP90AA1;BAG3;HSPA1B;HSPA1A,GO_Biological_Process_2023,2.2e-05
1,Negative Regulation Of Inclusion Body Assembly...,DNAJB1;HSPA1B;HSPA1A,GO_Biological_Process_2023,0.000259
2,Regulation Of Inclusion Body Assembly (GO:0090...,DNAJB1;HSPA1B;HSPA1A,GO_Biological_Process_2023,0.000259
3,Cytoplasmic Translation (GO:0002181),RPL41;RPS27;RPS29;RPL36;RPS21,GO_Biological_Process_2023,0.000389
4,Response To Unfolded Protein (GO:0006986),DNAJB1;HSP90AA1;HSPH1;HSPA1A,GO_Biological_Process_2023,0.000389


In [None]:
n_results = 5
results_cols = ["Term", "Genes", "Gene_set", "Adjusted P-value"]

for comp in range(n_top_comps):
    print("="*40, "Component", comp, "="*40)
    for i in range(2):
        enr = gp.enrichr(
            gene_list=gene_lists_bmlp[comp, i].tolist(),
            gene_sets=[
                "GO_Biological_Process_2023",
                # "KEGG_2021_Human",
                # "Reactome_2022"
            ],
            cutoff=0.05,
        )
        display(enr.results.head(n_results)[results_cols])



Unnamed: 0,Term,Genes,Gene_set,Adjusted P-value
0,Cytoplasmic Translation (GO:0002181),RPL41;RPS29;RPS21,GO_Biological_Process_2023,0.014878
1,Peptide Biosynthetic Process (GO:0043043),RPL41;RPS29;RPS21,GO_Biological_Process_2023,0.035472
2,Macromolecule Biosynthetic Process (GO:0009059),RPL41;RPS29;RPS21,GO_Biological_Process_2023,0.036257
3,Translation (GO:0006412),RPL41;RPS29;RPS21,GO_Biological_Process_2023,0.055233
4,Gene Expression (GO:0010467),RPL41;RPS29;RPS21,GO_Biological_Process_2023,0.070429


Unnamed: 0,Term,Genes,Gene_set,Adjusted P-value
0,"Positive Regulation Of CD8-positive, Alpha-Bet...",XCL1;HLA-A,GO_Biological_Process_2023,0.004903
1,Positive Regulation Of Alpha-Beta T Cell Proli...,XCL1;HLA-A,GO_Biological_Process_2023,0.008878
2,Regulation Of T Cell Cytokine Production (GO:0...,XCL1;HLA-A,GO_Biological_Process_2023,0.008878
3,Positive Regulation Of T Cell Cytokine Product...,XCL1;HLA-A,GO_Biological_Process_2023,0.009122
4,Positive Regulation Of T Cell Mediated Cytotox...,XCL1;HLA-A,GO_Biological_Process_2023,0.015181




Unnamed: 0,Term,Genes,Gene_set,Adjusted P-value
0,Regulation Of Cell Death (GO:0010941),HBB;HBA1;CRYAB,GO_Biological_Process_2023,0.002585
1,Defense Response To Virus (GO:0051607),IFI27;IFI6;USP18;IFI44L,GO_Biological_Process_2023,0.002585
2,Oxygen Transport (GO:0015671),HBB;HBA1,GO_Biological_Process_2023,0.002585
3,Carbon Dioxide Transport (GO:0015670),HBB;HBA1,GO_Biological_Process_2023,0.002585
4,Gas Transport (GO:0015669),HBB;HBA1,GO_Biological_Process_2023,0.003939


Unnamed: 0,Term,Genes,Gene_set,Adjusted P-value
0,Positive Regulation Of Viral Genome Replicatio...,CCL5;PKN2,GO_Biological_Process_2023,0.077153
1,Positive Regulation Of Viral Process (GO:0048524),CCL5;PKN2,GO_Biological_Process_2023,0.077153
2,Regulation Of Viral Genome Replication (GO:004...,CCL5;PKN2,GO_Biological_Process_2023,0.077153
3,"CD4-positive, Alpha-Beta T Cell Activation (GO...",NKG7,GO_Biological_Process_2023,0.077153
4,Natural Killer Cell Degranulation (GO:0043320),NKG7,GO_Biological_Process_2023,0.077153




Unnamed: 0,Term,Genes,Gene_set,Adjusted P-value
0,Positive Regulation Of Nitric Oxide Biosynthet...,IFNG;HBB,GO_Biological_Process_2023,0.051863
1,Positive Regulation Of Nitric Oxide Metabolic ...,IFNG;HBB,GO_Biological_Process_2023,0.051863
2,Positive Regulation Of Ion Transmembrane Trans...,IFNG;STK39,GO_Biological_Process_2023,0.051863
3,Regulation Of Nitric Oxide Biosynthetic Proces...,IFNG;HBB,GO_Biological_Process_2023,0.054393
4,Positive Regulation Of Cell Death (GO:0010942),IFNG;HBB,GO_Biological_Process_2023,0.060024


Unnamed: 0,Term,Genes,Gene_set,Adjusted P-value
0,Defense Response To Symbiont (GO:0140546),MX2;MX1;ISG15;IFI44L,GO_Biological_Process_2023,0.001476
1,Defense Response To Virus (GO:0051607),MX2;MX1;ISG15;IFI44L,GO_Biological_Process_2023,0.001476
2,Response To Type I Interferon (GO:0034340),MX1;ISG15,GO_Biological_Process_2023,0.001476
3,Response To Cytokine (GO:0034097),MX2;MX1;ISG15,GO_Biological_Process_2023,0.00817
4,B Cell Receptor Signaling Pathway (GO:0050853),IGHG1;IGKC,GO_Biological_Process_2023,0.021413


In [None]:
n_results = 5
results_cols = ["Term", "Genes", "Gene_set", "Adjusted P-value"]

for comp in range(n_top_comps):
    print("="*40, "Component", comp, "="*40)
    for i in range(2):
        enr = gp.enrichr(
            gene_list=gene_lists_bmlp[comp, i].tolist(),
            gene_sets=[
                "GO_Biological_Process_2023",
                # "KEGG_2021_Human",
                # "Reactome_2022"
            ],
            cutoff=0.05,
        )
        display(enr.results.head(n_results)[results_cols])



Unnamed: 0,Term,Genes,Gene_set,Adjusted P-value
0,Hydrogen Peroxide Catabolic Process (GO:0042744),HBA1;HBQ1,GO_Biological_Process_2023,0.018035
1,Regulation Of Vitamin D Biosynthetic Process (...,SNAI1,GO_Biological_Process_2023,0.073715
2,Positive Regulation Of Hemoglobin Biosynthetic...,SLC25A37,GO_Biological_Process_2023,0.073715
3,Negative Regulation Of Vitamin Metabolic Proce...,SNAI1,GO_Biological_Process_2023,0.073715
4,Negative Regulation Of Cellular Extravasation ...,ENC1,GO_Biological_Process_2023,0.073715


Unnamed: 0,Term,Genes,Gene_set,Adjusted P-value
0,Antimicrobial Humoral Response (GO:0019730),PPBP;IGKV3-20,GO_Biological_Process_2023,0.07606
1,"Regulation Of CD8-positive, Alpha-Beta T Cell ...",SOCS1,GO_Biological_Process_2023,0.07606
2,"Negative Regulation Of CD8-positive, Alpha-Bet...",SOCS1,GO_Biological_Process_2023,0.07606
3,Folic Acid Transport (GO:0015884),FOLR3,GO_Biological_Process_2023,0.07606
4,"Positive Regulation Of CD4-positive, Alpha-Bet...",SOCS1,GO_Biological_Process_2023,0.07606




Unnamed: 0,Term,Genes,Gene_set,Adjusted P-value
0,Antigen Processing And Presentation Of Peptide...,IFI30;B2M,GO_Biological_Process_2023,0.024762
1,Defense Response To Symbiont (GO:0140546),IFI27;MX1;IFI44L,GO_Biological_Process_2023,0.035493
2,Defense Response To Virus (GO:0051607),IFI27;MX1;IFI44L,GO_Biological_Process_2023,0.048221
3,Regulation Of Immune Response (GO:0050776),LAG3;FCGR1A,GO_Biological_Process_2023,0.064812
4,Inclusion Body Assembly (GO:0070841),UBD,GO_Biological_Process_2023,0.064812


Unnamed: 0,Term,Genes,Gene_set,Adjusted P-value
0,Negative Regulation Of MAPK Cascade (GO:0043409),SPRY1;DUSP8;DUSP6,GO_Biological_Process_2023,0.019376
1,Negative Regulation Of ERK1 And ERK2 Cascade (...,SPRY1;DUSP6,GO_Biological_Process_2023,0.045098
2,Defense Response To Bacterium (GO:0042742),DEFA3;LYZ;HTN1,GO_Biological_Process_2023,0.045098
3,Response To Reactive Oxygen Species (GO:0000302),HBB;FOS,GO_Biological_Process_2023,0.063246
4,Defense Response To Gram-negative Bacterium (G...,DEFA3;LYZ,GO_Biological_Process_2023,0.065105




Unnamed: 0,Term,Genes,Gene_set,Adjusted P-value
0,Negative Regulation Of Inclusion Body Assembly...,HSPA2;HSPA1B;HSPA1A,GO_Biological_Process_2023,1.3e-05
1,Regulation Of Inclusion Body Assembly (GO:0090...,HSPA2;HSPA1B;HSPA1A,GO_Biological_Process_2023,1.3e-05
2,Positive Regulation Of NF-kappaB Transcription...,S100A9;HSPA1B;S100A8;HSPA1A;LTF,GO_Biological_Process_2023,2.9e-05
3,Chaperone Cofactor-Dependent Protein Refolding...,HSPA2;HSPA1B;HSPA1A,GO_Biological_Process_2023,0.000155
4,Defense Response To Fungus (GO:0050832),S100A9;S100A8;LTF,GO_Biological_Process_2023,0.000155


Unnamed: 0,Term,Genes,Gene_set,Adjusted P-value
0,Central Nervous System Development (GO:0007417),ZBTB16;DDIT4;PBX4,GO_Biological_Process_2023,0.098914
1,Antigen Processing And Presentation Of Endogen...,ERAP2,GO_Biological_Process_2023,0.098914
2,Neurotrophin TRK Receptor Signaling Pathway (G...,DDIT4,GO_Biological_Process_2023,0.098914
3,Deadenylation-Dependent Decapping Of Nuclear-T...,PATL2,GO_Biological_Process_2023,0.098914
4,Regulation Of Alpha-Beta T Cell Differentiatio...,PRDM1,GO_Biological_Process_2023,0.098914


## PCA

### Gene markers

In [None]:
cov = adata.X.T @ adata.X
cov = torch.tensor(cov.toarray()).to(DEVICE)

In [None]:
# Eigendecompose to get gene module weights
_, vecs_cov = torch.linalg.eigh(cov)
vecs_cov = vecs_cov.flip([1])

In [None]:
# Get gene names per module (i.e. component; "comp")
n_top_comps = 3
n_top_genes = 20
gene_names = adata.var_names.values
gene_lists_cov = get_marker_gene_lists(
    gene_names, vecs_cov, n_top_comps=n_top_comps, n_top_genes=n_top_genes
)  # [comp, top/bottom, gene]

In [None]:
for i in range(n_top_comps):
    print("="*20, "Component", i, "="*20)
    print(f"Top genes: {gene_lists_cov[i,0,:8]}...")
    print(f"Bottom genes: {gene_lists_cov[i,1,:8]}...")

Top genes: ['AP000439.3' 'LGALS7B' 'SHC3' 'AP000695.2' 'YBX2' 'LINC00887' 'KCNIP1'
 'LGALS7']...
Bottom genes: ['MALAT1' 'B2M' 'RPS27' 'RPL10' 'RPS12' 'MT-CYB' 'RPS21' 'HLA-A']...
Top genes: ['NKG7' 'CCL5' 'IFITM1' 'GNLY' 'RPS27' 'GZMH' 'RPS29' 'GZMB']...
Bottom genes: ['CD74' 'HLA-DRA' 'LYZ' 'HLA-DRB1' 'FTL' 'S100A9' 'S100A8' 'FTH1']...
Top genes: ['S100A4' 'S100A9' 'S100A8' 'GNLY' 'LYZ' 'FTL' 'NKG7' 'FCN1']...
Bottom genes: ['HSP90AA1' 'DNAJB1' 'HSPA1A' 'HSPA1B' 'NR4A2' 'DUSP2' 'HSPH1' 'CREM']...


### GO analysis

In [None]:
n_results = 5
results_cols = ["Term", "Genes", "Gene_set", "Adjusted P-value"]

for comp in range(n_top_comps):
    print("="*40, "Component", comp, "="*40)
    for i in range(2):
        enr = gp.enrichr(
            gene_list=gene_lists_cov[comp, i].tolist(),
            gene_sets=[
                # "GO_Biological_Process_2023",
                # "KEGG_2021_Human",
                "Reactome_2022"
            ],
            cutoff=0.05,
        )
        display(enr.results.head(n_results)[results_cols])



Unnamed: 0,Term,Genes,Gene_set,Adjusted P-value
0,Phase 1 - Inactivation Of Fast Na+ Channels R-...,KCNIP1,Reactome_2022,0.172075
1,Signaling To RAS R-HSA-167044,SHC3,Reactome_2022,0.172075
2,Interleukin-20 Family Signaling R-HSA-8854691,IL20,Reactome_2022,0.172075
3,Signaling To ERKs R-HSA-187687,SHC3,Reactome_2022,0.172075
4,NCAM1 Interactions R-HSA-419037,COL4A5,Reactome_2022,0.172075


Unnamed: 0,Term,Genes,Gene_set,Adjusted P-value
0,Influenza Viral RNA Transcription And Replicat...,RPL41;HSP90AA1;RPS27;RPS29;RPL36;RPS21;RPS12,Reactome_2022,6.382606e-09
1,Influenza Infection R-HSA-168255,RPL41;HSP90AA1;RPS27;RPS29;RPL36;RPS21;RPS12,Reactome_2022,6.382606e-09
2,Peptide Chain Elongation R-HSA-156902,RPL41;RPS27;RPS29;RPL36;RPS21;RPS12,Reactome_2022,6.382606e-09
3,Selenocysteine Synthesis R-HSA-2408557,RPL41;RPS27;RPS29;RPL36;RPS21;RPS12,Reactome_2022,6.382606e-09
4,Viral mRNA Translation R-HSA-192823,RPL41;RPS27;RPS29;RPL36;RPS21;RPS12,Reactome_2022,6.382606e-09




Unnamed: 0,Term,Genes,Gene_set,Adjusted P-value
0,Peptide Chain Elongation R-HSA-156902,RPL41;RPS27;RPS29;RPL36;RPS21;RPS12,Reactome_2022,5.563747e-09
1,Selenocysteine Synthesis R-HSA-2408557,RPL41;RPS27;RPS29;RPL36;RPS21;RPS12,Reactome_2022,5.563747e-09
2,Viral mRNA Translation R-HSA-192823,RPL41;RPS27;RPS29;RPL36;RPS21;RPS12,Reactome_2022,5.563747e-09
3,Eukaryotic Translation Elongation R-HSA-156842,RPL41;RPS27;RPS29;RPL36;RPS21;RPS12,Reactome_2022,5.563747e-09
4,Eukaryotic Translation Termination R-HSA-72764,RPL41;RPS27;RPS29;RPL36;RPS21;RPS12,Reactome_2022,5.563747e-09


Unnamed: 0,Term,Genes,Gene_set,Adjusted P-value
0,Immune System R-HSA-168256,IFITM3;FCN1;CD74;GRN;SERPINA1;FOS;LYZ;FTH1;HLA...,Reactome_2022,1.353797e-08
1,Neutrophil Degranulation R-HSA-6798695,FCN1;GRN;SERPINA1;FTH1;LYZ;CD68;S100A9;S100A8;FTL,Reactome_2022,1.353797e-08
2,Innate Immune System R-HSA-168249,FCN1;GRN;SERPINA1;FTH1;FOS;LYZ;CD68;S100A9;S10...,Reactome_2022,5.255598e-07
3,Metal Sequestration By Antimicrobial Proteins ...,S100A9;S100A8,Reactome_2022,0.000369608
4,Antimicrobial Peptides R-HSA-6803157,LYZ;S100A9;S100A8,Reactome_2022,0.001454456




Unnamed: 0,Term,Genes,Gene_set,Adjusted P-value
0,Peptide Chain Elongation R-HSA-156902,RPL41;RPS29;RPL36;RPS21,Reactome_2022,3.2e-05
1,Antimicrobial Peptides R-HSA-6803157,GNLY;LYZ;S100A9;S100A8,Reactome_2022,3.2e-05
2,Selenocysteine Synthesis R-HSA-2408557,RPL41;RPS29;RPL36;RPS21,Reactome_2022,3.2e-05
3,Viral mRNA Translation R-HSA-192823,RPL41;RPS29;RPL36;RPS21,Reactome_2022,3.2e-05
4,Eukaryotic Translation Elongation R-HSA-156842,RPL41;RPS29;RPL36;RPS21,Reactome_2022,3.2e-05


Unnamed: 0,Term,Genes,Gene_set,Adjusted P-value
0,Attenuation Phase R-HSA-3371568,DNAJB1;HSP90AA1;HSPH1;HSPA1B;HSPA1A,Reactome_2022,6.428982e-09
1,HSF1 Activation R-HSA-3371511,DNAJB1;HSP90AA1;HSPH1;HSPA1B;HSPA1A,Reactome_2022,6.428982e-09
2,HSF1-dependent Transactivation R-HSA-3371571,DNAJB1;HSP90AA1;HSPH1;HSPA1B;HSPA1A,Reactome_2022,1.354689e-08
3,Cellular Response To Heat Stress R-HSA-3371556,DNAJB1;HSP90AA1;HSPH1;HSPA1B;HSPA1A,Reactome_2022,1.853195e-06
4,HSP90 Chaperone Cycle For Steroid Hormone Rece...,DNAJB1;HSP90AA1;HSPA1B;HSPA1A,Reactome_2022,1.98443e-06
