In [1]:
import os
import sys
import gc

import scipy.stats as stats
import scvi
import numpy as np
import pandas as pd
import scanpy as sc
from anndata import AnnData
from fast_matrix_market import mmread

import scipy
from scipy.spatial.distance import jensenshannon
from scipy.stats import pearsonr

import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib import rcParams

import torch
import cell2location

# Set matplotlib parameters
rcParams['pdf.fonttype'] = 42  # Enables correct plotting of text for PDFs

  self.seed = seed
  self.dl_pin_memory_gpu_training = (
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
results_folder = '/bgfs/alee/LO_LAB/Personal/Brent_Schlegel/Projects/Wu_Visium/Simulations/larger_ref/cell2location/mixed'

# create paths and names to results folders for reference regression and cell2location models
ref_run_name = '/bgfs/alee/LO_LAB/Personal/Brent_Schlegel/Projects/Wu_Visium/Simulations/larger_ref/cell2location/reference_signatures'
run_name = f'{results_folder}/cell2location_map'

adata_file = f"{ref_run_name}/reference_major.h5ad"
adata_ref = sc.read_h5ad(adata_file)
mod = cell2location.models.RegressionModel.load(f"{ref_run_name}", adata_ref)

[34mINFO    [0m File                                                                                                      
         [35m/bgfs/alee/LO_LAB/Personal/Brent_Schlegel/Projects/Wu_Visium/Simulations/larger_ref/cell2location/referenc[0m
         [35me_signatures/[0m[95mmodel.pt[0m already downloaded                                                                  


  model = torch.load(model_path, map_location=map_location)
CUDA backend failed to initialize: Unable to load cuDNN. Is it installed? (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn("You passed in a `val_dataloader` but have no `validation_step`. Skipping val loop.")
You are using a CUDA device ('NVIDIA A100-PCIE-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
SLURM auto-requeueing enabled. Setting signal handlers.


Epoch 1/523:   0%|          | 1/523 [00:01<13:44,  1.58s/it, v_num=1]

`Trainer.fit` stopped: `max_steps=1` reached.


Epoch 1/523:   0%|          | 1/523 [00:01<15:15,  1.75s/it, v_num=1]


In [None]:
# Directories and paths
input_folder  = "/bgfs/alee/LO_LAB/Personal/Brent_Schlegel/Projects/Wu_Visium/Simulations/larger_ref/replicates/mixed/h5ad_objects"
output_folder = "/bgfs/alee/LO_LAB/Personal/Brent_Schlegel/Projects/Wu_Visium/Simulations/larger_ref/cell2location/mixed"

# Prepare reference signatures
if 'means_per_cluster_mu_fg' in adata_ref.varm.keys():
    inf_aver = adata_ref.varm['means_per_cluster_mu_fg'][[
        f'means_per_cluster_mu_fg_{i}' for i in adata_ref.uns['mod']['factor_names']
    ]].copy()
else:
    inf_aver = adata_ref.var[[f'means_per_cluster_mu_fg_{i}' for i in adata_ref.uns['mod']['factor_names']]].copy()
inf_aver.columns = adata_ref.uns['mod']['factor_names']

# Iterate over all replicates
replicates = [f for f in os.listdir(input_folder) if f.endswith("GEX.h5ad")]

for replicate_name in replicates:
    replicate_path = os.path.join(input_folder, replicate_name)
    run_name = os.path.join(output_folder, f"cell2location_map_{replicate_name.split('_')[2]}")

    # Load Visium query dataset (replicate)
    adata_vis_0 = sc.read_h5ad(replicate_path)

    # Prepare `adata_vis_0` for cell2location
    adata_vis_0.X_norm = adata_vis_0.X
    adata_vis_0.X = np.expm1(adata_vis_0.X_norm).round()

    # Find shared genes and subset
    intersect = np.intersect1d(adata_vis_0.var_names, inf_aver.index)
    adata_vis_0 = adata_vis_0[:, adata_vis_0.var_names.isin(intersect)].copy()
    inf_aver = inf_aver.loc[inf_aver.index.isin(intersect), :].copy()

    # Ensure matching gene order
    adata_vis_0.var_names = adata_vis_0.var_names.sort_values()
    inf_aver = inf_aver.sort_index()

    # Setup cell2location
    cell2location.models.Cell2location.setup_anndata(adata=adata_vis_0)

    # Create and train the model
    mod = cell2location.models.Cell2location(
        adata_vis_0,
        cell_state_df=inf_aver,
        N_cells_per_location=5,
        detection_alpha=200,
    )
    mod.train(max_epochs=30000, batch_size=None, train_size=1, use_gpu=True)

    # Export posterior and save results
    adata_vis_0 = mod.export_posterior(
        adata_vis_0, sample_kwargs={'num_samples': 3000, 'batch_size': mod.adata.n_obs, 'use_gpu': True}
    )
    mod.save(f"{run_name}", overwrite=True)

    # Compute expected expression per cell type
    expected_dict = mod.module.model.compute_expected_per_cell_type(
        mod.samples["post_sample_q05"], mod.adata_manager
    )

    # Add to anndata layers
    for i, n in enumerate(mod.factor_names_):
        adata_vis_0.layers[n] = expected_dict['mu'][i]

    # Save AnnData with results
    adata_file = f"{run_name}/sp.h5ad"
    adata_vis_0.write(adata_file)

    # Add cell abundance quantile to `obs`
    adata_vis_0.obs[adata_vis_0.uns['mod']['factor_names']] = adata_vis_0.obsm['q05_cell_abundance_w_sf']

    # Save proportions
    df = adata_vis_0.obsm['q95_cell_abundance_w_sf']
    total_abundance = df.sum(axis=1)
    proportions = df.div(total_abundance, axis=0)
    proportions.columns = [col.split('_')[-1] for col in proportions.columns]
    proportions.reset_index(inplace=True)
    proportions.rename(columns={'index': 'spot'}, inplace=True)
    proportions.to_csv(f"{run_name}/cell2loc_deconv_predictions.csv", index=False)

    # Export layers to CSV
    layers_output_dir = f"{run_name}/layers"
    os.makedirs(layers_output_dir, exist_ok=True)
    for layer_name in adata_vis_0.layers.keys():
        layer_data = adata_vis_0.layers[layer_name].toarray()
        df = pd.DataFrame(layer_data, index=adata_vis_0.obs.index, columns=adata_vis_0.var.index)
        df.to_csv(f"{layers_output_dir}/{layer_name}_layer.csv")

    # Generate and save plots
    fig_dir = f"{run_name}/plots"
    os.makedirs(fig_dir, exist_ok=True)

    # Extract a simplified replicate name
    simplified_name = os.path.splitext(os.path.basename(replicate_name))[0]

    # UMAP visualization
    sc.pl.embedding(
        adata_vis_0,
        basis="spatial",
        color=adata_vis_0.uns['mod']['factor_names'],
        cmap="magma",
        ncols=2,
        show=False,
        save=f"umap_{simplified_name}.pdf",
    )

    # Spatial abundance visualization
    for ct in adata_vis_0.uns['mod']['factor_names']:
        sc.pl.spatial(
            adata_vis_0,
            color=[ct],
            spot_size=1,
            cmap="magma",
            size=1.3,
            show=False,
            save=f"{simplified_name}_{ct}_abundance.pdf",
        )

    print(f"Figures and results saved for {simplified_name}")

  accelerator, lightning_devices, device = parse_device_args(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn("You passed in a `val_dataloader` but have no `validation_step`. Skipping val loop.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
SLURM auto-requeueing enabled. Setting signal handlers.
  rank_zero_warn(


Epoch 30000/30000: 100%|██████████| 30000/30000 [31:10<00:00, 15.98it/s, v_num=1, elbo_train=1.98e+7]

`Trainer.fit` stopped: `max_epochs=30000` reached.


Epoch 30000/30000: 100%|██████████| 30000/30000 [31:10<00:00, 16.04it/s, v_num=1, elbo_train=1.98e+7]


  _, _, device = parse_device_args(


Sampling local variables, batch: 100%|██████████| 1/1 [00:41<00:00, 41.76s/it]
Sampling global variables, sample: 100%|██████████| 2999/2999 [00:42<00:00, 70.49it/s]


  x = um.multiply(x, x, out=x)


Figures and results saved for Wu_rep_0_GEX


  accelerator, lightning_devices, device = parse_device_args(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn("You passed in a `val_dataloader` but have no `validation_step`. Skipping val loop.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
SLURM auto-requeueing enabled. Setting signal handlers.
  rank_zero_warn(


Epoch 30000/30000: 100%|██████████| 30000/30000 [31:15<00:00, 15.87it/s, v_num=1, elbo_train=1.92e+7]

`Trainer.fit` stopped: `max_epochs=30000` reached.


Epoch 30000/30000: 100%|██████████| 30000/30000 [31:15<00:00, 16.00it/s, v_num=1, elbo_train=1.92e+7]


  _, _, device = parse_device_args(


Sampling local variables, batch: 100%|██████████| 1/1 [00:40<00:00, 40.97s/it]
Sampling global variables, sample: 100%|██████████| 2999/2999 [00:43<00:00, 69.73it/s]
Figures and results saved for Wu_rep_1_GEX


  accelerator, lightning_devices, device = parse_device_args(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn("You passed in a `val_dataloader` but have no `validation_step`. Skipping val loop.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
SLURM auto-requeueing enabled. Setting signal handlers.
  rank_zero_warn(


Sampling global variables, sample: 100%|██████████| 2999/2999 [00:42<00:00, 69.86it/s]_train=1.92e+7]
Figures and results saved for Wu_rep_2_GEX


  accelerator, lightning_devices, device = parse_device_args(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn("You passed in a `val_dataloader` but have no `validation_step`. Skipping val loop.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
SLURM auto-requeueing enabled. Setting signal handlers.
  rank_zero_warn(


Epoch 1058/30000:   4%|▎         | 1058/30000 [01:05<29:45, 16.21it/s, v_num=1, elbo_train=6.5e+8] 

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Epoch 30000/30000: 100%|██████████| 30000/30000 [31:15<00:00, 15.97it/s, v_num=1, elbo_train=1.93e+7]

`Trainer.fit` stopped: `max_epochs=30000` reached.


Epoch 30000/30000: 100%|██████████| 30000/30000 [31:15<00:00, 16.00it/s, v_num=1, elbo_train=1.93e+7]


  _, _, device = parse_device_args(


Sampling local variables, batch: 100%|██████████| 1/1 [00:40<00:00, 40.80s/it]
Sampling global variables, sample: 100%|██████████| 2999/2999 [00:42<00:00, 70.03it/s]
Figures and results saved for Wu_rep_3_GEX


  accelerator, lightning_devices, device = parse_device_args(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn("You passed in a `val_dataloader` but have no `validation_step`. Skipping val loop.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
SLURM auto-requeueing enabled. Setting signal handlers.
  rank_zero_warn(


Epoch 30000/30000: 100%|██████████| 30000/30000 [30:54<00:00, 15.80it/s, v_num=1, elbo_train=1.93e+7] 

`Trainer.fit` stopped: `max_epochs=30000` reached.


Epoch 30000/30000: 100%|██████████| 30000/30000 [30:54<00:00, 16.18it/s, v_num=1, elbo_train=1.93e+7]


  _, _, device = parse_device_args(


Sampling local variables, batch:   0%|          | 0/1 [00:00<?, ?it/s]