In [None]:
import os
import matplotlib as plt
import numpy as np
import scanpy as sc
import cell2location
from cell2location.utils.filtering import filter_genes
from cell2location.models import RegressionModel

In [None]:
# os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"]

In [None]:
os.name

In [None]:
# Kevin's Macbook
hamstring_h5ad = "/Users/kevin/Nexus365/Jolet Mimpen - CZI - Tendon Seed Network/Manuscripts/Hamstring paper/cell2loc/hamstring_integrated_annotated_res0p15_20220922.h5ad"
# create paths and names to results folders for reference regression and cell2location models
RESULTS_FOLDERNAME = '/Users/kevin/git/kevinrue/tendonhca/003-snakemake/notebooks/hamstring-gpu'
ref_run_name = f'{RESULTS_FOLDERNAME}/reference_signatures'
run_name = f'{RESULTS_FOLDERNAME}/cell2location_map'

In [None]:
# JADE
hamstring_h5ad = "/Users/kevin/Nexus365/Jolet Mimpen - CZI - Tendon Seed Network/Manuscripts/Hamstring paper/cell2loc/hamstring_integrated_annotated_res0p15_20220922.h5ad"
# create paths and names to results folders for reference regression and cell2location models
RESULTS_FOLDERNAME = '/Users/kevin/git/kevinrue/tendonhca/003-snakemake/notebooks/hamstring-gpu'
ref_run_name = f'{RESULTS_FOLDERNAME}/reference_signatures'
run_name = f'{RESULTS_FOLDERNAME}/cell2location_map'

In [None]:
adata_hamstring = sc.read(hamstring_h5ad)
adata_hamstring

Later, we figure out that there is not layer 'counts'.
Here, I assign .X to the 'counts' layer.

In [None]:
adata_hamstring.layers["counts"] = adata_hamstring.X

In [None]:
adata_hamstring.obs['cell_type'].value_counts()

In [None]:
adata_hamstring.var

In [None]:
from cell2location.utils.filtering import filter_genes
selected = filter_genes(adata_hamstring, cell_count_cutoff=30, #cell_percentage_cutoff2=0.03, 
                        nonz_mean_cutoff=1.12)

In [None]:
# filter the object
adata_hamstring = adata_hamstring[:, selected].copy()
adata_hamstring.var

In [None]:
adata_hamstring.obs

In [None]:
#adata_hamstring.obs['batch'].value_counts()
adata_hamstring.obs['sex'].value_counts()
#adata_hamstring.obs['sample'].value_counts()

In [None]:
adata_hamstring.layers

In [None]:
# prepare anndata for the regression model
cell2location.models.RegressionModel.setup_anndata(adata=adata_hamstring,
                        layer="counts",
                        # 10X reaction / sample / batch
                        batch_key='sample',
                        # cell type, covariate used for constructing signatures
                        labels_key='cell_type',
                        # multiplicative technical effects (platform, 3' vs 5', donor effect)
                        categorical_covariate_keys=['batch', 'sex']
                       )

# create the regression model
from cell2location.models import RegressionModel
mod = RegressionModel(adata_hamstring)

# view anndata_setup as a sanity check
mod.view_anndata_setup()

In [None]:
%%time
mod.train(max_epochs=250, use_gpu=True)

In [None]:
mod.plot_history(20)

In [None]:
# In this section, we export the estimated cell abundance (summary of the posterior distribution).
adata_hamstring = mod.export_posterior(
    adata_hamstring, sample_kwargs={'num_samples': 5000, 'batch_size': 2500, 'use_gpu': True}
)

# Save model
mod.save(f"{ref_run_name}", overwrite=True)

# Save anndata object with results
adata_file = f"{ref_run_name}/sc.h5ad"
adata_hamstring.write(adata_file)
adata_file

In [None]:
mod.plot_QC()

In [None]:
# adata_file = f"{ref_run_name}/sc.h5ad"
# adata_ref = sc.read_h5ad(adata_file)
# mod = cell2location.models.RegressionModel.load(f"{ref_run_name}", adata_ref)

In [None]:
# export estimated expression in each cluster
if 'means_per_cluster_mu_fg' in adata_hamstring.varm.keys():
    inf_aver = adata_hamstring.varm['means_per_cluster_mu_fg'][[f'means_per_cluster_mu_fg_{i}'
                                    for i in adata_hamstring.uns['mod']['factor_names']]].copy()
else:
    inf_aver = adata_hamstring.var[[f'means_per_cluster_mu_fg_{i}'
                                    for i in adata_hamstring.uns['mod']['factor_names']]].copy()
inf_aver.columns = adata_hamstring.uns['mod']['factor_names']
inf_aver

# Cell2Location Spatial Mapping

In [None]:
adata = sc.read_h5ad("../data/concatenated.h5ad")
adata

In [None]:
# find shared genes and subset both anndata and reference signatures
intersect = np.intersect1d(adata.var_names, inf_aver.index)
adata = adata[:, intersect].copy()
inf_aver = inf_aver.loc[intersect, :].copy()

# prepare anndata for cell2location model
cell2location.models.Cell2location.setup_anndata(adata=adata, batch_key="sample")

In [None]:
adata

In [None]:
adata.raw

In [None]:
# create and train the model
mod2 = cell2location.models.Cell2location(
    adata, 
    cell_state_df=inf_aver,
    # the expected average cell abundance: tissue-dependent
    # hyper-prior which can be estimated from paired histology:
    N_cells_per_location=17,
    # hyperparameter controlling normalisation of
    # within-experiment variation in RNA detection:
    detection_alpha=20
)
mod2.view_anndata_setup()

In [None]:
import torch
torch.set_float32_matmul_precision('high')

In [None]:
mod2.train(max_epochs=15000,
          # train using full data (batch_size=None)
          batch_size=None,
          # use all data points in training because
          # we need to estimate cell abundance at all locations
          train_size=1,
          use_gpu=True, log_every_n_steps=1
         )


# plot ELBO loss history during training, removing first 100 epochs from the plot
mod2.plot_history(1000)
plt.legend(labels=['full data training']);

In [None]:
|