# Import libraries and setup

In [None]:
# Import libraries we may need
import scanpy as sc
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
from matplotlib import colors
import seaborn as sb
import scanpy.external as sce
import wget
import yaml
import wget
import astir
import dill
import umap.umap_ as umap
reducer = umap.UMAP()
import statsmodels as sm
import anndata as ad

#Packages to do Elbow analysis
from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer

#To display multiple dataframes in same output
from IPython.display import display

#Loading bar functionality for long tasks
from tqdm import tqdm

#For multiprocessing
from functools import partial
from multiprocessing import Pool

#import imc_import
#import utils
#import pop_id


In [None]:
# Set up output figure settings
plt.rcParams['figure.figsize']=(64,64) #rescale figures, increase sizehere

# Set up scanpy settings
sc.settings.verbosity = 3
sc.set_figure_params(dpi=100, dpi_save=300) #Increase DPI for better resolution figures
#sc.logging.print_versions()

# Save / restore environemnt
Use these functions to either save or load the environmental variables, otherwise you will lose all the objects between opening/closing Jupyter sessions!

In [None]:
#load the session
dill.load_session('DC_ASTIR_Pf_spleen_Malawi.db')

In [None]:
#save the session
dill.dump_session('DC_ASTIR_Pf_spleen_Malawi.db')

# Load/save anndata object

Save adata object
This will save the adata object in the current directory as a file called 'adata'

Load adata
This will open up a stored adata object (from the 'raw' directory)

In [None]:
# Load the Malawi cohort - only covid cases
adata_Malawi = ad.read_h5ad('./2_h5ad files/adata_COVID') #only covid cases included

In [None]:
adata_Malawi.obs['Group']

In [None]:
# Load the Brazilian cohort - only covid cases
adata_Brazil = ad.read_h5ad('./8_h5ad_files_Brazil_cohort/adata_subset4.h5ad') #control case removed

In [None]:
adata_Brazil.obs['Type']

In [None]:
# Load the US cohort - only covid cases
adata_US = ad.read_h5ad('./7_h5ad_files_US_cohort/adata_covid') #control cases included

In [None]:
adata_US.obs['ROI']

In [None]:
#Add the cohort info as a column in .obs

adata_Malawi.obs["Cohort"] = "Malawi"
adata_Brazil.obs["Cohort"] = "Brazil"
adata_US.obs["Cohort"] = "US"


# Concatenate adatas

https://anndata.readthedocs.io/en/latest/concatenation.html

When the variables present in the objects to be concatenated aren’t exactly the same, you can choose to take either the intersection or union of these variables. This is otherwise called taking the "inner" (intersection) or "outer" (union) join.

In [None]:
adata_list = adata_Malawi, adata_Brazil, adata_US

In [None]:
adata_list

In [None]:
#Try both methods
adata = ad.concat(adata_list, join="inner")
#adata2 = ad.concat(adata_list, join="outer")

In [None]:
adata.obs

In [None]:
# Save adatas
adata.write('./2_h5ad files/adata_Malawi_Brazil_US_inner_integrated_covid')
#adata2.write('./2_h5ad files/adata_Malawi_Brazil_US_outer_concatenated')

In [None]:
adata = ad.read_h5ad('./2_h5ad files/adata_Malawi_Brazil_US_inner_integrated_covid') #all cases included
#adata2 = ad.read_h5ad('./2_h5ad files/adata_Malawi_Brazil_US_outer_concatenated') #all cases included


In [None]:
adata

In [None]:
adata.var_names

In [None]:
adata_scvi.write('./2_h5ad files/adata_Malawi_Brazil_US_inner_integrated_covid_scvi')


In [None]:
adata_immune = ad.read_h5ad('./2_h5ad files/adata_Malawi_Brazil_US_inner_integrated_covid_immune') 
adata_stromal = ad.read_h5ad('./2_h5ad files/adata_Malawi_Brazil_US_inner_integrated_covid_stromal') 

In [None]:
adata_immune.obs_names_make_unique

In [None]:
adata_immune.write('./2_h5ad files/adata_Malawi_Brazil_US_inner_integrated_covid_immune')
adata_stromal.write('./2_h5ad files/adata_Malawi_Brazil_US_inner_integrated_covid_stromal')

In [None]:
adata_immune

Adding the variant and progression infos

In [None]:
adata = ad.read_h5ad('./2_h5ad files/adata_Malawi_Brazil_US_inner_integrated_covid')
adata_immune = ad.read_h5ad('./2_h5ad files/adata_Malawi_Brazil_US_inner_integrated_covid_immune') 
adata_stromal = ad.read_h5ad('./2_h5ad files/adata_Malawi_Brazil_US_inner_integrated_covid_stromal') 

In [None]:
# export everything except the data using `.write_csvs`.
# Set `skip_data=False` if you also want to export the data.
adata.write_csvs(dirname='./1_CSV files/Cohorts', skip_data=False)

In [None]:
# Adding Variant status
anno = pd.read_csv(filepath_or_buffer='./1_CSV files/Cohorts/obs.csv', index_col=1)

In [None]:
adata.obs['Variant'] = anno['Variant'].values
adata.obs['Progression'] = anno['Progression'].values
adata.obs['Cohort_Progression'] = anno['Cohort_Progression'].values

In [None]:
adata.obs

In [None]:
adata.write('./2_h5ad files/adata_Malawi_Brazil_US_inner_integrated_covid')

In [None]:
adata_immune = adata[adata.obs['hierarchy'].isin(['Lymphoid', 'Myeloid']),:].copy()
adata_stromal = adata[adata.obs['hierarchy'].isin(['Stromal', 'Vascular']),:].copy()

In [None]:
adata_immune.write('./2_h5ad files/adata_Malawi_Brazil_US_inner_integrated_covid_immune')
adata_stromal.write('./2_h5ad files/adata_Malawi_Brazil_US_inner_integrated_covid_stromal')

In [None]:
adata_immune2 = adata_immune[~adata_immune.obs['Cohort_Progression'].isin(['Malawi_LD']),:].copy()
adata_stromal2 = adata_stromal[~adata_stromal.obs['Cohort_Progression'].isin(['Malawi_LD']),:].copy()

In [None]:
adata_immune2.obs['Cohort_Progression']

Removing cohort-specific cell types:

In [None]:
adata = ad.read_h5ad('./2_h5ad files/adata_Malawi_Brazil_US_inner_integrated_covid') #all cases included


In [None]:
adata

In [None]:
adata2 = adata[~adata.obs['pheno_cluster_edited2'].isin(['NK cell', 'SARSCoV2+ NK cell', 'Dendritic cell', 'Mast cell']),:].copy()

In [None]:
adata_immune2 = adata2[adata2.obs['hierarchy'].isin(['Lymphoid', 'Myeloid']),:].copy()
adata_stromal2 = adata2[adata2.obs['hierarchy'].isin(['Stromal', 'Vascular']),:].copy()

In [None]:
adata2.write('./2_h5ad files/adata_Malawi_Brazil_US_inner_integrated_covid2')
adata_immune2.write('./2_h5ad files/adata_Malawi_Brazil_US_inner_integrated_covid_immune2')
adata_stromal2.write('./2_h5ad files/adata_Malawi_Brazil_US_inner_integrated_covid_stromal2')

Reannotate of cell types based on the 17 markers only

In [None]:
covid_dict = pd.read_csv('mikeimc_approach/Dictionaries/covid_pheno_cluster_edited_dict3.csv')
ep = covid_dict.set_index('pheno_cluster_edited2').to_dict()

adata.obs['pheno_cluster_edited3']=adata.obs['pheno_cluster_edited2'].map(ep['Cell_Class3'])

adata.obs['pheno_cluster_edited3']=adata.obs['pheno_cluster_edited3'].astype('category')

In [None]:
# Plot UMAP
sc.pl.umap(adata, s = 1, color="pheno_cluster_edited3", palette=colour_palette['colour'], ncols=1, size=3, save='.pdf')

In [None]:
sc.pl.matrixplot(adata, adata.var_names, groupby='pheno_cluster_edited3', vmax=0.75, dendrogram=False, save=True)


# Integration

Harmony

In [None]:
#Calculate PCA for adata - this must be done first (adata2 already contains X_pca)
sc.tl.pca(adata)
sc.pp.neighbors(adata)
sc.tl.umap(adata)
adata

In [None]:
sc.pl.umap(adata, size =1, color="Cohort", save = "unintegrated_inner_dataset.pdf")

In [None]:
import scanpy.external as sce
sce.pp.harmony_integrate(adata, 'Cohort')

In [None]:
#Calculate UMAP based on harmony integration
sc.pp.neighbors(adata, use_rep='X_pca_harmony')
sc.tl.umap(adata)

In [None]:
adata

In [None]:
sc.pl.umap(adata, s = 1, color="pheno_cluster_edited2", palette = colour_palette['colour'], ncols=1, size=3, save='_harmony_pheno_cluster_edited.pdf')



In [None]:
sc.pl.umap(adata, s = 1, color="hierarchy", ncols=1, size=3, save='_harmony_hierarchy.pdf')

Plot UMAPs
You can add extra .obs to UMAP_groups to colour the UMAPs by, e.g. treatment

In [None]:
# Plot UMAP
sc.pl.umap(adata, s = 1, color="Cohort", ncols=1, size=3, save='_harmony_inner_dataset.pdf')

In [None]:
# Harmonise some labels first - mainly the SARSCov2+ populations

#Re-annotation of the clusters in a dictionary file

covid_dict = pd.read_csv('mikeimc_approach/Dictionaries/covid_pheno_cluster_edited_dict2.csv')
ep = covid_dict.set_index('pheno_cluster2').to_dict()

adata.obs['pheno_cluster_edited2']=adata.obs['pheno_cluster'].map(ep['Cell_Class2'])

adata.obs['pheno_cluster_edited2']=adata.obs['pheno_cluster_edited2'].astype('category')


In [None]:
adata

Visualizing distributions across batches

Often, batches correspond to experiments that one wants to compare. Scanpy offers to convenient visualization possibilities for this.

a density plot
a partial visualization of a subset of categories/groups in an emnbedding

In [None]:
adata

In [None]:
adata.obs_names_make_unique

In [None]:
#Density plot - it can be calculated for different columns in adata.obs
sc.tl.embedding_density(adata, groupby='Cohort')

In [None]:
sc.pl.embedding_density(adata, groupby='Cohort', save='_harmony_inner_dataset_density.pdf')

Variational autoencoder (VAE) based integration

References: https://www.sc-best-practices.org/cellular_structure/integration.html#variational-autoencoder-vae-based-integration
https://docs.scvi-tools.org/en/stable/tutorials/notebooks/api_overview.html

In [None]:
#Use scvi-env environment for this part of the analysis
# Python packages
import scanpy as sc
import scvi
import scib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import anndata as ad

In [None]:
adata_scvi = adata.copy()
#adata_scvi2 = adata2.copy()

In [None]:
adata_scvi

In [None]:
# Create layer counts
adata_scvi.layers["counts"]=adata_scvi.X.copy()
adata_scvi2.layers["counts"]=adata_scvi2.X.copy()

In [None]:
adata_scvi

In [None]:
# Define the expression matrix to use and what the batch key is.
batch_key='Cohort'

In [None]:
# Data preparation
# this creates fields marked with scvi
scvi.model.SCVI.setup_anndata(adata_scvi, layer="counts", batch_key=batch_key)
adata_scvi

#scvi.model.SCVI.setup_anndata(adata_scvi2, layer="counts", batch_key=batch_key)
#adata_scvi2

In [None]:
# Building the model
# Construct the scVI model object
# The scvi-tools package contains various other models - here we will use the scANVI model
model_scvi = scvi.model.SCVI(adata_scvi)
model_scvi

In [None]:
#model_scvi2 = scvi.model.SCVI(adata_scvi2)
#model_scvi2

In [None]:
# We can also print a more detailed description of the model that shows us where things are stored in the associated AnnData object.

model_scvi.view_anndata_setup()

In [None]:
#model_scvi2.view_anndata_setup()

In [None]:
# Training the model
# The model will be trained for a given number of epochs, a training iteration where every cell is passed through the network. 
# By default scVI uses the following heuristic to set the number of epochs. 
# For datasets with fewer than 20,000 cells, 400 epochs will be used and as the number of cells grows above 20,000 the number of epochs is continuously reduced. 
# The reasoning behind this is that as the network sees more cells during each epoch it can learn the same amount of information as it would from more epochs with fewer cells.
max_epochs_scvi = np.min([round((20000 / adata_scvi.n_obs) * 400), 400])
max_epochs_scvi

In [None]:
#max_epochs_scvi2 = np.min([round((20000 / adata_scvi2.n_obs) * 400), 400])
#max_epochs_scvi2

In [None]:
# We now train the model for the selected number of epochs (this will take ~20-40 minutes depending on the computer you are using).
# Additionally to setting a target number of epochs, it is possible to also set early_stopping=True in the training function. This will let scVI decide to stop training early depending on the convergence of the model. 
# The exact conditions for stopping can be controlled by other parameters.
model_scvi.train(max_epochs=26)

In [None]:
# Extract the embedding

# The main result we want to extract from the trained model is the latent representation for each cell. 
# This is a multi-dimensional embedding where the batch effects have been removed that can be used in a similar way to how we use PCA dimensions when analysing a single dataset. 
# We store this in obsm with the key X_scvi.

adata_scvi.obsm["X_scVI"] = model_scvi.get_latent_representation()

In [None]:
# Calculate a batch-corrected UMAP

#We calculate a new UMAP embedding but instead of finding nearest neighbors in PCA space, 
# we start with the corrected representation from scVI.

sc.pp.neighbors(adata_scvi, use_rep="X_scVI")
sc.tl.umap(adata_scvi)
adata_scvi

In [None]:
sc.pl.umap(adata_scvi, size =1, color=batch_key, save = "scVI_inner_dataset.pdf")

In [None]:
sc.tl.embedding_density(adata_scvi, groupby='Patient')

In [None]:
sc.pl.embedding_density(adata_scvi, groupby='Patient', save='_scVI_density.pdf')

VAE integration using cell labels

When performing integration with scVI we pretended that we didn’t already have any cell labels (although we showed them in plots). While this scenario is common there are some cases where we do know something about cell identity in advance. Most often this is when we want to combine one or more publicly available datasets with data from a new study. When we have labels for at least some of the cells we can use scANVI (single-cell ANnotation using Variational Inference) [Xu et al., 2021]. This is an extension of the scVI model that can incorporate cell identity label information as well as batch information. Because it has this extra information it can try to keep the differences between cell labels while removing batch effects. Benchmarking suggests that scANVI tends to better preserve biological signals compared to scVI but sometimes it is not as effective at removing batch effects [Luecken et al., 2021]. While we have labels for all cells here it is also possible to use scANVI in a semi-supervised manner where labels are only provided for some cells.

Label harmonization
If you are using scANVI to integrate multiple datasets for which you already have labels it is important to first perform label harmonization. This refers to a process of checking that labels are consistent across the datasets that are being integrated. For example, a cell may be annotated as a “T cell” in one dataset, but a cell of the same type could have been given the label “CD8+ T cell” in another dataset. How best to harmonize labels is an open question but often requires input from subject-matter experts.

In [None]:
# VAE integration using cell labels
# I will test this approach using the leiden0.3 as the cell label key
label_key = "pheno_cluster"
model_scanvi = scvi.model.SCANVI.from_scvi_model(model_scvi, labels_key=label_key, unlabeled_category="unlabelled")
print(model_scanvi)
model_scanvi.view_anndata_setup()

In [None]:
# Harmonise some labels first - mainly the SARSCov2+ populations

#Re-annotation of the clusters in a dictionary file

covid_dict = pd.read_csv('mikeimc_approach/Dictionaries/covid_pheno_cluster_edited_dict.csv')
ep = covid_dict.set_index('pheno_cluster').to_dict()

adata_scvi.obs['pheno_cluster_edited']=adata_scvi.obs['pheno_cluster'].map(ep['Cell_Class'])

adata_scvi.obs['pheno_cluster_edited']=adata_scvi.obs['pheno_cluster_edited'].astype('category')


In [None]:
adata_scvi.obs

In [None]:
label_key = "pheno_cluster_edited2"
model_scanvi = scvi.model.SCANVI.from_scvi_model(model_scvi, labels_key=label_key, unlabeled_category="unlabelled")
print(model_scanvi)
model_scanvi.view_anndata_setup()

In [None]:
max_epochs_scanvi = int(np.min([10, np.max([2, round(max_epochs_scvi / 3.0)])]))
model_scanvi.train(max_epochs=max_epochs_scanvi)

In [None]:
# extract the new latent representation from the model and create a new UMAP embedding.
adata_scvi.obsm["X_scANVI"] = model_scanvi.get_latent_representation()
sc.pp.neighbors(adata_scvi, use_rep="X_scANVI")
sc.tl.umap(adata_scvi)

In [None]:
sc.pl.umap(adata_scvi, color=[label_key], wspace=1)

In [None]:
sc.pl.umap(adata_scvi, color=[batch_key], wspace=1, save = "sANcVI_inner_dataset.pdf")

In [None]:
sc.pl.embedding_density(adata_scvi, groupby='Patient', save='_scANVI_density.pdf')

In [None]:
adata_scvi

# Cell proportions across cohorts

In [None]:
col_df = pd.read_csv('mikeimc_approach/colours/pheno_colours_edited2.csv')
colour_palette = col_df.set_index('pheno_cluster_edited2').to_dict()
colour_palette['colour']

In [None]:
col_df = pd.read_csv('mikeimc_approach/colours/pheno_colours_edited3.csv')
colour_palette = col_df.set_index('pheno_cluster_edited3').to_dict()
colour_palette['colour']

In [None]:
# Remove RBCs
adata = adata[~adata.obs['pheno_cluster_edited2'].isin(['RBC']),:].copy()

In [None]:
adata_scvi = adata_scvi[~adata_scvi.obs['pheno_cluster_edited2'].isin(['RBC']),:].copy()

In [None]:
adata_immune = adata[adata.obs['hierarchy'].isin(['Lymphoid', 'Myeloid']),:].copy()
adata_stromal = adata[adata.obs['hierarchy'].isin(['Stromal', 'Vascular']),:].copy()

In [None]:
tmp = pd.crosstab(adata.obs['pheno_cluster_edited2'],adata.obs['Cohort'], margins=False, margins_name='Total', normalize='columns')

In [None]:
tmp = pd.crosstab(adata_immune.obs['pheno_cluster_edited3'],adata_immune.obs['Cohort_Progression'], margins=False, margins_name='Total', normalize='columns')

In [None]:
tmp = pd.crosstab(adata_stromal.obs['pheno_cluster_edited3'],adata_stromal.obs['Cohort_Progression'], margins=False, margins_name='Total', normalize='columns')

In [None]:
tmp

In [None]:
tmp2 = tmp.reindex(['B cell', "CD4 T cell", "CD4 Treg cell", "CD8 T cell",
                    'Dendritic cell',  'Mast cell',
                     "SARSCoV2+ NK cell",  'NK cell',
                    'SARSCoV2+ neutrophil', "Apoptotic neutrophil", 'Neutrophil', 
                    'SARSCoV2+ monocyte', "Classical monocyte", 'SARSCoV2+ IM', 'Interstitial macrophage',
                    'SARSCoV2+ AM', 'Apoptotic alveolar macrophage', 'Alveolar macrophage'])

In [None]:
tmp2 = tmp.reindex(['B cell', "CD4 T cell", "CD4 Treg cell", "CD8 T cell",
                    'SARSCoV2+ neutrophil', "Apoptotic neutrophil", 'Neutrophil', 
                    'SARSCoV2+ monocyte', "Classical monocyte", 'SARSCoV2+ IM', 'Interstitial macrophage',
                    'SARSCoV2+ AM', 'Apoptotic alveolar macrophage', 'Alveolar macrophage'])

In [None]:
tmp2 = tmp.reindex(['SARSCoV2+ epithelial cell', 'Apoptotic epithelial cell', 'Epithelial cell', 
                    'Activated endothelial cell', 'Endothelial cell', 'Fibroblast', 'Apoptotic fibroblast',
                    'SMC', 'Apoptotic SMC', "Mesenchymal"])

In [None]:
tmp2 = tmp.reindex(['B cell', "CD4 T cell", "CD4 Treg cell", "CD8 T cell",
                    'CD11c+ cell',
                     "SARSCoV2+ CD45+ cell",  'CD45+ cell',
                    'SARSCoV2+ neutrophil', 'Neutrophil', 
                    'SARSCoV2+ monocyte', "Classical monocyte", 'SARSCoV2+ IM', 'Interstitial macrophage',
                    'SARSCoV2+ AM', 'Alveolar macrophage'])

In [None]:
tmp2 = tmp.reindex(['SARSCoV2+ epithelial cell', 'Epithelial cell', 
                    'Endothelial cell', 'Fibroblast',
                    'SMC', "Mesenchymal"])

In [None]:
tmp3 = tmp2.reindex(index=tmp2.index[::-1])

In [None]:
tmp4 = pd.DataFrame.transpose(tmp3)

In [None]:
tmp4

In [None]:
tmp5 = tmp4.reindex(['Malawi_ED', 'Brazil_ED', 'US_ED', 'Malawi_LD', 'Brazil_LD', 'US_LD'])

In [None]:
#color for stack bars for the disease groups matching the other bar graphs
#F08080 - light coral
#87CEFA - lightskyblue
#D3D3D3 - lightgrey

import seaborn as sb

pretty_colors = ['#F08080','#87CEFA','#D3D3D3']
color_pal = sb.color_palette('Paired')

#sb.set_style("whitegrid", {'axes.grid' : True})
tmp5.plot.bar(stacked=True, color=colour_palette['colour'], figsize=(4, 4), rot=45).legend(bbox_to_anchor=(1, 1))
plt.savefig("Frequency_immune_Cohorts_Progression_iteration3_reversed.pdf", format="pdf", bbox_inches="tight")

In [None]:
compression_opts = dict(method='zip',
                         archive_name='Immune_Cohort_Progression.csv') 
tmp2.to_csv('Immune_Cohort_Progression.zip', index=True, compression=compression_opts)

In [None]:
import mikeimc_v2

# Alter this list with adata.obs variables that you want to plot against the resulting leiden population
image_var = 'ROI'

for i in ['Cohort_Progression']:

    mikeimc_v2.grouped_graph(adata_immune,
                             ROI_id=image_var,
                             group_by_obs=i,
                             x_axis='pheno_cluster_edited2',
                             fig_size=(8, 4),
                             log_scale=False, scale_factor=True, crosstab_norm='columns',
                            display_tables=True) #If you change display_tables to True, will also do stats on the groups
    # use crosstab_norm='columns' to plot and do statistics based on cell frequencies per group
    plt.show()


In [None]:
cells = pd.crosstab([adata_immune.obs['Variant'], adata_immune.obs['ROI']],adata_immune.obs['pheno_cluster_edited2'],normalize='index')
cells.columns=cells.columns.astype('str') 

In [None]:
cells = pd.crosstab([adata_stromal.obs['Variant'], adata_stromal.obs['ROI']],adata_stromal.obs['pheno_cluster_edited2'],normalize='index')
cells.columns=cells.columns.astype('str') 

In [None]:
cells = pd.crosstab([adata.obs['Cohort_Progression'], adata.obs['ROI']],adata.obs['pheno_cluster_edited3'],normalize='index')
cells.columns=cells.columns.astype('str') 

In [None]:
cells

In [None]:
compression_opts = dict(method='zip',
                         archive_name='Proportions_Cohort_Progression_iteration3.csv') 
cells.to_csv('Proportions_Cohort_Progression_iteration3.zip', index=True, compression=compression_opts)

In [None]:
sc.pl.matrixplot(adata, adata.var_names, groupby='pheno_cluster_edited2', vmax=0.75, dendrogram=True, save=True)


# Proportion comparisons analysis - scanpro

In [None]:
from scanpro import scanpro

In [None]:
adata_stromal.obs['Variant']

In [None]:
adata_stromal

In [None]:
adata_immune3 = adata_immune[adata_immune.obs['Cohort_Progression'].isin(['Malawi_ED', "Brazil_ED", "US_ED"]),:].copy()
adata_stromal3 = adata_stromal[adata_stromal.obs['Cohort_Progression'].isin(['Malawi_ED', "Brazil_ED", "US_ED"]),:].copy()

In [None]:
adata_immune4 = adata_immune[adata_immune.obs['Cohort_Progression'].isin(['Malawi_LD', "Brazil_LD", "US_LD"]),:].copy()
adata_stromal4 = adata_stromal[adata_stromal.obs['Cohort_Progression'].isin(['Malawi_LD', "Brazil_LD", "US_LD"]),:].copy()
adata4 = adata[adata.obs['Cohort_Progression'].isin(['Malawi_LD', "Brazil_LD", "US_LD"]),:].copy()

In [None]:
adata_immune5 = adata_immune[adata_immune.obs['Cohort_Progression'].isin(['Malawi_ED', "Brazil_LD", "US_LD"]),:].copy()
adata_stromal5 = adata_stromal[adata_stromal.obs['Cohort_Progression'].isin(['Malawi_ED', "Brazil_LD", "US_LD"]),:].copy()
adata5 = adata[adata.obs['Cohort_Progression'].isin(['Malawi_LD', "Brazil_LD", "US_LD"]),:].copy()

In [None]:
out = scanpro(adata_immune, clusters_col='pheno_cluster_edited3', conds_col='Progression', samples_col='ROI')

In [None]:
out.results

# Proportion comparisons analysis - scCODA

In [None]:
# run in the scvi-env environment
from sccoda.util import comp_ana as mod
from sccoda.util import cell_composition_data as dat
from sccoda.util import data_visualization as viz

import sccoda.datasets as scd

In [None]:
model_salm = mod.CompositionalAnalysis(adata_immune, formula="Variant", reference_cell_type="automatic")

In [None]:
# Run MCMC
sim_results = model_salm.sample_hmc()

In [None]:
sim_results.summary()

In [None]:
print(sim_results.credible_effects())

In [None]:
sim_results.set_fdr(est_fdr=0.05)
sim_results.summary()

In [None]:
# saving
path = "test"
sim_results.save(path)

# loading
with open(path, "rb") as f:
    sim_results_2 = pkl.load(f)

sim_results_2.summary()

# QC integration methods

In [None]:
import faiss
from scib_metrics.nearest_neighbors import NeighborsOutput

In [None]:
from scib_metrics.benchmark import Benchmarker

In [None]:
import time

In [None]:
adata_scvi

In [None]:
start = time.time()
bm = Benchmarker(
    adata_scvi,
    batch_key=batch_key,
    label_key=label_key,
    embedding_obsm_keys=["X_pca", "X_scVI", "X_scANVI", "X_pca_harmony"],
    pre_integrated_embedding_obsm_key="X_pca",
    bio_conservation_metrics=None,
    n_jobs=-1,
)
bm.benchmark()
end = time.time()
print(f"Time: {int((end - start) / 60)} min {int((end - start) % 60)} sec")

In [None]:
label_key = "population"
batch_key='Patient'

In [None]:
metrics_scvi = scib.metrics.metrics_fast(adata_scvi, batch_key, label_key, embed="X_scVI")
metrics_scanvi = scib.metrics.metrics_fast(adata_scvi, batch_key, label_key, embed="X_scANVI")
metrics_bbknn = scib.metrics.metrics_fast(adata_scvi, batch_key, label_key)# BBKNN is stored in the neighbours key, because I ran in adata_subset it might no be different between these adatas
metrics_harmony = scib.metrics.metrics_fast(adata_scvi, batch_key, label_key, embed="X_pca_harmony")


# Export data

In [None]:
#Make a simplified dataframe to export
adata_export = adata_subset3.obs[['Case','ROI','Type','hierarchy','pheno_cluster']].copy()

In [None]:
#Split up the X and Y columns from the adata
adata_export['X'], adata_export['Y'] = np.split(adata_subset3.obsm['spatial'],[-1],axis=1)
#Save to file
adata_export.to_csv('adata_cluster_export.csv')

In [None]:
# export everything except the data using `.write_csvs`.
# Set `skip_data=False` if you also want to export the data.
adata_subset3.write_csvs(dirname='./', skip_data=False)