
# Preprocessing - Integration
Michael Sterr

2022-09-21 13:29:49 


# Setup

Run following scripts before:
 * 05-2_scRNA-seq_iPSC_IIR-KO_S6_Preprocessing_Normalization_WT-IIR-KO_v5

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:85% !important; }</style>"))

In [2]:
# General
import scipy as sci
import numpy as np
import pandas as pd
import logging
import time
import pickle
from itertools import chain
import session_info
import gc # Free memory #gc.collect()
import scipy.stats as stats

# Plotting
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib import rcParams
from matplotlib import cm
from matplotlib.pyplot import rc_context
import seaborn as sb
from adjustText import adjust_text

# Analysis
import anndata as ad
import scanpy as sc
import scvi
import scanpy.external as sce

Global seed set to 0


In [3]:
# Settings

## Scanpy settings
sc.settings.verbosity = 3
sc.logging.print_versions()
session_info.show()

-----
anndata     0.8.0
scanpy      1.9.1
-----
PIL                         8.4.0
absl                        NA
adjustText                  NA
anyio                       NA
astunparse                  1.6.3
attr                        21.2.0
babel                       2.9.1
backcall                    0.2.0
beta_ufunc                  NA
binom_ufunc                 NA
bottleneck                  1.3.2
certifi                     2022.06.15
cffi                        1.15.0
chardet                     4.0.0
charset_normalizer          2.0.7
chex                        0.1.1
cloudpickle                 2.0.0
colorama                    0.4.4
cupy                        10.1.0
cupy_backends               NA
cupyx                       NA
cycler                      0.10.0
cython_runtime              NA
dask                        2021.10.0
dateutil                    2.8.2
debugpy                     1.4.1
decorator                   5.1.0
defusedxml                  0.7.1
deprecate  

In [4]:
# Color maps
exec(open("/home/michi/Software/viscm/maps/michi_bk_bl_gn_yl.py").read())

In [5]:
# Plot settings
%matplotlib inline

## Directory
sc.settings.figdir='/home/michi/Projects/scRNA-seq_iPSC_IGFRL-KO_Notebooks/Figures'

## Plotting parameters
rcParams['figure.figsize']=(20,20) #rescale figures
#sc.set_figure_params(scanpy=True, frameon=False, vector_friendly=False, color_map='tab10' ,transparent=True, dpi=150, dpi_save=300)
sc.set_figure_params(scanpy=True, frameon=False, vector_friendly=False ,transparent=True, dpi=150, dpi_save=300)

## Font
rcParams['font.family'] = 'sans-serif'
rcParams['font.sans-serif'] = ['Source Sans 3']

## Grid & Ticks
rcParams['grid.alpha'] = 0
rcParams['xtick.bottom'] = True
rcParams['ytick.left'] = True

## Embed font
plt.rc('pdf', fonttype=42)

## Define new default settings
plt.rcParamsDefault = plt.rcParams

In [6]:
# Color maps
ch_YlRd=sb.cubehelix_palette(100, start=.7, rot=.25, gamma=0.6, hue=2, light=1, dark=0.05, as_cmap=True)
ch_Bl=sb.cubehelix_palette(100, start=2.65, rot=0, gamma=0.8, hue=1.8, light=1, dark=0, as_cmap=True)
ch_Bl2=sb.cubehelix_palette(100, start=2.75, rot=-.12, gamma=0.8, hue=1.8, light=1, dark=0, as_cmap=True)

# Setup R

In [7]:
#R
import os
os.environ['R_HOME'] = '/home/michi/Software/venvs/scAnalysis_sc1.9_ad0.8_mu0.1.2_md0.2_R4.1_FVF/lib/R' #path to your R installation

import rpy2
import rpy2.robjects as ro
import rpy2.rinterface_lib.callbacks
from rpy2.robjects import pandas2ri
import anndata2ri

## R settings

### Ignore R warning messages
#### Note: this can be commented out to get more verbose R output
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

### Automatically convert rpy2 outputs to pandas dataframes
pandas2ri.activate()
anndata2ri.activate()
%load_ext rpy2.ipython

In [8]:
%%R

.libPaths()

[1] "/home/michi/Software/venvs/scAnalysis_sc1.9_ad0.8_mu0.1.2_md0.2_R4.1_FVF/lib/R/library"


In [9]:
%%R
library(scry)

# Parallelization
library("BiocParallel.FutureParam")
register(FutureParam())
plan(multicore, workers=8)
options(future.globals.maxSize = 2 * 1024 ^ 3) # for 50 Gb RAM

sessionInfo()

R version 4.1.1 (2021-08-10)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 20.04.5 LTS

Matrix products: default
BLAS/LAPACK: /home/michi/Software/venvs/scAnalysis_sc1.9_ad0.8_mu0.1.2_md0.2_R4.1_FVF/lib/libopenblasp-r0.3.18.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=de_DE.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=de_DE.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=de_DE.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=de_DE.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] tools     stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
[1] BiocParallel.FutureParam_0.2.1 BiocParallel_1.28.3           
[3] future_1.27.0                  scry_1.6.0                    

loaded via a namespace (and not attached):
 [1] Rcpp_1.0.9                  parallelly_1.32.1          
 [3] BiocSingular_1

# Functions

In [10]:
def sparsify_all_layers(adata):
    """
    Loop trough all layers and make dense matrices sparse.
    """
          
    if not sci.sparse.issparse(adata.X):
        print('Sparsify .X...')
        adata.X = sci.sparse.csr_matrix(adata.X)
    else:
        print('.X already spase...')  
        
    for layer in list(adata.layers):
        if not sci.sparse.issparse(adata.layers[layer]):
            print('Sparsify ', layer)
            adata.layers[layer] = sci.sparse.csr_matrix(adata.layers[layer])
        else:
            print('Layer', layer, 'already spase...')

####################################################################################################################################################################################################################################
####################################################################################################################################################################################################################################
####################################################################################################################################################################################################################################
####################################################################################################################################################################################################################################

    
def print_r_session():
    ro.r('print(sessionInfo())')

# Load Data

In [11]:
adata=sc.read('/home/michi/Projects/scRNA-seq_iPSC_IGFRL-KO_Notebooks/Files/scRNA-seq_iPSC_IIR-KO_S6_adata_rmDoublets_normalized.h5ad')

# HVGs

In [13]:
adata_raw = ad.AnnData(X=adata.layers['raw_counts'])

In [14]:
%%R -i adata_raw
sce = devianceFeatureSelection(adata_raw, assay='X')

  return AnnData(exprs, obs, var, uns, obsm or None, layers=layers)
  return AnnData(exprs, obs, var, uns, obsm or None, layers=layers)


In [15]:
binomial_deviance = ro.r('rowData(sce)$binomial_deviance').T

In [16]:
idx = binomial_deviance.argsort()[-4000:]
mask = np.zeros(adata.var_names.shape, dtype=bool)
mask[idx] = True

adata.var['highly_deviant'] = mask
adata.var['highly_variable'] = mask

In [17]:
# Get HVGs and overlap with cell cycle & ambient genes

## HVGs
adata.var['highly_variable'] = adata.var['highly_deviant'].copy()
hvgs = pd.Series(adata.var_names[adata.var['highly_variable']])
print('\nHighly variable genes before filtering:',adata.var.loc[:,'highly_variable'].value_counts()[1])

# overlap HVGs with CC genes
hvcc = list(hvgs[hvgs.isin(all_cc_genes)])
print('\nHighly variable cell cycle genes:',len(hvcc),'\n',hvcc)

# overlap HVGs with ambient genes
hvambi = list(hvgs[hvgs.isin(list(adata[:,adata.var['is_ambient'] == True].var_names))])
print('\nHighly variable ambient genes:',len(hvambi),'\n',hvambi)

# remove cell cycle genes
adata.var.loc[hvcc,'highly_variable'] = False

# # remove ambient genes
# adata.var.loc[hvambi,'highly_variable'] = False

print('\nHighly variable genes after filtering:',adata.var.loc[:,'highly_variable'].value_counts()[1])


Highly variable genes before filtering: 4000

Highly variable cell cycle genes: 215 
 ['TOP1', 'SEPHS1', 'VCL', 'MYCBP2', 'GAS1', 'BIRC5', 'HMGB2', 'RHEB', 'TSC22D1', 'BRD7', 'TULP4', 'NT5DC1', 'CTSD', 'SEC62', 'CADM1', 'DHFR', 'PCNA', 'CFLAR', 'LNPEP', 'GNB1', 'TUBB', 'PHIP', 'KDM5B', 'NUMA1', 'FYN', 'ARL4A', 'NKTR', 'CEP57', 'CASP3', 'SSR3', 'DCTN6', 'BCLAF1', 'CBX5', 'CDC42', 'LARP7', 'GADD45A', 'MSL1', 'PTMS', 'EIF4E', 'TOP2A', 'DNAJB6', 'ZBTB7A', 'CCDC88A', 'TTLL7', 'DCAF16', 'CEP350', 'RAN', 'MAD2L1', 'HMGCR', 'CENPF', 'CTR9', 'FXR1', 'TUBB4B', 'CTCF', 'DNAJA1', 'LARP1', 'TMPO', 'LMO4', 'POM121', 'ANP32E', 'RERE', 'TUBB2A', 'HIPK2', 'ZNF24', 'INSR', 'THRAP3', 'KLF6', 'NR3C1', 'NFIC', 'BUB3', 'REEP1', 'UBL3', 'AHI1', 'PPP6R3', 'SLBP', 'KRAS', 'EIF2A', 'ZNF207', 'SFPQ', 'CALD1', 'ZBED5', 'EIF4EBP2', 'UBE2C', 'BTBD3', 'LMNA', 'KIF20B', 'NIPBL', 'AMD1', 'CCDC14', 'TPX2', 'TSN', 'HMGB3', 'CKS2', 'TXNRD1', 'ATF7IP', 'HP1BP3', 'CNIH4', 'TYMS', 'DYNLL1', 'RSRC2', 'PSMD11', 'CAPN7', 'PPP

In [18]:
del adata_raw
gc.collect()

571

# Initial Embedding

In [None]:
sc.pp.pca(adata, svd_solver='arpack', use_highly_variable=True)
sc.pl.pca_overview(adata)

In [None]:
sc.pp.neighbors(adata, n_neighbors=20, n_pcs=30, metric='correlation')

In [None]:
sc.tl.umap(adata, min_dist=0.4, spread=0.9)

In [None]:
genes = ['INS','GCG','TPH1','SST','ARX','NKX6-1','LMX1A','LMX1B','SLC18A1','ASCL1','GAP43','MKI67','KRT19','VIM']

In [None]:
sc.pl.umap(adata, color=['sample','leiden'] + genes, size=10, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=4, color_map=ch_YlRd)

# Initial Clustering & Annotation

In [None]:
sc.tl.leiden(adata, resolution=0, key_added='leiden_r0')
sc.tl.leiden(adata, resolution=0.1, key_added='leiden_r0.1')
sc.tl.leiden(adata, resolution=0.2, key_added='leiden_r0.2')
sc.tl.leiden(adata, resolution=0.3, key_added='leiden_r0.3')
sc.tl.leiden(adata, resolution=0.4, key_added='leiden_r0.4')
sc.tl.leiden(adata, resolution=0.5, key_added='leiden_r0.5')
sc.tl.leiden(adata, resolution=0.6, key_added='leiden_r0.6')
sc.tl.leiden(adata, resolution=0.7, key_added='leiden_r0.7')
sc.tl.leiden(adata, resolution=0.8, key_added='leiden_r0.8')
sc.tl.leiden(adata, resolution=0.9, key_added='leiden_r0.9')
sc.tl.leiden(adata, resolution=1, key_added='leiden_r1')

sc.tl.leiden(adata, resolution=1.25, key_added='leiden_r1.25')
sc.tl.leiden(adata, resolution=1.5, key_added='leiden_r1.5')
sc.tl.leiden(adata, resolution=1.75, key_added='leiden_r1.75')
sc.tl.leiden(adata, resolution=2, key_added='leiden_r2')

In [None]:
# Generate reduced adata object to pass to R
adata_r = ad.AnnData(X = adata.layers['sct_logcounts'].copy())
adata_r.var_names = adata.var_names.copy()
adata_r.obs_names = adata.obs_names.copy()
adata_r.obs = adata.obs.loc[:,['leiden_r0','leiden_r0.1','leiden_r0.2','leiden_r0.3','leiden_r0.4','leiden_r0.5','leiden_r0.6','leiden_r0.7','leiden_r0.8','leiden_r0.9','leiden_r1','leiden_r1.25','leiden_r1.5','leiden_r1.75','leiden_r2']].copy()

In [None]:
%%R
library(SingleCellExperiment)
library(clustree)

In [None]:
%%R -i adata_r

clustree(adata_r, prefix = 'leiden_r', exprs='X')

In [None]:
sc.pl.umap(adata, color=['leiden_r0','leiden_r0.1','leiden_r0.2','leiden_r0.3','leiden_r0.4','leiden_r0.5','leiden_r0.6','leiden_r0.7','leiden_r0.8','leiden_r0.9','leiden_r1','leiden_r1.25','leiden_r1.5','leiden_r1.75','leiden_r2'], size=5, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=4)

In [None]:
sc.tl.leiden(adata, resolution=1.75)

In [None]:
sc.pl.umap(adata, color=['leiden'], size=10, add_outline=True, alpha=0.7, outline_width=(0.3, 0.0), ncols=4, color_map=ch_YlRd)

In [None]:
marker_genes = ['INS','GCG','SST','ARX','TPH1','MKI67','CHGA','KRT19','VIM','EPCAM','GAP43','RPS26']

In [None]:
sc.pl.umap(adata, color=marker_genes + ['leiden'], size=15, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=4, color_map=ch_YlRd)

In [None]:
sc.tl.dendrogram(adata, groupby='leiden', var_names=marker_genes, key_added='marker_gene_dendrogram')
sc.pl.DotPlot(adata, var_names=marker_genes, groupby='leiden', cmap=ch_YlRd, use_raw=False, categories_order=adata.uns['marker_gene_dendrogram']['categories_ordered']).style(color_on='square', dot_edge_lw=1, grid=True, dot_min=0.15, dot_edge_color=None).show()

In [None]:
groupby = 'leiden'

df = pd.DataFrame(data = adata[:,np.in1d(adata.var_names,marker_genes)].X.toarray(), 
                  index = adata.obs_names, 
                  columns=adata.var_names[np.in1d(adata.var_names, marker_genes)].values)

df[groupby]= pd.Series(adata.obs[groupby], index=df.index)

if 'df_all' in globals():
    del df_all
    
for i,marker in enumerate(marker_genes):
    if i == 0:
        df_all = pd.DataFrame(df.groupby(by=groupby)[marker].apply(np.mean).values, index=df.groupby(by=groupby)[marker].apply(np.mean).index, columns=['mean_'+marker])
    else:
        df_all['mean_'+marker] = df.groupby(by=groupby)[marker].apply(np.mean).values
        
df_all

In [None]:
for i,marker in enumerate(marker_genes):
    df_all['lowhigh_'+marker] = pd.cut(stats.zscore(df_all['mean_'+marker]), bins=[-10, 0.5, 10], labels=['low', 'high'])
    
for i,marker in enumerate(['CHGA']):
    df_all['lowhigh_'+marker] = pd.cut(stats.zscore(df_all['mean_'+marker]), bins=[-10, 0, 10], labels=['low', 'high'])
    
for i,marker in enumerate(['TPH1']):
    df_all['lowhigh_'+marker] = pd.cut(stats.zscore(df_all['mean_'+marker]), bins=[-10, 1, 10], labels=['low', 'high'])
    
for i,marker in enumerate(['GAP43','SST']):
    df_all['lowhigh_'+marker] = pd.cut(stats.zscore(df_all['mean_'+marker]), bins=[-10, 1.5, 10], labels=['low', 'high'])
        
df_all.iloc[:,len(marker_genes):]

In [None]:
adata.obs['initial_cell_type'] = adata.obs['leiden'].cat.add_categories(['Alpha','Endocrine (ARX+)', 'Beta','Endocrine (GAP43+)','Delta', 'EC','Cycling Endocrine','Polyhormonal','Non-Endocrine','Non-Endocrine (KRT19+)','Non-Epithelial'])

adata.obs['initial_cell_type'][np.in1d(adata.obs['leiden'], 
                                   df_all[df_all['lowhigh_GCG']=='high'].index)] = 'Alpha'
adata.obs['initial_cell_type'][np.in1d(adata.obs['leiden'], 
                                   df_all[df_all['lowhigh_INS']=='high'].index)] = 'Beta'
adata.obs['initial_cell_type'][np.in1d(adata.obs['leiden'], 
                                   df_all[df_all['lowhigh_INS']=='high'].index)] = 'Beta'
adata.obs['initial_cell_type'][np.in1d(adata.obs['leiden'], 
                                   df_all[(df_all['lowhigh_SST']=='high')].index)] = 'Delta'
adata.obs['initial_cell_type'][np.in1d(adata.obs['leiden'], 
                                   df_all[(df_all['lowhigh_INS']=='high') & (df_all['lowhigh_GCG']=='high')].index)] = 'Polyhormonal'
adata.obs['initial_cell_type'][np.in1d(adata.obs['leiden'], 
                                   df_all[df_all['lowhigh_TPH1']=='high'].index)] = 'EC'
adata.obs['initial_cell_type'][np.in1d(adata.obs['leiden'], 
                                   df_all[(df_all['lowhigh_GAP43']=='high')].index)] = 'Endocrine (GAP43+)'
adata.obs['initial_cell_type'][np.in1d(adata.obs['leiden'], 
                                   df_all[(df_all['lowhigh_GCG']=='low') & (df_all['lowhigh_ARX']=='high')].index)] = 'Endocrine (ARX+)'
adata.obs['initial_cell_type'][np.in1d(adata.obs['leiden'], 
                                   df_all[(df_all['lowhigh_MKI67']=='high') & (df_all['lowhigh_CHGA']=='high')].index)] = 'Cycling Endocrine'
adata.obs['initial_cell_type'][np.in1d(adata.obs['leiden'], 
                                   df_all[df_all['lowhigh_CHGA']=='low'].index)] = 'Non-Endocrine'
# adata.obs['initial_cell_type'][np.in1d(adata.obs['leiden'], 
#                                    df_all[(df_all['lowhigh_CHGA']=='low') & (df_all['lowhigh_KRT19']=='high')].index)] = 'Non-Endocrine (KRT19+)'
adata.obs['initial_cell_type'][np.in1d(adata.obs['leiden'], 
                                   df_all[df_all['lowhigh_VIM']=='high'].index)] = 'Non-Epithelial'


adata.obs['initial_cell_type']= adata.obs['initial_cell_type'].cat.remove_unused_categories()

In [None]:
pd.value_counts(adata.obs['initial_cell_type'])

In [None]:
sc.pl.umap(adata, color=['initial_cell_type','sample','leiden'], size=10, add_outline=True, alpha=1, outline_width=(0.3, 0.0), wspace=0.5)

In [None]:
sc.tl.dendrogram(adata, groupby='initial_cell_type', var_names=marker_genes, key_added='marker_gene_dendrogram')
sc.pl.DotPlot(adata, var_names=marker_genes, groupby='initial_cell_type', cmap=ch_YlRd, use_raw=False, categories_order=adata.uns['marker_gene_dendrogram']['categories_ordered']).style(color_on='square', dot_edge_lw=1, grid=True, dot_min=0.15, dot_edge_color=None).show()

# Initial Clustering & Annotation - WT Only

In [None]:
adata_wt = adata[adata.obs.genotype.isin(['WT'])].copy()

In [None]:
sc.tl.umap(adata_wt, min_dist=0.4, spread=0.9)

In [None]:
sc.tl.leiden(adata_wt, resolution=0, key_added='leiden_r0')
sc.tl.leiden(adata_wt, resolution=0.1, key_added='leiden_r0.1')
sc.tl.leiden(adata_wt, resolution=0.2, key_added='leiden_r0.2')
sc.tl.leiden(adata_wt, resolution=0.3, key_added='leiden_r0.3')
sc.tl.leiden(adata_wt, resolution=0.4, key_added='leiden_r0.4')
sc.tl.leiden(adata_wt, resolution=0.5, key_added='leiden_r0.5')
sc.tl.leiden(adata_wt, resolution=0.6, key_added='leiden_r0.6')
sc.tl.leiden(adata_wt, resolution=0.7, key_added='leiden_r0.7')
sc.tl.leiden(adata_wt, resolution=0.8, key_added='leiden_r0.8')
sc.tl.leiden(adata_wt, resolution=0.9, key_added='leiden_r0.9')
sc.tl.leiden(adata_wt, resolution=1, key_added='leiden_r1')

sc.tl.leiden(adata_wt, resolution=1.25, key_added='leiden_r1.25')
sc.tl.leiden(adata_wt, resolution=1.5, key_added='leiden_r1.5')
sc.tl.leiden(adata_wt, resolution=1.75, key_added='leiden_r1.75')
sc.tl.leiden(adata_wt, resolution=2, key_added='leiden_r2')

In [None]:
# Generate reduced adata_wt object to pass to R
adata_wt_r = ad.AnnData(X = adata_wt.layers['sct_logcounts'].copy())
adata_wt_r.var_names = adata_wt.var_names.copy()
adata_wt_r.obs_names = adata_wt.obs_names.copy()
adata_wt_r.obs = adata_wt.obs.loc[:,['leiden_r0','leiden_r0.1','leiden_r0.2','leiden_r0.3','leiden_r0.4','leiden_r0.5','leiden_r0.6','leiden_r0.7','leiden_r0.8','leiden_r0.9','leiden_r1','leiden_r1.25','leiden_r1.5','leiden_r1.75','leiden_r2']].copy()

In [None]:
%%R
library(SingleCellExperiment)
library(clustree)

In [None]:
%%R -i adata_wt_r

clustree(adata_wt_r, prefix = 'leiden_r', exprs='X')

In [None]:
sc.pl.umap(adata_wt, color=['leiden_r0','leiden_r0.1','leiden_r0.2','leiden_r0.3','leiden_r0.4','leiden_r0.5','leiden_r0.6','leiden_r0.7','leiden_r0.8','leiden_r0.9','leiden_r1','leiden_r1.25','leiden_r1.5','leiden_r1.75','leiden_r2'], size=5, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=4)

In [None]:
sc.pl.umap(adata_wt, color=['sample','leiden'] + genes, size=10, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=4, color_map=ch_YlRd)

In [None]:
sc.tl.leiden(adata_wt, resolution=1.75)

In [None]:
sc.pl.umap(adata_wt, color=['leiden'], size=10, add_outline=True, alpha=0.7, outline_width=(0.3, 0.0), ncols=4, color_map=ch_YlRd)

In [None]:
marker_genes = ['INS','GCG','SST','ARX','TPH1','MKI67','CHGA','KRT19','VIM','EPCAM','GAP43','RPS26']

In [None]:
sc.pl.umap(adata_wt, color=marker_genes + ['leiden'], size=15, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=4, color_map=ch_YlRd)

In [None]:
sc.tl.dendrogram(adata_wt, groupby='leiden', var_names=marker_genes, key_added='marker_gene_dendrogram')
sc.pl.DotPlot(adata_wt, var_names=marker_genes, groupby='leiden', cmap=ch_YlRd, use_raw=False, categories_order=adata_wt.uns['marker_gene_dendrogram']['categories_ordered']).style(color_on='square', dot_edge_lw=1, grid=True, dot_min=0.15, dot_edge_color=None).show()

In [None]:
groupby = 'leiden'

df = pd.DataFrame(data = adata_wt[:,np.in1d(adata_wt.var_names,marker_genes)].X.toarray(), 
                  index = adata_wt.obs_names, 
                  columns=adata_wt.var_names[np.in1d(adata_wt.var_names, marker_genes)].values)

df[groupby]= pd.Series(adata_wt.obs[groupby], index=df.index)

if 'df_all' in globals():
    del df_all
    
for i,marker in enumerate(marker_genes):
    if i == 0:
        df_all = pd.DataFrame(df.groupby(by=groupby)[marker].apply(np.mean).values, index=df.groupby(by=groupby)[marker].apply(np.mean).index, columns=['mean_'+marker])
    else:
        df_all['mean_'+marker] = df.groupby(by=groupby)[marker].apply(np.mean).values
        
df_all

In [None]:
for i,marker in enumerate(marker_genes):
    df_all['lowhigh_'+marker] = pd.cut(stats.zscore(df_all['mean_'+marker]), bins=[-10, 0.5, 10], labels=['low', 'high'])
    
for i,marker in enumerate(['CHGA']):
    df_all['lowhigh_'+marker] = pd.cut(stats.zscore(df_all['mean_'+marker]), bins=[-10, 0.2, 10], labels=['low', 'high'])
    
for i,marker in enumerate(['INS']):
    df_all['lowhigh_'+marker] = pd.cut(stats.zscore(df_all['mean_'+marker]), bins=[-10, 0.4, 10], labels=['low', 'high'])
    
for i,marker in enumerate(['TPH1']):
    df_all['lowhigh_'+marker] = pd.cut(stats.zscore(df_all['mean_'+marker]), bins=[-10, 1, 10], labels=['low', 'high'])
    
for i,marker in enumerate(['SST']):
    df_all['lowhigh_'+marker] = pd.cut(stats.zscore(df_all['mean_'+marker]), bins=[-10, 1.5, 10], labels=['low', 'high'])
    
for i,marker in enumerate(['GAP43']):
    df_all['lowhigh_'+marker] = pd.cut(stats.zscore(df_all['mean_'+marker]), bins=[-10, 2, 10], labels=['low', 'high'])
        
df_all.iloc[:,len(marker_genes):]

In [None]:
adata_wt.obs['wt_cell_type'] = adata_wt.obs['leiden'].cat.add_categories(['Alpha','Endocrine (ARX+)', 'Beta','Endocrine (GAP43+)','Delta', 'EC','Cycling Endocrine','Polyhormonal','Non-Endocrine','Non-Endocrine (KRT19+)','Non-Epithelial'])

adata_wt.obs['wt_cell_type'][np.in1d(adata_wt.obs['leiden'], 
                                   df_all[df_all['lowhigh_GCG']=='high'].index)] = 'Alpha'
adata_wt.obs['wt_cell_type'][np.in1d(adata_wt.obs['leiden'], 
                                   df_all[df_all['lowhigh_INS']=='high'].index)] = 'Beta'
adata_wt.obs['wt_cell_type'][np.in1d(adata_wt.obs['leiden'], 
                                   df_all[df_all['lowhigh_INS']=='high'].index)] = 'Beta'
adata_wt.obs['wt_cell_type'][np.in1d(adata_wt.obs['leiden'], 
                                   df_all[(df_all['lowhigh_SST']=='high')].index)] = 'Delta'
adata_wt.obs['wt_cell_type'][np.in1d(adata_wt.obs['leiden'], 
                                   df_all[(df_all['lowhigh_INS']=='high') & (df_all['lowhigh_GCG']=='high')].index)] = 'Polyhormonal'
adata_wt.obs['wt_cell_type'][np.in1d(adata_wt.obs['leiden'], 
                                   df_all[df_all['lowhigh_TPH1']=='high'].index)] = 'EC'
adata_wt.obs['wt_cell_type'][np.in1d(adata_wt.obs['leiden'], 
                                   df_all[(df_all['lowhigh_GAP43']=='high')].index)] = 'Endocrine (GAP43+)'
adata_wt.obs['wt_cell_type'][np.in1d(adata_wt.obs['leiden'], 
                                   df_all[(df_all['lowhigh_GCG']=='low') & (df_all['lowhigh_ARX']=='high')].index)] = 'Endocrine (ARX+)'
adata_wt.obs['wt_cell_type'][np.in1d(adata_wt.obs['leiden'], 
                                   df_all[(df_all['lowhigh_MKI67']=='high') & (df_all['lowhigh_CHGA']=='high')].index)] = 'Cycling Endocrine'
adata_wt.obs['wt_cell_type'][np.in1d(adata_wt.obs['leiden'], 
                                   df_all[df_all['lowhigh_CHGA']=='low'].index)] = 'Non-Endocrine'
# adata_wt.obs['wt_cell_type'][np.in1d(adata_wt.obs['leiden'], 
#                                    df_all[(df_all['lowhigh_CHGA']=='low') & (df_all['lowhigh_KRT19']=='high')].index)] = 'Non-Endocrine (KRT19+)'
adata_wt.obs['wt_cell_type'][np.in1d(adata_wt.obs['leiden'], 
                                   df_all[df_all['lowhigh_VIM']=='high'].index)] = 'Non-Epithelial'


adata_wt.obs['wt_cell_type']= adata_wt.obs['wt_cell_type'].cat.remove_unused_categories()

In [None]:
pd.value_counts(adata_wt.obs['wt_cell_type'])

In [None]:
sc.pl.umap(adata_wt, color=['wt_cell_type','sample','leiden'], size=10, add_outline=True, alpha=1, outline_width=(0.3, 0.0), wspace=0.5)

In [None]:
sc.tl.dendrogram(adata_wt, groupby='wt_cell_type', var_names=marker_genes, key_added='marker_gene_dendrogram')
sc.pl.DotPlot(adata_wt, var_names=marker_genes, groupby='wt_cell_type', cmap=ch_YlRd, use_raw=False, categories_order=adata_wt.uns['marker_gene_dendrogram']['categories_ordered']).style(color_on='square', dot_edge_lw=1, grid=True, dot_min=0.15, dot_edge_color=None).show()

In [None]:
sc.pl.violin(adata_wt, keys=['INS','GAP43'], groupby='wt_cell_type', rotation=90)

In [None]:
adata.obs['wt_cell_type'] = 'unlabelled'
adata.obs.loc[adata.obs.genotype.isin(['WT']),'wt_cell_type'] = adata_wt.obs.loc[:,'wt_cell_type']
adata.obs['wt_cell_type'] = adata.obs['wt_cell_type'].astype('category')

In [None]:
sc.pl.umap(adata, color=['wt_cell_type'], size=10, add_outline=True, alpha=1, outline_width=(0.3, 0.0), wspace=0.5)

# Integration

In [None]:
batch_key = 'batch'
labels_key = 'initial_cell_type'

## HVG Overlap

In [None]:
adata_hvg = adata.copy()

In [None]:
adata_raw_0 = ad.AnnData(X=adata[adata.obs.batch.isin(['0'])].layers['raw_counts'])
adata_raw_1 = ad.AnnData(X=adata[adata.obs.batch.isin(['1'])].layers['raw_counts'])

In [None]:
%%R -i adata_raw_0
sce = devianceFeatureSelection(adata_raw_0, assay='X')

In [None]:
binomial_deviance_0 = ro.r('rowData(sce)$binomial_deviance').T

In [None]:
idx = binomial_deviance_0.argsort()[-4000:]
mask = np.zeros(adata_hvg.var_names.shape, dtype=bool)
mask[idx] = True

adata_hvg.var['highly_deviant_0'] = mask

In [None]:
%%R -i adata_raw_1
sce = devianceFeatureSelection(adata_raw_1, assay='X')

In [None]:
binomial_deviance_1 = ro.r('rowData(sce)$binomial_deviance').T

In [None]:
idx = binomial_deviance_1.argsort()[-4000:]
mask = np.zeros(adata_hvg.var_names.shape, dtype=bool)
mask[idx] = True

adata_hvg.var['highly_deviant_1'] = mask

In [None]:
hvg_list = list(set(adata_hvg.var[adata_hvg.var['highly_deviant_0']].index).intersection(set(adata_hvg.var[adata_hvg.var['highly_deviant_1']].index)))

In [None]:
# Get HVGs and overlap with cell cycle & ambient genes

## HVGs
adata_hvg.var['highly_variable'] = adata_hvg.var_names.isin(hvg_list)
hvgs = pd.Series(adata_hvg.var_names[adata_hvg.var['highly_variable']])
print('\nHighly variable genes before filtering:',adata_hvg.var.loc[:,'highly_variable'].value_counts()[1])

# overlap HVGs with CC genes
hvcc = list(hvgs[hvgs.isin(all_cc_genes)])
print('\nHighly variable cell cycle genes:',len(hvcc),'\n',hvcc)

# overlap HVGs with ambient genes
hvambi = list(hvgs[hvgs.isin(list(adata_hvg[:,adata_hvg.var['is_ambient'] == True].var_names))])
print('\nHighly variable ambient genes:',len(hvambi),'\n',hvambi)

# remove cell cycle genes
adata_hvg.var.loc[hvcc,'highly_variable'] = False

# # remove ambient genes
# adata_hvg.var.loc[hvambi,'highly_variable'] = False

print('\nHighly variable genes after filtering:',adata_hvg.var.loc[:,'highly_variable'].value_counts()[1])

In [None]:
sc.pp.pca(adata_hvg, svd_solver='arpack', use_highly_variable=True)

In [None]:
sc.pp.neighbors(adata_hvg)
sc.tl.leiden(adata_hvg)

In [None]:
sc.tl.umap(adata_hvg)

In [None]:
sc.pl.umap(adata_hvg, color=['sample','leiden'] + genes, size=10, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=4, color_map=ch_YlRd)

In [None]:
for genotype in set(adata_hvg.obs['genotype']):
    palette = list(adata_hvg.uns['initial_cell_type_colors'])#['#1f77b4','#7ba2cd','#becfe6', '#279e68','#7abf98','#bddfca','#d62728','#ff7f0e', '#8c564b','#aa40fc','#cd85ff', '#17becf',  '#b5bd61', '#dcddaf', '#c6c6c6', '#919191',  '#5e5e5e']
    sc.pl.umap(adata_hvg[adata_hvg.obs['genotype']==genotype], color=['initial_cell_type'], title= genotype, size=12, add_outline=True, alpha=0.7, outline_width=(0.3, 0.0), ncols=4, show=True, palette=palette,
                    ax=sc.pl.umap(adata_hvg, color=['initial_cell_type'], size=5, legend_loc=None, add_outline=False, alpha=0.3, outline_width=(0.3, 0.0), ncols=4, show=False))

## SCVI Tools

In [None]:
adata_scvi = adata.copy()

In [None]:
scvi.model.SCVI.setup_anndata(adata_scvi, layer='raw_counts', batch_key=batch_key)

In [None]:
model_scvi = scvi.model.SCVI(adata_scvi, n_hidden=512, n_layers=2, n_latent=30, gene_likelihood='nb', dispersion='gene-batch')
print(model_scvi)
model_scvi.view_anndata_setup()

In [None]:
model_scvi.train()

In [None]:
adata_scvi.obsm['X_scVI'] = model_scvi.get_latent_representation()

In [None]:
sc.pp.neighbors(adata_scvi, use_rep='X_scVI')
sc.tl.leiden(adata_scvi)

In [None]:
sc.tl.umap(adata_scvi)

In [None]:
sc.pl.umap(adata_scvi, color=['sample','leiden'] + marker_genes, size=10, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=4)

In [None]:
for genotype in set(adata_scvi.obs['genotype']):
    palette = list(adata_scvi.uns['initial_cell_type_colors'])#['#1f77b4','#7ba2cd','#becfe6', '#279e68','#7abf98','#bddfca','#d62728','#ff7f0e', '#8c564b','#aa40fc','#cd85ff', '#17becf',  '#b5bd61', '#dcddaf', '#c6c6c6', '#919191',  '#5e5e5e']
    sc.pl.umap(adata_scvi[adata_scvi.obs['genotype']==genotype], color=['initial_cell_type'], title= genotype, size=12, add_outline=True, alpha=0.7, outline_width=(0.3, 0.0), ncols=4, show=True, palette=palette,
                    ax=sc.pl.umap(adata_scvi, color=['initial_cell_type'], size=5, legend_loc=None, add_outline=False, alpha=0.3, outline_width=(0.3, 0.0), ncols=4, show=False))

## scANVI

In [None]:
adata_scanvi = adata.copy()

In [None]:
model_scanvi = scvi.model.SCANVI.from_scvi_model(
    model_scvi,
    adata=adata_scanvi, 
    labels_key=labels_key, 
    unlabeled_category='unlabelled'
)
print(model_scanvi)
model_scanvi.view_anndata_setup()

In [None]:
model_scanvi.train(max_epochs=30)

In [None]:
adata_scanvi.obsm['X_scANVI'] = model_scanvi.get_latent_representation()

In [None]:
sc.pp.neighbors(adata_scanvi, use_rep='X_scANVI')
sc.tl.leiden(adata_scanvi)

In [None]:
sc.tl.umap(adata_scanvi)

In [None]:
sc.pl.umap(adata_scanvi, color=['sample','leiden'] + marker_genes, size=10, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=4)

In [None]:
for genotype in set(adata_scanvi.obs['genotype']):
    palette = list(adata_scanvi.uns['initial_cell_type_colors'])#['#1f77b4','#7ba2cd','#becfe6', '#279e68','#7abf98','#bddfca','#d62728','#ff7f0e', '#8c564b','#aa40fc','#cd85ff', '#17becf',  '#b5bd61', '#dcddaf', '#c6c6c6', '#919191',  '#5e5e5e']
    sc.pl.umap(adata_scanvi[adata_scanvi.obs['genotype']==genotype], color=['initial_cell_type'], title= genotype, size=12, add_outline=True, alpha=0.7, outline_width=(0.3, 0.0), ncols=4, show=True, palette=palette,
                    ax=sc.pl.umap(adata_scanvi, color=['initial_cell_type'], size=5, legend_loc=None, add_outline=False, alpha=0.3, outline_width=(0.3, 0.0), ncols=4, show=False))

### Label Prediction

In [None]:
adata_scanvi_label = adata.copy()

In [None]:
sc.pl.umap(adata_scanvi, color=['sample','wt_cell_type'], size=10, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=4)

In [None]:
model_scanvi = scvi.model.SCANVI.from_scvi_model(
    model_scvi,
    adata=adata_scanvi_label, 
    labels_key='wt_cell_type', 
    unlabeled_category='unlabelled'
)
print(model_scanvi)
model_scanvi.view_anndata_setup()

In [None]:
model_scanvi.train(max_epochs=30)

In [None]:
adata_scanvi_label.obsm['X_scANVI'] = model_scanvi.get_latent_representation()

In [None]:
sc.pp.neighbors(adata_scanvi_label, use_rep='X_scANVI')
sc.tl.leiden(adata_scanvi_label)

In [None]:
sc.tl.umap(adata_scanvi_label)

In [None]:
sc.pl.umap(adata_scanvi_label, color=['sample','leiden'] + marker_genes, size=10, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=4)

In [None]:
adata_scanvi_label.obs['predicted_cell_type'] = model_scanvi.predict()

In [None]:
sc.pl.umap(adata_scanvi_label, color=['initial_cell_type','predicted_cell_type'], size=10, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=4, wspace=0.6)

In [None]:
adata_scanvi_label.obs.initial_cell_type.value_counts()

In [None]:
adata_scanvi_label.obs.predicted_cell_type.value_counts()

## Scanorama

In [None]:
adata_scanorama = adata.copy()

In [None]:
sce.pp.scanorama_integrate(adata_scanorama, key=batch_key,batch_size=50000)

In [None]:
sc.pp.neighbors(adata_scanorama, use_rep='X_scanorama')
sc.tl.leiden(adata_scanorama)

In [None]:
sc.tl.umap(adata_scanorama)

In [None]:
sc.pl.umap(adata_scanorama, color=['sample','leiden'] + marker_genes, size=10, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=4)

## Harmony

In [None]:
adata_harmony = adata.copy()

In [None]:
sce.pp.harmony_integrate(adata_harmony, key=batch_key, adjusted_basis='X_harmony')

In [None]:
sc.pp.neighbors(adata_harmony, use_rep='X_harmony')
sc.tl.leiden(adata_harmony)

In [None]:
sc.tl.umap(adata_harmony)

In [None]:
sc.pl.umap(adata_harmony, color=['sample','leiden'] + marker_genes, size=10, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=4)

# Benchmark

In [None]:
import scib
sc.set_figure_params(scanpy=True, frameon=False, vector_friendly=False ,transparent=True, dpi=150, dpi_save=300)

In [None]:
metrics_scvi = scib.metrics.metrics_fast(
    adata, adata_scvi, batch_key, labels_key, embed='X_scVI'
)
metrics_scanvi = scib.metrics.metrics_fast(
    adata, adata_scanvi, batch_key, labels_key, embed='X_scANVI'
)
metrics_scanorama = scib.metrics.metrics_fast(
    adata, adata_scanorama, batch_key, labels_key, embed='X_scanorama'
)
metrics_harmony = scib.metrics.metrics_fast(
    adata, adata_harmony, batch_key, labels_key, embed='X_harmony'
)
metrics_hvg = scib.metrics.metrics_fast(adata, adata_hvg, batch_key, labels_key)
metrics_unintegrated = scib.metrics.metrics_fast(adata, adata, batch_key, labels_key)

In [None]:
# Concatenate metrics results
metrics = pd.concat(
    [metrics_scvi, metrics_scanvi, metrics_scanorama, metrics_harmony, metrics_hvg, metrics_unintegrated],
    axis='columns',
)
# Set methods as column names
metrics = metrics.set_axis(
    ['scVI', 'scANVI', 'Scanorama', 'Harmony', 'HVG Overlap', 'Unintegrated'], axis='columns'
)
# Select only the fast metrics
metrics = metrics.loc[
    [
        'ASW_label',
        'ASW_label/batch',
        'PCR_batch',
        'isolated_label_silhouette',
        'graph_conn',
        "hvg_overlap",
    ],
    :,
]
# Transpose so that metrics are columns and methods are rows
metrics = metrics.T
metrics

In [None]:
metrics_scaled = (metrics - metrics.min()) / (metrics.max() - metrics.min())
metrics_scaled.style.background_gradient(cmap='Blues')

In [None]:
metrics.style.background_gradient(cmap='Blues')

In [None]:
metrics['Batch'] = metrics[
    ['ASW_label/batch', 'PCR_batch', 'graph_conn']
].mean(axis=1)
metrics['Bio'] = metrics[
    ['ASW_label', 'isolated_label_silhouette', "hvg_overlap",]
].mean(axis=1)
metrics.style.background_gradient(cmap='Blues')

In [None]:
fig, ax = plt.subplots()
ax.set_xlim(0, 1)
ax.set_ylim(0, 1)
metrics.plot.scatter(
    x='Batch',
    y='Bio',
    c=range(len(metrics)),
    colormap=mpl.cm.get_cmap('Set1'),
    ax=ax,
)
anno = []
for k, v in metrics[['Batch', 'Bio']].iterrows():
    anno.append(plt.text(
        v['Batch'],
        v['Bio'],
        k,
        fontsize=10,
    ))
adjust_text(anno, expand_points=(3,3), expand_text=(3,3), expand_objects=(3,3), precision=0.00001, lim=5000, autoalign='xy', arrowprops=dict(arrowstyle="-",  color='k',  lw=0.5), ax=ax)

In [None]:
metrics["Overall"] = 0.4 * metrics["Batch"] + 0.6 * metrics["Bio"]
metrics.style.background_gradient(cmap="Blues")

In [None]:
metrics.plot.bar(y='Overall')

In [None]:
metrics_scaled['Batch'] = metrics_scaled[
    ['ASW_label/batch', 'PCR_batch', 'graph_conn']
].mean(axis=1)
metrics_scaled['Bio'] = metrics_scaled[
    ['ASW_label', 'isolated_label_silhouette', "hvg_overlap",]
].mean(axis=1)
metrics_scaled.style.background_gradient(cmap='Blues')

In [None]:
fig, ax = plt.subplots()
ax.set_xlim(0, 1)
ax.set_ylim(0, 1)
metrics_scaled.plot.scatter(
    x='Batch',
    y='Bio',
    c=range(len(metrics_scaled)),
    colormap=mpl.cm.get_cmap('Set1'),
    ax=ax,
)

for k, v in metrics_scaled[['Batch', 'Bio']].iterrows():
    ax.annotate(
        k,
        v,
        xytext=(6, -3),
        textcoords='offset points',
        family='sans-serif',
        fontsize=12,
    )

In [None]:
metrics_scaled["Overall"] = 0.4 * metrics_scaled["Batch"] + 0.6 * metrics_scaled["Bio"]
metrics_scaled.style.background_gradient(cmap="Blues")

In [None]:
metrics_scaled.plot.bar(y='Overall')

# Collect Results

In [131]:
# get latent representaions
adata.obsm['X_scVI'] = model_scvi.get_latent_representation()
adata.obsm['X_scANVI'] = model_scanvi.get_latent_representation()
adata.obsm['X_scanorama'] = adata_scanorama.obsm['X_scanorama']
adata.obsm['X_harmony'] = adata_harmony.obsm['X_harmony']

In [132]:
# get denoised expression from scvi
adata.layers['scvi_counts'] = model_scvi.get_normalized_expression(library_size=10e4, n_samples=10)

In [133]:
# get label predictions for KO
adata.obs['predicted_cell_type'] = adata_scanvi_label.obs['predicted_cell_type']

# Save

In [134]:
sparsify_all_layers(adata)

.X already spase...
Layer ambiguous already spase...
Layer log_raw_counts already spase...
Layer matrix already spase...
Layer raw_counts already spase...
Layer scran_counts already spase...
Layer sct_counts already spase...
Layer sct_logcounts already spase...
Layer sct_scale_data already spase...
Layer spliced already spase...
Layer unspliced already spase...
Sparsify  scvi_counts


In [135]:
# Save
sc.write('/home/michi/Projects/scRNA-seq_iPSC_IGFRL-KO_Notebooks/Files/scRNA-seq_iPSC_IIR-KO_S6_adata_rmDoublets_normalized_integrated', adata)

# Session Info

In [136]:
session_info.show()



In [137]:
print_r_session()

R version 4.1.1 (2021-08-10)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 20.04.5 LTS

Matrix products: default
BLAS/LAPACK: /home/michi/Software/venvs/scAnalysis_sc1.9_ad0.8_mu0.1.2_md0.2_R4.1_FVF/lib/libopenblasp-r0.3.18.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=de_DE.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=de_DE.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=de_DE.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=de_DE.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats4    tools     stats     graphics  grDevices utils     datasets 
[8] methods   base     

other attached packages:
 [1] clustree_0.5.0                 ggraph_2.0.6                  
 [3] ggplot2_3.3.6                  Matrix_1.4-1                  
 [5] SingleCellExperiment_1.16.0    SummarizedExperiment_1.24.0   
 [7] Biobase_2.54.0                 GenomicRa