
# Preprocessing - SCT & Scran Normalization
Michael Sterr

2021-06-02 11:41:56     


# Setup

Run following scripts before:
 * scRNA-seq_iPCS_IIR-KO_Preprocessing_Doublet-Detection_XXX_XXX

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:85% !important; }</style>"))

In [2]:
# General
import scipy as sci
import numpy as np
import pandas as pd
import logging
import time
import pickle
from itertools import chain
import session_info

# Plotting
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib import rcParams
from matplotlib import cm
import seaborn as sb

# Analysis
import scanpy as sc

In [3]:
# Settings

## Scanpy settings
sc.settings.verbosity = 3
sc.logging.print_versions()
session_info.show()

-----
anndata     0.8.0
scanpy      1.9.1
-----
PIL                         8.4.0
anyio                       NA
attr                        21.2.0
babel                       2.9.1
backcall                    0.2.0
beta_ufunc                  NA
binom_ufunc                 NA
bottleneck                  1.3.2
certifi                     2022.06.15
cffi                        1.15.0
chardet                     4.0.0
charset_normalizer          2.0.7
cloudpickle                 2.0.0
colorama                    0.4.4
cupy                        10.1.0
cupy_backends               NA
cupyx                       NA
cycler                      0.10.0
cython_runtime              NA
dask                        2021.10.0
dateutil                    2.8.2
debugpy                     1.4.1
decorator                   5.1.0
defusedxml                  0.7.1
entrypoints                 0.3
fastrlock                   0.8
fsspec                      2021.10.1
google                      NA
h5py    

In [4]:
# Color maps
exec(open("/home/michi/Software/viscm/maps/michi_bk_bl_gn_yl.py").read())

In [5]:
# Plot settings
%matplotlib inline

## Directory
sc.settings.figdir='/home/michi/Projects/scRNA-seq_iPSC_IGFRL-KO_Notebooks/Figures'

## Plotting parameters
rcParams['figure.figsize']=(20,20) #rescale figures
#sc.set_figure_params(scanpy=True, frameon=False, vector_friendly=False, color_map='tab10' ,transparent=True, dpi=150, dpi_save=300)
sc.set_figure_params(scanpy=True, frameon=False, vector_friendly=False ,transparent=True, dpi=150, dpi_save=300)

## Font
rcParams['font.family'] = 'sans-serif'
rcParams['font.sans-serif'] = ['Source Sans 3']

## Grid & Ticks
rcParams['grid.alpha'] = 0
rcParams['xtick.bottom'] = True
rcParams['ytick.left'] = True

## Embed font
plt.rc('pdf', fonttype=42)

## Define new default settings
plt.rcParamsDefault = plt.rcParams

# Setup R

In [6]:
#R
import os
os.environ['R_HOME'] = '/home/michi/Software/venvs/scAnalysis_sc1.9_ad0.8_mu0.1.2_md0.2_R4.1_FVF/lib/R' #path to your R installation

import rpy2
import rpy2.robjects as ro
import rpy2.rinterface_lib.callbacks
from rpy2.robjects import pandas2ri
import anndata2ri

## R settings

### Ignore R warning messages
#### Note: this can be commented out to get more verbose R output
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

### Automatically convert rpy2 outputs to pandas dataframes
pandas2ri.activate()
anndata2ri.activate()
%load_ext rpy2.ipython

In [7]:
%%R

.libPaths()

[1] "/home/michi/Software/venvs/scAnalysis_sc1.9_ad0.8_mu0.1.2_md0.2_R4.1_FVF/lib/R/library"


In [8]:
%%R
# # Parallelization
# library("BiocParallel.FutureParam")
# register(FutureParam())
# plan(multicore, workers=8)
# options(future.globals.maxSize = 2 * 1024 ^ 3) # for 50 Gb RAM

sessionInfo()

R version 4.1.1 (2021-08-10)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 20.04.5 LTS

Matrix products: default
BLAS/LAPACK: /home/michi/Software/venvs/scAnalysis_sc1.9_ad0.8_mu0.1.2_md0.2_R4.1_FVF/lib/libopenblasp-r0.3.18.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=de_DE.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=de_DE.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=de_DE.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=de_DE.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] tools     stats     graphics  grDevices utils     datasets  methods  
[8] base     

loaded via a namespace (and not attached):
[1] compiler_4.1.1


# Functions

In [9]:
def qc_metrics(adata, ambient=True, plot=True, counts_per_gene=True, make_dense=False, genome='auto', mt_genes_path='/mnt/ssd/Resources/sus_scrofa_mt_ens101_ext.txt'):
    """\
    Calculate QC metrics.
    genome: {'auto','Mus_musculus','Homo_sapiens','Sus_scrofa'}
    mt_genes_path: Path to mitochondrial genes for sus scrofa. Tab delimited file without header and with gene symbols in column 2. default: '/mnt/ssd/Resources/sus_scrofa_mt_ens101_ext.txt'
    ambient: Requires adata.var['is_ambient'] = pd.Categorical(list(map(str,list(adata.var['ambient_genes'] > cut_off))))
    """
    
    is_ambi_key = 'is_ambient_' + adata.obs['sample'][0]
    
    if genome=='auto':
        genome = '_'.join(adata.var.loc[:,'genome'][0].split('_')[0:2])
        print('Genome is', genome)

    if make_dense:
        adata.X = adata.X.toarray()

    if counts_per_gene:
        # counts per gene
        adata.var['n_counts'] = adata.X.sum(0)

    # counts per cell
    adata.obs['n_counts'] = adata.X.sum(1)
    # log counts per cell
    adata.obs['log_counts'] = np.log(adata.obs['n_counts'])
    # rank by counts
    adata.obs['n_counts_rank'] = adata.obs['n_counts'].rank(method='first',ascending=False)
    # genes per cell
    adata.obs['n_genes'] = (adata.X > 0).sum(1)
    # log genes per cell
    adata.obs['log_genes'] = np.log(adata.obs['n_genes'])
    # fraction of mitochondrial genes
    if (genome == 'Homo_sapiens') | (genome == 'homo_sapiens'):
        mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]
        adata.obs['mt_frac'] = adata.X[:, mt_gene_mask].sum(1)/adata.obs['n_counts']

        rp_gene_mask = [gene.startswith(('RPS','RPL')) for gene in adata.var_names]
        adata.obs['rp_frac'] = adata.X[:,rp_gene_mask].sum(1) / adata.obs['n_counts']

    elif (genome == 'Mus_musculus') | (genome == 'mus_musculus'):
        mt_gene_mask = [gene.startswith('mt-') for gene in adata.var_names]
        adata.obs['mt_frac'] = adata.X[:, mt_gene_mask].sum(1)/adata.obs['n_counts']

        rp_gene_mask = [gene.startswith(('Rps','Rpl')) for gene in adata.var_names]
        adata.obs['rp_frac'] = adata.X[:,rp_gene_mask].sum(1) / adata.obs['n_counts']

    elif (genome == 'Sus_scrofa') | (genome == 'sus_scrofa'):
        mt_genes = [gene.split('-')[0] for gene in list(pd.read_csv(mt_genes_path , header=None, sep="\t")[1])]
        mt_gene_mask = adata.var_names.isin(mt_genes)
        adata.obs['mt_frac'] = adata.X[:, mt_gene_mask].sum(1)/adata.obs['n_counts']

        rp_gene_mask = [gene.startswith(('RPS','RPL')) for gene in adata.var_names]
        adata.obs['rp_frac'] = adata.X[:,rp_gene_mask].sum(1) / adata.obs['n_counts']

    if ambient:
        adata.obs['ambi_frac'] = adata.X[:,adata.var[is_ambi_key]=='True'].sum(1) / adata.obs['n_counts']

    if plot:
        sb.jointplot(
            data=adata.obs,
            x="log_counts",
            y="log_genes",
            kind="hist", bins=100, cmap="rocket_r", color="#f69c73", space=0
        )

        fig, ax1 = plt.subplots()
        ax1.scatter(x=adata.obs['n_counts_rank'], y=adata.obs['n_counts'], s=1, alpha=0.2, c='black', label='Total UMI Counts')
        ax1.scatter(x=adata.obs['n_counts_rank'], y=adata.obs['n_genes'], s=1, alpha=0.2, c='tab:green', label='Gene Counts')
        ax1.set(xscale='log', yscale='log')
        ax1.set_ylabel('Total UMI/Gene Counts')
        ax1.set_xlabel('Ranked Droplets')
        #ax1.vlines(x=[max_rank], color="black", lw=0.5).set_linestyle("--")

        ax2 = ax1.twinx()
        ax2.scatter(x=adata.obs['n_counts_rank'], y=adata.obs['mt_frac']*100, s=1, alpha=0.2, c='tab:red', label='% Mito. Counts')
        ax2.set_ylabel('%')

        fig.legend(loc='center left', fontsize='xx-small', bbox_to_anchor=(0.2, 0.35))
        
####################################################################################################################################################################################################################################
####################################################################################################################################################################################################################################
####################################################################################################################################################################################################################################
####################################################################################################################################################################################################################################


def sparsify_all_layers(adata):
    """
    Loop trough all layers and make dense matrices sparse.
    """
          
    if not sci.sparse.issparse(adata.X):
        print('Sparsify .X...')
        adata.X = sci.sparse.csr_matrix(adata.X)
    else:
        print('.X already spase...')  
        
    for layer in list(adata.layers):
        if not sci.sparse.issparse(adata.layers[layer]):
            print('Sparsify ', layer)
            adata.layers[layer] = sci.sparse.csr_matrix(adata.layers[layer])
        else:
            print('Layer', layer, 'already spase...')

####################################################################################################################################################################################################################################
####################################################################################################################################################################################################################################
####################################################################################################################################################################################################################################
####################################################################################################################################################################################################################################

            
def normalize_scran(adata, r=0.5):
    import rpy2
    import rpy2.robjects as ro
    import gc
    
    print('Normalization with Scran:')
    print('\n-----------------------------------\n\nPreprocess data... ')
    adata_pp = adata.copy()
    sc.pp.normalize_total(adata_pp, target_sum=1e4)#, exclude_highly_expressed=True) #sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)
    sc.pp.log1p(adata_pp)
    sc.pp.pca(adata_pp)
    sc.pp.neighbors(adata_pp)
    sc.tl.leiden(adata_pp, key_added='groups', resolution=r) #sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)
    
    
    print('\n-----------------------------------\n\nTransfer data... ')
    ro.globalenv['data_mat'] = adata_pp.X.T
    ro.globalenv['input_groups'] = adata_pp.obs['groups']
    
    print('\n-----------------------------------\n\nCalculate size factors... ')
    ro.r('library("scran")')
    # calculate size factors
    ro.r('''
    size_factors = calculateSumFactors(data_mat, clusters=input_groups, min.mean=0.1)
    ''')
    
    print('\n-----------------------------------\n\nTransfer data... ')
    # add to andata.obs
    adata.obs['size_factors'] = ro.r['size_factors']
    
    print('\n-----------------------------------\n\nPlot results... ')
    # plot results
    sc.pl.scatter(adata, 'size_factors', 'n_counts', color='leiden')
    sc.pl.scatter(adata, 'size_factors', 'n_genes', color='leiden')

    sb.distplot(adata.obs['size_factors'], bins=100, kde=True)
    
    print('\n-----------------------------------\n\nAdd results to anndata... ')
    #Keep the count data in a counts layer
    adata.layers['raw_counts'] = adata.X.copy()

    #Logarithmize raw counts
    adata.layers['log_raw_counts'] = sc.pp.log1p(adata.layers['raw_counts'], copy=True)

    #Normalize adata 
    adata.X /= adata.obs['size_factors'].values[:,None]
    sc.pp.log1p(adata)

    #Keep the normalized count data in a counts layer
    adata.layers['scran_counts'] = adata.X.copy()
    
    # delete
    print('\n-----------------------------------\n\nClean up... ')
    del adata_pp
    gc.collect()

    
    
####################################################################################################################################################################################################################################
####################################################################################################################################################################################################################################
####################################################################################################################################################################################################################################
####################################################################################################################################################################################################################################    
    


def normalize_sct(adata, batch=None, layer=None, results_to_X=None, min_cells=None, clip_range_denominator=1, n_core=64, max_memory_gb=128):
    '''
    adata: adata object to normalize
    layer: layer to use for normalization. Default = None -> use .X
    results_to_X: Set results layer to adata.X (e.g. 'sct_logcounts')
    '''
    
    import rpy2
    import rpy2.robjects as ro
    import gc

       
    print('Normalization with SCT:')
    # load packages
    ro.globalenv['clip_range_denominator'] = clip_range_denominator
    ro.globalenv['n_core'] = n_core
    ro.globalenv['max_memory'] = max_memory_gb#/64
    ro.r('''
    print(paste0("Cores: ", n_core))
    print(paste0("Memory: ", max_memory))
    ''')
    ro.r('''
    # Packages
    library(Seurat)
    library(sctransform)
    library(SingleCellExperiment)

    # Parallelization
    library("BiocParallel.FutureParam")
    register(FutureParam())
    plan(multicore, workers=n_core)
    options(future.globals.maxSize = max_memory * 1024^3)
    ''')
    # transfer data
    print('\tTransfer data...')
    if layer is None:
        ro.globalenv['data_mat'] = adata.X.T#.toarray()
        ro.globalenv['obs_names'] = adata.obs_names
        ro.globalenv['var_names'] = adata.var_names
    else:
        print('\tNormalizing layer \'', layer,'\'...')
        ro.globalenv['data_mat'] = adata.layers[layer].T#.toarray()
        ro.globalenv['obs_names'] = adata.obs_names
        ro.globalenv['var_names'] = adata.var_names
        
    ro.r('''
    rownames(data_mat) <- var_names
    colnames(data_mat) <- obs_names
    seurat <- CreateSeuratObject(counts = data_mat, project = "0", min.cells = 0, min.features = 0)
    ''')   
    # perform sct
    print('\tPerform SCT...')
    if batch is None:
        ro.r('''
        # SCTransform
        seurat <- SCTransform(seurat, verbose = FALSE, return.only.var.genes = FALSE, variable.features.n = NULL, vst.flavor = "v2", clip.range = c(-sqrt(x = ncol(x = seurat[["RNA"]])/clip_range_denominator), sqrt(x = ncol(x = seurat[["RNA"]])/clip_range_denominator)))
        ''') 
    else:
        ro.globalenv['batch_obs'] = adata.obs[batch]
        ro.globalenv['batch_key'] = batch
        ro.r('''
        batch_df <- data.frame(batch_obs, row.names = obs_names)
        colnames(batch_df) <- batch_key
        seurat <- AddMetaData(seurat, batch_df)
        Idents(seurat) <- batch_key
        print(head(seurat@meta.data))
        ''')
        ro.r('''
        # SCTransform
        seurat <- SCTransform(seurat, batch_var=batch_key, verbose = FALSE, return.only.var.genes = FALSE, variable.features.n = NULL, vst.flavor = "v2", clip.range = c(-sqrt(x = ncol(x = seurat[["RNA"]])/clip_range_denominator), sqrt(x = ncol(x = seurat[["RNA"]])/clip_range_denominator)))
        ''') 
    # convert to singleCellExperiment
    print('\tConvert data...')
    ro.r('''
    # Add feature meta data (since Seurat v4 -> will be fixed?)
    var <- c('detection_rate','gmean', 'variance', 'residual_variance')
    seurat[["SCT"]]@meta.features <- SCTResults(seurat[["SCT"]], slot = "feature.attributes")[, var]
    seurat[["SCT"]]@meta.features$variable <- FALSE
    seurat[["SCT"]]@meta.features[VariableFeatures(seurat[["SCT"]] ), "variable"] <- TRUE
    colnames(seurat[["SCT"]]@meta.features) <- paste0("sct.", colnames(seurat[["SCT"]]@meta.features) )

    # Convert to SingleCellExperiment
    sce <- as.SingleCellExperiment(seurat)

    # Add feature meta data (since Seurat v4 -> will be fixed?)
    rowData(sce) <- seurat[["SCT"]]@meta.features

    # Rename and add layers
    SummarizedExperiment::assay(sce, i = 1) <- seurat[["SCT"]]@counts
    SummarizedExperiment::assay(sce, i = 2) <- seurat[["SCT"]]@data
    SummarizedExperiment::assay(sce, i = 3) <- seurat[["SCT"]]@scale.data
    #SummarizedExperiment::assay(sce, i = 4) <- seurat[["RNA"]]@counts
    SummarizedExperiment::assayNames(sce) <- c("sct_counts", "sct_logcounts", "sct_scale_data")#, "raw_counts")
    ''')
    
    # transfer data
    print('\tTransfer data...')
    
    # add to andata.obs
    adata_sct = ro.globalenv['sce']
    adata_sct.layers['sct_counts'] = adata_sct.X.copy()
    
    
    
    # Harmonize var_names
    ## Remove underscores
    adata.var_names = ['-'.join(var_name.split('_')) for var_name in adata.var_names]
    var_adata = set(adata.var_names)
    var_sct = set(adata_sct.var_names)
    var_intersect = list(var_adata.intersection(var_sct))
    # Subset adata
    adata = adata[:,var_intersect]
    adata_sct = adata_sct[:,var_intersect]
    
    # Add SCT data
    print('\tAdd results to anndata...')
    adata.layers['sct_counts'] = adata_sct.layers['sct_counts'].copy()
    adata.layers['sct_logcounts'] = adata_sct.layers['sct_logcounts'].copy()
    adata.layers['sct_scale_data'] = adata_sct.layers['sct_scale_data'].copy()
    adata.var[['sct.detection_rate', 'sct.gmean', 'sct.variance', 'sct.residual_variance', 'sct.variable']] = adata_sct.var[['sct.detection_rate', 'sct.gmean', 'sct.variance', 'sct.residual_variance', 'sct.variable']].copy()

    if results_to_X is not None:
        print('\tSet',results_to_X,' anndata.X...')
        adata.X = adata.layers[results_to_X].copy()
        
    # Set HVGs
    print('\tSet HVGs...')
    adata.var.loc[:,'highly_variable'] = [bool(i) for i in adata_sct.var['sct.variable']]
    #hvgs = pd.Series(adata.var['sct.variable'][adata.var['sct.variable'] > 0].index) # use HVGs from sct
    #adata.var['highly_variable']= False
    #adata.var.loc[hvgs,'highly_variable'] = True
    
    if min_cells is not None:
        # Filter genes: Min 20 cells - filters out 0 count genes
        print('\tFilter genes...')
        sc.pp.filter_genes(adata, min_cells=min_cells)
    
    # delete
    ro.r('''
    rm(list = ls())
    gc()
    ''')
      
    del adata_sct
    gc.collect()
    
    return adata

####################################################################################################################################################################################################################################
####################################################################################################################################################################################################################################
####################################################################################################################################################################################################################################
####################################################################################################################################################################################################################################

 

    
def print_r_session():
    ro.r('print(sessionInfo())')

# Load Data

In [10]:
adata_1=sc.read('/storage/scRNA-seq/scRNA-seq_iPSC_IGFRL-KO/cellranger/MUC18396/count_matrices/MUC18396_raw_feature_bc_matrix_filtered_markedDoublets.h5ad')
adata_2=sc.read('/storage/scRNA-seq/scRNA-seq_iPSC_IGFRL-KO/cellranger/MUC18397/count_matrices/MUC18397_raw_feature_bc_matrix_filtered_markedDoublets.h5ad')

In [11]:
adata_1_velo=sc.read_loom('/storage/scRNA-seq/scRNA-seq_iPSC_IGFRL-KO/velocyto/MUC18396/possorted_genome_bam_5KIVH.loom')
adata_2_velo=sc.read_loom('/storage/scRNA-seq/scRNA-seq_iPSC_IGFRL-KO/velocyto/MUC18397/possorted_genome_bam_7YBY3.loom')

  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


## Add Velocyto Results

In [13]:
adata_1_velo.obs_names = [name[27:43] + '-1' for name in adata_1_velo.obs_names]
adata_1_velo.var_names_make_unique()

adata_2_velo.obs_names = [name[27:43] + '-1' for name in adata_2_velo.obs_names]
adata_2_velo.var_names_make_unique()

In [14]:
adata_1.layers = adata_1_velo[adata_1.obs_names,adata_1.var_names].layers.copy()
adata_2.layers = adata_2_velo[adata_2.obs_names,adata_2.var_names].layers.copy()

## Concatenate Samples, Filter Genes & Doublets, & Save

In [15]:
# Concatenate
adata=adata_1.concatenate(adata_2).copy()

  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],


In [16]:
# Ambient genes
ambi_cols = adata.var.columns[[column.startswith('is_ambient_') for column in adata.var.columns]]
ambi_bool = [False] * adata.var.shape[0] #np.array()
for col in ambi_cols:
    ambi_bool = list(np.add(ambi_bool, [string in ('True') for string in adata.var[ambi_cols].loc[:,col]]))
    
adata.var['is_ambient'] = ambi_bool

In [17]:
get_umap_leiden(adata)

normalizing counts per cell
    finished (0:00:00)
computing PCA
    with n_comps=50
    finished (0:00:27)
computing neighbors
    using 'X_pca' with n_pcs = 50


2022-10-05 13:22:35.672547: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:01:25)
running Leiden clustering
    finished: found 11 clusters and added
    'leiden', the cluster labels (adata.obs, categorical) (0:00:00)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:04)


In [18]:
#Filter genes:
print('Total number of genes: {:d}'.format(adata.n_vars))

# Min 20 cells - filters out 0 count genes
sc.pp.filter_genes(adata, min_cells=20)
print('Number of genes after cell filter: {:d}'.format(adata.n_vars))

Total number of genes: 17255
filtered out 1596 genes that are detected in less than 20 cells
Number of genes after cell filter: 15659


In [None]:
sc.pl.umap(adata, color=['sample','reporter','n_counts','log_counts','n_genes','log_genes','mt_frac','rp_frac'], size=20, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=4)

In [None]:
sc.pl.umap(adata, color=['final_doublets_cat','doublet_calls'], size=20, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=2)

In [None]:
# Number of doublet calls calls
print(adata.obs['doublet_calls'].value_counts())

In [None]:
# Number of final doublets
print('Number of doublets:')
print(adata.obs['final_doublets'].value_counts())

# Percentage:
print('\nOverall doublet rate:\t\t',round(adata.obs['final_doublets'].value_counts()[1]/len(adata.obs['final_doublets'])*100,2),'%')

for sample in adata.obs['sample'].cat.categories:
    print(f"\n{sample} doublet rate:\t\t{round(adata.obs['final_doublets'][adata.obs['sample']==sample].value_counts()[1]/len(adata.obs['final_doublets'][adata.obs['sample']==sample])*100,2)} % ")

In [None]:
# Annotate the data sets
print(adata.obs['sample'].value_counts())

# Checking the total size of the data set
adata.shape

In [None]:
# Save
sc.write('/home/michi/Projects/scRNA-seq_iPSC_IGFRL-KO_Notebooks/Files/scRNA-seq_IIR-KO_S6_adata_markedDoublets', adata)

In [None]:
adata_rmD = adata[adata.obs['final_doublets'] == False].copy()

In [None]:
# Annotate the data sets
print(adata_rmD.obs['sample'].value_counts())

# Checking the total size of the data set
adata_rmD.shape

In [None]:
# Save
sc.write('/home/michi/Projects/scRNA-seq_iPSC_IGFRL-KO_Notebooks/Files/scRNA-seq_IIR-KO_S6_adata_rmDoublets', adata_rmD)

# Doublets Removed

In [None]:
adata = adata_rmD.copy()

## Normalization with Scran

In [None]:
#adata.layers['raw_counts'] = adata.X.copy()

In [None]:
normalize_scran(adata)

## Normalization with Seurat SCT 


In [None]:
adata = normalize_sct(adata, layer='raw_counts', results_to_X='sct_logcounts')

In [None]:
# Get HVGs and overlap with cell cycle & ambient genes

## HVGs from SCT
hvgs = pd.Series(adata.var_names[adata.var['highly_variable']])
print('\nHighly variable genes before filtering:',adata.var.loc[:,'highly_variable'].value_counts()[1])

# overlap HVGs with CC genes
hvcc = list(hvgs[hvgs.isin(all_cc_genes)])
print('\nHighly variable cell cycle genes:',len(hvcc),'\n',hvcc)

# overlap HVGs with ambient genes
hvambi = list(hvgs[hvgs.isin(list(adata[:,adata.var['is_ambient'] == True].var_names))])
print('\nHighly variable ambient genes:',len(hvambi),'\n',hvambi)

# remove cell cycle genes
adata.var.loc[hvcc,'highly_variable'] = False

# remove ambient genes
adata.var.loc[hvambi,'highly_variable'] = False

print('\nHighly variable genes after filtering:',adata.var.loc[:,'highly_variable'].value_counts()[1])

In [None]:
# Calc umap
sc.pp.pca(adata, svd_solver='arpack', use_highly_variable=True)
sc.pp.neighbors(adata)
sc.tl.leiden(adata, resolution=0.5)

sc.tl.umap(adata)

In [None]:
fig, ax = plt.subplots()
ax.scatter(x=adata.var['sct.gmean'], y=adata.var['sct.residual_variance'], c=adata.var['highly_variable'], s=1, alpha=0.8)
#ax.set_yscale('log')
ax.set_ylim((0,500)) 
ax.set_xscale('log')
ax.set_xlabel('Mean UMI Counts')
ax.set_ylabel('Residual Variance')

In [None]:
fig, ax = plt.subplots()
ax.scatter(x=adata.var['sct.gmean'], y=adata.var['sct.residual_variance'], c=adata.var['highly_variable'], s=1, alpha=0.8)
ax.set_yscale('log')
#ax.set_ylim((0,500)) 
ax.set_xscale('log')
ax.set_xlabel('Mean UMI Counts')
ax.set_ylabel('Residual Variance')

In [None]:
qc_metrics(adata, ambient=False, make_dense=True)

## Nomalization Results

In [None]:
genes = ['INS','GCG','TPH1','SST','ARX','NKX6-1','LMX1A','LMX1B','SLC18A1','ASCL1','GAP43']

In [None]:
sc.pl.umap(adata, color=['n_counts', 'log_counts','n_genes','log_genes','mt_frac','rp_frac','sample','leiden'], size=10, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=4)

In [None]:
sc.pl.umap(adata, color=['sample','leiden'] + genes, size=10, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=4)

In [None]:
sc.pl.violin(adata, use_raw=False, keys=['n_counts', 'log_counts','n_genes','log_genes','mt_frac','rp_frac'], groupby='leiden', rotation=90)

In [None]:
sc.pl.violin(adata, use_raw=False, keys=genes, groupby='leiden', layer='raw_counts', rotation=90)

In [None]:
sc.pl.violin(adata, use_raw=False, keys=genes, groupby='leiden', layer='sct_counts', rotation=90)

In [None]:
sc.pl.violin(adata, use_raw=False, keys=genes, groupby='leiden', layer='sct_logcounts', rotation=90)

In [None]:
sc.pl.violin(adata, use_raw=False, keys=genes, groupby='leiden', layer='scran_counts', rotation=90)

In [None]:
sc.pl.violin(adata, use_raw=False, keys=genes, groupby='leiden', layer='sct_scale_data', rotation=90)

In [None]:
#genes = ['Lgr5','Sis','Pou2f3','Spdef','Defa24','Chga','Neurog3','Tph1','Isl1','Foxa2']
for gene in genes:
    df = pd.DataFrame({'sct':list(chain.from_iterable(adata[:,gene].layers['sct_logcounts'].toarray())), 'scran':list(chain.from_iterable(adata[:,gene].layers['scran_counts'].toarray())), 'raw':list(chain.from_iterable(adata[:,gene].layers['log_raw_counts'].toarray())), 'leiden':list(adata.obs['leiden'].astype(int))})
    df = df.sort_values(by=['leiden'])
    df.loc[:,'leiden'] = df.loc[:,'leiden'].astype('category')
    lims_x = []
    lims_y = []
    lims_line = []

    fig, axs = plt.subplots(1, 3, constrained_layout=True, figsize=(10, 3))
    # Plots
    axs[0].scatter(df.loc[:,'sct'], y=df.loc[:,'scran'], s=2, alpha=0.2, c=df.loc[:,'leiden'], cmap=ListedColormap(adata.uns['leiden_colors']))
    axs[1].scatter(df.loc[:,'raw'], y=df.loc[:,'sct'], s=2, alpha=0.2, c=df.loc[:,'leiden'], cmap=ListedColormap(adata.uns['leiden_colors']))
    axs[2].scatter(df.loc[:,'raw'], y=df.loc[:,'scran'], s=2, alpha=0.2, c=df.loc[:,'leiden'], cmap=ListedColormap(adata.uns['leiden_colors']))

    # Aesthetics
    for i,ax in enumerate(axs):
        lims_x.append(ax.get_xlim())
        lims_y.append(ax.get_ylim())
        lims_line.append([np.min([ax.get_xlim(), ax.get_ylim()]), np.max([ax.get_xlim(), ax.get_ylim()])])

    axs[0].set_xlabel('SCT Normalized')
    axs[0].set_ylabel('SCRAN Normalized')
    #axs[0].set_aspect('equal')
    axs[0].plot(lims_line[0], lims_line[0], 'k-', alpha=1, zorder=0, ls='--', lw=1)
    axs[0].set_xlim(lims_x[0])
    axs[0].set_ylim(lims_y[0])

    axs[1].set_xlabel('Raw')
    axs[1].set_ylabel('SCT Normalized')
    axs[1].set_title(gene, fontweight='bold')
    #axs[1].set_aspect('equal')
    axs[1].plot(lims_line[1], lims_line[1], 'k-', alpha=1, zorder=0, ls='--', lw=1)
    axs[1].set_xlim(lims_x[1])
    axs[1].set_ylim(lims_y[1])

    axs[2].set_xlabel('Raw')
    axs[2].set_ylabel('SCRAN Normalized')
    #axs[2].set_aspect('equal')
    axs[2].plot(lims_line[2], lims_line[2], 'k-', alpha=1, zorder=0, ls='--', lw=1)
    axs[2].set_xlim(lims_x[2])
    axs[2].set_ylim(lims_y[2])

    plt.show()

## Save

In [68]:
sparsify_all_layers(adata)

Sparsify .X...
Layer matrix already spase...
Layer ambiguous already spase...
Layer spliced already spase...
Layer unspliced already spase...
Layer raw_counts already spase...
Layer log_raw_counts already spase...
Sparsify  scran_counts
Layer sct_counts already spase...
Layer sct_logcounts already spase...
Sparsify  sct_scale_data


In [69]:
# Save
sc.write('/home/michi/Projects/scRNA-seq_iPSC_IGFRL-KO_Notebooks/Files/scRNA-seq_iPSC_IIR-KO_S6_adata_rmDoublets_normalized', adata)

# Session Info

In [70]:
session_info.show()

In [71]:
print_r_session()

R version 4.1.1 (2021-08-10)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 20.04.5 LTS

Matrix products: default
BLAS/LAPACK: /home/michi/Software/venvs/scAnalysis_sc1.9_ad0.8_mu0.1.2_md0.2_R4.1_FVF/lib/libopenblasp-r0.3.18.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=de_DE.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=de_DE.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=de_DE.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=de_DE.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats4    tools     stats     graphics  grDevices utils     datasets 
[8] methods   base     

other attached packages:
 [1] BiocParallel.FutureParam_0.2.1 BiocParallel_1.28.3           
 [3] future_1.27.0                  sctransform_0.3.3             
 [5] sp_1.5-0                       SeuratObject_4.1.0            
 [7] Seurat_4.1.1                   scran_1.2