
# Preprocessing - QC & Filtering - WT S6

Michael Sterr

2022-09-15 16:30:36 

# Setup

In [1]:
# General
import scipy as sci
import numpy as np
import pandas as pd
import logging

# Plotting
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib import rcParams
from matplotlib.pyplot import rc_context
import seaborn as sb

# Analysis
import scanpy as sc
import scvi

Global seed set to 0


In [2]:
# Display options
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:85% !important; }</style>"))
#pd.set_option("display.max_rows", None)

from itables import init_notebook_mode
init_notebook_mode(all_interactive=True)

<IPython.core.display.Javascript object>

In [3]:
# Settings

## Scanpy settings
sc.settings.verbosity = 3
sc.logging.print_versions()

-----
anndata     0.8.0
scanpy      1.9.1
-----
PIL                         8.4.0
absl                        NA
anyio                       NA
astunparse                  1.6.3
attr                        21.2.0
babel                       2.9.1
backcall                    0.2.0
beta_ufunc                  NA
binom_ufunc                 NA
bottleneck                  1.3.2
certifi                     2022.06.15
cffi                        1.15.0
chardet                     4.0.0
charset_normalizer          2.0.7
chex                        0.1.1
cloudpickle                 2.0.0
colorama                    0.4.4
cupy                        10.1.0
cupy_backends               NA
cupyx                       NA
cycler                      0.10.0
cython_runtime              NA
dask                        2021.10.0
dateutil                    2.8.2
debugpy                     1.4.1
decorator                   5.1.0
defusedxml                  0.7.1
deprecate                   0.3.1
docrep  

In [4]:
# Color maps
exec(open("/home/michi/Software/viscm/maps/michi_bk_bl_gn_yl.py").read())

In [5]:
# Plot settings
%matplotlib inline
# %config InlineBackend.figure_format='retina'

## Directory
sc.settings.figdir='/home/michi/Projects/scRNA-seq_Gut_Maren_Wnt-PCP_Böttcher-et-al_Notebooks/Figures'

## Plotting parameters
rcParams['figure.figsize']=(4,4) #rescale figures
#sc.set_figure_params(scanpy=True, frameon=False, vector_friendly=False, color_map='tab10' ,transparent=True, dpi=150, dpi_save=300)
sc.set_figure_params(scanpy=True, frameon=False, vector_friendly=False ,transparent=True, dpi=150, dpi_save=300)

## Font
rcParams['font.family'] = 'sans-serif'
rcParams['font.sans-serif'] = ['Source Sans 3']

## Grid & Ticks
rcParams['grid.alpha'] = 0
rcParams['xtick.bottom'] = True
rcParams['ytick.left'] = True

## Embed font
plt.rc('pdf', fonttype=42)

## Define new default settings
plt.rcParamsDefault = plt.rcParams

# Load Data

In [6]:
adata=sc.read('/storage/scRNA-seq/scRNA-seq_iPSC_IGFRL-KO/cellranger/MUC18396/count_matrices/MUC18396_raw_feature_bc_matrix.h5ad')
adata.obs['sample'] = ['S6_WT']*adata.n_obs
adata.obs['sample_name'] = ['S6 WT']*adata.n_obs
adata.obs['seq_id'] = ['MUC18396']*adata.n_obs
adata.obs['int_id'] = ['394']*adata.n_obs
adata.obs['reporter'] = ['INS-mCherry']*adata.n_obs
adata.obs['stage'] = ['S6']*adata.n_obs
adata.obs['genotype'] = ['WT']*adata.n_obs
# adata.X = adata.X.toarray()
adata.var_names_make_unique()

  utils.warn_names_duplicates("var")


In [7]:
# DropletUtils output
cell_barcodes_du = pd.read_csv('/storage/scRNA-seq/scRNA-seq_iPSC_IGFRL-KO/cellranger/MUC18396/count_matrices/MUC18396_DropletUtils_CellBarcodes.csv')
cell_probs = pd.read_csv('/storage/scRNA-seq/scRNA-seq_iPSC_IGFRL-KO/cellranger/MUC18396/count_matrices/MUC18396_DropletUtils_LogProbabilities.csv')
ambient_genes = pd.read_csv('/storage/scRNA-seq/scRNA-seq_iPSC_IGFRL-KO/cellranger/MUC18396/count_matrices/MUC18396_DropletUtils_AmbientGenes.csv')

# CellRanger cell barcodes
cell_barcodes_cr = pd.read_csv('/storage/scRNA-seq/scRNA-seq_iPSC_IGFRL-KO/cellranger/MUC18396/count_matrices/filtered_feature_bc_matrix/barcodes.tsv', names=['x'])
cell_barcodes_cr = pd.Series(cell_barcodes_cr.loc[:,'x'])

# Functions

In [8]:
# Functions
def get_umap_leiden(adata, resolution=0.5, exclude_highly_expressed=False):
       
    # preprocess adata, cluster and get umap
    adata_pp = adata.copy()
    sc.pp.normalize_total(adata_pp, target_sum=1e4, exclude_highly_expressed=exclude_highly_expressed, key_added='size_factor') #sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)
    sc.pp.log1p(adata_pp)
    sc.pp.pca(adata_pp)
    sc.pp.neighbors(adata_pp, metric='correlation')
    sc.tl.leiden(adata_pp, resolution=resolution)
    sc.tl.umap(adata_pp)
    
    adata.obsm['X_umap'] = adata_pp.obsm['X_umap'].copy()
    adata.obs['leiden'] = adata_pp.obs['leiden'].copy()

###################################################################################################################
###################################################################################################################
###################################################################################################################
    
    
def add_droplet_utils_results(adata, ambient_probs=None, cell_probs=None):
    ambi_key = 'ambient_genes_' + adata.obs['sample'][0]
    ambient_probs.index = ambient_probs['Unnamed: 0'].values
    all_genes = set(adata.var.index)
    ambient_probs_genes = set(ambient_probs.index)
    shared_genes = all_genes.intersection(ambient_probs_genes)
    adata.var[ambi_key] = 0.0
    adata.var.loc[shared_genes,ambi_key] = ambient_probs.loc[shared_genes,'ambient_genes'].values
    #adata.var[ambi_key][all_genes] = ambient_probs['ambient_genes'].values
    
       
    # intersect barcodes from adata and cell_probs
    cell_probs_key = 'log_cell_probs' #'log_cell_probs_' + adata.obs['sample'][0]
    cell_probs.index = cell_probs.barcodes.values
    adata_barcodes = set(adata.obs.index)
    cell_probs_barcodes = set(cell_probs.index)
    shared_barcodes = adata_barcodes.intersection(cell_probs_barcodes)
                               
    adata.obs.loc[shared_barcodes,cell_probs_key] = cell_probs.loc[shared_barcodes,'cell_probs'].values

###################################################################################################################
###################################################################################################################
###################################################################################################################
    
def set_ambient_threshold(adata, threshold=0.0005, lower_limit=0.0002, upper_limit=0.002, bins=60, kde=True):
    ambi_key = 'ambient_genes_' + adata.obs['sample'][0]
    is_ambi_key = 'is_ambient_' + adata.obs['sample'][0]
    
    sb.distplot(adata.var[ambi_key][(adata.var[ambi_key] > lower_limit) & (adata.var[ambi_key] < upper_limit)], kde=kde, bins=bins)
    plt.axvline(threshold, 0, 1)
    plt.title(label='Ambient Genes Threshold (' + str(len(adata.var[ambi_key][adata.var[ambi_key] > threshold])) + ' Genes)', fontweight='bold')
    
    adata.var[is_ambi_key] = pd.Categorical(list(map(str,list(adata.var[ambi_key] > threshold))))

###################################################################################################################
###################################################################################################################
###################################################################################################################
    

def qc_metrics(adata, ambient=True, plot=True, counts_per_gene=True, make_dense=False, genome='auto', mt_genes_path='/mnt/ssd/Resources/sus_scrofa_mt_ens101_ext.txt'):
    """\
    Calculate QC metrics.
    genome: {'auto','Mus_musculus','Homo_sapiens','Sus_scrofa'}
    mt_genes_path: Path to mitochondrial genes for sus scrofa. Tab delimited file without header and with gene symbols in column 2. default: '/mnt/ssd/Resources/sus_scrofa_mt_ens101_ext.txt'
    ambient: Requires adata.var['is_ambient'] = pd.Categorical(list(map(str,list(adata.var['ambient_genes'] > cut_off))))
    """
    
    is_ambi_key = 'is_ambient_' + adata.obs['sample'][0]
    
    if genome=='auto':
        genome = '_'.join(adata.var.loc[:,'genome'][0].split('_')[0:2])
        print('Genome is', genome)

    if make_dense:
        adata.X = adata.X.toarray()

    if counts_per_gene:
        # counts per gene
        adata.var['n_counts'] = adata.X.sum(0)

    # counts per cell
    adata.obs['n_counts'] = adata.X.sum(1)
    # log counts per cell
    adata.obs['log_counts'] = np.log(adata.obs['n_counts'])
    # rank by counts
    adata.obs['n_counts_rank'] = adata.obs['n_counts'].rank(method='first',ascending=False)
    # genes per cell
    adata.obs['n_genes'] = (adata.X > 0).sum(1)
    # log genes per cell
    adata.obs['log_genes'] = np.log(adata.obs['n_genes'])
    # fraction of mitochondrial genes
    if (genome == 'Homo_sapiens') | (genome == 'homo_sapiens'):
        mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]
        adata.obs['mt_frac'] = adata.X[:, mt_gene_mask].sum(1)/adata.obs['n_counts']

        rp_gene_mask = [gene.startswith(('RPS','RPL')) for gene in adata.var_names]
        adata.obs['rp_frac'] = adata.X[:,rp_gene_mask].sum(1) / adata.obs['n_counts']

    elif (genome == 'Mus_musculus') | (genome == 'mus_musculus'):
        mt_gene_mask = [gene.startswith('mt-') for gene in adata.var_names]
        adata.obs['mt_frac'] = adata.X[:, mt_gene_mask].sum(1)/adata.obs['n_counts']

        rp_gene_mask = [gene.startswith(('Rps','Rpl')) for gene in adata.var_names]
        adata.obs['rp_frac'] = adata.X[:,rp_gene_mask].sum(1) / adata.obs['n_counts']

    elif (genome == 'Sus_scrofa') | (genome == 'sus_scrofa'):
        mt_genes = [gene.split('-')[0] for gene in list(pd.read_csv(mt_genes_path , header=None, sep="\t")[1])]
        mt_gene_mask = adata.var_names.isin(mt_genes)
        adata.obs['mt_frac'] = adata.X[:, mt_gene_mask].sum(1)/adata.obs['n_counts']

        rp_gene_mask = [gene.startswith(('RPS','RPL')) for gene in adata.var_names]
        adata.obs['rp_frac'] = adata.X[:,rp_gene_mask].sum(1) / adata.obs['n_counts']

    if ambient:
        adata.obs['ambi_frac'] = adata.X[:,adata.var[is_ambi_key]=='True'].sum(1) / adata.obs['n_counts']

    if plot:
        sb.jointplot(
            data=adata.obs,
            x="log_counts",
            y="log_genes",
            kind="hist", bins=100, cmap="rocket_r", color="#f69c73", space=0
        )

        fig, ax1 = plt.subplots()
        ax1.scatter(x=adata.obs['n_counts_rank'], y=adata.obs['n_counts'], s=1, alpha=0.2, c='black', label='Total UMI Counts')
        ax1.scatter(x=adata.obs['n_counts_rank'], y=adata.obs['n_genes'], s=1, alpha=0.2, c='tab:green', label='Gene Counts')
        ax1.set(xscale='log', yscale='log')
        ax1.set_ylabel('Total UMI/Gene Counts')
        ax1.set_xlabel('Ranked Droplets')
        #ax1.vlines(x=[max_rank], color="black", lw=0.5).set_linestyle("--")

        ax2 = ax1.twinx()
        ax2.scatter(x=adata.obs['n_counts_rank'], y=adata.obs['mt_frac']*100, s=1, alpha=0.2, c='tab:red', label='% Mito. Counts')
        ax2.set_ylabel('%')

        fig.legend(loc='center left', fontsize='xx-small', bbox_to_anchor=(0.2, 0.35))
        

###################################################################################################################
###################################################################################################################
###################################################################################################################
    

        
def filter_qc(adata, counts_filter=None, genes_filter=None, mito_filter=None, qc_filter=None):
    """\
    counts_filter, genes_filter, mito_filter: array of booleans. E.g. genes_filter = adata.obs['n_genes'] > min_genes
    """
    if ((qc_filter is None) & ((counts_filter is None) | (genes_filter is None) | (mito_filter is None))):
        print('Specify QC filter.')
        return
    
    pre_filter_n_obs = adata.n_obs
    if ((counts_filter is not None) & (genes_filter is not None) & (mito_filter is not None)):
        adata = adata[counts_filter & genes_filter & mito_filter]
    elif (qc_filter is not None):
        adata = adata[qc_filter]
    
    pct = (pre_filter_n_obs - adata.n_obs) / pre_filter_n_obs * 100
    print('Filtered out {:d}'.format(pre_filter_n_obs - adata.n_obs),'cells ({:.1f}'.format(pct) ,'%).')
    print('Number of cells after filter: {:d}'.format(adata.n_obs))
    return adata
    




###################################################################################################################
###################################################################################################################
###################################################################################################################
    
    
    
def prefilter_barcodes(adata, barcodes=None, plot=True):
    all_barcodes = pd.Series(adata.obs.index)
    adata=adata[all_barcodes[all_barcodes.isin(barcodes)]].copy()
    
    if plot:
        sb.jointplot(
            data=adata.obs,
            x="log_counts",
            y="log_genes",
            kind="hist", bins=100, cmap="rocket_r", color="#f69c73", space=0
        )
        
        #############################

        fig, ax1 = plt.subplots()
        ax1.scatter(x=adata.obs['n_counts_rank'], y=adata.obs['n_counts'], s=1, alpha=0.2, c='black', label='Total UMI Counts')
        ax1.scatter(x=adata.obs['n_counts_rank'], y=adata.obs['n_genes'], s=1, alpha=0.2, c='tab:green', label='Gene Counts')
        ax1.set(xscale='log', yscale='log')
        ax1.set_ylabel('Total UMI/Gene Counts')
        ax1.set_xlabel('Ranked Droplets')
        #ax1.vlines(x=[max_rank], color="black", lw=0.5).set_linestyle("--")

        ax2 = ax1.twinx()
        ax2.scatter(x=adata.obs['n_counts_rank'], y=adata.obs['mt_frac']*100, s=1, alpha=0.2, c='tab:red', label='% Mito. Counts')
        ax2.set_ylabel('%')

        fig.legend(loc='center left', fontsize='xx-small', bbox_to_anchor=(0.2, 0.35))
        
        plt.show()
        
        ###################################
        
        cell_probs_key = 'log_cell_probs' #'log_cell_probs_' + adata.obs['sample'][0]
        fig, ax1 = plt.subplots()
        ax1.scatter(x=adata.obs['n_counts_rank'], y=adata.obs['n_counts'], s=1, alpha=0.2, c='black', label='Total UMI Counts')
        ax1.scatter(x=adata.obs['n_counts_rank'], y=adata.obs['n_genes'], s=1, alpha=0.2, c='tab:green', label='Gene Counts')
        ax1.set(xscale='log', yscale='log')
        ax1.set_ylabel('Total UMI/Gene Counts')
        ax1.set_xlabel('Ranked Droplets')
        #ax1.vlines(x=[max_rank], color="black", lw=0.5).set_linestyle("--")

        ax2 = ax1.twinx()
        ax2.scatter(x=adata.obs['n_counts_rank'], y=adata.obs[cell_probs_key], s=1, alpha=0.2, c='tab:blue', label='Log Cell Probabilities')
        ax2.set_ylabel('Cell Probabilities')

        fig.legend(loc='center left', fontsize='xx-small', bbox_to_anchor=(0.2, 0.35))
        
        plt.show()
        
        ##############################
        
        sb.distplot(adata.obs[cell_probs_key][~np.isnan(list(adata.obs[cell_probs_key]))], kde=True, bins=60)
        plt.title(label='Log Cell Probabilities', fontweight='bold')
    
    return adata




###################################################################################################################
###################################################################################################################
###################################################################################################################
    
    
    
def sparsify_all_layers(adata):
    """
    Loop trough all layers and make dense matrices sparse.
    """
          
    if not sci.sparse.issparse(adata.X):
        print('Sparsify .X...')
        adata.X = sci.sparse.csr_matrix(adata.X)
    else:
        print('.X already spase...')  
        
    for layer in list(adata.layers):
        if not sci.sparse.issparse(adata.layers[layer]):
            print('Sparsify ', layer)
            adata.layers[layer] = sci.sparse.csr_matrix(adata.layers[layer])
        else:
            print('Layer', layer, 'already spase...')

# Preprocessing

## Ambient Genes

In [None]:
add_droplet_utils_results(adata, ambient_probs=ambient_genes, cell_probs=cell_probs)

In [None]:
set_ambient_threshold(adata, threshold=0.0007)

## Filter Cell Droplets

In [None]:
qc_metrics(adata, make_dense=True)

In [None]:
adata = prefilter_barcodes(adata, barcodes=cell_barcodes_cr)

## QC & Filtering

### QC & Thresholding Decisions

In [None]:
get_umap_leiden(adata)

In [None]:
sc.pl.umap(adata, color=['leiden','n_counts','log_counts','n_genes','log_genes','mt_frac','rp_frac', 'ambi_frac'], size=20, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=4)

In [None]:
# Quality control - plot QC metrics
#Sample quality plots
sc.pl.violin(adata, 'n_counts', groupby='sample', log=True, cut=0, palette='Set1')
sc.pl.violin(adata, 'mt_frac', groupby='sample', palette='Set1')
sc.pl.violin(adata, 'mt_frac', groupby='sample', palette='Set1', log=True)

#### Mitochondrial Gene Fraction

##### High & Low MT-Gene Fraction

In [None]:
sc.pl.scatter(adata, 'mt_frac', 'n_genes', color='n_counts', color_map=michi_bk_bl_gn_yl)
sc.pl.scatter(adata, 'mt_frac', 'log_counts', color='n_genes', color_map=michi_bk_bl_gn_yl)
sc.pl.scatter(adata, 'mt_frac', 'rp_frac', color='n_genes', color_map=michi_bk_bl_gn_yl)
sb.distplot(adata.obs['mt_frac'], kde=True, bins=60)

In [None]:
max_mito = 0.4
min_mito = 0.025

In [None]:
sb.jointplot(x=adata.obs['mt_frac'], y=adata.obs['n_genes'], n_levels=15, thresh=0.05, kind="kde", space=0, fill=True, cmap="rocket_r", color="#f69c73").plot_joint(
    sb.scatterplot, alpha=0).ax_joint.vlines(x=[min_mito,max_mito], ymin=[0,0], ymax=[max(adata.obs['n_genes']),max(adata.obs['n_genes'])], color="black", lw=0.5).set_linestyle("--")

sb.jointplot(x=adata.obs['mt_frac'], y=adata.obs['log_counts'], n_levels=15, thresh=0.05, kind="kde", space=0, fill=True, cmap="rocket_r", color="#f69c73").plot_joint(
    sb.scatterplot, alpha=0).ax_joint.vlines(x=[min_mito,max_mito], ymin=[0,0], ymax=[max(adata.obs['log_counts']),max(adata.obs['log_counts'])], color="black", lw=0.5).set_linestyle("--")

In [None]:
adata.obs['filter_mt_frac']=pd.Categorical(list(map(str,list((adata.obs['mt_frac'] < max_mito) & (adata.obs['mt_frac'] > min_mito)))))

In [None]:
sc.pl.umap(adata, color=['filter_mt_frac','mt_frac'], size=20, add_outline=True, alpha=1, outline_width=(0.3, 0.0))

In [None]:
with rc_context({'figure.figsize': (8, 3)}):
    p=sc.pl.scatter(adata, 'mt_frac', 'n_genes', color='filter_mt_frac', color_map=michi_bk_bl_gn_yl, show=False)
    p.vlines(x=[min_mito, max_mito], ymin=[0,0], ymax=[max(adata.obs['n_genes']),max(adata.obs['n_genes'])], color="black", lw=0.5).set_linestyle("--")
    plt.show()

In [None]:
#Thresholding decision: MT-gene fraction
with rc_context({'figure.figsize': (8, 3)}):
    sb.distplot(adata.obs['mt_frac'], kde=True, bins=60)
    plt.axvline(max_mito, 0, 1)
    plt.axvline(min_mito, 0, 1)

In [None]:
mito_filter = (adata.obs['mt_frac'] < max_mito) & (adata.obs['mt_frac'] > min_mito)

#### Ribosomal Gene Fraction

In [None]:
sc.pl.scatter(adata, 'rp_frac', 'n_genes', color='n_counts', color_map=michi_bk_bl_gn_yl)
sc.pl.scatter(adata, 'rp_frac', 'log_counts', color='n_genes', color_map=michi_bk_bl_gn_yl)
sc.pl.scatter(adata, 'rp_frac', 'mt_frac', color='n_genes', color_map=michi_bk_bl_gn_yl)

In [None]:
sb.jointplot(x=adata[mito_filter].obs['rp_frac'], y=adata[mito_filter].obs['n_genes'], n_levels=30, thresh=0.05, kind="kde", space=0, fill=True, cmap="rocket_r", color="#f69c73").plot_joint(
    sb.scatterplot, alpha=0)

sb.jointplot(x=adata[mito_filter].obs['rp_frac'], y=adata[mito_filter].obs['log_counts'], n_levels=30, thresh=0.05, kind="kde", space=0, fill=True, cmap="rocket_r", color="#f69c73").plot_joint(
    sb.scatterplot, alpha=0)

sb.jointplot(x=adata[mito_filter].obs['rp_frac'], y=adata[mito_filter].obs['mt_frac'], n_levels=30, thresh=0.05, kind="kde", space=0, fill=True, cmap="rocket_r", color="#f69c73").plot_joint(
    sb.scatterplot, alpha=0)

#### Counts

In [None]:
sb.jointplot(x=adata[mito_filter].obs['log_counts'], y=adata[mito_filter].obs['log_genes'], n_levels=30, thresh=0.05, kind="kde", space=0, fill=True, cmap="rocket_r", color="#f69c73").plot_joint(
    sb.scatterplot, alpha=0)

In [None]:
#Thresholding decision: counts
sb.distplot(adata[mito_filter].obs['n_counts'], kde=True)
sb.distplot(adata.obs['n_counts'], kde=True)

In [None]:
min_counts = 3500
x_lim = [0,7000]
sb.distplot(adata.obs['n_counts'][(adata.obs['n_counts']<x_lim[1]) & mito_filter], kde=True, bins=60)
sb.distplot(adata.obs['n_counts'][(adata.obs['n_counts']<x_lim[1])], kde=True, bins=60)
plt.axvline(min_counts, 0, 1)

In [None]:
x_lim = [100,6000]
sb.distplot(adata.obs['n_counts'][(adata.obs['n_counts']>x_lim[0]) & (adata.obs['n_counts']<x_lim[1]) & mito_filter], kde=True, bins=60)
sb.distplot(adata.obs['n_counts'][(adata.obs['n_counts']>x_lim[0]) & (adata.obs['n_counts']<x_lim[1])], kde=True, bins=60)
plt.axvline(min_counts, 0, 1)

In [None]:
max_counts = 40000
x_lim = [8000,100000]
sb.distplot(adata.obs['n_counts'][(adata.obs['n_counts']>x_lim[0]) & (adata.obs['n_counts']<x_lim[1]) & mito_filter], kde=True, bins=60)
sb.distplot(adata.obs['n_counts'][(adata.obs['n_counts']>x_lim[0]) & (adata.obs['n_counts']<x_lim[1])], kde=True, bins=60)
plt.axvline(max_counts, 0, 1)

In [31]:
counts_filter = (adata.obs['n_counts'] > min_counts) & (adata.obs['n_counts'] < max_counts)

In [None]:
sc.pl.scatter(adata, 'n_counts', 'n_genes', color='mt_frac', color_map=michi_bk_bl_gn_yl, show=False).vlines(x=[min_counts, max_counts], ymin=[0,0], ymax=[max(adata.obs['n_genes']),max(adata.obs['n_genes'])], color="black", lw=0.5).set_linestyle("--")
sc.pl.scatter(adata, 'log_counts', 'log_genes', color='mt_frac', color_map=michi_bk_bl_gn_yl, show=False).vlines(x=[np.log(min_counts), np.log(max_counts)], ymin=[np.log(min(adata.obs['n_genes'])),np.log(min(adata.obs['n_genes']))], ymax=[np.log(max(adata.obs['n_genes'])),np.log(max(adata.obs['n_genes']))], color="black", lw=0.5).set_linestyle("--")

#### Genes

In [None]:
sb.jointplot(x=adata[mito_filter].obs['log_counts'], y=adata[mito_filter].obs['log_genes'], n_levels=30, thresh=0.05, kind="kde", space=0, fill=True, cmap="rocket_r", color="#f69c73").plot_joint(
    sb.scatterplot, alpha=0).ax_joint.vlines(x=[np.log(min_counts), np.log(max_counts)], ymin=[np.log(min(adata.obs['n_genes'])),np.log(min(adata.obs['n_genes']))], ymax=[np.log(max(adata.obs['n_genes'])),np.log(max(adata.obs['n_genes']))], color="black", lw=0.5).set_linestyle("--")

In [None]:
#Thresholding decision: genes
sb.distplot(adata.obs['n_genes'][mito_filter & counts_filter], kde=True, bins=60)
sb.distplot(adata.obs['n_genes'][mito_filter], kde=True, bins=60)
sb.distplot(adata.obs['n_genes'], kde=True, bins=60)

In [None]:
min_genes = 1500
x_lim = [100,3000]
sb.distplot(adata.obs['n_genes'][(adata.obs['n_genes']>x_lim[0]) & (adata.obs['n_genes']<x_lim[1])& mito_filter & counts_filter], kde=True, bins=60)
sb.distplot(adata.obs['n_genes'][(adata.obs['n_genes']>x_lim[0]) & (adata.obs['n_genes']<x_lim[1])& mito_filter], kde=True, bins=60)
sb.distplot(adata.obs['n_genes'][(adata.obs['n_genes']>x_lim[0]) & (adata.obs['n_genes']<x_lim[1])], kde=True, bins=60)
plt.axvline(min_genes, 0, 1)

In [None]:
x_lim = [1000,2000]
sb.distplot(adata.obs['n_genes'][(adata.obs['n_genes']>x_lim[0]) & (adata.obs['n_genes']<x_lim[1]) & mito_filter & counts_filter], kde=True, bins=100)
sb.distplot(adata.obs['n_genes'][(adata.obs['n_genes']>x_lim[0]) & (adata.obs['n_genes']<x_lim[1]) & mito_filter], kde=True, bins=100)
sb.distplot(adata.obs['n_genes'][(adata.obs['n_genes']>x_lim[0]) & (adata.obs['n_genes']<x_lim[1])], kde=True, bins=100)
plt.axvline(min_genes, 0, 1)

In [37]:
genes_filter = (adata.obs['n_genes'] > min_genes)

In [None]:
p=sc.pl.scatter(adata, 'n_counts', 'n_genes', color='mt_frac', color_map=michi_bk_bl_gn_yl, show=False)
p.vlines(x=[min_counts, max_counts], ymin=[0,0], ymax=[max(adata.obs['n_genes']),max(adata.obs['n_genes'])], color="black", lw=0.5).set_linestyle("--")
p.axhline(y=min_genes, xmin=0, xmax=max(adata.obs['n_counts']), color="black", lw=0.5).set_linestyle("--")
plt.show()

p=sc.pl.scatter(adata, 'log_counts', 'log_genes', color='mt_frac', color_map=michi_bk_bl_gn_yl, show=False)
p.vlines(x=[np.log(min_counts), np.log(max_counts)], ymin=[np.log(min(adata.obs['n_genes'])),np.log(min(adata.obs['n_genes']))], ymax=[np.log(max(adata.obs['n_genes'])),np.log(max(adata.obs['n_genes']))], color="black", lw=0.5).set_linestyle("--")
p.axhline(y=np.log(min_genes), color="black", lw=0.5).set_linestyle("--")
plt.show()

### Filtering

In [39]:
qc_filter = counts_filter & genes_filter & mito_filter

In [40]:
adata.obs['filtered_cells']=pd.Categorical(list(map(str,list(qc_filter))))

In [None]:
for col in ['log_cell_probs', 'log_counts', 'n_counts_rank', 'log_genes', 'mt_frac', 'rp_frac', 'ambi_frac']:
    fig, axes = plt.subplots(1,2, figsize=(10, 4), gridspec_kw=dict(width_ratios=[2,1],wspace = 0.3))
    
    sb.violinplot(x='leiden', y=col, data=adata.obs, ax=axes[0])
    
    sb.violinplot(x='filtered_cells', y=col, data=adata.obs, ax=axes[1])
    axes[1].set_ylabel(None)
    fig.suptitle(col)

In [None]:
sc.pl.umap(adata, color=['log_counts','log_genes','mt_frac','rp_frac','ambi_frac','log_cell_probs'], size=20, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=3)
sc.pl.umap(adata, color=['filtered_cells','leiden'], size=20, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=6)

###############################################################################
###############################################################################

with rc_context({'figure.figsize': (6, 4)}): #rcParams['figure.figsize']=(6,4)
    key = 'leiden'
    labels = list(adata.obs[key].cat.categories)
    keep_pct = []
    filter_pct = []
    width = 0.85       # the width of the bars: can also be len(x) sequence

    for label in labels:
        keep_pct = keep_pct + [adata.obs['filtered_cells'][adata.obs[key]==label].value_counts()['True']/adata.obs['filtered_cells'][adata.obs[key]==label].value_counts().sum()*100]
        filter_pct = filter_pct + [adata.obs['filtered_cells'][adata.obs[key]==label].value_counts()['False']/adata.obs['filtered_cells'][adata.obs[key]==label].value_counts().sum()*100]

    fig, ax = plt.subplots()

    ax.bar(labels, filter_pct, width, label='Filter Out', edgecolor='0', linewidth=0.5)
    ax.bar(labels, keep_pct, width, bottom=filter_pct, label='Keep', edgecolor='0', linewidth=0.5)

    ax.set_ylabel('%')
    ax.set_title('Percentage of Filtered Cells')
    ax.axes.set_xticklabels(labels=labels, rotation=90)
    ax.legend(bbox_to_anchor=(1, .5),loc='center left', edgecolor='1')

    plt.ylim([-2.5,100+2.5])
    plt.xlim([-1+0.25,len(labels)-0.25])

    plt.show()

#################################################################################
#################################################################################

sc.pl.scatter(adata, 'mt_frac', 'n_genes', color='filtered_cells')
sc.pl.scatter(adata, 'log_counts', 'log_genes', color='filtered_cells')

#### Filter

In [43]:
# Filter cells according to identified QC thresholds:
adata = filter_qc(adata, qc_filter=qc_filter)

Filtered out 1760 cells (20.1 %).
Number of cells after filter: 6980


In [44]:
del adata.obs['filtered_cells']
del adata.obs['filter_mt_frac']
#del adata.obs['low_mt_frac']

#### Plot Results

In [45]:
get_umap_leiden(adata)

normalizing counts per cell
    finished (0:00:00)
computing PCA
    with n_comps=50
    finished (0:00:04)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:00)
running Leiden clustering
    finished: found 12 clusters and added
    'leiden', the cluster labels (adata.obs, categorical) (0:00:00)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:06)


In [None]:
sc.pl.umap(adata, color=['log_counts','log_genes','mt_frac','rp_frac'], size=20, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=5)

In [None]:
sb.jointplot(
    data=adata.obs,
    x="log_counts",
    y="log_genes",
    kind="hist", bins=100, cmap="rocket_r", color="#f69c73", space=0
)

# Save

In [None]:
sparsify_all_layers(adata)

In [49]:
# Save
sc.write('/storage/scRNA-seq/scRNA-seq_iPSC_IGFRL-KO/cellranger/MUC18396/count_matrices/MUC18396_raw_feature_bc_matrix_filtered.h5ad', adata)