In [None]:
import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', 500)

from rpy2.robjects import pandas2ri
from rpy2.robjects import r
import rpy2.rinterface_lib.callbacks
import anndata2ri
import rpy2.robjects.numpy2ri
#import numpy2ri
import anndata

pandas2ri.activate()
anndata2ri.activate()
rpy2.robjects.numpy2ri.activate()

%load_ext rpy2.ipython

In [None]:
adata = sc.read_mtx('../Parse_alingment/Parse_hg38_aligned/output_combined/all-sample/DGE_filtered/count_matrix.mtx')
gene_data = pd.read_csv('../Parse_alingment/Parse_hg38_aligned/output_combined/all-sample/DGE_filtered/all_genes.csv')
cell_meta = pd.read_csv('../Parse_alingment/Parse_hg38_aligned/output_combined/all-sample/DGE_filtered/cell_metadata.csv')

In [None]:
gene_data = gene_data[gene_data.gene_name.notnull()]
notNa = gene_data.index
notNa = notNa.to_list()

# remove genes with nan values and assign gene names
adata = adata[:,notNa]
adata.var = gene_data
adata.var.set_index('gene_name', inplace=True)
adata.var.index.name = None
adata.var_names_make_unique()

# add cell meta data to anndata object
adata.obs = cell_meta
adata.obs.set_index('bc_wells', inplace=True)
adata.obs.index.name = None
adata.obs_names_make_unique()

In [None]:
adata.write('../Data/SC/parse.h5ad')

In [None]:
adata = sc.read('../Data/SC/parse.h5ad')

In [None]:
from scipy.stats import median_abs_deviation

def is_outlier(adata, metric: str, nmads: int):
        M = adata.obs[metric]
        outlier = (M < np.median(M) - nmads * median_abs_deviation(M)) | (
            np.median(M) + nmads * median_abs_deviation(M) < M)
        return outlier

adata_dict_unfiltered ,adata_dict_filtered = {}, {}

for batch in adata.obs['sample'].unique():
    
    print(batch)
    
    adata_temp = adata[adata.obs['sample'].isin([batch])].copy()
    
    # mitochondrial genes
    adata_temp.var["mt"] = adata_temp.var_names.str.startswith("MT-")
    # ribosomal genes
    adata_temp.var["ribo"] = adata_temp.var_names.str.startswith(("RPS", "RPL"))
    
    sc.pp.calculate_qc_metrics(adata_temp, qc_vars=["mt", "ribo"], inplace=True, percent_top=[20], log1p=True)
    
    adata_temp.obs["outlier"] = (is_outlier(adata_temp, "log1p_total_counts", 3) | is_outlier(adata_temp, "log1p_n_genes_by_counts", 5))
       
    adata_temp.obs["mt_outlier"] = is_outlier(adata_temp, "pct_counts_mt", 5) | (adata_temp.obs["pct_counts_mt"] > 8)
    
    adata_temp.obs['pass_qc'] = (~adata_temp.obs.outlier) & (~adata_temp.obs.mt_outlier)
    adata_dict_unfiltered[batch] = adata_temp.copy()
    
    print(f"Total number of cells: {adata_temp.n_obs}")
    adata_temp = adata_temp[(~adata_temp.obs.outlier) & (~adata_temp.obs.mt_outlier)].copy()
    print(f"Number of cells after filtering of low quality cells: {adata_temp.n_obs}")
    print('______________________________________________________________________')
                                 
    # Remove rare genes
    sc.pp.filter_genes(adata_temp, min_cells=3)    

    # Remove mito and ribo genes
    ribo = adata_temp.var_names.str.startswith(('RPL', "RPS"))
    mito = adata_temp.var_names.str.startswith('MT-')
    remove = np.add(mito, ribo)
    #keep = np.invert(ribo)
    
    keep = np.invert(remove)
    adata_temp = adata_temp[:,np.array(keep)]
    
    
    adata_dict_filtered[batch] = adata_temp.copy()

In [None]:
# Check the results before and after filtering
batch = 'AM34K_d50'

with plt.rc_context({ "figure.dpi": 300}):
    # Compute qc metrix
    adata_dict_unfiltered[batch].var['mt'] = adata_dict_unfiltered[batch].var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
    sc.pp.calculate_qc_metrics(adata_dict_unfiltered[batch], qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
    fig, (ax0, ax1, ax2, ax3, ax4, ax5) = plt.subplots(1, 6,  figsize=(20,4), gridspec_kw={'wspace':0.5})
    ax0_dict = sc.pl.violin(adata_dict_unfiltered[batch],["pct_counts_mt"], jitter=0.5, show=False, ax=ax0, stripplot=False)
    ax1_dict = sc.pl.violin(adata_dict_unfiltered[batch],['n_genes_by_counts'], jitter=0.5, show=False, ax = ax1, stripplot=False) 
    ax2_dict = sc.pl.violin(adata_dict_unfiltered[batch],['total_counts'], jitter=0.5, show=False, ax = ax2, stripplot=False)
    ax3_dict = sns.histplot(adata_dict_unfiltered[batch].obs["n_genes_by_counts"],  ax = ax3)
    ax4_dict = sns.histplot(adata_dict_unfiltered[batch].obs["total_counts"], ax = ax4)
    ax5_dict = sc.pl.scatter(adata_dict_unfiltered[batch], x='total_counts', y='n_genes_by_counts', show=False, ax=ax5)    
    plt.show()
    plt.clf()

    # Compute qc metrix
    adata_dict_filtered[batch].var['mt'] = adata_dict_filtered[batch].var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
    sc.pp.calculate_qc_metrics(adata_dict_filtered[batch], qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
    fig, (ax0, ax1, ax2, ax3, ax4, ax5) = plt.subplots(1, 6,  figsize=(20,4), gridspec_kw={'wspace':0.5})
    ax0_dict = sc.pl.violin(adata_dict_filtered[batch],["pct_counts_mt"], jitter=0.5, show=False, ax=ax0, stripplot=False)
    ax1_dict = sc.pl.violin(adata_dict_filtered[batch],['n_genes_by_counts'], jitter=0.5, show=False, ax = ax1, stripplot=False) 
    ax2_dict = sc.pl.violin(adata_dict_filtered[batch],['total_counts'], jitter=0.5, show=False, ax = ax2, stripplot=False)
    ax3_dict = sns.histplot(adata_dict_filtered[batch].obs["n_genes_by_counts"],  ax = ax3)
    ax4_dict = sns.histplot(adata_dict_filtered[batch].obs["total_counts"], ax = ax4)
    ax5_dict = sc.pl.scatter(adata_dict_filtered[batch], x='total_counts', y='n_genes_by_counts', show=False, ax=ax5)
    plt.show()
    plt.clf()



In [None]:
# filter adata.var and adata.obs, and concatenate the objects and save to later use
result_dict = {}

for batch, adata_temp in adata_dict_filtered.items():
    del adata_temp.var
    result_dict[batch] = adata_temp.copy()
    
adata = list(result_dict.values())[0].concatenate(list(result_dict.values())[1:], batch_key=None, join='outer')


In [None]:
%%R 
# Detecting doublets

library(Seurat)
library(scater)
library(scDblFinder)
library(BiocParallel)

In [None]:
%%R -i adata -o doublet_score -o doublet_class
# Detecting doublets

library(Seurat)
library(scater)
library(scDblFinder)
library(BiocParallel)

set.seed(123)
sce = adata
names(assays(sce))=c("counts")

sce = scDblFinder(sce, samples='sample', BPPARAM=MulticoreParam(8))
doublet_score = sce$scDblFinder.score
doublet_class = sce$scDblFinder.class

In [None]:
# Add results from scDblFinder to adata and remove doublets
adata.obs["scDblFinder_score"] = doublet_score
adata.obs["scDblFinder_class"] = doublet_class
print(adata.obs.scDblFinder_class.value_counts())

adata = adata[adata.obs.scDblFinder_class=='singlet']

In [None]:
adata.layers['counts'] = adata.X.copy()

# Log-normalize
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

In [None]:
# Add cell line and bmp treatment columns to df
adata.obs[['sample_temp','day']] = adata.obs['sample'].str.split('_', n=1, expand=True)

adata.obs['cell_line'] = adata.obs['sample_temp'].map({'AM30A':'RC17','AM30B':'RC17','AM30C':'RC17','AM30D':'RC17', 'AM30E':'RC17',
'AM34A': 'KOLF','AM34B':'KOLF','AM34C':'KOLF','AM34D':'KOLF','AM34E':'KOLF',
'AM34F': 'Bio-N','AM34G':'Bio-N','AM34H':'Bio-N','AM34I':'Bio-N','AM34K':'Bio-N'})

adata.obs['bmp_treatment'] = adata.obs['sample_temp'].map({'AM30A':'no BMP','AM30B':'BMP 5-14','AM30C':'BMP 5-11','AM30D':'BMP 5-9', 'AM30E':'BMP 5-7',
'AM34A': 'no BMP','AM34B':'BMP 5-14','AM34C':'BMP 5-11','AM34D':'BMP 5-9','AM34E':'BMP 5-7',
'AM34F': 'no BMP','AM34G':'BMP 5-14','AM34H':'BMP 5-11','AM34I':'BMP 5-9','AM34K':'BMP 5-7'})

del adata.obs['sample_temp']

adata.obs[['sample','day','cell_line','bmp_treatment']].drop_duplicates().reset_index(drop=True)

In [None]:
adata.write('../Data/SC/processed_bmp_timing_exp.h5ad')