In [None]:
# Author: Janssen Kotah
# snRNAseq analysis for WT/SHIP1 KO mice as part of Matera et al. project

# Scrublet cleanup of SoupX processed sample, adjusting lab pipeline based on the following code from:
# https://cellgeni.github.io/notebooks/html/new-doublets-scrublet.html

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
import scipy.io
import os
import scrublet as scr

In [2]:
sc.settings.verbosity = 3
sc.logging.print_header()
sc.settings.set_figure_params(dpi=800, facecolor="white")

scanpy==1.9.1 anndata==0.8.0 umap==0.5.3 numpy==1.23.5 scipy==1.9.3 pandas==1.5.3 scikit-learn==1.2.0 statsmodels==0.13.5 python-igraph==0.10.2 pynndescent==0.5.8


In [29]:
samples_list = os.listdir("/data/bcn/p310674/Documents/009_Ship1_Paolicelli/04. Analysis_resequencing_2/001_SoupX_outputs_for_scrublet")

for sample_name in samples_list:
    current_sample = sample_name[:3]
    print("Running scrublet for " + current_sample)
    
    input_dir = "/data/bcn/p310674/Documents/009_Ship1_Paolicelli/04. Analysis_resequencing_2/001_SoupX_outputs_for_scrublet/" + sample_name + "/"

    adata = sc.read_10x_mtx(input_dir, cache = False)
    adata.var_names_make_unique()
    
    scrub = scr.Scrublet(adata.X, expected_doublet_rate= 0.1)
    adata.obs['doublet_scores'], adata.obs['predicted_doublets'] = scrub.scrub_doublets(min_counts = 2, 
                                                                                        min_cells = 3, 
                                                                                        min_gene_variability_pctl=85,
                                                                                        n_prin_comps=30)
    
    print("Saving results for " + current_sample)
    path_save = current_sample + "_scrublet_results.tsv"
    pd.DataFrame(adata.obs).to_csv(path_save, sep = '\t', header = True)


Running scrublet for JK7
--> This might be very slow. Consider passing `cache=True`, which enables much faster reading from a cache file.
Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.50
Detected doublet rate = 0.6%
Estimated detectable doublet fraction = 13.0%
Overall doublet rate:
	Expected   = 10.0%
	Estimated  = 4.3%
Elapsed time: 0.8 seconds
Saving results for JK7
Running scrublet for JK4
--> This might be very slow. Consider passing `cache=True`, which enables much faster reading from a cache file.
Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.43
Detected doublet rate = 2.2%
Estimated detectable doublet fraction = 44.1%
Overall doublet rate:
	Expected   = 10.0%
	Estimated  = 4.9%
Elapsed time: 6.1 seconds
Saving results for JK4
Running scrublet for JK5
--> This 