# Import packages and data 

#### created by SW and based on PV script (for HCA skin analysis)

In [None]:
#%% Import
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy as sc 
import scrublet as scr
from statsmodels import robust
import sys
import os.path

In [None]:
sc.settings.verbosity = 1  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()

In [None]:
# Set up the plot config for viewing the annotation clearly.
sc.settings.set_figure_params(dpi=300)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Load in raw dataset

In [None]:
adata = sc.read('raw_pre_scrublet.h5ad')

In [None]:
adata.var_names_make_unique()

In [None]:
adata

In [None]:
adata.obs

# Running scrublet according to Peter Vegh script

In [None]:
#%% Scrublet
meta_10x_channels = 'lanes'

In [None]:
RUNs, DSs, CELLs, THRs, MEDs, MADs, CUTs, no_thr = [], [], [], [], [], [], [], []

# Loop through channels in anndata object:
orig_stdout = sys.stdout
sys.stdout = open('scrublet_output/scrublet_output_file_mad.txt', 'w')

for run in adata.obs[meta_10x_channels].unique():
    print(run)
    ad = adata[adata.obs[meta_10x_channels] == run, :]
    x = ad.X
    scrub = scr.Scrublet(x)
    ds, prd = scrub.scrub_doublets()
    RUNs.append(run)
    DSs.append(ds)
    CELLs.append(ad.obs_names)
    # MAD calculation of threshold:
    MED = np.median(ds)
    MAD = robust.mad(ds)
    CUT = (MED + (3 * MAD))
    MEDs.append(MED)
    MADs.append(MAD)
    CUTs.append(CUT)

    try:  # not always can calculate automatic threshold
        THRs.append(scrub.threshold_)
        print('Threshold found by scrublet')
    except:
        THRs.append(0.4)
        no_thr.append(run)
        print('No threshold found, assigning 0.4 to', run)
        scrub.call_doublets(threshold=0.4) # so that it can make the plot
    fig = scrub.plot_histogram()
    fig[0].savefig('scrublet_output/' + run + '.png')

    # Alternative histogram for MAD-based cutoff
    scrub.call_doublets(threshold=CUT)
    fig = scrub.plot_histogram()
    fig[0].savefig('scrublet_output/' + run + '_mad_' + '.png')
    plt.close('all')
    print()
    print()

print()
print('The following sample(s) do not have automatic threshold:')
print(no_thr)

In [None]:
sys.stdout.close()
sys.stdout = orig_stdout

ns = np.array(list(map(len, DSs)))

tbl = pd.DataFrame({
    'run': np.repeat(RUNs, ns),
    'ds': np.concatenate(DSs),
    'thr': np.repeat(THRs, ns),
    'mad_MED': np.repeat(MEDs, ns),
    'mad_MAD': np.repeat(MADs, ns),
    'mad_thr': np.repeat(CUTs, ns),
    }, index=np.concatenate(CELLs))

tbl['auto_prd'] = tbl['ds'] > tbl['thr']
tbl['mad_prd'] = tbl['ds'] > tbl['mad_thr']

tbl.to_csv('scrublet_output/doublets_score_mad.csv', header=True, index=True)

In [None]:
adata.obs["mad_prd"] = tbl['mad_prd']

In [None]:
adata.obs["auto_prd"] = tbl['auto_prd']

In [None]:
adata.obs

In [None]:
adata.obs['mad_prd'] = adata.obs['mad_prd'].astype("str") 

In [None]:
adata.obs['auto_prd'] = adata.obs['auto_prd'].astype("str") 

In [None]:
adata_singlet = adata[adata.obs['mad_prd'].isin(["False"])].copy()

In [None]:
adata_singlet.obs

In [None]:
adata_singlet

In [None]:
print(adata_singlet.obs['lanes'].value_counts())

# Savings singlets from downs bone marrow dataset

In [None]:
adata_singlet.write('raw.h5ad')