In [1]:
import pandas as pd
import scanpy as sc
import gc

from sklearn.metrics import confusion_matrix, roc_curve, auc
from sklearn.metrics import precision_recall_curve
from pathlib import Path
import time

import sys, os
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), "src"))
import dbldec

# scanpy settings
sc.settings.set_figure_params(dpi=100, frameon=False, facecolor='white')
sc.settings.verbosity = 0             # verbosity: errors (0), warnings (1), info (2), hints (3)
# Set random seed directly in scanpy 
sc.settings.seed = 42

In [2]:
sample_path = r'../../SUM25/h5ad_format/'
# Replace 'your_folder_path' with your actual path
folder_path = Path(sample_path)
h5ad_files = list(folder_path.glob('*.h5ad'))

sample_names = [file.name.replace('.h5ad', '') for file in h5ad_files]

# temporary to adjust to start to 'HMEC-orig-MULTI'
start_sample = 'pbmc-1A-dm'
start_index = sample_names.index(start_sample)
sample_names = sample_names[start_index:] + sample_names[:start_index]
sample_names

['pbmc-1A-dm',
 'pbmc-1B-dm',
 'pbmc-1C-dm',
 'pbmc-2ctrl-dm',
 'pbmc-2stim-dm',
 'pbmc-ch',
 'pdx-MULTI',
 'cline-ch',
 'HEK-HMEC-MULTI',
 'hm-12k',
 'hm-6k',
 'HMEC-orig-MULTI',
 'HMEC-rep-MULTI',
 'J293t-dm',
 'mkidney-ch',
 'nuc-MULTI']

In [3]:
dfs = []
for sample_name in sample_names:
    print(sample_name)
    # get data
    sample_file = sample_path + sample_name + r'.h5ad'
    adata = sc.read_h5ad(sample_file)
    adata.X = adata.X.toarray()
    sc.pp.filter_genes(adata, min_cells=1)
    sc.pp.filter_cells(adata, min_genes=1) 
    adata.raw = adata.copy()

    start_time = time.time()
    doublet_probs, doublet_preds = dbldec.dbl_dec(adata, n_features=1000, verbose=0)
    execution_time = time.time() - start_time
      
    # Get metrics in df
    y_test = adata.obs.y_true  
    y_pred = doublet_preds.astype(int)
    n_cells = adata.n_obs
    precision, recall, thresholds = precision_recall_curve(y_test, doublet_probs)
    auprc = auc(recall, precision)
    fpr, tpr, _ = roc_curve(y_test, doublet_probs)
    auroc = auc(fpr, tpr)
    cm = confusion_matrix(y_test, y_pred)
    TN = cm[0,0]
    FP = cm[0,1]
    FN = cm[1,0]
    TP = cm[1,1]
    df_metrics = pd.DataFrame([n_cells, execution_time, auprc, auroc, TN, FP, FN, TP], 
                      index = ['Num cells', 'time', 'auprc', 'auroc', 'TN', 'FP', 'FN', 'TP']).T
    df_metrics.index = [sample_name]
    dfs.append(df_metrics)
    display(df_metrics)

    del adata, doublet_probs, doublet_preds, y_test, y_pred, precision, recall, thresholds, fpr, tpr, cm
    gc.collect()
    
df_output = pd.concat(dfs, ignore_index=False)
df_output.to_csv('gbc_output.csv')
display(df_output)

pbmc-1A-dm
Selecting 1000 features...
Finished!
Preprocessing...
Finished!
Running Main Clustering...



 To achieve the future defaults please pass: flavor="igraph" and n_iterations=2.  directed must also be False to work with igraph's implementation.
  sc.tl.leiden(adata, resolution=2.0, key_added='main', flavor='leidenalg')


Running Leiden Clustering...
Isolating density outliers...
Finished!
Generating 3298 Doublets...
Done!

Training Gradient Boosting classifier for heterotypic doublet detection...
Threshold found: 0.5
Number of doublets detected by classifier: 190


Unnamed: 0,Num cells,time,auprc,auroc,TN,FP,FN,TP
pbmc-1A-dm,3298.0,70.293965,0.512352,0.832018,3066.0,112.0,42.0,78.0


pbmc-1B-dm
Selecting 1000 features...
Finished!
Preprocessing...
Finished!
Running Main Clustering...
Running Leiden Clustering...
Isolating density outliers...
Finished!
Generating 3790 Doublets...
Done!

Training Gradient Boosting classifier for heterotypic doublet detection...
Threshold found: 0.5
Number of doublets detected by classifier: 250


Unnamed: 0,Num cells,time,auprc,auroc,TN,FP,FN,TP
pbmc-1B-dm,3790.0,52.249696,0.425062,0.785512,3486.0,174.0,54.0,76.0


pbmc-1C-dm
Selecting 1000 features...
Finished!
Preprocessing...
Finished!
Running Main Clustering...
Running Leiden Clustering...
Isolating density outliers...
Finished!
Generating 5270 Doublets...
Done!

Training Gradient Boosting classifier for heterotypic doublet detection...
Threshold found: 0.5
Number of doublets detected by classifier: 479


Unnamed: 0,Num cells,time,auprc,auroc,TN,FP,FN,TP
pbmc-1C-dm,5270.0,105.065603,0.535934,0.82915,4684.0,270.0,107.0,209.0


pbmc-2ctrl-dm
Selecting 1000 features...
Finished!
Preprocessing...
Finished!
Running Main Clustering...
Running Leiden Clustering...
Isolating density outliers...
Finished!
Generating 13913 Doublets...
Done!

Training Gradient Boosting classifier for heterotypic doublet detection...
Threshold found: 0.5
Number of doublets detected by classifier: 2165


Unnamed: 0,Num cells,time,auprc,auroc,TN,FP,FN,TP
pbmc-2ctrl-dm,13913.0,285.24845,0.676974,0.919817,11455.0,860.0,293.0,1305.0


pbmc-2stim-dm
Selecting 1000 features...
Finished!
Preprocessing...
Finished!
Running Main Clustering...
Running Leiden Clustering...
Isolating density outliers...
Finished!
Generating 13916 Doublets...
Done!

Training Gradient Boosting classifier for heterotypic doublet detection...
Threshold found: 0.5
Number of doublets detected by classifier: 2090


Unnamed: 0,Num cells,time,auprc,auroc,TN,FP,FN,TP
pbmc-2stim-dm,13916.0,276.368513,0.662997,0.914259,11492.0,793.0,334.0,1297.0


pbmc-ch
Selecting 1000 features...
Finished!
Preprocessing...
Finished!
Running Main Clustering...
Running Leiden Clustering...
Isolating density outliers...
Finished!
Generating 15272 Doublets...
Done!

Training Gradient Boosting classifier for heterotypic doublet detection...
Threshold found: 0.5
Number of doublets detected by classifier: 2234


Unnamed: 0,Num cells,time,auprc,auroc,TN,FP,FN,TP
pbmc-ch,15272.0,353.740901,0.628293,0.834048,12008.0,719.0,1030.0,1515.0


pdx-MULTI
Selecting 1000 features...
Finished!
Preprocessing...
Finished!
Running Main Clustering...
Running Leiden Clustering...
Isolating density outliers...
Finished!
Generating 10296 Doublets...
Done!

Training Gradient Boosting classifier for heterotypic doublet detection...
Threshold found: 0.5
Number of doublets detected by classifier: 1792


Unnamed: 0,Num cells,time,auprc,auroc,TN,FP,FN,TP
pdx-MULTI,10296.0,193.497124,0.417183,0.735265,7885.0,1094.0,619.0,698.0


cline-ch
Selecting 1000 features...
Finished!
Preprocessing...
Finished!
Running Main Clustering...
Running Leiden Clustering...
Isolating density outliers...
Finished!
Generating 7954 Doublets...
Done!

Training Gradient Boosting classifier for heterotypic doublet detection...
Threshold found: 0.5
Number of doublets detected by classifier: 526


Unnamed: 0,Num cells,time,auprc,auroc,TN,FP,FN,TP
cline-ch,7954.0,150.81303,0.406343,0.607305,6294.0,195.0,1134.0,331.0


HEK-HMEC-MULTI
Selecting 1000 features...
Finished!
Preprocessing...
Finished!
Running Main Clustering...
Running Leiden Clustering...
Isolating density outliers...
Finished!
Generating 10641 Doublets...
Done!

Training Gradient Boosting classifier for heterotypic doublet detection...
Threshold found: 0.5
Number of doublets detected by classifier: 618


Unnamed: 0,Num cells,time,auprc,auroc,TN,FP,FN,TP
HEK-HMEC-MULTI,10641.0,192.936906,0.498714,0.78039,9796.0,356.0,227.0,262.0


hm-12k
Selecting 1000 features...
Finished!
Preprocessing...
Finished!
Running Main Clustering...
Running Leiden Clustering...
Isolating density outliers...
Finished!
Generating 12820 Doublets...
Done!

Training Gradient Boosting classifier for heterotypic doublet detection...
Threshold found: 0.5
Number of doublets detected by classifier: 1258


Unnamed: 0,Num cells,time,auprc,auroc,TN,FP,FN,TP
hm-12k,12820.0,264.219542,0.815524,0.984241,11497.0,593.0,65.0,665.0


hm-6k
Selecting 1000 features...
Finished!
Preprocessing...
Finished!
Running Main Clustering...
Running Leiden Clustering...
Isolating density outliers...
Finished!
Generating 6806 Doublets...
Done!

Training Gradient Boosting classifier for heterotypic doublet detection...
Threshold found: 0.5
Number of doublets detected by classifier: 424


Unnamed: 0,Num cells,time,auprc,auroc,TN,FP,FN,TP
hm-6k,6806.0,123.717122,0.942155,0.9985,6382.0,253.0,0.0,171.0


HMEC-orig-MULTI
Selecting 1000 features...
Finished!
Preprocessing...
Finished!
Running Main Clustering...
Running Leiden Clustering...
Isolating density outliers...
Finished!
Generating 26426 Doublets...
Done!

Training Gradient Boosting classifier for heterotypic doublet detection...
Threshold found: 0.5
Number of doublets detected by classifier: 2714


Unnamed: 0,Num cells,time,auprc,auroc,TN,FP,FN,TP
HMEC-orig-MULTI,26426.0,890.967836,0.445429,0.75007,21632.0,1226.0,2080.0,1488.0


HMEC-rep-MULTI
Selecting 1000 features...
Finished!
Preprocessing...
Finished!
Running Main Clustering...
Running Leiden Clustering...
Isolating density outliers...
Finished!
Generating 10580 Doublets...
Done!

Training Gradient Boosting classifier for heterotypic doublet detection...
Threshold found: 0.5
Number of doublets detected by classifier: 1360


Unnamed: 0,Num cells,time,auprc,auroc,TN,FP,FN,TP
HMEC-rep-MULTI,10580.0,214.870074,0.554666,0.657275,6919.0,379.0,2301.0,981.0


J293t-dm
Selecting 1000 features...
Finished!
Preprocessing...
Finished!
Running Main Clustering...
Running Leiden Clustering...
Isolating density outliers...
Finished!
Generating 500 Doublets...
Done!

Training Gradient Boosting classifier for heterotypic doublet detection...
Threshold found: 0.5
Number of doublets detected by classifier: 17


Unnamed: 0,Num cells,time,auprc,auroc,TN,FP,FN,TP
J293t-dm,500.0,8.89773,0.124125,0.483209,446.0,12.0,37.0,5.0


mkidney-ch
Selecting 1000 features...
Finished!
Preprocessing...
Finished!
Running Main Clustering...
Running Leiden Clustering...
Isolating density outliers...
Finished!
Generating 21179 Doublets...
Done!

Training Gradient Boosting classifier for heterotypic doublet detection...
Threshold found: 0.5
Number of doublets detected by classifier: 4623


Unnamed: 0,Num cells,time,auprc,auroc,TN,FP,FN,TP
mkidney-ch,21179.0,590.993744,0.580453,0.681254,11667.0,1611.0,4889.0,3012.0


nuc-MULTI
Selecting 1000 features...
Finished!
Preprocessing...
Finished!
Running Main Clustering...
Running Leiden Clustering...
Isolating density outliers...
Finished!
Generating 5578 Doublets...
Done!

Training Gradient Boosting classifier for heterotypic doublet detection...
Threshold found: 0.5
Number of doublets detected by classifier: 816


Unnamed: 0,Num cells,time,auprc,auroc,TN,FP,FN,TP
nuc-MULTI,5578.0,96.323686,0.447252,0.767841,4540.0,563.0,222.0,253.0


Unnamed: 0,Num cells,time,auprc,auroc,TN,FP,FN,TP
pbmc-1A-dm,3298.0,70.293965,0.512352,0.832018,3066.0,112.0,42.0,78.0
pbmc-1B-dm,3790.0,52.249696,0.425062,0.785512,3486.0,174.0,54.0,76.0
pbmc-1C-dm,5270.0,105.065603,0.535934,0.82915,4684.0,270.0,107.0,209.0
pbmc-2ctrl-dm,13913.0,285.24845,0.676974,0.919817,11455.0,860.0,293.0,1305.0
pbmc-2stim-dm,13916.0,276.368513,0.662997,0.914259,11492.0,793.0,334.0,1297.0
pbmc-ch,15272.0,353.740901,0.628293,0.834048,12008.0,719.0,1030.0,1515.0
pdx-MULTI,10296.0,193.497124,0.417183,0.735265,7885.0,1094.0,619.0,698.0
cline-ch,7954.0,150.81303,0.406343,0.607305,6294.0,195.0,1134.0,331.0
HEK-HMEC-MULTI,10641.0,192.936906,0.498714,0.78039,9796.0,356.0,227.0,262.0
hm-12k,12820.0,264.219542,0.815524,0.984241,11497.0,593.0,65.0,665.0
