In [None]:
import pandas as pd
import scanpy as sc
import gc

from sklearn.metrics import confusion_matrix, roc_curve, auc
from sklearn.metrics import precision_recall_curve
from pathlib import Path
import time

import sys, os
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), "src"))
import dbldec

# scanpy settings
sc.settings.set_figure_params(dpi=100, frameon=False, facecolor='white')
sc.settings.verbosity = 0             # verbosity: errors (0), warnings (1), info (2), hints (3)
# Set random seed directly in scanpy 
sc.settings.seed = 42

In [None]:
sample_path = r'../../SUM25/h5ad_format/'
# Replace 'your_folder_path' with your actual path
folder_path = Path(sample_path)
h5ad_files = list(folder_path.glob('*.h5ad'))

sample_names = [file.name.replace('.h5ad', '') for file in h5ad_files]

# temporary to adjust to start to 'HMEC-orig-MULTI'
start_sample = 'pbmc-1A-dm'
start_index = sample_names.index(start_sample)
sample_names = sample_names[start_index:] + sample_names[:start_index]
sample_names

In [None]:
dfs = []
for sample_name in sample_names:
    print(sample_name)
    # get data
    sample_file = sample_path + sample_name + r'.h5ad'
    adata = sc.read_h5ad(sample_file)
    adata.X = adata.X.toarray()
    sc.pp.filter_genes(adata, min_cells=1)
    sc.pp.filter_cells(adata, min_genes=1) 
    adata.raw = adata.copy()

    start_time = time.time()
    doublet_probs, doublet_preds = dbldec.dbl_dec(adata, n_features=1000, verbose=0)
    execution_time = time.time() - start_time
      
    # Get metrics in df
    y_test = adata.obs.y_true  
    y_pred = doublet_preds.astype(int)
    n_cells = adata.n_obs
    precision, recall, thresholds = precision_recall_curve(y_test, doublet_probs)
    auprc = auc(recall, precision)
    fpr, tpr, _ = roc_curve(y_test, doublet_probs)
    auroc = auc(fpr, tpr)
    cm = confusion_matrix(y_test, y_pred)
    TN = cm[0,0]
    FP = cm[0,1]
    FN = cm[1,0]
    TP = cm[1,1]
    df_metrics = pd.DataFrame([n_cells, execution_time, auprc, auroc, TN, FP, FN, TP], 
                      index = ['Num cells', 'time', 'auprc', 'auroc', 'TN', 'FP', 'FN', 'TP']).T
    df_metrics.index = [sample_name]
    dfs.append(df_metrics)
    display(df_metrics)

    del adata, doublet_probs, doublet_preds, y_test, y_pred, precision, recall, thresholds, fpr, tpr, cm
    gc.collect()
    
df_output = pd.concat(dfs, ignore_index=False)
df_output.to_csv('gbc_output.csv')
display(df_output)