In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from os import listdir
from os.path import isfile, join
import umap as um
import pathlib as pl
from scipy.io import mmread
from sklearn.preprocessing import StandardScaler
import scipy.sparse as scs
from sklearn.decomposition import PCA


def sigmoid(x):
    sig = 1 / (1 + np.exp(-(12*x)+6))
    return sig

In [None]:
save_path = '../results_manuscript/UMAPs_preds/'
data_dir = 'sce_normalized_data_inflate'
path = '../results_benchmark/'

files = [f.split('_') for f in listdir(path) if (isfile(join(path, f)) & (f[-3:]=='csv'))]
files = np.array(files)
files

In [None]:
data_names = np.sort(np.unique(files[:,0]))
data_names

In [None]:
np.unique(files[:,1])

In [None]:
method_names = ['vaeda', 'scDblFinder', 'DoubletFinder', 'hybrid', 'solo', 'bcds', 'Scrublet', 'cxds']

In [None]:
for data_name in data_names:

    print('----------------')
    print('')
    print(data_name)

    #read in UMAP coords
    umap_file = save_path + 'UMAP_coords/' + data_name + '_UMAP.npy'
    if(pl.Path(umap_file).exists()):
        proj = np.load(umap_file)
    else:
        #read in real data and make UMAP
        real_path = '../data/mtx_files/' + data_name + '.mtx'

        print('making UMAP projection')
        dat_real = mmread(real_path)
        Xr = scs.csr_matrix(dat_real).toarray().T
        print(Xr.shape)
        
        var = np.var(Xr, axis=0)
        np.random.seed(3900362577)
        hvgs = np.argpartition(var, -2000)[-2000:]  

        Xr = Xr[:,hvgs]
        
        #scale
        Xr = np.log2(Xr+1)
        np.random.seed(42)
        scaler = StandardScaler().fit(Xr.T)
        np.random.seed(42)
        Xr = scaler.transform(Xr.T).T
        
        #PCA
        np.random.seed(42)
        pca = PCA(n_components=20)
        pca_proj = pca.fit_transform(Xr)
        del(Xr)
        
        #UMAP
        proj = um.UMAP(n_neighbors=7, random_state=42).fit_transform(pca_proj)
        
        np.save(umap_file, proj)
        print('done making UMAP projection')
        
        
    #read in annotations
    ano_path  = '../data/'+ data_dir + '/' + data_name + '_anno.csv'

    #- READ IN BARCODE ANNOTATIONS
    ano = pd.read_csv(ano_path)
    true = pd.factorize(ano.x)[0]
    labels = ano.x
    if (labels[0]=='doublet'):
        tmp = true + 3
        tmp[tmp==3] = 1
        tmp[tmp==4] = 0
        true = tmp
    
    #UMAP of raw data colored by actual labels
    ###########################################
    ###########################################
    ###########################################
    labs = labels
    colors = ['#d7e4ef', '#4c004b']
    alphas = [1, 1]
    size= (1/np.sqrt(proj.shape[0]))*30

    fig, ax = plt.subplots()
    
    for i, lab in enumerate(np.unique(labs)[::-1]):
        ind = (labs == lab)
        plt.scatter(proj[ind,0], proj[ind,1], c=colors[i], s=size, alpha=alphas[i], label=lab)

    plt.legend(bbox_to_anchor=(1,1), loc="upper left")
    #make square
    x0,x1 = ax.get_xlim()
    y0,y1 = ax.get_ylim()
    ax.set_aspect(abs(x1-x0)/abs(y1-y0))
    #remove ticks
    plt.tick_params(axis='both', 
                    which='both', 
                    bottom=False, 
                    top=False, 
                    labelbottom=False, 
                    right=False, 
                    left=False, 
                    labelleft=False)

    fig.savefig(save_path + 'individual_plots/' + data_name +'_true.png', dpi=300)
    plt.show()
    plt.close(fig)
    ###########################################
    ###########################################
    ###########################################
    
    

In [None]:
for file in files:

    print('----------------')
    print('')
    data_name = file[0]
    print(data_name)
    method_name = file[1]
    print(method_name)

    #read in UMAP coords
    umap_file = save_path + 'UMAP_coords/' + data_name + '_UMAP.npy'
    proj = np.load(umap_file)

    #read in other method's things        
    #if method_name=='vaeda':
    #    preds_file  = path + data_name + '_' + method_name + '_scores.npy' 
    #    preds = np.load(preds_file) 
    #else:
    preds_file  = path + data_name + '_' + method_name + '_scores_1.csv' 
    results = pd.read_csv(preds_file)
    preds = np.array(results.doublet_scores)


    #UMAP of raw data colored by preds labels
    alpha = 1
    size= (1/np.sqrt(proj.shape[0]))*30

    fig, ax = plt.subplots()

    ind = np.argsort(preds)
    plt.scatter(proj[ind,0], proj[ind,1], c=preds[ind], s=size, alpha=alpha, cmap='BuPu')
    plt.colorbar()

    #make square
    x0,x1 = ax.get_xlim()
    y0,y1 = ax.get_ylim()
    ax.set_aspect(abs(x1-x0)/abs(y1-y0))
    #remove ticks
    plt.tick_params(axis='both', 
                    which='both', 
                    bottom=False, 
                    top=False, 
                    labelbottom=False, 
                    right=False, 
                    left=False, 
                    labelleft=False)

    fig.savefig(save_path + 'individual_plots/' + data_name + '_' + method_name +'_preds.png', dpi=300)

    plt.show()
    plt.close(fig)
    



In [None]:
rows = len(data_names)
cols = len(method_names) + 1

width = 18.267717
width_p_plot = width / cols
height = rows*width_p_plot + rows*0.3

fig,ax = plt.subplots(figsize=(width,height),sharex=True, sharey=True,dpi=300)

d = 1

for data_name in data_names:

    print(data_name)

    #read in UMAP coords
    umap_file = save_path + 'UMAP_coords/' + data_name + '_UMAP.npy'
    proj = np.load(umap_file)

    ano_path  = '../data/'+ data_dir + '/' + data_name + '_anno.csv'

    #- READ IN BARCODE ANNOTATIONS
    ano = pd.read_csv(ano_path)
    true = pd.factorize(ano.x)[0]
    labels = ano.x
    if (labels[0]=='doublet'):
        tmp = true + 3
        tmp[tmp==3] = 1
        tmp[tmp==4] = 0
        true = tmp
    
    #UMAP of raw data colored by actual labels
    labs = labels
    colors = ['#d7e4ef', '#4c004b']
    alphas = [1, 1]
    size= (1/np.sqrt(proj.shape[0]))*30

    plt.subplot(rows,cols,d)

    for i, lab in enumerate(np.unique(labs)[::-1]):
        ind = (labs == lab)
        plt.scatter(proj[ind,0], proj[ind,1], c=colors[i], s=size, alpha=alphas[i], label=lab)

    #plt.legend(bbox_to_anchor=(1,1), loc="upper left")
    
    #make square
    x0,x1 = ax.get_xlim()
    y0,y1 = ax.get_ylim()
    ax.set_aspect(abs(x1-x0)/abs(y1-y0))
    #remove ticks
    plt.tick_params(axis='both', 
                    which='both', 
                    bottom=False, 
                    top=False, 
                    labelbottom=False, 
                    right=False, 
                    left=False, 
                    labelleft=False)

    plt.ylabel(data_name)
    if(data_name == data_names[0]):
        plt.title('Doublet Labels')

    d = d+1

    #read in other method's things
    for method_name in method_names:
        print(method_name)
        
        #if method_name=='vaeda':
        #    preds_file  = path + data_name + '_' + method_name + '_scores.npy' 
        #    preds = np.load(preds_file) 
        #else:
        preds_file  = path + data_name + '_' + method_name + '_scores_1.csv' 
        results = pd.read_csv(preds_file)
        preds = np.array(results.doublet_scores)
        
        preds = (preds - np.min(preds)) / np.max(preds)

        #UMAP of raw data colored by preds labels
        alpha = 1
        size= (1/np.sqrt(proj.shape[0]))*30
        
        plt.subplot(rows,cols,d)

        ind = np.argsort(preds)
        plt.scatter(proj[ind,0], proj[ind,1], c=preds[ind], s=size, alpha=alpha, cmap='BuPu')

        #make square
        x0,x1 = ax.get_xlim()
        y0,y1 = ax.get_ylim()
        ax.set_aspect(abs(x1-x0)/abs(y1-y0))
        #remove ticks
        plt.tick_params(axis='both', 
                        which='both', 
                        bottom=False, 
                        top=False, 
                        labelbottom=False, 
                        right=False, 
                        left=False, 
                        labelleft=False)
        
        if(data_name == data_names[0]):
            plt.title(method_name)      

        d = d+1
        
        
fig.savefig(save_path +'ALL_PREDS.png', dpi=300)

plt.show()
plt.close(fig)


In [None]:
print('done')

In [None]:
for file in files:

    print('----------------')
    print('')
    data_name = file[0]
    print(data_name)
    method_name = file[1]
    print(method_name)

    if(method_name=='vaedaRF'):
        preds_file  = path + data_name + '_' + method_name + '_scores_1.csv' 
        results = pd.read_csv(preds_file)
        preds = np.array(results.doublet_scores)

        #read in annotations
        ano_path  = '../data/'+ data_dir + '/' + data_name + '_anno.csv'

        #- READ IN BARCODE ANNOTATIONS
        ano = pd.read_csv(ano_path)
        true = pd.factorize(ano.x)[0]
        labels = ano.x
        if (labels[0]=='doublet'):
            tmp = true + 3
            tmp[tmp==3] = 1
            tmp[tmp==4] = 0
            true = tmp
        
        print(np.sum(preds>0)/len(preds))
        print(np.sum(true>0)/len(true))

        #UMAP of raw data colored by preds labels
        fig, ax = plt.subplots()

        plt.hist(preds, bins=50)

        #make square
        x0,x1 = ax.get_xlim()
        y0,y1 = ax.get_ylim()
        ax.set_aspect(abs(x1-x0)/abs(y1-y0))
        #remove ticks
        plt.tick_params(axis='both', 
                        which='both', 
                        bottom=False, 
                        top=False, 
                        labelbottom=False, 
                        right=False, 
                        left=False, 
                        labelleft=False)


        plt.show()
        plt.close(fig)