In [None]:
import numpy as np
from numpy import *
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from os import listdir
from os.path import isfile, join, isdir
import pathlib as pl

from plot_results import get_dbl_metrics

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, matthews_corrcoef


In [None]:
save_path = '../results_manuscript/doublet_calls/'

path = '../results_benchmark/'
files = [f.split('_') for f in listdir(path) if (isfile(join(path, f)) & (f[-3:]=='csv') and (f.split('_')[1]!='lib-sze'))]
files = [f.split('_') for f in listdir(path) if (isfile(join(path, f)) & (f[-3:]=='csv') )]
files = np.array(files)
files

In [None]:
data_names = np.sort(np.unique(files[:,0]))
methods = np.unique(files[:,1])

In [None]:
methods

In [None]:
cols = ['method', 'data_name', 'actual_frac', 'expected_frac', 'estimated_frac', 
        'f1', 'mcc', 'precision', 'recall', 'accuracy', 'doublets removed ÷ total doublets',
        'singlets removed ÷ total removed']
num = len(data_names)*len(methods)

tmp1 = np.zeros((num, len(cols)))
df_res = pd.DataFrame(tmp1, columns=cols)

tmp2 = np.zeros((num, len(cols)))
df_exp = pd.DataFrame(tmp2, columns=cols)

In [None]:
pd.options.mode.chained_assignment = None
for i, file in enumerate(files):
    res = pd.read_csv(path + '_'.join(file))
    scores = np.array(res['doublet_scores'])
    calls = np.array(res['predicted_doublets'])
    if not (isinstance(calls[0], np.int64)):
        if (isinstance(calls[0], np.str)):
            #fix Doublet Singlet annotation
            new_calls = np.zeros(len(calls))
            new_calls[((calls=='Doublet') | (calls=='doublet'))] = 1
            calls = new_calls
        elif (isinstance(calls[0], np.bool_)):
            #fix boolean annotation
            new_calls = np.zeros(len(calls))
            new_calls[calls] = 1
            calls = new_calls
        elif (isinstance(calls[0], np.float)):
            new_calls = calls.astype(int)
            calls = new_calls
        else:
            print('UNKNOWN DATATYPE')
            print(file)
            print(100/0)
                
    ano_path  = '../data/mtx_files/' + file[0] + '_anno.csv'

    #- READ IN BARCODE ANNOTATIONS
    ano = pd.read_csv(ano_path)
    true = pd.factorize(ano.x)[0]
    labels = ano.x
    if (labels[0]=='doublet'):
        tmp = true + 3
        tmp[tmp==3] = 1
        tmp[tmp==4] = 0
        true = tmp
    
    
    total = len(calls)
    actual = np.sum(true) / total
    estimated = np.sum(calls) / total

    df_res['method'].iloc[i] = file[1]
    df_res['data_name'].iloc[i] = file[0]
    
    df_res['actual_frac'].iloc[i] = actual
    df_res['expected_frac'].iloc[i] = (total)*(10**(-5))
    df_res['estimated_frac'].iloc[i] = estimated

    df_res['f1'].iloc[i] = (f1_score(true, calls, average="binary"))#weighted
    df_res['mcc'].iloc[i] = (matthews_corrcoef(true, calls))    
    df_res['precision'].iloc[i] = (precision_score(true, calls, average="binary"))
    df_res['recall'].iloc[i] = (recall_score(true, calls, average="binary"))
    df_res['accuracy'].iloc[i] = (accuracy_score(true, calls))

    removed = np.sum(calls[true==1])
    df_res['doublets removed ÷ total doublets'].iloc[i] = removed / np.sum(true) 
    df_res['singlets removed ÷ total removed'].iloc[i] = np.sum(calls[true==0]) / np.sum(calls)
    
    
    
    exp_num = int(len(scores)**2/(10**5))
    exp_calls = np.zeros(len(scores))
    ind = np.argsort(scores)[-exp_num:]
    exp_calls[ind] = 1
    
    df_exp['method'].iloc[i] = file[1]
    df_exp['data_name'].iloc[i] = file[0]
    
    df_exp['actual_frac'].iloc[i] = actual
    df_exp['expected_frac'].iloc[i] = (total)*(10**(-5))
    df_exp['estimated_frac'].iloc[i] = estimated

    df_exp['f1'].iloc[i] = (f1_score(true, exp_calls, average="binary"))#weighted
    df_exp['mcc'].iloc[i] = (matthews_corrcoef(true, exp_calls))    
    df_exp['precision'].iloc[i] = (precision_score(true, exp_calls, average="binary"))
    df_exp['recall'].iloc[i] = (recall_score(true, exp_calls, average="binary"))
    df_exp['accuracy'].iloc[i] = (accuracy_score(true, exp_calls))

    removed = np.sum(exp_calls[true==1])
    df_exp['doublets removed ÷ total doublets'].iloc[i] = removed / np.sum(true) 
    df_exp['singlets removed ÷ total removed'].iloc[i] = np.sum(exp_calls[true==0]) / np.sum(exp_calls)
    
df_res.loc[df_res.method=='lib-sze',:] = df_exp.loc[df_res.method=='lib-sze',:]

In [None]:
MAX_WIDTH = 6.726

In [None]:
plt.rcParams["image.cmap"] = "Dark2"
plt.rcParams['axes.prop_cycle'] = plt.cycler(color=plt.cm.Dark2.colors)

In [None]:
corrs=[]
for method in methods:
    df = df_res.loc[df_res.method==method,:]
    fig, ax = plt.subplots(figsize=(MAX_WIDTH,3), dpi=300)

    plt.plot([0,0.4], [0,0.4], c='black')
    plt.scatter(df.actual_frac, df.estimated_frac, alpha=0.8)
    
    corr = np.round(np.corrcoef(df.actual_frac, df.estimated_frac)[0,1], 3)
    corrs.append(corr)
    
    plt.title('Doublet Fraction (R: ' + str(corr) + ')')
    plt.xlabel('actual doublet fraction')
    plt.ylabel('estimated doublet fraction')
    #plt.legend(bbox_to_anchor=(1,1), loc="upper left")

    #ax = plt.axes()
    ax.set_facecolor("white")
    plt.grid(False)
    
    #make square
    x0,x1 = ax.get_xlim()
    y0,y1 = ax.get_ylim()
    ax.set_aspect(abs(x1-x0)/abs(y1-y0))
    
    plt.tight_layout()
    plt.savefig(save_path + method + '_doublet_frac_scatter.png', dpi = 300)
    plt.show()
    plt.close()


In [None]:
df = df_res.loc[df_res.method=='vaeda',:]
fig, ax = plt.subplots(figsize=(MAX_WIDTH,3), dpi=300)

plt.plot([0,0.4], [0,0.4], c='black')
plt.scatter(df.actual_frac, df.expected_frac, alpha=0.8)

corr = np.round(np.corrcoef(df.actual_frac, df.expected_frac)[0,1], 3)
corrs.append(corr)

plt.title('Doublet Fraction (R: ' + str(corr) + ')')
plt.xlabel('actual doublet fraction')
plt.ylabel('expected doublet fraction')

ax.set_facecolor("white")
plt.grid(False)

#make square
x0,x1 = ax.get_xlim()
y0,y1 = ax.get_ylim()
ax.set_aspect(abs(x1-x0)/abs(y1-y0))

plt.tight_layout()
plt.savefig(save_path + 'EXPECTED_doublet_frac_scatter.png', dpi = 300)
plt.show()
plt.close()

In [None]:
methods = df_res.groupby(['method']).agg('mean').sort_values(by='f1', ascending=False).index.to_numpy()

In [None]:
methods

In [None]:
table = ((df_res.groupby(['method']).agg('mean').sort_values(by='f1', ascending=False).loc[:,['f1', 'mcc', 'precision', 'recall', 'accuracy']])).round(3)
table.to_csv(save_path + 'table.csv')

In [None]:
table

In [None]:
d1 = ((df_res.groupby(['method']).agg('mean').sort_values(by='f1', ascending=False).loc[methods,['f1', 'mcc', 'precision', 'recall', 'accuracy']]))
d2 = df_exp.groupby(['method']).agg('mean').sort_values(by='f1', ascending=False).loc[methods,['f1', 'mcc', 'precision', 'recall', 'accuracy']]
d2.round(3).to_csv(save_path + 'table_exp.csv')
d1.subtract(d2).round(3).to_csv(save_path + 'table_caller-exp.csv') #if doub caller better than exp, possitive
d1.subtract(d2).round(3)

In [None]:
def mk_heatmap(hm, title = 'Heatmap'):
    fig, ax = plt.subplots(1, 1, dpi=300)
    cbar_kws = {"orientation":"horizontal", 
                "shrink":0.7,
                'extendfrac':0.1, 
                "drawedges":False,
               }
    
    hm_norm = (hm.subtract(hm.min(axis=1), axis=0)).div((hm.max(axis=1) - hm.min(axis=1)), axis=0) * 100
    
    ax=sns.heatmap(hm_norm, square=False, ax=ax, cbar=True, annot=hm, cmap="YlOrRd",  fmt='g')#, cbar_kws=cbar_kws) #YlOrRd
    ax.set_yticklabels(ax.get_yticklabels(),rotation=45)
    ax.set_xticklabels(ax.get_xticklabels(),rotation=90)
    ax.set_title(title, fontsize=18)

    plt.tight_layout()

In [None]:
tmp1 = np.zeros((len(data_names), len(methods)))
hm = pd.DataFrame(tmp1, index=data_names, columns=methods)

In [None]:
for col in cols[5:-2]:
    df = df_res
    
    df_means = df.groupby(['method'])[col].agg('mean').reset_index().sort_values([col], ascending=False)
    pr_methods = np.array(df_means['method'])

    sns.set(rc={"figure.figsize":(MAX_WIDTH, 4)})
    sns.set_style("whitegrid")
    
    df_var = df.groupby(['method'])[col].agg('var').reset_index()
    df_var.index = df_var.method
    df_var = df_var.loc[pr_methods,:]

    plt.errorbar(df_means.method, df_means[col], np.sqrt(df_var[col]),
                fmt='none', ecolor='black', capsize=5)
    
    ax = sns.violinplot(data=df, x='method', y=col, palette="Set2", linewidth=0, dodge=True, order=pr_methods)
    plt.setp(ax.collections, alpha=.3)

    ax = sns.swarmplot(data=df, x='method', y=col, palette="Set2", dodge=True, size=3, order=pr_methods)
    
    sns.swarmplot(data=df_means, x='method', y=col, marker='o', palette="Set2", size=3, linewidth=1, edgecolor='black', order=pr_methods)
    
    #plt.legend(bbox_to_anchor=(1,1), loc="upper left")
    plt.xticks(rotation=45, horizontalalignment='right')
    plt.title(col)
    plt.xlabel('')
    
    plt.tight_layout()
    plt.savefig(save_path + col + '_violin.png', dpi=300)
    plt.show()
    plt.close()

In [None]:
from scipy.stats import ranksums, wilcoxon


In [None]:
for col in cols[5:-2]:
    print(col)
    torny = pd.DataFrame(np.zeros((len(methods), len(methods))), index=methods, columns=methods)
    df = df_res.loc[:,['method', 'data_name', col]]
    df.index = df['data_name']
    tmp = np.full((len(methods), len(methods)), 'ns')
    sig = pd.DataFrame(tmp, index=methods, columns=methods)
    for i in range(len(methods)):
        #print('--', method, '--')
        m1 = methods[i]
        for j in range(i+1,len(methods)):
            print
            m2 = methods[j]
            #print(m1, m2)
            d1 = df.loc[df.method==m1,col]
            d2 = df.loc[df.method==m2,col]
            
            d1 = d1.loc[data_names]
            d2 = d2.loc[data_names]
            
            w = wilcoxon(d1, d2)
            diff = (d1-d2).sum()

            if((diff<0) and (w[1]<0.05)):
                #loss for m1
                #print(m1, m2, 'loss')
                torny.loc[m1,m2] = m2
                torny.loc[m2,m1] = m2
            elif((diff>0) and (w[1]<0.05)):
                #win for m1
                #print(m1, m2, 'win')
                torny.loc[m1,m2] = m1
                torny.loc[m2,m1] = m1
            else:
                #tie
                #print(m1, m2, 'tie')
                torny.loc[m1,m2] = 'tie'
                torny.loc[m2,m1] = 'tie'

    print(torny)



In [None]:
meths = ['vaeda', 'scDblFinder', 'DoubletFinder']
for col in cols[5:-2]:
    print(col)
    torny = pd.DataFrame(np.zeros((len(meths), len(meths))), index=meths, columns=meths)
    df = df_res.loc[:,['method', 'data_name', col]]
    df.index = df['data_name']
    tmp = np.full((len(meths), len(meths)), 'ns')
    sig = pd.DataFrame(tmp, index=meths, columns=meths)
    for i in range(len(meths)):
        #print('--', meths, '--')
        m1 = meths[i]
        for j in range(i+1,len(meths)):
            print
            m2 = meths[j]
            #print(m1, m2)
            
            d1 = df.loc[df.method==m1,col]
            d2 = df.loc[df.method==m2,col]
            
            d1 = d1.loc[data_names]
            d2 = d2.loc[data_names]
            
            w = wilcoxon(d1, d2)
            diff = (d1-d2).sum()
            
            r = ranksums(d1, d2)
            
            print('m1', m1)
            print('d1', d1.sum()/16)
            print('m2', m2)
            print('d2', d2.sum()/16)
            print('diff', diff)
            print('p', w[1])
            print('p2', r[1])
            
            if((diff<0) and (w[1]<0.05)):
                #loss for m1
                #print(m1, m2, 'loss')
                torny.loc[m1,m2] = m2
                torny.loc[m2,m1] = m2
            elif((diff>0) and (w[1]<0.05)):
                #win for m1
                #print(m1, m2, 'win')
                torny.loc[m1,m2] = m1
                torny.loc[m2,m1] = m1
            else:
                #tie
                #print(m1, m2, 'tie')
                torny.loc[m1,m2] = 'tie'
                torny.loc[m2,m1] = 'tie'

    print(torny)
    #torny.to_csv(save_path + 'lookup_table_frac' + str(frac) + '.csv')

In [None]:
fs = files[files[:,1]=='vaeda',:]
fs

In [None]:
def log_norm(x, mean, sd):
    t1 = -np.log(sd*np.sqrt(2*math.pi))
    t2 = (-.5)*((x-mean)/sd)**2
    return t1+t2

In [None]:
pd.options.mode.chained_assignment = None
threshes=[]

plt.rcParams["image.cmap"] = "Dark2"
plt.rcParams['axes.prop_cycle'] = plt.cycler(color=plt.cm.Dark2.colors)

for i, file in enumerate(fs):
    
    pth = '../results_PU/final_vaeda_result/VAEDA_new_calls/' + file[0] + '_scores_on_sim.npy'
    scores_on_sim = np.load(pth)

    res = pd.read_csv(path + '_'.join(file))
    scores = np.array(res['doublet_scores'])
    
    maximum = np.max([np.max(scores), np.max(scores_on_sim)])
    minimum = np.min([np.min(scores), np.min(scores_on_sim)])
    
    thresholds = np.arange(minimum,maximum,0.001)
    
    n = len(scores)
    dbr = n/10**5
    dbl_expected = n*dbr
    dbr_sd = np.sqrt(n*dbr*(1-dbr))
    
    FNR = []
    FPR = []
    ll_doub=[]
    
    d_t = np.sum(scores>=thresholds[-1])
    norm_factor = (log_norm(d_t, dbl_expected, dbr_sd))
    
    for thresh in thresholds:
        
        d_t = np.sum(scores>=thresh)
        
        FNR.append((np.sum(scores_on_sim<thresh)/len(scores_on_sim)))
        FPR.append((d_t/len(scores)))  
        ll_doub.append((log_norm(d_t, dbl_expected, dbr_sd)/norm_factor))
        
            
    cost = np.array(FNR) + np.array(FPR) + np.array(ll_doub)**2
    
    t = thresholds[np.argmin(cost)]
    clls = scores > t
    
    threshes.append(t)
    
    fig, ax = plt.subplots(figsize=(MAX_WIDTH,3), dpi=300)
    
    plt.plot(thresholds, FNR, linewidth=2, label='FNR')
    plt.plot(thresholds, FPR, linewidth=2, label='FPR')
    plt.plot(thresholds, np.array(ll_doub)**2, linewidth=2, label='\u03B1LL\u00b2')
    plt.plot(thresholds, cost, linewidth=2, label='cost')

    plt.plot([t,t], [0,1], c='black', alpha=0.7, linestyle='dotted', label='threshold')
    
    #ax = plt.axes()
    ax.set_facecolor("white")
    plt.grid(False)
    
    #make square
    plt.ylim(0,1)
    x0,x1 = ax.get_xlim()
    y0,y1 = ax.get_ylim()
    ax.set_aspect(abs(x1-x0)/abs(y1-y0))
    
    #plt.legend()
    plt.legend(bbox_to_anchor=(1,1), loc="upper left")
    plt.xlabel('thresholds')
    plt.ylabel('cost')

    plt.title(file[0])
    
    plt.tight_layout()
    plt.savefig(save_path + file[0] + '_cost.png', dpi = 300)
    
    plt.show()
    plt.close()
    


In [None]:
pd.options.mode.chained_assignment = None
threshes=[]

plt.rcParams["image.cmap"] = "Dark2"
plt.rcParams['axes.prop_cycle'] = plt.cycler(color=plt.cm.Dark2.colors)

width = 15
rows = 3
cols = 6

fig,axes = plt.subplots(figsize=(width,(width/cols)*rows),sharex=True, sharey=True,dpi=300)
d = 1

for i, file in enumerate(fs):
    
    pth = '../results_PU/final_vaeda_result/VAEDA_new_calls/' + file[0] + '_scores_on_sim.npy'
    scores_on_sim = np.load(pth)

    res = pd.read_csv(path + '_'.join(file))
    scores = np.array(res['doublet_scores'])
    
    maximum = np.max([np.max(scores), np.max(scores_on_sim)])
    minimum = np.min([np.min(scores), np.min(scores_on_sim)])
    
    thresholds = np.arange(minimum,maximum,0.001)
    
    n = len(scores)
    dbr = n/10**5
    dbl_expected = n*dbr
    dbr_sd = np.sqrt(n*dbr*(1-dbr))
    
    FNR = []
    FPR = []
    ll_doub=[]
    
    d_t = np.sum(scores>=thresholds[-1])
    norm_factor = (log_norm(d_t, dbl_expected, dbr_sd))
    
    for thresh in thresholds:
        
        d_t = np.sum(scores>=thresh)
        
        FNR.append((np.sum(scores_on_sim<thresh)/len(scores_on_sim)))
        FPR.append((d_t/len(scores)))  
        ll_doub.append((log_norm(d_t, dbl_expected, dbr_sd)/norm_factor))
        
            
    cost = np.array(FNR) + np.array(FPR) + np.array(ll_doub)**2
    
    t = thresholds[np.argmin(cost)]
    clls = scores > t
    
    threshes.append(t)
    
    ax = plt.subplot(rows, cols, d)
    
    plt.plot(thresholds, FNR, linewidth=2, label='FNR')
    plt.plot(thresholds, FPR, linewidth=2, label='FPR')
    plt.plot(thresholds, np.array(ll_doub)**2, linewidth=2, label='LL\u00b2')
    plt.plot(thresholds, cost, linewidth=2, label='cost')

    plt.plot([t,t], [0,1], c='black', alpha=0.7, linestyle='dotted', label='threshold')
    
    #ax = plt.axes()
    #ax.set_facecolor("white")
    plt.grid(False)
    
    #make square
    plt.ylim(0,1)
    x0 = thresholds.min()
    x1 = thresholds.max()
    y0,y1 =  (0,1)
    ax.set_aspect(abs(x1-x0)/abs(y1-y0))
    
    #plt.legend()
    #plt.legend(bbox_to_anchor=(1,1), loc="upper left")
    if (d>10):
        plt.xlabel('thresholds')
    if ((d==1)|(d==7)|(d==13)):
        plt.ylabel('cost')
    
    d += 1

    plt.title(file[0])
    
plt.tight_layout()
plt.savefig(save_path + 'ALL_cost.png', dpi = 300)
    
plt.show()
plt.close()