In [None]:
import numpy as np
from numpy import *
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from os import listdir
from os.path import isfile, join, isdir
import pathlib as pl

from plot_results import get_dbl_metrics
import multiprocessing


In [None]:
def mk_heatmap(hm, title = 'Heatmap'):
    #fig, ax = plt.subplots(1, 1, dpi=300)
    plt.figure(figsize=(6,5),  dpi=300)
    cbar_kws = {"orientation":"horizontal", 
                "shrink":0.7,
                'extendfrac':0.1, 
                "drawedges":False,
               }
    
    hm_norm = hm.rank(axis=1, method='max')
    #(hm.subtract(hm.min(axis=1), axis=0)).div((hm.max(axis=1) - hm.min(axis=1)), axis=0)
    
    ax=sns.heatmap(hm_norm, square=False, cbar=True, annot=hm, cmap="YlOrRd",  fmt='g')#, cbar_kws=cbar_kws) #YlOrRd
    ax.set_yticklabels(ax.get_yticklabels(),rotation=45)
    ax.set_xticklabels(ax.get_xticklabels(),rotation=90)
    ax.set_title(title, fontsize=18)

    plt.tight_layout()


In [None]:
save_path = '../results_manuscript/downsample_95_dwnsamp_cells/'


In [None]:
path = '../results_downsample_cells/'
files = [f.split('_') for f in listdir(path) if isfile(join(path, f))]
files = np.array(files)
files

In [None]:
methods = np.unique(files[:,3])
data_names = np.sort(np.unique(files[:,0]))
fracs = np.unique(np.char.lstrip(files[:,1], 'frac')).astype(float)

In [None]:
methods

In [None]:
data_names

In [None]:
fracs

In [None]:
def get_results(file):
        
    res = pd.read_csv(path + '_'.join(file))
    
    ano_path  = '../data/mtx_files/' + file[0] + '_anno.csv'

    #- READ IN BARCODE ANNOTATIONS
    ano = pd.read_csv(ano_path)
    true = pd.factorize(ano.x)[0]
    labels = ano.x
    if (labels[0]=='doublet'):
        tmp = true + 3
        tmp[tmp==3] = 1
        tmp[tmp==4] = 0
        true = tmp
        
    #slice out annotations
    ind_path = '../results_PU/downsample_cells/data/' + '_'.join(file[:3]) + '_ind.npy'
    ind = np.load(ind_path)
    
    true = true[ind]
    
    if (res.columns[0]=='x'):
        score_roc=np.nan
        score_pr=np.nan
    else:
        score_roc, score_pr, _ = get_dbl_metrics(true, res['doublet_scores'])
        plt.close()
    
    method = file[3]
    frac = float(np.char.lstrip(file[1], 'frac'))
    rep = int(np.char.lstrip(file[2], 'rep'))
    data_name = file[0]
        
    df1 = pd.DataFrame({'method': [method], 'data_name': [data_name], 'frac': [frac], 'rep': [rep], 'score': [score_pr]})
    df2 = pd.DataFrame({'method': [method], 'data_name': [data_name], 'frac': [frac], 'rep': [rep], 'score': [score_roc]})

    return [df1, df2]

In [None]:
pool_obj = multiprocessing.Pool()

res = pool_obj.map(get_results, files)


In [None]:
pr_res = []
roc_res = []
for r in res:
    pr_res.append(r[0])
    roc_res.append(r[1])
df_pr = pd.concat(pr_res).reset_index()
df_roc = pd.concat(roc_res).reset_index()

In [None]:
df_means = df_pr.groupby(['method'])['score'].agg('mean').reset_index().sort_values(['score'], ascending=False)
methods = np.array(df_means['method'])
methods

In [None]:
var = df_pr.groupby(['method','data_name'])['score'].agg('var').reset_index()
var = var.groupby(['method'])['score'].agg('sum').reset_index()

df_var = var.groupby(['method'])['score'].agg('sum').reset_index()
df_var.score.divide(16**2)
df_var.index = df_var.method
df_var

In [None]:
MAX_WIDTH = 6.726

In [None]:
##### 3 standard deviations
df = df_pr

sns.set(rc={"figure.figsize":(3.4*0.9, 4.5*0.9)})
sns.set_style("whitegrid")

df_means = df.groupby(['method', 'data_name'])['score'].agg('mean').reset_index()#.sort_values(['score'], ascending=False)
df_means = df_means.groupby(['method'])['score'].agg('mean').reset_index().sort_values(['score'], ascending=False)
pr_methods = np.array(df_means['method'])

var = df.groupby(['method','data_name'])['score'].agg('var').reset_index()
#df_var = var.groupby(['method'])['score'].agg('mean').reset_index()
df_var = var.groupby(['method'])['score'].agg('sum').reset_index()
#df_var = df.groupby(['method'])['score'].agg('var').reset_index()
df_var.index = df_var.method
df_var = df_var.loc[pr_methods,:]

plt.errorbar(df_means.method, df_means.score, 3*np.sqrt(df_var.score.divide(len(data_names)**2)),
            fmt='none', ecolor='black', capsize=4, linewidth=0.5)

sns.swarmplot(data=df_means, x='method', y='score', palette="Set2", size=10, order=pr_methods)

#plt.legend(bbox_to_anchor=(1,1), loc="upper left")
plt.xticks(rotation=45, horizontalalignment='right')

plt.title('Downsample Cells to 95%')
plt.ylabel('Average AUPRC')
plt.xlabel('')
plt.tight_layout()
plt.savefig(save_path + 'PR_violin_3SDs.png', dpi=300)
plt.show()
plt.close()


In [None]:
df_pr

In [None]:
##### 3 standard deviations

df = df_pr

sns.set(rc={"figure.figsize":(3.4*0.9, 4.5*0.9)})
sns.set_style("whitegrid")

df_means = df.groupby(['method', 'data_name'])['score'].agg('mean').reset_index()#.sort_values(['score'], ascending=False)
df_means = df_means.groupby(['method'])['score'].agg('mean').reset_index().sort_values(['score'], ascending=False)
pr_methods = np.array(df_means['method'])

var = df.groupby(['method','data_name'])['score'].agg('var').reset_index()
#df_var = var.groupby(['method'])['score'].agg('mean').reset_index()
df_var = var.groupby(['method'])['score'].agg('sum').reset_index()
#df_var = df.groupby(['method'])['score'].agg('var').reset_index()
df_var.index = df_var.method
df_var = df_var.loc[pr_methods,:]

print(df_means)
print(np.sqrt(df_var.score.divide(len(data_names)**2)))

np.sqrt(df_var.score.divide(len(data_names)**2))



In [None]:
##### 3 standard deviations
df = df_roc

sns.set(rc={"figure.figsize":(9, 4)})
sns.set_style("whitegrid")

df_means = df.groupby(['method', 'data_name'])['score'].agg('mean').reset_index()#.sort_values(['score'], ascending=False)
df_means = df_means.groupby(['method'])['score'].agg('mean').reset_index().sort_values(['score'], ascending=False)
roc_methods = np.array(df_means['method'])

var = df.groupby(['method','data_name'])['score'].agg('var').reset_index()
#df_var = var.groupby(['method'])['score'].agg('mean').reset_index()
df_var = var.groupby(['method'])['score'].agg('sum').reset_index()
#df_var = df.groupby(['method'])['score'].agg('var').reset_index()
df_var.index = df_var.method
df_var = df_var.loc[roc_methods,:]

plt.errorbar(df_means.method, df_means.score, 3*np.sqrt(df_var.score.divide(len(data_names)**2)),
            fmt='none', ecolor='black', capsize=5)

sns.swarmplot(data=df_means, x='method', y='score', palette="Set2", size=10, order=roc_methods)

#plt.legend(bbox_to_anchor=(1,1), loc="upper left")
plt.xticks(rotation=45)

plt.title('Downsample Cells')
plt.tight_layout()
plt.savefig(save_path + 'ROC_violin_3SDs.png', dpi=300)
plt.show()
plt.close()

In [None]:
df = df_pr

sns.set(rc={"figure.figsize":(15, 4*0.9)})
sns.set_style("whitegrid")

df_means = df.groupby(['method'])['score'].agg('mean').reset_index().sort_values(['score'], ascending=False)
pr_methods = np.array(df_means['method'])

df_var = df.groupby(['method'])['score'].agg('var').reset_index()
df_var.index = df_var.method
df_var = df_var.loc[pr_methods,:]

plt.errorbar(df_means.method, df_means.score, np.sqrt(df_var.score),
            fmt='none', ecolor='black', capsize=5)

ax = sns.violinplot(x='method', y='score', data=df, inner=None, linewidth=1, edgecolor='black', color='white', order=pr_methods)

ax = sns.swarmplot(x='method', y='score', data=df, hue='data_name', palette=sns.color_palette("hls", 16), dodge=False, size=3.5, order=pr_methods)

plt.title('Down-sample Cells to 95%', fontsize=14)
plt.legend(bbox_to_anchor=(1,1), loc="upper left", fontsize=7.9)
plt.ylabel('AUPRC', fontsize=14)
plt.xlabel('', fontsize=14)
plt.tick_params(axis='both', which='major', labelsize=14)


plt.tight_layout()
plt.savefig(save_path + 'PR_violin_combined.png', dpi=300)
plt.show()
plt.close()

In [None]:
df = df_roc

sns.set(rc={"figure.figsize":(24, 5)})
sns.set_style("whitegrid")

df_means = df.groupby(['method'])['score'].agg('mean').reset_index().sort_values(['score'], ascending=False)
roc_methods = np.array(df_means['method'])

df_var = df.groupby(['method'])['score'].agg('var').reset_index()
df_var.index = df_var.method
df_var = df_var.loc[roc_methods,:]

plt.errorbar(df_means.method, df_means.score, np.sqrt(df_var.score),
          fmt='none', ecolor='black', capsize=5)

ax = sns.violinplot(x='method', y='score', data=df, inner=None, linewidth=1, edgecolor='black', color='white', order=roc_methods)

ax = sns.swarmplot(x='method', y='score', data=df, hue='data_name', palette=sns.color_palette("hls", 16), dodge=False, size=4, order=roc_methods)


plt.title('Downsample Cells')
plt.legend(bbox_to_anchor=(1,1), loc="upper left")

plt.tight_layout()
plt.savefig(save_path + 'ROC_violin_combined.png', dpi=300)
plt.show()
plt.close()

In [None]:
df = df_pr

df_means = df.groupby(['method'])['score'].agg('mean').reset_index().sort_values(['score'], ascending=False)
pr_methods = np.array(df_means['method'])

sns.set(rc={"figure.figsize":(10, 4)})
sns.set_style("whitegrid")

df_var = df.groupby(['method'])['score'].agg('var').reset_index()
df_var.index = df_var.method
df_var = df_var.loc[pr_methods,:]

plt.errorbar(df_means.method, df_means.score, np.sqrt(df_var.score),
            fmt='none', ecolor='black', capsize=5)

ax = sns.violinplot(data=df, x='method', y='score', palette="Set2", linewidth=0, dodge=True, order=pr_methods)
plt.setp(ax.collections, alpha=.3)

ax = sns.swarmplot(data=df, x='method', y='score', palette="Set2", dodge=True, size=3, order=pr_methods)

sns.swarmplot(data=df_means, x='method', y='score', marker='o', palette="Set2", size=3, linewidth=1, edgecolor='black', order=pr_methods)

#plt.legend(bbox_to_anchor=(1,1), loc="upper left")
plt.xticks(rotation=45)

plt.tight_layout()
plt.savefig(save_path + 'PR_violin_combined2.png', dpi=300)
plt.show()
plt.close()

In [None]:
df = df_roc

df_means = df.groupby(['method'])['score'].agg('mean').reset_index().sort_values(['score'], ascending=False)
roc_methods = np.array(df_means['method'])

sns.set(rc={"figure.figsize":(10, 4)})
sns.set_style("whitegrid")

df_var = df.groupby(['method'])['score'].agg('var').reset_index()
df_var.index = df_var.method
df_var = df_var.loc[roc_methods,:]

plt.errorbar(df_means.method, df_means.score, np.sqrt(df_var.score),
            fmt='none', ecolor='black', capsize=5)

ax = sns.violinplot(data=df, x='method', y='score', palette="Set2", linewidth=0, dodge=True, order=roc_methods)
plt.setp(ax.collections, alpha=.3)

ax = sns.swarmplot(data=df, x='method', y='score', palette="Set2", dodge=True, size=3, order=roc_methods)

sns.swarmplot(data=df_means, x='method', y='score', marker='o', palette="Set2", size=3, linewidth=1, edgecolor='black', order=roc_methods)

#plt.legend(bbox_to_anchor=(1,1), loc="upper left")
plt.xticks(rotation=45)

plt.tight_layout()
plt.savefig(save_path + 'ROC_violin_combined2.png', dpi=300)
plt.show()
plt.close()

In [None]:
df = df_pr

sns.set(rc={"figure.figsize":(18, 4)})
sns.set_style("whitegrid")

g = sns.FacetGrid(df, col='data_name', col_wrap=4, sharey=False, aspect = 1.5, col_order=data_names)

g.map(sns.violinplot, 'data_name',
       data=df, x='frac', y='score', hue='method', palette="Set2", linewidth=0, dodge=True, hue_order=methods)
for ax in g.axes:
    plt.setp(ax.collections, alpha=.3)

g.map(sns.swarmplot, 'data_name',
       data=df, x='frac', y='score', hue='method', palette="Set2", dodge=True, size=3, hue_order=methods)

df_means = df.groupby(['method','data_name'])['score'].agg('mean').reset_index().sort_values(['data_name'], ascending=False)

plt.legend(bbox_to_anchor=(1,1), loc="upper left")

plt.tight_layout()
plt.savefig(save_path + 'PR_violin.png', dpi=300)
plt.show()
plt.close()

In [None]:
df = df_roc

sns.set(rc={"figure.figsize":(18, 4)})
sns.set_style("whitegrid")

g = sns.FacetGrid(df, col='data_name', col_wrap=4, sharey=False, aspect = 1.5)

g.map(sns.violinplot, 'data_name',
       data=df, x='method', y='score', hue='method', palette="Set2", linewidth=0, dodge=True, hue_order=methods)
for ax in g.axes:
    plt.setp(ax.collections, alpha=.3)

g.map(sns.swarmplot, 'data_name',
       data=df, x='method', y='score', hue='method', palette="Set2", dodge=True, size=3, hue_order=methods)

plt.legend(bbox_to_anchor=(1,1), loc="upper left")

plt.tight_layout()
plt.savefig(save_path + 'ROC_violin.png', dpi=300)
plt.show()
plt.close()

In [None]:
from scipy.stats import ranksums, wilcoxon


In [None]:
'''

Symbol 	Meaning
ns 	     P > 0.05
* 	     P ≤ 0.05
** 	     P ≤ 0.01
*** 	 P ≤ 0.001
**** 	 P ≤ 0.0001 (see note)

'''

In [None]:
df_pr['rep'] = df_pr['rep'].astype(str)
df_pr.index = df_pr[['data_name', 'rep']].agg('_'.join, axis=1) 
index_keep = np.unique(df_pr.index)
df_pr

In [None]:
meths = ['scDblFinder', 'vaeda', 'DoubletFinder']
meths=methods
torny = pd.DataFrame(np.zeros((len(meths), len(meths))), index=meths, columns=meths)
df = df_pr
tmp = np.full((len(meths), len(meths)), 'ns')
sig = pd.DataFrame(tmp, index=meths, columns=meths)
for i in range(len(meths)):
    #print('--', method, '--')
    m1 = meths[i]
    for j in range(i+1,len(meths)):
        print
        m2 = meths[j]
        #print(m1, m2)
        d1 = df.loc[df.method==m1,'score']
        d2 = df.loc[df.method==m2,'score']

        intersect = np.intersect1d(d1.index, d2.index)
        index_keep = d1.index[np.in1d(d1.index, intersect)]
        #print(np.in1d(d1.index, d2.index))

        d1 = d1.loc[index_keep]
        d2 = d2.loc[index_keep]

        w = wilcoxon(d1, d2)
        diff = (d1-d2).sum()

        if((diff<0) and (w[1]<0.05)):
            #loss for m1
            #print(m1, m2, 'loss')
            torny.loc[m1,m2] = m2
            torny.loc[m2,m1] = m2
        elif((diff>0) and (w[1]<0.05)):
            #win for m1
            #print(m1, m2, 'win')
            torny.loc[m1,m2] = m1
            torny.loc[m2,m1] = m1
        else:
            #tie
            #print(m1, m2, 'tie')
            torny.loc[m1,m2] = 'tie'
            torny.loc[m2,m1] = 'tie'

print(torny)
torny.to_csv(save_path + 'lookup_table.csv')

In [None]:
np.intersect1d(d1.index, d2.index)

In [None]:
torny = pd.DataFrame(np.zeros((len(methods), len(methods))), index=methods, columns=methods)
df = df_pr
tmp = np.full((len(methods), len(methods)), 'ns')
sig = pd.DataFrame(tmp, index=methods, columns=methods)
for i in range(len(methods)):
    #print('--', method, '--')
    m1 = methods[i]
    for j in range(i+1,len(methods)):
        print
        m2 = methods[j]
        #print(m1, m2)
        w = ranksums(df[df.method==m1].score, df[df.method==m2].score)

        if((w[0]<0) and (w[1]<0.05)):
            #loss for m1
            #print(m1, m2, 'loss')
            torny.loc[m1,m2] = m2
            torny.loc[m2,m1] = m2
        elif((w[0]>0) and (w[1]<0.05)):
            #win for m1
            #print(m1, m2, 'win')
            torny.loc[m1,m2] = m1
            torny.loc[m2,m1] = m1
        else:
            #tie
            #print(m1, m2, 'tie')
            torny.loc[m1,m2] = 'tie'
            torny.loc[m2,m1] = 'tie'

print(torny)
torny.to_csv(save_path + 'lookup_table.csv')

In [None]:
torny = pd.DataFrame(np.zeros((len(methods), 3)), index=methods, columns=['wins', 'ties', 'losses'])
df0 = df_pr
for data_name in data_names:
    print('*****', data_name, '*****')
    df = df0[df0.data_name == data_name]
    tmp = np.full((len(methods), len(methods)), 'ns')
    sig = pd.DataFrame(tmp, index=methods, columns=methods)
    for i in range(len(methods)):
        #print('--', method, '--')
        m1 = methods[i]
        for j in range(i+1,len(methods)):
            print
            m2 = methods[j]
            #print(m1, m2)
            w = ranksums(df[df.method==m1].score, df[df.method==m2].score)

            if((w[0]<0) and (w[1]<0.05)):
                #loss for m1
                #print(m1, m2, 'loss')
                torny.loc[m1,'losses'] = torny.loc[m1,'losses'] + 1
                torny.loc[m2,'wins'] = torny.loc[m2,'wins'] + 1
            elif((w[0]>0) and (w[1]<0.05)):
                #win for m1
                #print(m1, m2, 'win')
                torny.loc[m1,'wins'] = torny.loc[m1,'wins'] + 1
                torny.loc[m2,'losses'] = torny.loc[m2,'losses'] + 1
            else:
                #tie
                #print(m1, m2, 'tie')
                torny.loc[m1,'ties'] = torny.loc[m1,'ties'] + 1
                torny.loc[m2,'ties'] = torny.loc[m2,'ties'] + 1

print(torny)
torny.to_csv(save_path + 'tornament.csv')
                    

In [None]:
torny

In [None]:
meths = ['vaeda', 'scDblFinder', 'DoubletFinder']
torny = pd.DataFrame(np.zeros((len(meths), 3)), index=meths, columns=['wins', 'ties', 'losses'])
df0 = df_pr
for data_name in data_names:
    df = df0[df0.data_name == data_name]
    tmp = np.full((len(meths), len(meths)), 'ns')
    sig = pd.DataFrame(tmp, index=meths, columns=meths)
    for i in range(len(meths)):
        #print('--', method, '--')
        m1 = meths[i]
        for j in range(i+1,len(meths)):
            print
            m2 = meths[j]
            #print(m1, m2)
            w = ranksums(df[df.method==m1].score, df[df.method==m2].score)

            if((w[0]<0) and (w[1]<0.05)):
                #loss for m1
                #print(m1, m2, 'loss')
                torny.loc[m1,'losses'] = torny.loc[m1,'losses'] + 1
                torny.loc[m2,'wins'] = torny.loc[m2,'wins'] + 1
            elif((w[0]>0) and (w[1]<0.05)):
                #win for m1
                #print(m1, m2, 'win')
                torny.loc[m1,'wins'] = torny.loc[m1,'wins'] + 1
                torny.loc[m2,'losses'] = torny.loc[m2,'losses'] + 1
            else:
                #tie
                #print(m1, m2, 'tie')
                torny.loc[m1,'ties'] = torny.loc[m1,'ties'] + 1
                torny.loc[m2,'ties'] = torny.loc[m2,'ties'] + 1

print(torny)


In [None]:
'vaeda-scDblFinder'.split('-')

In [None]:
meths = ['vaeda', 'scDblFinder', 'DoubletFinder']
combos = ['vaeda-scDblFinder', 'vaeda-DoubletFinder', 'scDblFinder-DoubletFinder']
torny = pd.DataFrame(np.zeros((len(combos), len(data_names))), index=combos, columns=data_names)
df0 = df_pr
for data_name in data_names:
    df = df0[df0.data_name == data_name]
    for i in range(len(combos)):
        #print('--', method, '--')

        m1 = combos[i].split('-')[0]
        m2 = combos[i].split('-')[1]

        #print(m1, m2)
        w = ranksums(df[df.method==m1].score, df[df.method==m2].score)

        if((w[0]<0) and (w[1]<0.05)):
            #loss for m1
            #print(m1, m2, 'loss')
            torny.loc[combos[i],data_name] = m2
        elif((w[0]>0) and (w[1]<0.05)):
            #win for m1
            #print(m1, m2, 'win')
            torny.loc[combos[i],data_name] = m1
        else:
            #tie
            #print(m1, m2, 'tie')
            torny.loc[combos[i],data_name] = 'tie'

print(torny)
torny.to_csv(save_path + 'torny_combos_vs_dataset.csv')

In [None]:

df0 = df_pr
for data_name in data_names:
    print('*****', data_name, '*****')
    df = df0[df0.data_name == data_name]
    tmp = np.full((len(methods), len(methods)), 'ns')
    sig = pd.DataFrame(tmp, index=methods, columns=methods)
    for method in methods:
        #print('--', method, '--')
        for m in methods:
            #print(m)
            #print(df[df.method==method])
            w = ranksums(df[df.method==method].score, df[df.method==m].score)
            if(w[1]<=0.05):
                sig.loc[method, m] = '*'
            if(w[1]<=0.01):
                sig.loc[method, m] = '**'
            if(w[1]<=0.001):
                sig.loc[method, m] = '***'
            if(w[1]<=0.0001):
                sig.loc[method, m] = '****'
    print(sig)

