# cfDNA spikein series on single chrom

In [None]:
# Imports

%load_ext autoreload
%autoreload 2

import io
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pysam
import warnings
from sklearn.metrics import precision_recall_curve, f1_score, average_precision_score
warnings.filterwarnings('ignore')
from sklearn.metrics import confusion_matrix

# set working directory
if not os.getcwd().endswith('cfdna_snv_benchmark'):
    os.chdir('../')
print('Current working directory: {}'.format(os.getcwd()))

from utils.config import Config
from utils.viz import *
from utils.table import *
from utils.metrics import *
from utils.calltable import *
from utils.venn import venn6, get_labels

In [None]:
# Config and Display paramaters

config = Config("config/", "config_viz.yaml")
set_display_params(config)
print(config.methods)

In [None]:
# Chomosome

chrom = '3'
muttype = 'snv'
#spikeinid =  'CRC-COSMIC-5p_CRC-123_121115-CW-T'
#spikeinid = 'CRC-COSMIC-5p_CRC-986_300316-CW-T'
spikeinid =  'CRC-COSMIC-5p_CRC-1014_090516-CW-T'
plasmasample = '_'.join(spikeinid.split('_')[:1])
print(plasmasample)
healthysample = '_'.join(spikeinid.split('_')[1:])
print(healthysample)
filterparam = 'all'

In [None]:
reload = True
save = False
fixedvar = 'coverage'

# Load call table

In [None]:
# Save table if do not exist and load tables

calltables = {'sampleid':[], 'vaf':[], 'snv':[], 'indel':[], 'snp':[]}
spikeinfolder = os.path.join(*config.spikeinfolder, 'spikeins_chr'+chrom, 'spikeins_chr'+chrom+'_'+spikeinid)
for spikeinpath in [l for l in os.listdir(spikeinfolder) if l.endswith('x') or l.endswith('T')]:
    print(spikeinpath)
    if not os.path.exists(os.path.join(spikeinfolder, spikeinpath, 'calls', spikeinpath+'_snv_calls_'+filterparam+'csv')) or reload:
        calltable_snv, calltable_indel, calltable_snp = get_calltable(os.path.join(spikeinfolder, spikeinpath), config.methods, save=True, filter=filterparam)
    calltables['sampleid'].append(spikeinpath)
    calltables['vaf'].append(float(spikeinpath.split('vaf')[1].split('_')[0]))
    calltable_snv = pd.read_csv(os.path.join(spikeinfolder, spikeinpath, 'calls', spikeinpath+'_snv_calls_'+filterparam+'.csv'), index_col=0)
    calltable_indel = pd.read_csv(os.path.join(spikeinfolder, spikeinpath, 'calls', spikeinpath+'_indel_calls_'+filterparam+'.csv'), index_col=0)
    calltable_snp = pd.read_csv(os.path.join(spikeinfolder, spikeinpath, 'calls', spikeinpath+'_snp_calls_'+filterparam+'.csv'), index_col=0)
    calltables['snv'].append(calltable_snv)
    calltables['indel'].append(calltable_indel)
    calltables['snp'].append(calltable_snp)
calltables.keys()

In [None]:
for mt in ['snv', 'indel']:
    if not os.path.exists(os.path.join(spikeinfolder, 'calls')):
        os.mkdir(os.path.join(spikeinfolder, 'calls'))
    if not os.path.exists(os.path.join(spikeinfolder, 'calls', spikeinid+'_'+mt+'_calls_'+filterparam+'.csv')) or reload:
        for ci, csnv in enumerate(calltables[mt]):
            cols = ['chrom', 'pos', 'ref', 'alt', 'type']
            for m in config.methods:
                cols.append('{:.2f}_{}'.format(calltables['vaf'][ci], m))
                cols.append('{:.2f}_{}_score'.format(calltables['vaf'][ci], m))
            for m in config.methods:
                cols.append('{:.2f}_{}_altcov'.format(calltables['vaf'][ci], m)) 
                cols.append('{:.2f}_{}_totcov'.format(calltables['vaf'][ci], m)) 
                cols.append('{:.2f}_{}_vaf'.format(calltables['vaf'][ci], m)) 
            csnv.columns = cols
        # ensure no duplicated index
        print(calltables[mt][0].loc[calltables[mt][0].index[calltables[mt][0].index.duplicated(keep=False)]].shape[0])
        # get call series
        calltablesseries = pd.concat([ct.set_index(['chrom', 'pos', 'ref', 'alt', 'type']) for ct in calltables[mt]], axis=1)
        calltablesseries.reset_index(inplace=True)
        calltablesseries['chrom_pos_ref_alt'] = calltablesseries['chrom'].astype('str').str.cat(calltablesseries['pos'].astype('str'), sep="_").str.cat(calltablesseries['ref'].astype('str'), sep='_').str.cat(calltablesseries['alt'].astype('str'), sep='_')
        calltablesseries.set_index('chrom_pos_ref_alt', inplace=True)
        print(calltablesseries.shape)
        calltablesseries.to_csv(os.path.join(spikeinfolder, 'calls', spikeinid+'_'+mt+'_calls_'+filterparam+'.csv'))
        
calltablesseries = pd.read_csv(os.path.join(spikeinfolder, 'calls', spikeinid+'_'+muttype+'_calls_'+filterparam+'.csv'), index_col=0)
calltablesseries.head()

In [None]:
np.unique([i.split('_')[0] if '_' in i else '' for i in list(calltablesseries.columns)])

In [None]:
calltablesseries[calltablesseries['truth'] == True][[i for i in list(calltablesseries.columns) if i.count('_') == 1]].dropna(how='all',axis=0)

In [None]:
calltablesseries[calltablesseries['truth'] == True][[i for i in list(calltablesseries.columns) if  'freebayes' in i]] # i.count('_') == 1 and

In [None]:
print(calltables['vaf'])
print(len(list(calltablesseries.columns)))
print(len(np.unique(list(calltablesseries.columns))))

# Plot curves

In [None]:
# Truth in COSMIC database
calltablesseries['truth'] = False
gt = pd.read_csv(os.path.join(*config.extdatafolder, 'cosmic_mutations_atleast5patients', 'CRC_chr'+str(chrom)+'_'+muttype.upper()+'_tf1.bed'), sep='\t', header=None)
if muttype == 'snv':
    gt.columns = ['chrom', 'startpos', 'endpos', 'vaf', 'alt']
else: # indel
    gt.columns = ['chrom', 'startpos', 'endpos', 'vaf', 'type', 'alt']
truthpos = []
posvalues = calltablesseries['pos'].values
c =0
for igt, gtv in enumerate(gt['startpos'].values):
    if gtv in posvalues:
        if calltablesseries[calltablesseries['pos'] == gtv].shape[0] > 1:
            print('ISSUE: cannot retrieve reference easily')
            print(calltablesseries[calltablesseries['pos'] == gtv].shape[0])
        truthpos.append(str(gt.iloc[igt]['chrom']) +'_'+ str(gt.iloc[igt]['startpos']) +'_'+calltablesseries[calltablesseries['pos'] == gtv]['ref'].values[0] +'_'+gt.iloc[igt]['alt'])
    c +=1
print(c)
calltablesseries['truth'] = False
calltablesseries = calltablesseries.reindex(list(set(list(calltablesseries.index) + truthpos)))
calltablesseries['truth'].loc[truthpos] = True
calltablesseries['truth'].value_counts()

In [None]:
if fixedvar == 'coverage':
    dilutionseries = [0.20, 0.15, 0.10, 0.05, 0.03, 0.01, 0.00]

results_auprc_df = metric_curve(config, calltablesseries, plasmasample, healthysample, dilutionseries,
                                    metric='auprc', ground_truth_method='spikein', refsample=plasmasample, muttype=muttype, chrom=chrom, methods=config.methods, fixedvar=fixedvar, save=save)
results_recall_df = metric_curve(config, calltablesseries, plasmasample, healthysample, dilutionseries,
                                    metric='recall', ground_truth_method='spikein', refsample=plasmasample, muttype=muttype, chrom=chrom, methods=config.methods, fixedvar=fixedvar, save=save)
results_precision_df = metric_curve(config, calltablesseries, plasmasample, healthysample, dilutionseries,
                                    metric='precision', ground_truth_method='spikein', refsample=plasmasample, muttype=muttype, chrom=chrom, methods=config.methods, fixedvar=fixedvar, save=save)


In [None]:
figure_curve(config, calltablesseries, plasmasample, healthysample, dilutionseries, xy='pr', ground_truth_method='spikein',
             refsample=plasmasample, muttype=muttype.upper(), chrom='22', methods=None, fixedvar=fixedvar, save=save)

# Confusion matrix

# Call set similarity: Jaccard Index

In [None]:

for sklearn.metrics.jaccard_similarity_score(y_true, y_pred, normalize=True)

In [None]:
dilutionseries = [(70,0), (70, 80), (50, 100), (30, 120), (20, 130), (10, 140), (5, 145)]

patient = '1014'

if patient == '809':
    plasmasample1 = 'CRC-809_110914'
    plasmasample2 = 'CRC-809_030915'
    tumorsample1 = 'NCC_CRC-809_290714-T1W'
    plasmasampleltb = None
    healthysamples = ['pooledhealthy']
elif patient == '986':
    plasmasample1 = 'CRC-986_100215'
    plasmasample2 = 'CRC-986_261016'
    tumorsample1 = 'NCC_CRC-986_100215-T1W'
    plasmasampleltb = 'CRC-986_300316'
    healthysamples = ['pooledhealthy', 'pooledhealthy_986_filter_snv', plasmasampleltb]
elif patient == '1014':
    plasmasample2 = 'CRC-1014_110116'
    tumorsample1 = None
    plasmasampleltb = 'CRC-1014_090516'
    healthysamples = ['pooledhealthy', plasmasampleltb]
    
healthysample = healthysamples[-1]
    
if patient == '1014':
    vcf_ref_path = 'data/bcbio_output/dilution_chr22_CRC-1014_110116_1_pooledhealthy_0/dilution_chr22_CRC-1014_110116_1_pooledhealthy_0-ensemble-annotated.vcf'
else:
    vcf_ref_path = None

print(patient, healthysamples)
dilutionseries = [(1,0), (1, 0.72), (0.75, 0.765), (0.5, 0.81), (0.25, 0.855), (0.125, 0.875), (0.0625, 0.88)]
for i, d in dilutionseries:
    tb_dict = {}
    for i, d in enumerate(dilutionseries):
        #tb_path = os.path.join(*config.dilutionfolder, "estimated_tf_chr22_"+plasmasample1+"_"+str(dilutionseries[i][0])+"_"+healthysample+"_"+str(dilutionseries[i][1])+".txt")
        tb_path = os.path.join(*config.dilutionfolder, "estimated_tf_chr22_"+plasmasample2+"_"+str(dilutionseries[i][0])+"_"+healthysample+"_"+str(dilutionseries[i][1])+".txt")
        if d == (70, 0) and not os.path.exists(tb_path):
            tb_path = [os.path.join(*config.dilutionfolder, f) for f in os.listdir(os.path.join(*config.dilutionfolder)) if ("estimated_tf_chr22_"+plasmasample2) and (f.endswith('_0.txt'))][0]
        tb_dict[str(dilutionseries[i])] = float(pd.read_csv(tb_path).columns[0])
print(tb_dict)

tb_dict =  {'(70, 0)': 0.45449979526423284, 
            '(70, 80)': 0.19021542447303608,
            '(50, 100)': 0.1531168888180089,
            '(30, 120)': 0.11015117236383784,
            '(20, 130)': 0.05980496272648487,
            '(10, 140)': 0.031326826579643155,
            '(5, 145)': 0.01902154244730361}
print(tb_dict)

In [None]:
dilutionseries = [(70,0), (70, 80), (50, 100), (30, 120), (20, 130), (10, 140), (5, 145)]

for i, d in enumerate(dilutionseries):
    print(i, d)
    d0 = str(d[0]).replace('.', '_')
    d1 = str(d[1]).replace('.', '_')
    path_folder="~/Repositories/cfdna_snv_benchmark/data/callers_output/mixtures/mixtures_chr22/mixture_chr22_CRC-1014_180816-CW-T_"+str(d[0])+"x_CRC-1014_090516-CW-T_"+str(d[1])+"x.sorted/abemus"
    path_file="pmtab_F3_optimalR_mixture_chr22_CRC-1014_180816-CW-T_"+str(d[0])+"x_CRC-1014_090516-CW-T_"+str(d[1])+"x.sorted.tsv"
    #path_folder="~/Repositories/cfdna_snv_benchmark/data/abemus_output/abemus_outdir_chr22/dilution_chr22_CRC-986_100215_"+str(d[0])+"_CRC-986_300316_"+str(d[1])+".sorted"
    #path_file = 'pmtab_F3_dilution_chr22_CRC-986_100215_'+str(d[0])+'_CRC-986_300316_'+str(d[1])+'.sorted.tsv'
    aux_df = pd.read_csv(os.path.join(path_folder, path_file), sep='\t')
    #res_df[['CLASS', 'CLASS.xbg', 'filter.pbem_coverage', 'pass.filter.pbem_coverage']]
    aux_df['chrom_pos'] = aux_df.chr.astype(str) + '_' + aux_df.pos.astype(str)
    aux_df.set_index('chrom_pos', inplace=True)
    aux_df = aux_df[['filter.pbem_coverage', 'pass.filter.pbem_coverage']]
    aux_df.columns = [str(round(100*tb_dict[str(d)], 3))+'_abemus_score', str(round(100*tb_dict[str(d)], 3))+'_abemus']
    aux_df[str(round(100*tb_dict[str(d)], 3))+'_abemus'] = aux_df[str(round(100*tb_dict[str(d)], 3))+'_abemus'].astype(bool)
    aux_df.drop('chr_pos', inplace=True)
    aux_df[str(round(100*tb_dict[str(d)], 3))+'_abemus_score'] = aux_df[str(round(100*tb_dict[str(d)], 3))+'_abemus_score'].astype(float)
    aux_df = aux_df[~aux_df.index.duplicated()] ### TODO
    # if d == (1,0):
    if d == (70,0):
        res_df = aux_df.copy()
    else:
        res_df = pd.concat([res_df, aux_df], axis=1)
        
res_df

In [None]:

config = Config("config/", "config_viz.yaml")
set_display_params(config)


In [None]:
prefix = 'dilution_chr22_'
chrom = '22'
dilutionseries = [(1,0), (1, 0.72), (0.75, 0.765), (0.5, 0.81), (0.25, 0.855), (0.125, 0.875), (0.0625, 0.88)]

patient = '1014'

if patient == '809':
    plasmasample1 = 'CRC-809_110914'
    plasmasample2 = 'CRC-809_030915'
    tumorsample1 = 'NCC_CRC-809_290714-T1W'
    plasmasampleltb = None
    healthysamples = ['pooledhealthy']
elif patient == '986':
    plasmasample1 = 'CRC-986_100215'
    plasmasample2 = 'CRC-986_261016'
    tumorsample1 = 'NCC_CRC-986_100215-T1W'
    plasmasampleltb = 'CRC-986_300316'
    healthysamples = ['pooledhealthy', 'pooledhealthy_986_filter_snv', plasmasampleltb]
elif patient == '1014':
    plasmasample2 = 'CRC-1014_110116'
    tumorsample1 = None
    plasmasampleltb = 'CRC-1014_090516'
    healthysamples = ['pooledhealthy', plasmasampleltb]
    
healthysample = healthysamples[-1]
    
if patient == '1014':
    vcf_ref_path = 'data/bcbio_output/dilution_chr22_CRC-1014_110116_1_pooledhealthy_0/dilution_chr22_CRC-1014_110116_1_pooledhealthy_0-ensemble-annotated.vcf'
else:
    vcf_ref_path = None

print(patient, healthysamples)

In [None]:
df_table = get_call_table(config, prefix, plasmasample2, healthysample, dilutionseries, ground_truth_method=3, refsample='undiluted', chrom=chrom, muttype='SNV', vcf_ref_path=vcf_ref_path, tumorsample=None)
print(df_table.shape)
df_table.head()

In [None]:
df_table = pd.concat([df_table, res_df], axis=1)
df_table['truth'].fillna(False, inplace=True)
df_table

In [None]:
print(df_table[(df_table['truth'] == True) & (df_table['45.45_abemus'] == True)].shape[0])
print(df_table[(df_table['truth'] == True)].shape[0])

# AUPRC plots


In [None]:

config = Config("config/", "config_viz.yaml")
set_display_params(config)

color_dict = {config.methods[i]: config.colors[i] for i in range(len(config.methods))}
alpha_dict = dict(zip(config.tissuebenchmark.fractions, [1-i*0.3 for i in range(len(config.tissuebenchmark.fractions))]))

print(color_dict)
print(alpha_dict)

In [None]:
df_table.drop(['11.015_abemus', '11.015_abemus_score'], axis=1, inplace=True)

In [None]:
df_table.columns

In [None]:
results_auprc_df = metric_curve(config, df_table, plasmasample2, healthysample, dilutionseries, metric='precision', ground_truth_method=3, refsample='undiluted', muttype='SNV', chrom='22', methods=config.methods, save=False)

In [None]:
for muttype in config.muttype:
    results_df = pd.DataFrame()
    aux_auprc = []
    aux_method = []
    aux_sample = []
    aux_tp = []
    aux_baseline = []
    for i, sample in enumerate(config.tissuebenchmark.samples):
        for f in config.tissuebenchmark.fractions:
            for method in config.tissuebenchmark.methods:
                #print(muttype, sample, f, method)
                vcf_sample = df_table[(df_table['sample'] == sample) & (df_table['mutation type'] == muttype) & (df_table['purity'] == round(config.tissuebenchmark.purities[i]*f, 2))]
                df_sample_method = vcf_sample[[method+'_score', 'TRUTH']]
                df_sample_method[method + '_score'].fillna(0, inplace=True)
                precision, recall, thresholds = precision_recall_curve(df_sample_method['TRUTH'], df_sample_method[method + '_score'])
                f1 = f1_score(vcf_sample['TRUTH'], vcf_sample[method])
                estimator_name = method if f == 1 else ''
                auprc = average_precision_score(df_sample_method['TRUTH'], df_sample_method[method+'_score'])
                aux_auprc.append(auprc)
                aux_method.append(method)
                aux_sample.append(sample)
                aux_tp.append(round(100*config.tissuebenchmark.purities[i]*f, 2))
            # baseline
            aux_auprc.append(len(vcf_sample['TRUTH'][vcf_sample['TRUTH']])/len(vcf_sample['TRUTH']))
            aux_method.append('baseline')
            aux_sample.append(sample)
            aux_tp.append(round(100*config.tissuebenchmark.purities[i]*f, 2))
            
    results_df['AUPRC'] = aux_auprc
    results_df['tumor purity'] = aux_tp
    results_df['caller'] = aux_method
    results_df['sample'] = aux_sample
    
    for si, sample in enumerate(config.tissuebenchmark.samples):
        sns.catplot(x="tumor purity", y="AUPRC", hue="caller",
                  capsize=.2, height=4, aspect=1.5, kind="point", colors=config.colors + ['k'],
                    order=sorted(results_df[results_df['sample'] == sample]['tumor purity'].unique(), reverse=True),
                        data=results_df[results_df['sample'] == sample])
        plt.ylim([0, 1])
        plt.title("AUPRC score for {} calling in {}".format(muttype, sample))
        plt.savefig('figures/tissue_benchmark/'+sample+'_'+muttype+'_auprcscore',  bbox_inches='tight')

# Change in prediction with dilution

In [None]:
df_table.head()

In [None]:
df = df_table[df_table['truth'] == True]#[['CHROM_POS', 'sample', 'purity', 'mutation type', 'truth', 'freebayes','mutect2', 'strelka2', 'vardict', 'varscan']] # .purity.unstack().add_prefix('purity_')#.groupby('CHROM_POS').sum()
df.unstack

In [None]:

fig, ax = plt.subplots(figsize=(15,8))
for m, method in enumerate(config.methods):
    change_df_aux = df_table[(df_table['truth']) & (df_table[method])][['CHROM_POS', 'TRUTH', 'purity'] + [method]].set_index(['CHROM_POS'])
    change_df = pd.DataFrame(index=change_df_aux.index.unique(), columns= ['TRUTH'] + [str(round(config.tissuebenchmark.purities[i]*f, 2)) for f in config.tissuebenchmark.fractions])
    for chrpos in change_df.index:
        for f in config.tissuebenchmark.fractions:
            try:
                change_df.loc[chrpos][str(round(config.tissuebenchmark.purities[i]*f, 2))] = change_df_aux[(change_df_aux['purity'] == round(config.tissuebenchmark.purities[i]*f, 2))].loc[chrpos][method]
            except:
                change_df.loc[chrpos][str(round(config.tissuebenchmark.purities[i]*f, 2))] = False
        if change_df_aux.loc[chrpos]['TRUTH'].shape == ():
            change_df.loc[chrpos]['TRUTH'] = change_df_aux.loc[chrpos]['TRUTH']
        else:
            change_df.loc[chrpos]['TRUTH'] = np.unique(change_df_aux.loc[chrpos]['TRUTH'].values)[0]
    # print(change_df.shape)
    change_df['sum'] = sum([change_df[str(round(config.tissuebenchmark.purities[i]*f, 2))] for f in config.tissuebenchmark.fractions])
    change_df = change_df[~change_df['sum'].isin([0, 3])].dropna(how='all')
    change_df = change_df.astype(int).drop('sum', axis=1)
    # plot
    #for chrpos, row in change_df.astype(int).drop('sum', axis=1).iterrows():
    #    plt.plot(row.astype(int), label=chrpos)
    #plt.legend()
    change_df = change_df.groupby(change_df.columns.tolist(),as_index=False).size()
    print(method)
    print(change_df)

    c = 0.7
    for idx, row in change_df.iterrows():
        plt.plot(row[['TRUTH'] + [str(round(config.tissuebenchmark.purities[i]*f, 2)) for f in config.tissuebenchmark.fractions]], lw=row['size'], label=method, c=config.colors[m], alpha=c-max(1/row['size'], 0.2))
custom_lines = [Line2D([0], [0], color=config.colors[m], lw=2) for m in range(len(config.methods))]
plt.legend(custom_lines, config.methods)

plt.xlabel('purity')
plt.ylabel('mutation calling output prediction')
plt.title('Change in mutation prediction w.r.t purity in {} for {}'.format(sample, muttype))

ax.set_yticks([0,1])
ax.set_yticklabels(['False','True'])
plt.show()

In [None]:
for i, sample in enumerate(config.tissuebenchmark.samples):
    for muttype in config.muttype:
        fig, ax = plt.subplots(figsize=(15,8))
        for m, method in enumerate(config.methods):
            change_df_aux = df_table[(df_table['sample'] == sample) & (df_table['mutation type'] == muttype) & (df_table['TRUTH']) & (df_table[method])][['CHROM_POS', 'TRUTH', 'purity'] + [method]].set_index(['CHROM_POS'])
            change_df = pd.DataFrame(index=change_df_aux.index.unique(), columns= ['TRUTH'] + [str(round(config.tissuebenchmark.purities[i]*f, 2)) for f in config.tissuebenchmark.fractions])
            for chrpos in change_df.index:
                for f in config.tissuebenchmark.fractions:
                    try:
                        change_df.loc[chrpos][str(round(config.tissuebenchmark.purities[i]*f, 2))] = change_df_aux[(change_df_aux['purity'] == round(config.tissuebenchmark.purities[i]*f, 2))].loc[chrpos][method]
                    except:
                        change_df.loc[chrpos][str(round(config.tissuebenchmark.purities[i]*f, 2))] = False
                if change_df_aux.loc[chrpos]['TRUTH'].shape == ():
                    change_df.loc[chrpos]['TRUTH'] = change_df_aux.loc[chrpos]['TRUTH']
                else:
                    change_df.loc[chrpos]['TRUTH'] = np.unique(change_df_aux.loc[chrpos]['TRUTH'].values)[0]
            # print(change_df.shape)
            change_df['sum'] = sum([change_df[str(round(config.tissuebenchmark.purities[i]*f, 2))] for f in config.tissuebenchmark.fractions])
            change_df = change_df[~change_df['sum'].isin([0, 3])].dropna(how='all')
            change_df = change_df.astype(int).drop('sum', axis=1)
            # plot
            #for chrpos, row in change_df.astype(int).drop('sum', axis=1).iterrows():
            #    plt.plot(row.astype(int), label=chrpos)
            #plt.legend()
            change_df = change_df.groupby(change_df.columns.tolist(),as_index=False).size()
            print(method)
            print(change_df)

            c = 0.7
            for idx, row in change_df.iterrows():
                plt.plot(row[['TRUTH'] + [str(round(config.tissuebenchmark.purities[i]*f, 2)) for f in config.tissuebenchmark.fractions]], lw=row['size'], label=method, c=config.colors[m], alpha=c-max(1/row['size'], 0.2))
        custom_lines = [Line2D([0], [0], color=config.colors[m], lw=2) for m in range(len(config.methods))]
        plt.legend(custom_lines, config.methods)

        plt.xlabel('purity')
        plt.ylabel('mutation calling output prediction')
        plt.title('Change in mutation prediction w.r.t purity in {} for {}'.format(sample, muttype))

        ax.set_yticks([0,1])
        ax.set_yticklabels(['False','True'])
        plt.show()

# True Positive, False Negative and False Positive Mutations

In [None]:
df = pd.DataFrame(columns=['sample', 'mutation type', 'caller', 'purity', 'metric', 'number of loci', 'rate',  'number of mutations (ground truth)']) #'TN', 'FP', 'FN', 'TP'])
metric = ['TN', 'FP', 'FN', 'TP']
c = 0
for muttype in config.muttype:
    for i, sample in enumerate(config.tissuebenchmark.samples):
        for method in config.tissuebenchmark.methods:
            print(muttype, sample, method)
            for f in config.tissuebenchmark.fractions:
                vcf_sample = df_table[(df_table['sample'] == sample) & (df_table['mutation type'] == muttype) & (df_table['purity'] == round(config.tissuebenchmark.purities[i]*f, 2))]
                ntruth = sum(vcf_sample['TRUTH'])
                tn, fp, fn, tp = confusion_matrix(vcf_sample['TRUTH'], vcf_sample[method], labels=[0,1]).ravel()
                tpfn = tp + fn
                tnfp = tn + fp
                for j, tfpn in enumerate([tn, fp, fn, tp]):
                    # df.loc[c, :] = [sample, muttype, method, round(config.tissuebenchmark.purities[i]*f, 2), tn, fp, fn, tp]
                    if j in [0, 1]:
                        df.loc[c, :] = [sample, muttype, method, round(config.tissuebenchmark.purities[i]*f, 2), metric[c%4], tfpn, tfpn/tnfp, ntruth]
                    else: # j in [2,3]
                        df.loc[c, :] = [sample, muttype, method, round(config.tissuebenchmark.purities[i]*f, 2), metric[c%4], tfpn, tfpn/tpfn, ntruth]
                    c += 1
df.head(20)

In [None]:
for muttype in config.muttype:
    for s, sample in enumerate(config.tissuebenchmark.samples):
        plt.figure(figsize=(10,8))
        plots = sns.barplot(x="caller", y="rate", hue="purity", data=df[(df['sample'] == sample) & (df['mutation type'] == muttype) & (df['metric'] == 'TP')], ci=None, palette='Blues_r',
                            hue_order=[round(config.tissuebenchmark.purities[s]*config.tissuebenchmark.fractions[i], 2) for i in range(3)])
        plt.suptitle('True Positive Rate - '+sample+ ' - '+muttype)
        plt.legend(bbox_to_anchor=(1.01, 1), borderaxespad=0, title='purity')
        plt.ylim([0,1])
        xcoords, ycoords = [], []
        ci = 0
        # Iterrating over the bars one-by-one
        for b, bar in enumerate(plots.patches):
            plots.annotate(df[(df['sample'] == sample) & (df['mutation type'] == muttype) & (df['metric'] == 'TP') & (df['caller'] == config.methods[b%5]) & (df['purity'] == round(config.tissuebenchmark.purities[s]*config.tissuebenchmark.fractions[b//5], 2))]['number of loci'].values[0],
                   (bar.get_x() + (bar.get_width() / 2),
                    bar.get_height()), ha='center', va='center',
                   size=15, xytext=(0, 8),
                   textcoords='offset points')
            
        for m,methods in enumerate(config.methods):
            for b, bar in enumerate(plots.patches):
                if b % len(config.methods) == m:
                    #print(b)
                    if len(xcoords) == 3:
                        xcoords, ycoords = [], []
                    xi = bar.get_x() + (bar.get_width() / 2)
                    yi = df[(df['sample'] == sample) & (df['mutation type'] == muttype) & (df['caller'] == config.methods[b//3]) & (df['purity'] == round(config.tissuebenchmark.purities[s]*config.tissuebenchmark.fractions[b%3], 2))]['number of mutations (ground truth)'].values[0]
                    xcoords.append(xi)
                    ycoords.append(yi)
                    if len(xcoords) == 3:
                        if m == 0:
                            redline = ax.plot(xcoords, ycoords, 'ro-', label='Number of ground truths mutations')
                        else:
                            ax.plot(xcoords, ycoords, 'ro-')
        plt.savefig('figures/tissue_benchmark/'+sample+'_'+muttype+'_TPrate',  bbox_inches='tight')

        
        plt.figure(figsize=(10,8))
        plots = sns.barplot(x="caller", y="rate", hue="purity", data=df[(df['sample'] == sample) & (df['mutation type'] == muttype) & (df['metric'] == 'FP')], ci=None, palette='Blues_r',
                            hue_order=[round(config.tissuebenchmark.purities[s]*config.tissuebenchmark.fractions[i], 2) for i in range(3)])
        plt.suptitle('False Positive Rate - '+sample+ ' - '+muttype)
        plt.legend(bbox_to_anchor=(1.01, 1), borderaxespad=0, title='purity')
        plt.ylim([0,1])
        # Iterrating over the bars one-by-one
        for b, bar in enumerate(plots.patches):
            plots.annotate(df[(df['sample'] == sample) & (df['mutation type'] == muttype) & (df['metric'] == 'FP') & (df['caller'] == config.methods[b//3]) & (df['purity'] == round(config.tissuebenchmark.purities[s]*config.tissuebenchmark.fractions[b%3], 2))]['number of loci'].values[0],
                            #format(bar.get_height(), '.2f'),
                           (bar.get_x() + bar.get_width() / 2,
                            bar.get_height()), ha='center', va='center',
                           size=15, xytext=(0, 8),
                           textcoords='offset points')
        plt.savefig('figures/tissue_benchmark/'+sample+'_'+muttype+'_FPrate',  bbox_inches='tight')

            
        plt.figure(figsize=(10,8))
        plots = sns.barplot(x="caller", y="rate", hue="purity", data=df[(df['sample'] == sample) & (df['mutation type'] == muttype) & (df['metric'] == 'FN')], ci=None, palette='Blues_r',
                            hue_order=[round(config.tissuebenchmark.purities[s]*config.tissuebenchmark.fractions[i], 2) for i in range(3)])
        plt.suptitle('False Negative Rate - '+sample+ ' - '+muttype)
        plt.legend(bbox_to_anchor=(1.01, 1),  borderaxespad=0, title='purity')
        plt.ylim([0,1])
        # Iterrating over the bars one-by-one
        for b, bar in enumerate(plots.patches):
            plots.annotate(df[(df['sample'] == sample) & (df['mutation type'] == muttype) & (df['metric'] == 'FN') & (df['caller'] == config.methods[b//3]) & (df['purity'] == round(config.tissuebenchmark.purities[s]*config.tissuebenchmark.fractions[b%3], 2))]['number of loci'].values[0],
                            #format(bar.get_height(), '.2f'),
                           (bar.get_x() + bar.get_width() / 2,
                            bar.get_height()), ha='center', va='center',
                           size=15, xytext=(0, 8),
                           textcoords='offset points')
        plt.savefig('figures/tissue_benchmark/'+sample+'_'+muttype+'_FNrate',  bbox_inches='tight')


# Stacked TP, FN, FP, TN

In [None]:
df = pd.DataFrame(columns=['sample', 'mutation type', 'caller', 'purity', 'TP', 'FN', 'FP', 'FN', ])
metric = ['TN', 'FP', 'FN', 'TP']
c = 0
for muttype in config.muttype:
    for i, sample in enumerate(config.tissuebenchmark.samples):
        for method in config.tissuebenchmark.methods:
            print(muttype, sample, method)
            for f in config.tissuebenchmark.fractions:
                vcf_sample = df_table[(df_table['sample'] == sample) & (df_table['mutation type'] == muttype) & (df_table['purity'] == round(config.tissuebenchmark.purities[i]*f, 2))]
                tn, fp, fn, tp = confusion_matrix(vcf_sample['TRUTH'], vcf_sample[method], labels=[0,1]).ravel()
                tpfn = tp + fn
                tnfp = tn + fp
                df.loc[c, :] = [sample, muttype, method, round(config.tissuebenchmark.purities[i]*f, 2), tp, fn, fp, tn]
                c += 1
df.head()

In [None]:
df = pd.DataFrame(columns=['sample', 'mutation type', 'caller', 'purity',  'caller_purity',  'metric', 'number of loci', 'rate']) #'TN', 'FP', 'FN', 'TP'])
metric = ['TN', 'FP', 'FN', 'TP']
c = 0
d = 0
for muttype in config.muttype:
    for i, sample in enumerate(config.tissuebenchmark.samples):
        for method in config.tissuebenchmark.methods:
            print(muttype, sample, method)
            for f in config.tissuebenchmark.fractions:
                vcf_sample = df_table[(df_table['sample'] == sample) & (df_table['mutation type'] == muttype) & (df_table['purity'] == round(config.tissuebenchmark.purities[i]*f, 2))]
                tn, fp, fn, tp = confusion_matrix(vcf_sample['TRUTH'], vcf_sample[method], labels=[0,1]).ravel()
                tpfn = tp + fn
                tnfp = tn + fp
                for j, tfpn in enumerate([tn, fp, fn, tp]):
                    # df.loc[c, :] = [sample, muttype, method, round(config.tissuebenchmark.purities[i]*f, 2), tn, fp, fn, tp]
                    if j in [0, 1]:
                        df.loc[c+d, :] = [sample, muttype, method, round(config.tissuebenchmark.purities[i]*f, 2), method + '_' + str(round(config.tissuebenchmark.purities[i]*f, 2)), metric[c%4], tfpn, tfpn/tnfp]
                    else: # j in [2,3]
                        df.loc[c+d, :] = [sample, muttype, method, round(config.tissuebenchmark.purities[i]*f, 2), method + '_' + str(round(config.tissuebenchmark.purities[i]*f, 2)), metric[c%4], tfpn, tfpn/tpfn]
                    c += 1
            
            df.loc[c+d, :] = [sample, muttype, method, 0, method+'_0', 'TN', 0, 0]
            d += 1
df.head(30)

In [None]:
for muttype in config.muttype:
    for s, sample in enumerate(config.tissuebenchmark.samples):
        #for f in config.tissuebenchmark.fractions:
        #for m in config.methods:
        plt.figure(figsize=(10,4))
        bar = sns.barplot(x="caller_purity", y="number of loci", hue="metric", data=df[(df['sample'] == sample) & (df['mutation type'] == muttype)],
                    #order= [round(config.tissuebenchmark.purities[s]*config.tissuebenchmark.fractions[i], 2) for i in range(3)],
                    palette=['lightcoral', 'red', 'lightgreen', 'green'],
                    dodge= False)
        bar.set_xticklabels(bar.get_xticklabels(), rotation=90, horizontalalignment='right')
        labels = bar.get_xticklabels() 
        labels = [l if ((i+1)%4 != 0) else '' for i, l in enumerate(labels)]
        bar.set_xticklabels(labels)
        plt.suptitle('Predictions - '+sample+ ' - '+muttype)
        plt.legend(bbox_to_anchor=(1.01, 1))
        #plt.ylim([0,1])
        plt.savefig('figures/tissue_benchmark/'+sample+'_'+muttype+'_predictions',  bbox_inches='tight')

# Number of mutations

In [None]:
df = pd.DataFrame(columns=['sample', 'mutation type', 'caller', 'purity', 'number of loci called', 'number of mutations (ground truth)']) #'TN', 'FP', 'FN', 'TP'])
metric = ['TN', 'FP', 'FN', 'TP']
c = 0
for muttype in config.muttype:
    for i, sample in enumerate(config.tissuebenchmark.samples):
        for method in config.tissuebenchmark.methods:
            print(muttype, sample, method)
            for f in config.tissuebenchmark.fractions:
                vcf_sample = df_table[(df_table['sample'] == sample) & (df_table['mutation type'] == muttype) & (df_table['purity'] == round(config.tissuebenchmark.purities[i]*f, 2))]
                ntruth = sum(vcf_sample['TRUTH'])
                ncaller = sum(vcf_sample[method])
                df.loc[c, :] = [sample, muttype, method, round(config.tissuebenchmark.purities[i]*f, 2), ncaller, ntruth]
                c += 1
df.head()

In [None]:
for muttype in config.muttype:
    for s, sample in enumerate(config.tissuebenchmark.samples):
        plt.figure(figsize=(10,8))
        ax = sns.barplot(x="caller", y="number of loci called", hue="purity", data=df[(df['mutation type'] == muttype) & (df['sample'] == sample)], ci=None, palette="Greys_r", log=True,
                        hue_order=[round(config.tissuebenchmark.purities[s]*config.tissuebenchmark.fractions[i], 2) for i in range(3)])
        leg1 = plt.legend(title='purity', loc=2)

        xcoords, ycoords = [], []
        for m,methods in enumerate(config.methods):
            for b, bar in enumerate(plots.patches):
                if b % len(config.methods) == m:
                    #print(b)
                    if len(xcoords) == 3:
                        xcoords, ycoords = [], []
                    xi = bar.get_x() + (bar.get_width() / 2)
                    yi = df[(df['sample'] == sample) & (df['mutation type'] == muttype) & (df['caller'] == config.methods[b//3]) & (df['purity'] == round(config.tissuebenchmark.purities[s]*config.tissuebenchmark.fractions[b%3], 2))]['number of mutations (ground truth)'].values[0]
                    xcoords.append(xi)
                    ycoords.append(yi)
                    if len(xcoords) == 3:
                        if m == 0:
                            redline = ax.plot(xcoords, ycoords, 'ro-', label='Number of ground truths mutations')
                        else:
                            ax.plot(xcoords, ycoords, 'ro-')
        plt.ylim([0,15000])
        plt.legend(redline, ['# ground truths mutations'], loc='upper right', bbox_to_anchor=(1, 1))
        plt.gca().add_artist(leg1)
        plt.suptitle('Number of mutations called - '+sample+' - '+muttype)
    

# Ratio performance attenuation

In [None]:
df = pd.DataFrame(columns=['sample', 'mutation type', 'caller', 'purity', 'metric', 'number of loci', 'rate',  'number of mutations (ground truth)']) #'TN', 'FP', 'FN', 'TP'])
metric = ['TN', 'FP', 'FN', 'TP']
c = 0
for muttype in config.muttype:
    for i, sample in enumerate(config.tissuebenchmark.samples):
        for method in config.tissuebenchmark.methods:
            print(muttype, sample, method)
            for f in config.tissuebenchmark.fractions:
                vcf_sample = df_table[(df_table['sample'] == sample) & (df_table['mutation type'] == muttype) & (df_table['purity'] == round(config.tissuebenchmark.purities[i]*f, 2))]
                ntruth = sum(vcf_sample['TRUTH'])
                tn, fp, fn, tp = confusion_matrix(vcf_sample['TRUTH'], vcf_sample[method], labels=[0,1]).ravel()
                tpfn = tp + fn
                tnfp = tn + fp
                for j, tfpn in enumerate([tn, fp, fn, tp]):
                    # df.loc[c, :] = [sample, muttype, method, round(config.tissuebenchmark.purities[i]*f, 2), tn, fp, fn, tp]
                    if j in [0, 1]:
                        df.loc[c, :] = [sample, muttype, method, round(config.tissuebenchmark.purities[i]*f, 2), metric[c%4], tfpn, tfpn/tnfp, ntruth]
                    else: # j in [2,3]
                        df.loc[c, :] = [sample, muttype, method, round(config.tissuebenchmark.purities[i]*f, 2), metric[c%4], tfpn, tfpn/tpfn, ntruth]
                    c += 1
                    
df = df[df['metric'] == 'TP']
df['TP/GT'] =  df['number of loci'].divide(df['number of mutations (ground truth)'], axis='index') 

df.head(20)

In [None]:
for muttype in config.muttype:
    for i, sample in enumerate(config.tissuebenchmark.samples):
        g = plt.figure(figsize=(12,6))
        g = sns.catplot(x='purity', y='TP/GT', hue='caller', data=df[(df['sample'] == sample) & (df['mutation type'] == muttype)], kind='point', height=8,
                        order=[round(config.tissuebenchmark.purities[i]*config.tissuebenchmark.fractions[f], 2) for f in range(3)])
        plt.title('Detection rate evolution w.r.t purity in '+sample+' - '+muttype)
        plt.grid() 
        plt.ylim([0.5,1])
        plt.savefig('figures/tissue_benchmark/'+sample+'_'+muttype+'_detectionrate',  bbox_inches='tight')

In [None]:
res = df
a = res[['sample', 'mutation type', 'caller', 'TP/GT', 'purity']][res['purity'] > 0.9].set_index(['sample', 'mutation type', 'caller']).drop('purity', axis=1)
b = res[['sample', 'mutation type', 'caller', 'TP/GT', 'purity']][res['purity'] <= 0.2].set_index(['sample', 'mutation type', 'caller']).drop('purity', axis=1)
res = (a-b).divide(a).reset_index()
res['TP/GT'] = 100*res['TP/GT']
res

In [None]:
for muttype in config.muttype:
    plt.figure(figsize=(20,12))
    #sns.set_style("whitegrid")
    sns.catplot(y='TP/GT', x='sample', hue='caller', data=res[res['mutation type'] == muttype], height=8, aspect=0.8, s=15)
    plt.axhline(y=0, c='k', ls='--')
    plt.grid()
    plt.ylim([-15, 20])
    plt.ylabel('(TP/GT$_{\\rm{high purity}}$ - TP/GT$_{\\rm{low purity}}$) / TP/GT$_{\\rm{high purity}}$) (%) ')
    plt.title('Performance attenuation factor\n between high and low tumor purity sample - ' + muttype)
    plt.savefig('figures/tissue_benchmark/'+muttype+'_performanceattenuationfactor',  bbox_inches='tight')