In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pysam
from collections import Counter
from matplotlib import cm
from matplotlib.colors import ListedColormap

import io
from pysam import VariantFile

from sklearn.metrics import precision_score, recall_score, average_precision_score, precision_recall_curve

In [None]:
tab20 = cm.get_cmap('tab10', 8)
newcmap_list = []
for i in range(4):
    newcmap_list.append(tab20.colors[i])
for i in range(4):
    newcmap_list.append(np.array(list(tab20.colors[i][:-1]) +[0.3]))

newcmap = ListedColormap(newcmap_list, name='newcmap')
color_list = ['tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple']

In [None]:
def read_vcf(path):
    with open(path, 'r') as f:
        lines = [l for l in f if not l.startswith('##')]
    res = pd.read_csv(
        io.StringIO(''.join(lines)),
        dtype={'#CHROM': str, 'POS': int, 'ID': str, 'REF': str, 'ALT': str,
               'QUAL': str, 'FILTER': str, 'INFO': str},
        sep='\t'
    ).rename(columns={'#CHROM': 'CHROM'})
    return res

foo = lambda x: pd.Series(x.split('CALLERS=')[1].split(';')[0])
foo2 = lambda x: pd.Series(x.split('TYPE=')[1].split(';')[0] if len(x.split('TYPE=')) > 1 else np.nan)

In [None]:
methods = ['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan']

# Select plasma sample

In [None]:
sample = '809'
#sample = '986'

In [None]:
if sample == '809':
    plasmasample1 = '809_110914'
    plasmasample2 = '809_030915'
    tumorsample1 = '809_290714-T1W'
elif sample == '986':
    plasmasample1 = '986_100215'
    plasmasample2 = '986_261016'
    tumorsample1 = '986_100215-T1W'

# Load SNV calls for plasma sample and matching mixed samples

In [None]:
vcf_pd_0 = read_vcf("../data/2015-07-31_CRC-"+plasmasample1+"-1-0/CRC-"+plasmasample1+"-1-0-ensemble-annotated.vcf")
vcf_pd_1 = read_vcf("../data/2015-07-31_CRC-"+plasmasample1+"-1-05775/CRC-"+plasmasample1+"-1-05775-ensemble-annotated.vcf")
vcf_pd_2 = read_vcf("../data/2015-07-31_CRC-"+plasmasample1+"-075-06738/CRC-"+plasmasample1+"-075-06738-ensemble-annotated.vcf")
vcf_pd_3 = read_vcf("../data/2015-07-31_CRC-"+plasmasample1+"-05-07701/CRC-"+plasmasample1+"-05-07701-ensemble-annotated.vcf")
vcf_pd_4 = read_vcf("../data/2015-07-31_CRC-"+plasmasample1+"-025-08663/CRC-"+plasmasample1+"-025-08663-ensemble-annotated.vcf")

vcf_pd_0['callers'] = vcf_pd_0['INFO'].apply(foo)
vcf_pd_0['type'] = vcf_pd_0['INFO'].apply(foo2)
vcf_pd_0['type'][(vcf_pd_0['type'] == 'Deletion') |  (vcf_pd_0['type'] == 'del')] = 'DEL'
vcf_pd_0['type'][(vcf_pd_0['type'] == 'Insertion') |  (vcf_pd_0['type'] == 'ins')] = 'INS'
vcf_pd_0['freebayes'] = vcf_pd_0['INFO'].str.contains('freebayes')
vcf_pd_0['vardict'] = vcf_pd_0['INFO'].str.contains('vardict')
vcf_pd_0['varscan'] = vcf_pd_0['INFO'].str.contains('varscan')
vcf_pd_0['mutect2'] = vcf_pd_0['INFO'].str.contains('mutect2')
vcf_pd_0['strelka2'] = vcf_pd_0['INFO'].str.contains('strelka2')

vcf_pd_1['callers'] = vcf_pd_1['INFO'].apply(foo)
vcf_pd_1['type'] = vcf_pd_1['INFO'].apply(foo2)
vcf_pd_1['type'][(vcf_pd_1['type'] == 'Deletion') |  (vcf_pd_1['type'] == 'del')] = 'DEL'
vcf_pd_1['type'][(vcf_pd_1['type'] == 'Insertion') |  (vcf_pd_1['type'] == 'ins')] = 'INS'
vcf_pd_1['freebayes'] = vcf_pd_1['INFO'].str.contains('freebayes')
vcf_pd_1['vardict'] = vcf_pd_1['INFO'].str.contains('vardict')
vcf_pd_1['varscan'] = vcf_pd_1['INFO'].str.contains('varscan')
vcf_pd_1['mutect2'] = vcf_pd_1['INFO'].str.contains('mutect2')
vcf_pd_1['strelka2'] = vcf_pd_1['INFO'].str.contains('strelka2')

vcf_pd_2['callers'] = vcf_pd_2['INFO'].apply(foo)
vcf_pd_2['type'] = vcf_pd_2['INFO'].apply(foo2)
vcf_pd_2['type'][(vcf_pd_2['type'] == 'Deletion') |  (vcf_pd_2['type'] == 'del')] = 'DEL'
vcf_pd_2['type'][(vcf_pd_2['type'] == 'Insertion') |  (vcf_pd_2['type'] == 'ins')] = 'INS'
vcf_pd_2['freebayes'] = vcf_pd_2['INFO'].str.contains('freebayes')
vcf_pd_2['vardict'] = vcf_pd_2['INFO'].str.contains('vardict')
vcf_pd_2['varscan'] = vcf_pd_2['INFO'].str.contains('varscan')
vcf_pd_2['mutect2'] = vcf_pd_2['INFO'].str.contains('mutect2')
vcf_pd_2['strelka2'] = vcf_pd_2['INFO'].str.contains('strelka2')

vcf_pd_3['callers'] = vcf_pd_3['INFO'].apply(foo)
vcf_pd_3['type'] = vcf_pd_3['INFO'].apply(foo2)
vcf_pd_3['type'][(vcf_pd_3['type'] == 'Deletion') |  (vcf_pd_3['type'] == 'del')] = 'DEL'
vcf_pd_3['type'][(vcf_pd_3['type'] == 'Insertion') |  (vcf_pd_3['type'] == 'ins')] = 'INS'
vcf_pd_3['freebayes'] = vcf_pd_3['INFO'].str.contains('freebayes')
vcf_pd_3['vardict'] = vcf_pd_3['INFO'].str.contains('vardict')
vcf_pd_3['varscan'] = vcf_pd_3['INFO'].str.contains('varscan')
vcf_pd_3['mutect2'] = vcf_pd_3['INFO'].str.contains('mutect2')
vcf_pd_3['strelka2'] = vcf_pd_3['INFO'].str.contains('strelka2')

vcf_pd_4['callers'] = vcf_pd_4['INFO'].apply(foo)
vcf_pd_4['type'] = vcf_pd_4['INFO'].apply(foo2)
vcf_pd_4['type'][(vcf_pd_4['type'] == 'Deletion') |  (vcf_pd_4['type'] == 'del')] = 'DEL'
vcf_pd_4['type'][(vcf_pd_4['type'] == 'Insertion') |  (vcf_pd_4['type'] == 'ins')] = 'INS'
vcf_pd_4['freebayes'] = vcf_pd_4['INFO'].str.contains('freebayes')
vcf_pd_4['vardict'] = vcf_pd_4['INFO'].str.contains('vardict')
vcf_pd_4['varscan'] = vcf_pd_4['INFO'].str.contains('varscan')
vcf_pd_4['mutect2'] = vcf_pd_4['INFO'].str.contains('mutect2')
vcf_pd_4['strelka2'] = vcf_pd_4['INFO'].str.contains('strelka2')

sample_0 = vcf_pd_0[['CHROM', 'POS', 'REF', 'ALT', 'QUAL', 'FILTER', 'type', 'freebayes', 'vardict', 'varscan', 'mutect2', 'strelka2']]
sample_0['CHROM_POS'] = sample_0['CHROM'].astype('str').str.cat(sample_0['POS'].astype('str'),sep="_")
sample_0.set_index('CHROM_POS', inplace = True)

sample_1 = vcf_pd_1[['CHROM', 'POS', 'REF', 'ALT', 'QUAL','type', 'freebayes', 'vardict', 'varscan', 'mutect2', 'strelka2']]
sample_1['CHROM_POS'] = sample_1['CHROM'].astype('str').str.cat(sample_1['POS'].astype('str'),sep="_")
sample_1.set_index('CHROM_POS', inplace = True)

sample_2 = vcf_pd_2[['CHROM', 'POS', 'REF', 'ALT', 'QUAL', 'type', 'freebayes', 'vardict', 'varscan', 'mutect2', 'strelka2']]
sample_2['CHROM_POS'] = sample_2['CHROM'].astype('str').str.cat(sample_2['POS'].astype('str'),sep="_")
sample_2.set_index('CHROM_POS', inplace = True)

sample_3 = vcf_pd_3[['CHROM', 'POS', 'REF', 'ALT', 'QUAL', 'type', 'freebayes', 'vardict', 'varscan', 'mutect2', 'strelka2']]
sample_3['CHROM_POS'] = sample_3['CHROM'].astype('str').str.cat(sample_3['POS'].astype('str'),sep="_")
sample_3.set_index('CHROM_POS', inplace = True)

sample_4 = vcf_pd_4[['CHROM', 'POS', 'REF', 'ALT', 'QUAL', 'type', 'freebayes', 'vardict', 'varscan', 'mutect2', 'strelka2']]
sample_4['CHROM_POS'] = sample_4['CHROM'].astype('str').str.cat(sample_4['POS'].astype('str'),sep="_")
sample_4.set_index('CHROM_POS', inplace = True)

sample_4.head()

# Estimated tumor burden of mixed samples

In [None]:
samples_tf = {
    'sample_0': float(list(pd.read_csv("../data/2015-07-31_CRC-"+plasmasample1+"-1-0/estimated_tf.txt").columns)[0]),
    'sample_1': float(list(pd.read_csv("../data/2015-07-31_CRC-"+plasmasample1+"-1-05775/estimated_tf.txt").columns)[0]),
    'sample_2': float(list(pd.read_csv("../data/2015-07-31_CRC-"+plasmasample1+"-075-06738/estimated_tf.txt").columns)[0]),
    'sample_3': float(list(pd.read_csv("../data/2015-07-31_CRC-"+plasmasample1+"-05-07701/estimated_tf.txt").columns)[0]),
    'sample_4': float(list(pd.read_csv("../data/2015-07-31_CRC-"+plasmasample1+"-025-08663/estimated_tf.txt").columns)[0]),
}

print(samples_tf)

# Number of detections detected

In [None]:
numbersnvs_pd = pd.DataFrame()

for si, s in enumerate([sample_0, sample_1, sample_2, sample_3, sample_4]):
    nb_snv = []
    for method in methods:
        #if si == 4:
        #    si = 't'
       # print('sample '+ str(si) + ': ', method, s[s[method] == True].shape[0])
        nb_snv.append(s[s[method] == True].shape[0])
    if si == 0:
        numbersnvs_pd = pd.DataFrame.from_dict({'sample_'+ str(si): nb_snv}).T
        numbersnvs_pd.columns = methods
    else:
        numbersnvs_pd.loc['sample_'+ str(si)] = nb_snv
    numbersnvs_pd = numbersnvs_pd.rename(index=samples_tf)

numbersnvs_pd.plot(style='.-', logx=True, xlim=(10e-1, 10e-3))
#sns.catplot(x='tumor burden', y='coverage', kind='point', data=coverage_pd,
#            order=sorted(coverage_pd['tumor burden'].unique(), reverse=True))
        
numbersnvs_pd

In [None]:
print(sample_0[['QUAL', 'type', 'freebayes', 'mutect2', 'strelka2', 'varscan', 'vardict']][sample_0['QUAL'] == '0.0'].shape)

print(sample_0[['QUAL', 'type', 'freebayes', 'mutect2', 'strelka2', 'varscan', 'vardict']][
    (sample_0['varscan'] == True) & (sample_0['strelka2'] == True) &
(sample_0['freebayes'] == False) & (sample_0['vardict'] == False)
& (sample_0['mutect2'] == False)].shape)
print(sample_0[['QUAL', 'type', 'freebayes', 'mutect2', 'strelka2', 'varscan', 'vardict']][(sample_0['mutect2'] == True) & (sample_0['type'].isna())].shape)


In [None]:
# type NaN
# Mutect2 is True -> 63 mutations, all type NaN and qual=0
# rest of NaN is strelka2 + varscan are True, others False -> 18 with type NaN and qual=0

In [None]:
numbersnvs_pd = pd.DataFrame()

for si, s in enumerate([sample_0, sample_1, sample_2, sample_3, sample_4]):
    nb_snv = []
    for method in methods:
        #if si == 4:
        #    si = 't'
       # print('sample '+ str(si) + ': ', method, s[s[method] == True].shape[0])
        nb_snv.append(s[(s[method] == True) & ((s['type'].isna()))].shape[0])
    if si == 0:
        numbersnvs_pd = pd.DataFrame.from_dict({'sample_'+ str(si): nb_snv}).T
        numbersnvs_pd.columns = methods
    else:
        numbersnvs_pd.loc['sample_'+ str(si)] = nb_snv
    numbersnvs_pd = numbersnvs_pd.rename(index=samples_tf)

numbersnvs_pd.plot(style='.-', logx=True, xlim=(10e-1, 10e-3))
#sns.catplot(x='tumor burden', y='coverage', kind='point', data=coverage_pd,
#            order=sorted(coverage_pd['tumor burden'].unique(), reverse=True))
        
numbersnvs_pd

In [None]:
numbersnvs_pd = pd.DataFrame()

for si, s in enumerate([sample_0, sample_1, sample_2, sample_3, sample_4]):
    nb_snv = []
    for method in methods:
        #if si == 4:
        #    si = 't'
       # print('sample '+ str(si) + ': ', method, s[s[method] == True].shape[0])
        nb_snv.append(s[(s[method] == True) & ((s['type'] == 'SNV'))].shape[0])
    if si == 0:
        numbersnvs_pd = pd.DataFrame.from_dict({'sample_'+ str(si): nb_snv}).T
        numbersnvs_pd.columns = methods
    else:
        numbersnvs_pd.loc['sample_'+ str(si)] = nb_snv
    numbersnvs_pd = numbersnvs_pd.rename(index=samples_tf)

numbersnvs_pd.plot(style='.-', logx=True, xlim=(10e-1, 10e-3))
#sns.catplot(x='tumor burden', y='coverage', kind='point', data=coverage_pd,
#            order=sorted(coverage_pd['tumor burden'].unique(), reverse=True))
        
numbersnvs_pd

In [None]:
numbersnvs_pd = pd.DataFrame()

for si, s in enumerate([sample_0, sample_1, sample_2, sample_3, sample_4]):
    nb_snv = []
    for method in methods:
        #if si == 4:
        #    si = 't'
       # print('sample '+ str(si) + ': ', method, s[s[method] == True].shape[0])
        nb_snv.append(s[(s[method] == True) & ((s['type'] == 'snp'))].shape[0])
    if si == 0:
        numbersnvs_pd = pd.DataFrame.from_dict({'sample_'+ str(si): nb_snv}).T
        numbersnvs_pd.columns = methods
    else:
        numbersnvs_pd.loc['sample_'+ str(si)] = nb_snv
    numbersnvs_pd = numbersnvs_pd.rename(index=samples_tf)

numbersnvs_pd.plot(style='.-', logx=True, xlim=(10e-1, 10e-3))
#sns.catplot(x='tumor burden', y='coverage', kind='point', data=coverage_pd,
#            order=sorted(coverage_pd['tumor burden'].unique(), reverse=True))
        
numbersnvs_pd

In [None]:
numbersnvs_pd = pd.DataFrame()

for si, s in enumerate([sample_0, sample_1, sample_2, sample_3, sample_4]):
    nb_snv = []
    for method in methods:
        #if si == 4:
        #    si = 't'
       # print('sample '+ str(si) + ': ', method, s[s[method] == True].shape[0])
        nb_snv.append(s[(s[method] == True) & ((s['type'] == 'INS') | (s['type'] == 'DEL'))].shape[0])
    if si == 0:
        numbersnvs_pd = pd.DataFrame.from_dict({'sample_'+ str(si): nb_snv}).T
        numbersnvs_pd.columns = methods
    else:
        numbersnvs_pd.loc['sample_'+ str(si)] = nb_snv
    numbersnvs_pd = numbersnvs_pd.rename(index=samples_tf)

numbersnvs_pd.plot(style='.-', logx=True, xlim=(10e-1, 10e-3))
#sns.catplot(x='tumor burden', y='coverage', kind='point', data=coverage_pd,
#            order=sorted(coverage_pd['tumor burden'].unique(), reverse=True))
        
numbersnvs_pd

# Coverage variability

In [None]:
samples_cov_tf = {
    'sample_0': [float(list(pd.read_csv("../data/2015-07-31_CRC-"+plasmasample1+"-1-0/coverage.txt").columns)[0]),
                np.round(float(list(pd.read_csv("../data/2015-07-31_CRC-"+plasmasample1+"-1-0/estimated_tf.txt").columns)[0]),2)],
    'sample_1': [float(list(pd.read_csv("../data/2015-07-31_CRC-"+plasmasample1+"-1-05775/coverage.txt").columns)[0]),
                np.round(float(list(pd.read_csv("../data/2015-07-31_CRC-"+plasmasample1+"-1-05775/estimated_tf.txt").columns)[0]),2)],
    'sample_2': [float(list(pd.read_csv("../data/2015-07-31_CRC-"+plasmasample1+"-075-06738/coverage.txt").columns)[0]),
                np.round(float(list(pd.read_csv("../data/2015-07-31_CRC-"+plasmasample1+"-075-06738/estimated_tf.txt").columns)[0]),2)],
    'sample_3': [float(list(pd.read_csv("../data/2015-07-31_CRC-"+plasmasample1+"-05-07701/coverage.txt").columns)[0]),
                np.round(float(list(pd.read_csv("../data/2015-07-31_CRC-"+plasmasample1+"-05-07701/estimated_tf.txt").columns)[0]),2)],
    'sample_4': [float(list(pd.read_csv("../data/2015-07-31_CRC-"+plasmasample1+"-025-08663/coverage.txt").columns)[0]),
                np.round(float(list(pd.read_csv("../data/2015-07-31_CRC-"+plasmasample1+"-025-08663/estimated_tf.txt").columns)[0]),2)],
    
}

print(samples_cov_tf)

coverage_pd = pd.DataFrame.from_dict(samples_cov_tf).T
coverage_pd.columns = ['coverage', 'tumor burden']
sns.catplot(x='tumor burden', y='coverage', data=coverage_pd,
            order=sorted(coverage_pd['tumor burden'].unique(), reverse=True), color='magenta')

        
coverage_pd

# AUPRC with reference = SNV found in undiluted plasma sample by the same method

In [None]:
smurf_snv = pd.read_csv('../data/SMURF/NCC_CRC-809_110914-CW/snv-predicted.txt', sep='\t')
print(smurf_snv[smurf_snv['Chr'] == '22'].shape)

smurf_indels = pd.read_csv('../data/SMURF/NCC_CRC-809_110914-CW/indel-predicted.txt', sep='\t')
print(smurf_indels[smurf_indels['Chr'] == '22'].shape)

smurf_calls = pd.concat([smurf_snv[smurf_snv['Chr'] == '22'], smurf_indels[smurf_indels['Chr'] == '22']])
smurf_calls['CHROM_POS'] = smurf_calls['Chr'].astype('str').str.cat(smurf_calls['START_POS_REF'].astype('str'),sep="_")
smurf_calls.set_index('CHROM_POS', inplace = True)
#smurf_calls

In [None]:
pd_methods = pd.DataFrame.empty
count = 0

for mi, method in enumerate(methods):
    #print(method)

    s0 = sample_0[['REF', 'ALT', 'QUAL', method]]
    s0.rename(columns = {method:'sample_0'},  inplace = True)
    s1 = sample_1[['QUAL', method]]
    s1.rename(columns = {method:'sample_1'},  inplace = True)
    s2 = sample_2[['QUAL', method]]
    s2.rename(columns = {method:'sample_2'},  inplace = True)
    s3 = sample_3[['QUAL', method]]
    s3.rename(columns = {method:'sample_3'},  inplace = True)
    s4 = sample_4[['QUAL', method]]
    s4.rename(columns = {method:'sample_4'},  inplace = True)

    pd_method = pd.concat([s0, s1, s2, s3, s4], axis=1)

    pd_method.columns = ['REF', 'ALT', 'QUAL_0', 'sample_0', 'QUAL_1', 'sample_1', 'QUAL_2', 'sample_2',  'QUAL_3', 'sample_3', 'QUAL_4', 'sample_4']
    pd_method[['sample_0', 'sample_1', 'sample_2', 'sample_3', 'sample_4']] = pd_method[['sample_0','sample_1', 'sample_2', 'sample_3', 'sample_4']].fillna(value=False)
    pd_method.drop(pd_method[pd_method['sample_0'] + pd_method['sample_1'] + pd_method['sample_2']  + pd_method['sample_3'] + pd_method['sample_4'] == False].index, axis=0, inplace=True)
    pd_method['caller'] = method

    if count == 0:
        pd_methods = pd_method
        count = 1
    else:
        pd_methods = pd.concat([pd_methods, pd_method], join='inner')
#print(pd_methods.shape)


y_true_smurf = pd.Series(True, index=smurf_calls.index)

y_true_index = pd.Index(list(pd_methods.index)+list(y_true_smurf.index))
print(y_true_index.size)
y_true_index = y_true_index.drop_duplicates()
print(y_true_index.size)
y_true = pd.Series(False, index=y_true_index)
y_true.loc[list(y_true_smurf.index)] = True


pd_results = pd.DataFrame
pd_results_PR = pd.DataFrame
count = 0

fig, axs = plt.subplots(1,5,figsize=(30, 4))
fig.suptitle('Precision-Recall curves')

baselineAUPRC = {}


for mi, method in enumerate(methods):
    
    res_df = (100*pd.Series(samples_tf)).round(decimals=2).to_frame(name='tumor burden')
    res_PR_df = (100*pd.Series(samples_tf)).round(decimals=2).to_frame(name='tumor burden')

    #y_true = pd_methods[pd_methods['caller'] == method]['sample_0'].astype(bool)
    baselineAUPRC[method] = len(y_true[y_true == True])/len(y_true)
    y_0 = pd_methods[pd_methods['caller'] == method]['sample_0'].astype(bool).reindex(y_true.index).squeeze()
    y_0 = y_0.fillna(False)
    y_1 = pd_methods[pd_methods['caller'] == method]['sample_1'].astype(bool).reindex(y_true.index).squeeze()
    y_1 = y_1.fillna(False)
    y_2 = pd_methods[pd_methods['caller'] == method]['sample_2'].astype(bool).reindex(y_true.index).squeeze()
    y_2 = y_2.fillna(False)
    y_3 = pd_methods[pd_methods['caller'] == method]['sample_3'].astype(bool).reindex(y_true.index).squeeze()
    y_3 = y_3.fillna(False)
    y_4 = pd_methods[pd_methods['caller'] == method]['sample_4'].astype(bool).reindex(y_true.index).squeeze()
    y_4 = y_4.fillna(False)


    res_df['AUPRC'] = [average_precision_score(y_true, y_0),
                      average_precision_score(y_true, y_1),
                      average_precision_score(y_true, y_2),
                      average_precision_score(y_true, y_3),
                      average_precision_score(y_true, y_4),
                     ]
    res_PR_df['precision'] = [precision_score(y_true, y_0),
                      precision_score(y_true, y_1),
                      precision_score(y_true, y_2),
                      precision_score(y_true, y_3),
                      precision_score(y_true, y_4),
                     ]

    res_PR_df['recall'] = [recall_score(y_true, y_0),
                      recall_score(y_true, y_1),
                      recall_score(y_true, y_2),
                      recall_score(y_true, y_3),
                      recall_score(y_true, y_4),
                     ]
    res_PR_df = pd.melt(res_PR_df, id_vars =['tumor burden'], value_vars =['precision', 'recall'],
                    var_name='metric', value_name='value')
    res_df['caller'] = method
    res_PR_df['caller'] = method
    
    res_df.drop(['sample_0'], inplace=True)
    res_PR_df.drop([0, 5], inplace=True)
 
    # Plot Precision-Recall curve
    alpha_list = [0.4, 1, .75, .5, .3, .1]
    for i in range(1,5):
        y_i = pd_methods[pd_methods['caller'] == method]['sample_'+str(i)].astype(bool).reindex(y_true.index).squeeze()
        y_i = y_i.fillna(False)
        precision, recall, _ = precision_recall_curve(y_true, y_i)
        axs[mi].plot(recall, precision, 'o-',
                              label='tf='+str(res_df['tumor burden'].loc['sample_'+str(i)])+ ', AP='+str(round(average_precision_score(y_true, y_i), 2)),
                              c=color_list[mi], alpha=alpha_list[i])
    axs[mi].set_xlabel('Recall')
    axs[mi].set_ylabel('Precision')
    axs[mi].set_ylim([0.0, 1.05])
    axs[mi].set_xlim([0.0, 1.05])
    axs[mi].set_title(method)
    axs[mi].legend()

    if count == 0:
        pd_results = res_df
        pd_results_PR = res_PR_df
        count = 1
    else:
        pd_results = pd.concat([pd_results, res_df], join='inner')
        pd_results_PR = pd.concat([pd_results_PR, res_PR_df], join='inner')
plt.show()

plt.figure()
sns.catplot(x="tumor burden", y="AUPRC", hue="caller",  
                capsize=.2, height=6, aspect=.75,
                kind="point", order=sorted(pd_results['tumor burden'].unique(), reverse=True), data=pd_results)
for mi, method in enumerate(methods):
    plt.axhline(y = baselineAUPRC[method], color = sns.color_palette("tab10")[mi], linestyle = '--') 
plt.ylim([0, 0.5])

plt.figure()
sns.catplot(x="tumor burden", y="value", col="metric", hue="caller",  
                capsize=.2, height=6, aspect=.75,
                kind="point", order=sorted(pd_results['tumor burden'].unique(), reverse=True), data=pd_results_PR)
plt.ylim([0, 0.5])

In [None]:
pd_results = pd.DataFrame.empty
count = 0

baselineAUPRC = {}

for mi, method in enumerate(methods):
    
    res_df = (100*pd.Series(samples_tf)).round(decimals=2).to_frame(name='tumor burden')

    #y_true = pd_methods[pd_methods['caller'] == method]['sample_0'].astype(bool)
    baselineAUPRC[method] = len(y_true[y_true == True])/len(y_true)

    y_0 = pd_methods[pd_methods['caller'] == method]['sample_0'].astype(bool).reindex(y_true.index).squeeze()
    y_0 = y_0.fillna(False)
    y_1 = pd_methods[pd_methods['caller'] == method]['sample_1'].astype(bool).reindex(y_true.index).squeeze()
    y_1 = y_1.fillna(False)
    y_2 = pd_methods[pd_methods['caller'] == method]['sample_2'].astype(bool).reindex(y_true.index).squeeze()
    y_2 = y_2.fillna(False)
    y_3 = pd_methods[pd_methods['caller'] == method]['sample_3'].astype(bool).reindex(y_true.index).squeeze()
    y_3 = y_3.fillna(False)
    y_4 = pd_methods[pd_methods['caller'] == method]['sample_4'].astype(bool).reindex(y_true.index).squeeze()
    y_4 = y_4.fillna(False)

    res_df['AUPRC - baseline AUPRC'] = [average_precision_score(y_true, y_0) - baselineAUPRC[method],
                      average_precision_score(y_true, y_1) - baselineAUPRC[method],
                      average_precision_score(y_true, y_2) - baselineAUPRC[method],
                      average_precision_score(y_true, y_3) - baselineAUPRC[method],
                      average_precision_score(y_true, y_4) - baselineAUPRC[method],
                     ]
    res_df['caller'] = method
    res_df.drop(['sample_0'], inplace=True)
 
    if count == 0:
        pd_results = res_df
        count = 1
    else:
        pd_results = pd.concat([pd_results, res_df], join='inner')
plt.show()

sns.catplot(x="tumor burden", y="AUPRC - baseline AUPRC", hue="caller",  
                capsize=.2, height=6, aspect=.75,
                kind="point", order=sorted(pd_results['tumor burden'].unique(), reverse=True), data=pd_results)
plt.axhline(y = 0, color = 'k', linestyle = '--', label='baseline AUPRC') 
#plt.ylim([0, 0.5])

In [None]:
for ncallers in range(2, len(methods)+1):
    y_true = pd_methods[['sample_0', 'caller']]
    y_true.index.name = 'CHROM_POS'
    y_true = y_true.groupby(['CHROM_POS'])['sample_0'].sum()
    y_true[y_true < ncallers] = 0
    y_true = y_true.astype(bool)
    list_ncallers = list(y_true[y_true == True].index)
    list_smurf = list(smurf_calls.index)
    print(ncallers, y_true[y_true == True].shape[0])
    print(len(list(set(set(list_ncallers) & set(list_smurf)))))

## SNV only

In [None]:
smurf_snv = smurf_snv[smurf_snv['Chr'] == '22']
smurf_snv['CHROM_POS'] = smurf_snv['Chr'].astype('str').str.cat(smurf_snv['START_POS_REF'].astype('str'),sep="_")
smurf_snv.set_index('CHROM_POS', inplace = True)
#smurf_snv

In [None]:
pd_methods = pd.DataFrame.empty
count = 0

for mi, method in enumerate(methods):
    #print(method)

    s0 = sample_0[['REF', 'ALT',  'QUAL',  'type', method]]
    s0.rename(columns = {method:'sample_0'},  inplace = True)
    s1 = sample_1[['QUAL', 'type', method]]
    s1.rename(columns = {method:'sample_1'},  inplace = True)
    s2 = sample_2[['QUAL', 'type',  method]]
    s2.rename(columns = {method:'sample_2'},  inplace = True)
    s3 = sample_3[['QUAL', 'type',  method]]
    s3.rename(columns = {method:'sample_3'},  inplace = True)
    s4 = sample_4[['QUAL', 'type',  method]]
    s4.rename(columns = {method:'sample_4'},  inplace = True)

    pd_method = pd.concat([s0, s1, s2, s3, s4], axis=1)

    pd_method.columns = ['REF', 'ALT', 'QUAL_0',  'type_0', 'sample_0', 'QUAL_1',  'type_1', 'sample_1', 'QUAL_2', 'type_2',  'sample_2',  'QUAL_3',  'type_3', 'sample_3', 'QUAL_4', 'type_4',  'sample_4']
    pd_method[['sample_0', 'sample_1', 'sample_2', 'sample_3', 'sample_4']] = pd_method[['sample_0','sample_1', 'sample_2', 'sample_3', 'sample_4']].fillna(value=False)
    pd_method.drop(pd_method[pd_method['sample_0'] + pd_method['sample_1'] + pd_method['sample_2']  + pd_method['sample_3'] + pd_method['sample_4'] == False].index, axis=0, inplace=True)
    pd_method['caller'] = method

    if count == 0:
        pd_methods = pd_method
        count = 1
    else:
        pd_methods = pd.concat([pd_methods, pd_method], join='inner')
print(pd_methods.shape)
pd_methods = pd_methods[(pd_methods["type_0"].str.contains('SNV')) | (pd_methods["type_1"].str.contains('SNV')) | (pd_methods["type_2"].str.contains('SNV')) | (pd_methods["type_3"].str.contains('SNV')) | (pd_methods["type_4"].str.contains('SNV')) |
                       (pd_methods["type_0"].isna()) | (pd_methods["type_1"].isna()) | (pd_methods["type_2"].isna()) | (pd_methods["type_3"].isna()) | (pd_methods["type_4"].isna())]
print(pd_methods.shape)
y_true_smurf = pd.Series(True, index=smurf_snv.index)

y_true_index = pd.Index(list(pd_methods.index)+list(y_true_smurf.index))
print(y_true_index.size)
y_true_index = y_true_index.drop_duplicates()
print(y_true_index.size)
y_true = pd.Series(False, index=y_true_index)
y_true.loc[list(y_true_smurf.index)] = True


pd_results = pd.DataFrame
pd_results_PR = pd.DataFrame
count = 0

fig, axs = plt.subplots(1,5,figsize=(30, 4))
fig.suptitle('Precision-Recall curves')

baselineAUPRC = {}


for mi, method in enumerate(methods):
    
    res_df = (100*pd.Series(samples_tf)).round(decimals=2).to_frame(name='tumor burden')
    res_PR_df = (100*pd.Series(samples_tf)).round(decimals=2).to_frame(name='tumor burden')

    #y_true = pd_methods[pd_methods['caller'] == method]['sample_0'].astype(bool)
    baselineAUPRC[method] = len(y_true[y_true == True])/len(y_true)
    y_0 = pd_methods[(pd_methods['caller'] == method) & ((pd_methods['type_0'] == 'SNV') | (pd_methods['type_0'].isna()))]['sample_0'].astype(bool).reindex(y_true.index).squeeze()
    y_0 = y_0.fillna(False)
    y_1 = pd_methods[(pd_methods['caller'] == method) & ((pd_methods['type_1'] == 'SNV') | (pd_methods['type_0'].isna()))]['sample_1'].astype(bool).reindex(y_true.index).squeeze()
    y_1 = y_1.fillna(False)
    y_2 = pd_methods[(pd_methods['caller'] == method) & ((pd_methods['type_2'] == 'SNV') | (pd_methods['type_0'].isna()))]['sample_2'].astype(bool).reindex(y_true.index).squeeze()
    y_2 = y_2.fillna(False)
    y_3 = pd_methods[(pd_methods['caller'] == method) & ((pd_methods['type_3'] == 'SNV') | (pd_methods['type_0'].isna()))]['sample_3'].astype(bool).reindex(y_true.index).squeeze()
    y_3 = y_3.fillna(False)
    y_4 = pd_methods[(pd_methods['caller'] == method) & ((pd_methods['type_4'] == 'SNV') | (pd_methods['type_0'].isna()))]['sample_4'].astype(bool).reindex(y_true.index).squeeze()
    y_4 = y_4.fillna(False)


    res_df['AUPRC'] = [average_precision_score(y_true, y_0),
                      average_precision_score(y_true, y_1),
                      average_precision_score(y_true, y_2),
                      average_precision_score(y_true, y_3),
                      average_precision_score(y_true, y_4),
                     ]
    res_PR_df['precision'] = [precision_score(y_true, y_0),
                      precision_score(y_true, y_1),
                      precision_score(y_true, y_2),
                      precision_score(y_true, y_3),
                      precision_score(y_true, y_4),
                     ]

    res_PR_df['recall'] = [recall_score(y_true, y_0),
                      recall_score(y_true, y_1),
                      recall_score(y_true, y_2),
                      recall_score(y_true, y_3),
                      recall_score(y_true, y_4),
                     ]
    res_PR_df = pd.melt(res_PR_df, id_vars =['tumor burden'], value_vars =['precision', 'recall'],
                    var_name='metric', value_name='value')
    res_df['caller'] = method
    res_PR_df['caller'] = method
    
    res_df.drop(['sample_0'], inplace=True)
    res_PR_df.drop([0, 5], inplace=True)
 
    # Plot Precision-Recall curve
    alpha_list = [0.4, 1, .75, .5, .3, .1]
    for i in range(1,5):
        y_i = pd_methods[(pd_methods['caller'] == method)  & (pd_methods['type_'+str(i)] == 'SNV')]['sample_'+str(i)].astype(bool).reindex(y_true.index).squeeze()
        y_i = y_i.fillna(False)
        precision, recall, _ = precision_recall_curve(y_true, y_i)
        axs[mi].plot(recall, precision, 'o-',
                              label='tf='+str(res_df['tumor burden'].loc['sample_'+str(i)])+ ', AP='+str(round(average_precision_score(y_true, y_i), 2)),
                              c=color_list[mi], alpha=alpha_list[i])
    axs[mi].set_xlabel('Recall')
    axs[mi].set_ylabel('Precision')
    axs[mi].set_ylim([0.0, 1.05])
    axs[mi].set_xlim([0.0, 1.05])
    axs[mi].set_title(method)
    axs[mi].legend()

    if count == 0:
        pd_results = res_df
        pd_results_PR = res_PR_df
        count = 1
    else:
        pd_results = pd.concat([pd_results, res_df], join='inner')
        pd_results_PR = pd.concat([pd_results_PR, res_PR_df], join='inner')
plt.show()

plt.figure()
sns.catplot(x="tumor burden", y="AUPRC", hue="caller",  
                capsize=.2, height=6, aspect=.75,
                kind="point", order=sorted(pd_results['tumor burden'].unique(), reverse=True), data=pd_results)
for mi, method in enumerate(methods):
    plt.axhline(y = baselineAUPRC[method], color = sns.color_palette("tab10")[mi], linestyle = '--') 
plt.ylim([0, 0.5])

plt.figure()
sns.catplot(x="tumor burden", y="value", col="metric", hue="caller",  
                capsize=.2, height=6, aspect=.75,
                kind="point", order=sorted(pd_results['tumor burden'].unique(), reverse=True), data=pd_results_PR)
plt.ylim([0, 0.5])

In [None]:
pd_results = pd.DataFrame.empty
count = 0

baselineAUPRC = {}

for mi, method in enumerate(methods):
    
    res_df = (100*pd.Series(samples_tf)).round(decimals=2).to_frame(name='tumor burden')

    #y_true = pd_methods[pd_methods['caller'] == method]['sample_0'].astype(bool)
    baselineAUPRC[method] = len(y_true[y_true == True])/len(y_true)

    y_0 = pd_methods[(pd_methods['caller'] == method) & ((pd_methods['type_0'] == 'SNV') | (pd_methods['type_0'].isna()))]['sample_0'].astype(bool).reindex(y_true.index).squeeze()
    y_0 = y_0.fillna(False)
    y_1 = pd_methods[(pd_methods['caller'] == method) & ((pd_methods['type_1'] == 'SNV') | (pd_methods['type_1'].isna()))]['sample_1'].astype(bool).reindex(y_true.index).squeeze()
    y_1 = y_1.fillna(False)
    y_2 = pd_methods[(pd_methods['caller'] == method) & ((pd_methods['type_2'] == 'SNV') | (pd_methods['type_2'].isna()))]['sample_2'].astype(bool).reindex(y_true.index).squeeze()
    y_2 = y_2.fillna(False)
    y_3 = pd_methods[(pd_methods['caller'] == method) & ((pd_methods['type_3'] == 'SNV') | (pd_methods['type_3'].isna()))]['sample_3'].astype(bool).reindex(y_true.index).squeeze()
    y_3 = y_3.fillna(False)
    y_4 = pd_methods[(pd_methods['caller'] == method) & ((pd_methods['type_4'] == 'SNV') | (pd_methods['type_4'].isna()))]['sample_4'].astype(bool).reindex(y_true.index).squeeze()
    y_4 = y_4.fillna(False)

    res_df['AUPRC - baseline AUPRC'] = [average_precision_score(y_true, y_0) - baselineAUPRC[method],
                      average_precision_score(y_true, y_1) - baselineAUPRC[method],
                      average_precision_score(y_true, y_2) - baselineAUPRC[method],
                      average_precision_score(y_true, y_3) - baselineAUPRC[method],
                      average_precision_score(y_true, y_4) - baselineAUPRC[method],
                     ]
    res_df['caller'] = method
    res_df.drop(['sample_0'], inplace=True)
 
    if count == 0:
        pd_results = res_df
        count = 1
    else:
        pd_results = pd.concat([pd_results, res_df], join='inner')
plt.show()

sns.catplot(x="tumor burden", y="AUPRC - baseline AUPRC", hue="caller",  
                capsize=.2, height=6, aspect=.75,
                kind="point", order=sorted(pd_results['tumor burden'].unique(), reverse=True), data=pd_results)
plt.axhline(y = 0, color = 'k', linestyle = '--', label='baseline AUPRC') 
#plt.ylim([0, 0.5])

## Indels only

In [None]:
smurf_indels = smurf_indels[smurf_indels['Chr'] == '22']
smurf_indels['CHROM_POS'] = smurf_indels['Chr'].astype('str').str.cat(smurf_indels['START_POS_REF'].astype('str'),sep="_")
smurf_indels.set_index('CHROM_POS', inplace = True)
#smurf_indels

In [None]:
pd_methods = pd.DataFrame.empty
count = 0

for mi, method in enumerate(methods):
    #print(method)

    s0 = sample_0[['REF', 'ALT',  'QUAL',  'type', method]]
    s0.rename(columns = {method:'sample_0'},  inplace = True)
    s1 = sample_1[['QUAL', 'type', method]]
    s1.rename(columns = {method:'sample_1'},  inplace = True)
    s2 = sample_2[['QUAL', 'type',  method]]
    s2.rename(columns = {method:'sample_2'},  inplace = True)
    s3 = sample_3[['QUAL', 'type',  method]]
    s3.rename(columns = {method:'sample_3'},  inplace = True)
    s4 = sample_4[['QUAL', 'type',  method]]
    s4.rename(columns = {method:'sample_4'},  inplace = True)

    pd_method = pd.concat([s0, s1, s2, s3, s4], axis=1)

    pd_method.columns = ['REF', 'ALT', 'QUAL_0',  'type_0', 'sample_0', 'QUAL_1',  'type_1', 'sample_1', 'QUAL_2', 'type_2',  'sample_2',  'QUAL_3',  'type_3', 'sample_3', 'QUAL_4', 'type_4',  'sample_4']
    pd_method[['sample_0', 'sample_1', 'sample_2', 'sample_3', 'sample_4']] = pd_method[['sample_0','sample_1', 'sample_2', 'sample_3', 'sample_4']].fillna(value=False)
    pd_method.drop(pd_method[pd_method['sample_0'] + pd_method['sample_1'] + pd_method['sample_2']  + pd_method['sample_3'] + pd_method['sample_4'] == False].index, axis=0, inplace=True)
    pd_method['caller'] = method

    if count == 0:
        pd_methods = pd_method
        count = 1
    else:
        pd_methods = pd.concat([pd_methods, pd_method], join='inner')
print(pd_methods.shape)
pd_methods = pd_methods[(pd_methods["type_0"].str.contains('INS')) | (pd_methods["type_1"].str.contains('INS')) | (pd_methods["type_2"].str.contains('INS')) | (pd_methods["type_3"].str.contains('INS')) | (pd_methods["type_4"].str.contains('INS')) |
                       (pd_methods["type_0"].str.contains('DEL')) | (pd_methods["type_1"].str.contains('DEL')) | (pd_methods["type_2"].str.contains('DEL')) | (pd_methods["type_3"].str.contains('DEL')) | (pd_methods["type_4"].str.contains('DEL')) |
                       (pd_methods["type_0"].isna()) | (pd_methods["type_1"].isna()) | (pd_methods["type_2"].isna()) | (pd_methods["type_3"].isna()) | (pd_methods["type_4"].isna())]
print(pd_methods.shape)

y_true_smurf = pd.Series(True, index=smurf_snv.index)

y_true_index = pd.Index(list(pd_methods.index)+list(y_true_smurf.index))
print(y_true_index.size)
y_true_index = y_true_index.drop_duplicates()
print(y_true_index.size)
y_true = pd.Series(False, index=y_true_index)
y_true.loc[list(y_true_smurf.index)] = True


pd_results = pd.DataFrame
pd_results_PR = pd.DataFrame
count = 0

fig, axs = plt.subplots(1,5,figsize=(30, 4))
fig.suptitle('Precision-Recall curves')

baselineAUPRC = {}


for mi, method in enumerate(methods):
    
    res_df = (100*pd.Series(samples_tf)).round(decimals=2).to_frame(name='tumor burden')
    res_PR_df = (100*pd.Series(samples_tf)).round(decimals=2).to_frame(name='tumor burden')

    #y_true = pd_methods[pd_methods['caller'] == method]['sample_0'].astype(bool)
    baselineAUPRC[method] = len(y_true[y_true == True])/len(y_true)
    y_0 = pd_methods[(pd_methods['caller'] == method) & ((pd_methods['type_0'] == 'DEL') | (pd_methods['type_0'] == 'INS') | (pd_methods['type_0'].isna()))]['sample_0'].astype(bool).reindex(y_true.index).squeeze()
    y_0 = y_0.fillna(False)
    y_1 = pd_methods[(pd_methods['caller'] == method) & ((pd_methods['type_1'] == 'DEL') | (pd_methods['type_1'] == 'INS') | (pd_methods['type_1'].isna()))]['sample_1'].astype(bool).reindex(y_true.index).squeeze()
    y_1 = y_1.fillna(False)
    y_2 = pd_methods[(pd_methods['caller'] == method) & ((pd_methods['type_2'] == 'DEL') | (pd_methods['type_2'] == 'INS') | (pd_methods['type_2'].isna()))]['sample_2'].astype(bool).reindex(y_true.index).squeeze()
    y_2 = y_2.fillna(False)
    y_3 = pd_methods[(pd_methods['caller'] == method) & ((pd_methods['type_3'] == 'DEL') | (pd_methods['type_3'] == 'INS') | (pd_methods['type_3'].isna()))]['sample_3'].astype(bool).reindex(y_true.index).squeeze()
    y_3 = y_3.fillna(False)
    y_4 = pd_methods[(pd_methods['caller'] == method) & ((pd_methods['type_4'] == 'DEL') | (pd_methods['type_4'] == 'INS') | (pd_methods['type_4'].isna()))]['sample_4'].astype(bool).reindex(y_true.index).squeeze()
    y_4 = y_4.fillna(False)


    res_df['AUPRC'] = [average_precision_score(y_true, y_0),
                      average_precision_score(y_true, y_1),
                      average_precision_score(y_true, y_2),
                      average_precision_score(y_true, y_3),
                      average_precision_score(y_true, y_4),
                     ]
    res_PR_df['precision'] = [precision_score(y_true, y_0),
                      precision_score(y_true, y_1),
                      precision_score(y_true, y_2),
                      precision_score(y_true, y_3),
                      precision_score(y_true, y_4),
                     ]

    res_PR_df['recall'] = [recall_score(y_true, y_0),
                      recall_score(y_true, y_1),
                      recall_score(y_true, y_2),
                      recall_score(y_true, y_3),
                      recall_score(y_true, y_4),
                     ]
    res_PR_df = pd.melt(res_PR_df, id_vars =['tumor burden'], value_vars =['precision', 'recall'],
                    var_name='metric', value_name='value')
    res_df['caller'] = method
    res_PR_df['caller'] = method
    
    res_df.drop(['sample_0'], inplace=True)
    res_PR_df.drop([0, 5], inplace=True)
 
    # Plot Precision-Recall curve
    alpha_list = [0.4, 1, .75, .5, .3, .1]
    for i in range(1,5):
        y_i = pd_methods[(pd_methods['caller'] == method)  & (pd_methods['type_'+str(i)] == 'I')]['sample_'+str(i)].astype(bool).reindex(y_true.index).squeeze()
        y_i = y_i.fillna(False)
        precision, recall, _ = precision_recall_curve(y_true, y_i)
        axs[mi].plot(recall, precision, 'o-',
                              label='tf='+str(res_df['tumor burden'].loc['sample_'+str(i)])+ ', AP='+str(round(average_precision_score(y_true, y_i), 2)),
                              c=color_list[mi], alpha=alpha_list[i])
    axs[mi].set_xlabel('Recall')
    axs[mi].set_ylabel('Precision')
    axs[mi].set_ylim([0.0, 1.05])
    axs[mi].set_xlim([0.0, 1.05])
    axs[mi].set_title(method)
    axs[mi].legend()

    if count == 0:
        pd_results = res_df
        pd_results_PR = res_PR_df
        count = 1
    else:
        pd_results = pd.concat([pd_results, res_df], join='inner')
        pd_results_PR = pd.concat([pd_results_PR, res_PR_df], join='inner')
plt.show()

plt.figure()
sns.catplot(x="tumor burden", y="AUPRC", hue="caller",  
                capsize=.2, height=6, aspect=.75,
                kind="point", order=sorted(pd_results['tumor burden'].unique(), reverse=True), data=pd_results)
for mi, method in enumerate(methods):
    plt.axhline(y = baselineAUPRC[method], color = sns.color_palette("tab10")[mi], linestyle = '--') 
plt.ylim([0, 0.5])

plt.figure()
sns.catplot(x="tumor burden", y="value", col="metric", hue="caller",  
                capsize=.2, height=6, aspect=.75,
                kind="point", order=sorted(pd_results['tumor burden'].unique(), reverse=True), data=pd_results_PR)
plt.ylim([0, 0.5])

In [None]:
pd_results = pd.DataFrame.empty
count = 0

baselineAUPRC = {}

for mi, method in enumerate(methods):
    
    res_df = (100*pd.Series(samples_tf)).round(decimals=2).to_frame(name='tumor burden')

    #y_true = pd_methods[pd_methods['caller'] == method]['sample_0'].astype(bool)
    baselineAUPRC[method] = len(y_true[y_true == True])/len(y_true)

    y_0 = pd_methods[(pd_methods['caller'] == method) & ((pd_methods['type_0'] == 'DEL') | (pd_methods['type_0'] == 'INS') | (pd_methods['type_0'].isna()))]['sample_0'].astype(bool).reindex(y_true.index).squeeze()
    y_0 = y_0.fillna(False)
    y_1 = pd_methods[(pd_methods['caller'] == method) & ((pd_methods['type_1'] == 'DEL') | (pd_methods['type_1'] == 'INS') | (pd_methods['type_1'].isna()))]['sample_1'].astype(bool).reindex(y_true.index).squeeze()
    y_1 = y_1.fillna(False)
    y_2 = pd_methods[(pd_methods['caller'] == method) & ((pd_methods['type_2'] == 'DEL') | (pd_methods['type_2'] == 'INS') | (pd_methods['type_2'].isna()))]['sample_2'].astype(bool).reindex(y_true.index).squeeze()
    y_2 = y_2.fillna(False)
    y_3 = pd_methods[(pd_methods['caller'] == method) & ((pd_methods['type_3'] == 'DEL') | (pd_methods['type_3'] == 'INS') | (pd_methods['type_3'].isna()))]['sample_3'].astype(bool).reindex(y_true.index).squeeze()
    y_3 = y_3.fillna(False)
    y_4 = pd_methods[(pd_methods['caller'] == method) & ((pd_methods['type_4'] == 'DEL') | (pd_methods['type_4'] == 'INS') | (pd_methods['type_4'].isna()))]['sample_4'].astype(bool).reindex(y_true.index).squeeze()
    y_4 = y_4.fillna(False)

    res_df['AUPRC - baseline AUPRC'] = [average_precision_score(y_true, y_0) - baselineAUPRC[method],
                      average_precision_score(y_true, y_1) - baselineAUPRC[method],
                      average_precision_score(y_true, y_2) - baselineAUPRC[method],
                      average_precision_score(y_true, y_3) - baselineAUPRC[method],
                      average_precision_score(y_true, y_4) - baselineAUPRC[method],
                     ]
    res_df['caller'] = method
    res_df.drop(['sample_0'], inplace=True)
 
    if count == 0:
        pd_results = res_df
        count = 1
    else:
        pd_results = pd.concat([pd_results, res_df], join='inner')
plt.show()

sns.catplot(x="tumor burden", y="AUPRC - baseline AUPRC", hue="caller",  
                capsize=.2, height=6, aspect=.75,
                kind="point", order=sorted(pd_results['tumor burden'].unique(), reverse=True), data=pd_results)
plt.axhline(y = 0, color = 'k', linestyle = '--', label='baseline AUPRC') 
#plt.ylim([0, 0.5])

# AUPRC with reference = SNV found in both undiluted plasma samples by the same method

In [None]:
smurf_snv_bis = pd.read_csv('../data/SMURF/NCC_CRC-809_030915-CW/snv-predicted.txt', sep='\t')
print(smurf_snv_bis[smurf_snv_bis['Chr'] == '22'].shape)

smurf_indels_bis = pd.read_csv('../data/SMURF/NCC_CRC-809_030915-CW/indel-predicted.txt', sep='\t')
print(smurf_indels_bis[smurf_indels_bis['Chr'] == '22'].shape)

smurf_calls_bis = pd.concat([smurf_snv_bis[smurf_snv_bis['Chr'] == '22'], smurf_indels_bis[smurf_indels_bis['Chr'] == '22']])
smurf_calls_bis['CHROM_POS'] = smurf_calls_bis['Chr'].astype('str').str.cat(smurf_calls_bis['START_POS_REF'].astype('str'),sep="_")
smurf_calls_bis.set_index('CHROM_POS', inplace = True)

y_true_index = pd.Index(list(set(list(smurf_calls.index)) & set(list(smurf_calls_bis.index))))
y_true_smurf = pd.Series(True, index=y_true_index)
print(len(list(pd_methods.index)), len(list(y_true_smurf.index)))
y_true_index = pd.Index(list(pd_methods.index)+list(y_true_smurf.index))
print(y_true_index.size)
y_true_index = y_true_index.drop_duplicates()
print(y_true_index.size)
y_true = pd.Series(False, index=y_true_index)
y_true.loc[list(y_true_smurf.index)] = True

In [None]:
vcf_pd_0_bis = read_vcf("../data/2015-07-31_CRC-"+plasmasample2+"-1-0/CRC-"+plasmasample2+"-1-0-ensemble-annotated.vcf")

vcf_pd_0_bis['INFO'] = vcf_pd_0_bis['INFO'].apply(foo)
vcf_pd_0_bis['freebayes'] = vcf_pd_0_bis['INFO'].str.contains('freebayes')
vcf_pd_0_bis['vardict'] = vcf_pd_0_bis['INFO'].str.contains('vardict')
vcf_pd_0_bis['varscan'] = vcf_pd_0_bis['INFO'].str.contains('varscan')
vcf_pd_0_bis['mutect2'] = vcf_pd_0_bis['INFO'].str.contains('mutect2')
vcf_pd_0_bis['strelka2'] = vcf_pd_0_bis['INFO'].str.contains('strelka2')
vcf_pd_0_bis.drop('INFO', axis=1)

vcf_pd_0_bis['SNV callers'] = vcf_pd_0_bis['freebayes'].map(str) + '_' + vcf_pd_0_bis['vardict'].map(str) +  '_' + vcf_pd_0_bis['varscan'].map(str)

sample_0_bis = vcf_pd_0_bis[['CHROM', 'POS', 'REF', 'ALT', 'QUAL', 'freebayes', 'vardict', 'varscan', 'mutect2', 'strelka2', 'SNV callers']]
sample_0_bis['CHROM_POS'] = sample_0_bis['CHROM'].astype('str').str.cat(sample_0_bis['POS'].astype('str'),sep="_")
sample_0_bis.set_index('CHROM_POS', inplace = True)
sample_0_bis.head()

sample0 = sample_0.copy()
sample0['date'] = '110914'
sample0bis = sample_0_bis.copy()
sample0bis['date'] = '030915'
sample_ref = pd.concat([sample0, sample0bis], join='inner')
l1 = list(sample_ref[sample_ref['date'] == '110914'].index)
l2 = list(sample_ref[sample_ref['date'] == '030915'].index)
print(len(l1), len(l2))
print(len(list(set(set(l1) ^ set(l2)))))
print(len(list(set(set(l1) & set(l2)))))

In [None]:
pd_methods = pd.DataFrame()
count = 0

for mi, method in enumerate(['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan']):
    
    s0 = sample_0[['REF', 'ALT', 'QUAL', method]]
    s0.rename(columns = {method:'sample_0'},  inplace = True)
    s0_bis = sample_0_bis[['REF', 'ALT', 'QUAL', method]]
    s0_bis.rename(columns = {method:'sample_0_bis'},  inplace = True)
    s1 = sample_1[['REF', 'ALT', 'QUAL', method]]
    s1.rename(columns = {method:'sample_1'},  inplace = True)
    s2 = sample_2[['REF', 'ALT', 'QUAL', method]]
    s2.rename(columns = {method:'sample_2'},  inplace = True)
    s3 = sample_3[['REF', 'ALT', 'QUAL', method]]
    s3.rename(columns = {method:'sample_3'},  inplace = True)
    s4 = sample_4[['REF', 'ALT', 'QUAL', method]]
    s4.rename(columns = {method:'sample_4'},  inplace = True)

    pd_method = pd.concat([s0, s0_bis, s1, s2, s3, s4], axis=1)

    pd_method['REF'] = [list(np.unique([i for i in list(ai) if str(i) != 'nan']))[-1] for ai in list(pd_method['REF'].values)]
    pd_method['ALT'] = [list(np.unique([i for i in list(ai) if str(i) != 'nan']))[-1] for ai in list(pd_method['ALT'].values)]
    print(pd_method.shape)
    pd_method = pd_method.T.drop_duplicates().T
    print(pd_method.shape)
    pd_method.columns = ['REF', 'ALT', 'QUAL_0', 'sample_0', 'QUAL_0_bis', 'sample_0_bis', 'QUAL_1', 'sample_1', 'QUAL_2', 'sample_2', 'QUAL_3', 'sample_3', 'QUAL_4', 'sample_4']
    pd_method[['sample_0', 'sample_0_bis', 'sample_1', 'sample_2', 'sample_3', 'sample_4']] = pd_method[['sample_0', 'sample_0_bis', 'sample_1', 'sample_2', 'sample_3', 'sample_4']].fillna(value=False)
    #pd_method.drop(pd_method[pd_method['sample_0'] + pd_method['sample_0_bis']+ pd_method['sample_1'] + pd_method['sample_2'] + pd_method['sample_3'] + pd_method['sample_4'] == False].index, axis=0, inplace=True)
    print(pd_method.shape)
    pd_method[['REF', 'sample_0', 'sample_0_bis', 'sample_1', 'sample_2', 'sample_3', 'sample_4']].head()
    pd_method['caller'] = method

    if count == 0:
        pd_methods = pd_method
        count = 1
    else:
        pd_methods = pd.concat([pd_methods, pd_method], join='inner')

pd_results = pd.DataFrame.empty
pd_results_PR = pd.DataFrame.empty
count = 0

fig, axs = plt.subplots(1,5,figsize=(30, 4))
fig.suptitle('Precision-Recall curves')


baselineAUPRC = {}

for mi, method in enumerate(methods):
    
    res_df = (100*pd.Series(samples_tf)).round(decimals=2).to_frame(name='tumor burden')
    res_PR_df = (100*pd.Series(samples_tf)).round(decimals=2).to_frame(name='tumor burden')

    #y_true = pd_methods[pd_methods['caller'] == method]['sample_0'].astype(bool) + pd_methods[pd_methods['caller'] == method]['sample_0_bis'].astype(bool)
    baselineAUPRC[method] = len(y_true[y_true == True])/len(y_true)

    y_0 = pd_methods[pd_methods['caller'] == method]['sample_0'].astype(bool).reindex(y_true.index).squeeze()
    y_0 = y_0.fillna(False)
    y_1 = pd_methods[pd_methods['caller'] == method]['sample_1'].astype(bool).reindex(y_true.index).squeeze()
    y_1 = y_1.fillna(False)
    y_2 = pd_methods[pd_methods['caller'] == method]['sample_2'].astype(bool).reindex(y_true.index).squeeze()
    y_2 = y_2.fillna(False)
    y_3 = pd_methods[pd_methods['caller'] == method]['sample_3'].astype(bool).reindex(y_true.index).squeeze()
    y_3 = y_3.fillna(False)
    y_4 = pd_methods[pd_methods['caller'] == method]['sample_4'].astype(bool).reindex(y_true.index).squeeze()
    y_4 = y_4.fillna(False)

    res_df['AUPRC'] = [average_precision_score(y_true, y_0),
                      average_precision_score(y_true, y_1),
                      average_precision_score(y_true, y_2),
                      average_precision_score(y_true, y_3),
                      average_precision_score(y_true, y_4),
                     ]
    res_PR_df['precision'] = [precision_score(y_true, y_0),
                      precision_score(y_true, y_1),
                      precision_score(y_true, y_2),
                      precision_score(y_true, y_3),
                      precision_score(y_true, y_4),
                     ]

    res_PR_df['recall'] = [recall_score(y_true, y_0),
                      recall_score(y_true, y_1),
                      recall_score(y_true, y_2),
                      recall_score(y_true, y_3),
                      recall_score(y_true, y_4),
                     ]
    res_PR_df = pd.melt(res_PR_df, id_vars =['tumor burden'], value_vars =['precision', 'recall'],
                    var_name='metric', value_name='value')
    res_df['caller'] = method
    res_PR_df['caller'] = method
    res_df.drop(['sample_0'], inplace=True)
    res_PR_df.drop([0, 5], inplace=True)
 
    # Plot Precision-Recall curve
    alpha_list = [0, 1, .75, .5, .2, .1]
    for i in range(1,5):
        y_i = pd_methods[pd_methods['caller'] == method]['sample_'+str(i)].astype(bool).reindex(y_true.index).squeeze()
        y_i = y_i.fillna(False)
        precision, recall, _ = precision_recall_curve(y_true, y_i)
        axs[mi].plot(recall, precision, 'o-',
                              label='tf='+str(res_df['tumor burden'].loc['sample_'+str(i)])+ ', AP='+str(round(average_precision_score(y_true, y_i), 2)),
                              c=color_list[mi], alpha=alpha_list[i])
    axs[mi].set_xlabel('Recall')
    axs[mi].set_ylabel('Precision')
    axs[mi].set_ylim([0.0, 1.05])
    axs[mi].set_xlim([0.0, 1.05])
    axs[mi].set_title(method)
    axs[mi].legend()

    if count == 0:
        pd_results = res_df
        pd_results_PR = res_PR_df
        count = 1
    else:
        pd_results = pd.concat([pd_results, res_df], join='inner')
        pd_results_PR = pd.concat([pd_results_PR, res_PR_df], join='inner')
plt.show()

sns.catplot(x="tumor burden", y="AUPRC", hue="caller",  
                capsize=.2, height=6, aspect=.75,
                kind="point", order=sorted(pd_results['tumor burden'].unique(), reverse=True), data=pd_results, legend=False)
for mi, method in enumerate(methods):
    plt.axhline(y = baselineAUPRC[method], color = sns.color_palette("tab10")[mi], linestyle = '--') 
plt.axhline(y = 1, color = 'k', linestyle = '--', label='baseline AUPRC') 
plt.ylim([0, 0.5])
plt.legend(bbox_to_anchor=(1,1), loc="upper left")

sns.catplot(x="tumor burden", y="value", col="metric", hue="caller",  
                capsize=.2, height=6, aspect=.75,
                kind="point", order=sorted(pd_results['tumor burden'].unique(), reverse=True), data=pd_results_PR)
plt.ylim([0, 0.5])

In [None]:
pd_results = pd.DataFrame.empty
count = 0

baselineAUPRC = {}

for mi, method in enumerate(methods):
    
    res_df = (100*pd.Series(samples_tf)).round(decimals=2).to_frame(name='tumor burden')

    #y_true = pd_methods[pd_methods['caller'] == method]['sample_0'].astype(bool) + pd_methods[pd_methods['caller'] == method]['sample_0_bis'].astype(bool)
    baselineAUPRC[method] = len(y_true[y_true == True])/len(y_true)

    y_0 = pd_methods[pd_methods['caller'] == method]['sample_0'].astype(bool).reindex(y_true.index).squeeze()
    y_0 = y_0.fillna(False)
    y_1 = pd_methods[pd_methods['caller'] == method]['sample_1'].astype(bool).reindex(y_true.index).squeeze()
    y_1 = y_1.fillna(False)
    y_2 = pd_methods[pd_methods['caller'] == method]['sample_2'].astype(bool).reindex(y_true.index).squeeze()
    y_2 = y_2.fillna(False)
    y_3 = pd_methods[pd_methods['caller'] == method]['sample_3'].astype(bool).reindex(y_true.index).squeeze()
    y_3 = y_3.fillna(False)
    y_4 = pd_methods[pd_methods['caller'] == method]['sample_4'].astype(bool).reindex(y_true.index).squeeze()
    y_4 = y_4.fillna(False)

    res_df['AUPRC - baseline AUPRC'] = [average_precision_score(y_true, y_0) - baselineAUPRC[method],
                      average_precision_score(y_true, y_1) - baselineAUPRC[method],
                      average_precision_score(y_true, y_2) - baselineAUPRC[method],
                      average_precision_score(y_true, y_3) - baselineAUPRC[method],
                      average_precision_score(y_true, y_4) - baselineAUPRC[method],
                     ]
    res_df['caller'] = method
    res_df.drop(['sample_0'], inplace=True)
 
    if count == 0:
        pd_results = res_df
        count = 1
    else:
        pd_results = pd.concat([pd_results, res_df], join='inner')
plt.show()

sns.catplot(x="tumor burden", y="AUPRC - baseline AUPRC", hue="caller",  
                capsize=.2, height=6, aspect=.75,
                kind="point", order=sorted(pd_results['tumor burden'].unique(), reverse=True), data=pd_results)
plt.axhline(y = 0, color = 'k', linestyle = '--', label='baseline AUPRC') 
#plt.ylim([0, 0.5])

## AUPRC with reference = SNV found in the tumor by the same method

In [None]:
smurf_snv_t = pd.read_csv('../data/SMURF/NCC_CRC-809_290714-T1W/snv-predicted.txt', sep='\t')
print(smurf_snv_t[smurf_snv_t['Chr'] == '22'].shape)

smurf_indels_t = pd.read_csv('../data/SMURF/NCC_CRC-809_290714-T1W/indel-predicted.txt', sep='\t')
print(smurf_indels_t[smurf_indels_t['Chr'] == '22'].shape)

smurf_calls_t = pd.concat([smurf_snv_t[smurf_snv_t['Chr'] == '22'], smurf_indels_t[smurf_indels_t['Chr'] == '22']])
smurf_calls_t['CHROM_POS'] = smurf_calls_t['Chr'].astype('str').str.cat(smurf_calls_t['START_POS_REF'].astype('str'),sep="_")
smurf_calls_t.set_index('CHROM_POS', inplace = True)

y_true_smurf = pd.Series(True, index=smurf_calls_t.index)
print(len(list(pd_methods.index)), len(list(y_true_smurf.index)))
y_true_index = pd.Index(list(pd_methods.index)+list(y_true_smurf.index))
print(y_true_index.size)
y_true_index = y_true_index.drop_duplicates()
print(y_true_index.size)
y_true = pd.Series(False, index=y_true_index)
y_true.loc[list(y_true_smurf.index)] = True

In [None]:
vcf_pd_h = read_vcf("../data/2015-07-31_Merged_Healthy/Merged_Healthy-ensemble-annotated.vcf")

vcf_pd_h['INFO'] = vcf_pd_h['INFO'].apply(foo)
vcf_pd_h['freebayes'] = vcf_pd_h['INFO'].str.contains('freebayes')
vcf_pd_h['vardict'] = vcf_pd_h['INFO'].str.contains('vardict')
vcf_pd_h['varscan'] = vcf_pd_h['INFO'].str.contains('varscan')
vcf_pd_h['mutect2'] = vcf_pd_h['INFO'].str.contains('mutect2')
vcf_pd_h['strelka2'] = vcf_pd_h['INFO'].str.contains('strelka2')
vcf_pd_h.drop('INFO', axis=1)


vcf_pd_h['SNV callers'] = vcf_pd_h['freebayes'].map(str) + '_' + vcf_pd_h['vardict'].map(str) +  '_' + vcf_pd_h['varscan'].map(str)

sample_h = vcf_pd_h[['CHROM', 'POS', 'REF', 'ALT', 'QUAL', 'freebayes', 'vardict', 'varscan', 'mutect2', 'strelka2', 'SNV callers']]
sample_h['CHROM_POS'] = sample_h['CHROM'].astype('str').str.cat(sample_h['POS'].astype('str'),sep="_")
sample_h.set_index('CHROM_POS', inplace = True)
sample_h = sample_h[sample_h['CHROM'] == '22']
sample_h.head()

In [None]:
vcf_pd_t = read_vcf("../data/2015-07-31_NCC_CRC-"+tumorsample1+"/NCC_CRC-"+tumorsample1+"-ensemble-annotated.vcf")

vcf_pd_t['callers'] = vcf_pd_t['INFO'].apply(foo)
vcf_pd_t['type'] = vcf_pd_t['INFO'].apply(foo2)
vcf_pd_t['type'][(vcf_pd_t['type'] == 'Deletion') |  (vcf_pd_t['type'] == 'del')] = 'DEL'
vcf_pd_t['type'][(vcf_pd_t['type'] == 'Insertion') |  (vcf_pd_t['type'] == 'ins')] = 'INS'
vcf_pd_t['freebayes'] = vcf_pd_t['INFO'].str.contains('freebayes')
vcf_pd_t['vardict'] = vcf_pd_t['INFO'].str.contains('vardict')
vcf_pd_t['varscan'] = vcf_pd_t['INFO'].str.contains('varscan')
vcf_pd_t['mutect2'] = vcf_pd_t['INFO'].str.contains('mutect2')
vcf_pd_t['strelka2'] = vcf_pd_t['INFO'].str.contains('strelka2')

vcf_pd_t['SNV callers'] = vcf_pd_t['freebayes'].map(str) + '_' + vcf_pd_t['vardict'].map(str) +  '_' + vcf_pd_t['varscan'].map(str)

sample_t = vcf_pd_t[['CHROM', 'POS', 'REF', 'ALT', 'QUAL', 'freebayes', 'vardict', 'varscan', 'mutect2', 'strelka2', 'SNV callers']]
sample_t['CHROM_POS'] = sample_t['CHROM'].astype('str').str.cat(sample_t['POS'].astype('str'),sep="_")
sample_t.set_index('CHROM_POS', inplace = True)
sample_t = sample_t[sample_t['CHROM'] == '22']
print(sample_t.shape)
sample_t = sample_t.loc[list(sample_t.index[~sample_t.index.isin(sample_h.index)])]
print(sample_t.shape)
sample_t.head()



sample_0 = vcf_pd_0[['CHROM', 'POS', 'REF', 'ALT', 'QUAL', 'FILTER', 'type', 'freebayes', 'vardict', 'varscan', 'mutect2', 'strelka2']]
sample_0['CHROM_POS'] = sample_0['CHROM'].astype('str').str.cat(sample_0['POS'].astype('str'),sep="_")
sample_0.set_index('CHROM_POS', inplace = True)

In [None]:
l1 = list(sample_t.index)
l2 = list(sample_0.index)
print(len(l1), len(l2))
print(len(list(set(set(l1) & set(l2)))))

In [None]:
pd_methods = pd.DataFrame.empty
count = 0

for method in methods:    
    st = sample_t[['REF', 'ALT', 'QUAL', method]]
    st.rename(columns = {method:'sample_t'},  inplace = True)
    s0 = sample_0[['REF', 'ALT', 'QUAL', method]]
    s0.rename(columns = {method:'sample_0'},  inplace = True)
    s1 = sample_1[['REF', 'ALT', 'QUAL', method]]
    s1.rename(columns = {method:'sample_1'},  inplace = True)
    s2 = sample_2[['REF', 'ALT', 'QUAL', method]]
    s2.rename(columns = {method:'sample_2'},  inplace = True)
    s3 = sample_3[['REF', 'ALT', 'QUAL', method]]
    s3.rename(columns = {method:'sample_3'},  inplace = True)
    s4 = sample_4[['REF', 'ALT', 'QUAL', method]]
    s4.rename(columns = {method:'sample_4'},  inplace = True)

    pd_method = pd.concat([st, s0, s1, s2, s3, s4], axis=1)

    pd_method['REF'] = [list(np.unique([i for i in list(ai) if str(i) != 'nan']))[-1] for ai in list(pd_method['REF'].values)]
    pd_method['ALT'] = [list(np.unique([i for i in list(ai) if str(i) != 'nan']))[-1] for ai in list(pd_method['ALT'].values)]
    print(pd_method.shape)
    pd_method = pd_method.T.drop_duplicates().T
    print(pd_method.shape)
    pd_method[['sample_t', 'sample_0', 'sample_1', 'sample_2', 'sample_3', 'sample_4']] = pd_method[['sample_t', 'sample_0','sample_1', 'sample_2', 'sample_3', 'sample_4']].fillna(value=False)
    #pd_method.drop(pd_method[pd_method['sample_t'] + pd_method['sample_0'] + pd_method['sample_1'] + pd_method['sample_2'] + pd_method['sample_3']  + pd_method['sample_4'] == False].index, axis=0, inplace=True)
    print(pd_method.shape)
    pd_method['caller'] = method

    if count == 0:
        pd_methods = pd_method
        count = 1
    else:
        pd_methods = pd.concat([pd_methods, pd_method], join='inner')


pd_results = pd.DataFrame()
count = 0

fig, axs = plt.subplots(1,5,figsize=(30, 4))
fig.suptitle('Precision-Recall curves')

baselineAUPRC = {}

for mi, method in enumerate(methods):
    
    res_df = (100*pd.Series(samples_tf)).round(decimals=2).to_frame(name='tumor burden')
    res_PR_df = (100*pd.Series(samples_tf)).round(decimals=2).to_frame(name='tumor burden')

    #y_true = pd_methods[pd_methods['caller'] == method]['sample_t'].astype(bool)
    baselineAUPRC[method] = len(y_true[y_true == True])/len(y_true)

    y_0 = pd_methods[pd_methods['caller'] == method]['sample_0'].astype(bool).reindex(y_true.index).squeeze()
    y_0 = y_0.fillna(False)
    y_1 = pd_methods[pd_methods['caller'] == method]['sample_1'].astype(bool).reindex(y_true.index).squeeze()
    y_1 = y_1.fillna(False)
    y_2 = pd_methods[pd_methods['caller'] == method]['sample_2'].astype(bool).reindex(y_true.index).squeeze()
    y_2 = y_2.fillna(False)
    y_3 = pd_methods[pd_methods['caller'] == method]['sample_3'].astype(bool).reindex(y_true.index).squeeze()
    y_3 = y_3.fillna(False)
    y_4 = pd_methods[pd_methods['caller'] == method]['sample_4'].astype(bool).reindex(y_true.index).squeeze()
    y_4 = y_4.fillna(False)

    res_df['AUPRC'] = [average_precision_score(y_true, y_0),
                      average_precision_score(y_true, y_1),
                      average_precision_score(y_true, y_2),
                      average_precision_score(y_true, y_3),
                      average_precision_score(y_true, y_4),
                     ]
    res_PR_df['precision'] = [precision_score(y_true, y_0),
                      precision_score(y_true, y_1),
                      precision_score(y_true, y_2),
                      precision_score(y_true, y_3),
                      precision_score(y_true, y_4),
                     ]

    res_PR_df['recall'] = [recall_score(y_true, y_0),
                      recall_score(y_true, y_1),
                      recall_score(y_true, y_2),
                      recall_score(y_true, y_3),
                      recall_score(y_true, y_4),
                     ]
    res_PR_df = pd.melt(res_PR_df, id_vars =['tumor burden'], value_vars =['precision', 'recall'],
                    var_name='metric', value_name='value')
    res_df['caller'] = method
    res_PR_df['caller'] = method
    res_df.drop(['sample_0'], inplace=True)
    
    res_PR_df.drop([0, 5], inplace=True)
 
    # Plot Precision-Recall curve
    alpha_list = [0, 1, .75, .5, .3, .1]
    for i in range(1,5):
        y_i = pd_methods[pd_methods['caller'] == method]['sample_'+str(i)].astype(bool).reindex(y_true.index).squeeze()
        y_i = y_i.fillna(False)
        precision, recall, _ = precision_recall_curve(y_true, y_i)
        axs[mi].plot(recall, precision, 'o-',
                              label='tf='+str(res_df['tumor burden'].loc['sample_'+str(i)])+ ', AP='+str(round(average_precision_score(y_true, y_i), 2)),
                              c=color_list[mi], alpha=alpha_list[i])
    axs[mi].set_xlabel('Recall')
    axs[mi].set_ylabel('Precision')
    axs[mi].set_ylim([0.0, 1.05])
    axs[mi].set_xlim([0.0, 1.05])
    axs[mi].set_title(method)
    axs[mi].legend()

    if count == 0:
        pd_results = res_df
        pd_results_PR = res_PR_df
        count = 1
    else:
        pd_results = pd.concat([pd_results, res_df], join='inner')
        pd_results_PR = pd.concat([pd_results_PR, res_PR_df], join='inner')
plt.show()

plt.figure()
sns.catplot(x="tumor burden", y="AUPRC", hue="caller",  
                capsize=.2, height=6, aspect=.75,
                kind="point", order=sorted(pd_results['tumor burden'].unique(), reverse=True), data=pd_results)
for mi, method in enumerate(methods):
    plt.axhline(y = baselineAUPRC[method], color = sns.color_palette("tab10")[mi], linestyle = '--') 
plt.ylim([0, 0.5])

plt.figure()
g = sns.catplot(x="tumor burden", y="value", col="metric", hue="caller",  
                capsize=.2, height=6, aspect=.75,
                kind="point", order=sorted(pd_results['tumor burden'].unique(), reverse=True), data=pd_results_PR)
plt.ylim([0, 0.5])


In [None]:
pd_results = pd.DataFrame.empty
count = 0

baselineAUPRC = {}

for mi, method in enumerate(methods):
    
    res_df = (100*pd.Series(samples_tf)).round(decimals=2).to_frame(name='tumor burden')

    #y_true = pd_methods[pd_methods['caller'] == method]['sample_t'].astype(bool)
    baselineAUPRC[method] = len(y_true[y_true == True])/len(y_true)

    y_0 = pd_methods[pd_methods['caller'] == method]['sample_0'].astype(bool).reindex(y_true.index).squeeze()
    y_0 = y_0.fillna(False)
    y_1 = pd_methods[pd_methods['caller'] == method]['sample_1'].astype(bool).reindex(y_true.index).squeeze()
    y_1 = y_1.fillna(False)
    y_2 = pd_methods[pd_methods['caller'] == method]['sample_2'].astype(bool).reindex(y_true.index).squeeze()
    y_2 = y_2.fillna(False)
    y_3 = pd_methods[pd_methods['caller'] == method]['sample_3'].astype(bool).reindex(y_true.index).squeeze()
    y_3 = y_3.fillna(False)
    y_4 = pd_methods[pd_methods['caller'] == method]['sample_4'].astype(bool).reindex(y_true.index).squeeze()
    y_4 = y_4.fillna(False)

    res_df['AUPRC - baseline AUPRC'] = [average_precision_score(y_true, y_0) - baselineAUPRC[method],
                      average_precision_score(y_true, y_1) - baselineAUPRC[method],
                      average_precision_score(y_true, y_2) - baselineAUPRC[method],
                      average_precision_score(y_true, y_3) - baselineAUPRC[method],
                      average_precision_score(y_true, y_4) - baselineAUPRC[method],
                     ]
    res_df['caller'] = method
    res_df.drop(['sample_0'], inplace=True)
 
    if count == 0:
        pd_results = res_df
        count = 1
    else:
        pd_results = pd.concat([pd_results, res_df], join='inner')
plt.show()

sns.catplot(x="tumor burden", y="AUPRC - baseline AUPRC", hue="caller",  
                capsize=.2, height=6, aspect=.75,
                kind="point", order=sorted(pd_results['tumor burden'].unique(), reverse=True), data=pd_results)
plt.axhline(y = 0, color = 'k', linestyle = '--', label='baseline AUPRC') 
#plt.ylim([0, 0.5])

- Histograms of true allele frequencies in each tumor sample. Note how increasing admixture increases the prevalence of low- frequency variants.
- Benchmarking results for germline SNVs
- Benchmarking results for somatic SNVs on exome data.
- averaged over the four replicates

- add fake mutations on healthy mixtures