In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pysam
from collections import Counter
from matplotlib import cm
from matplotlib.colors import ListedColormap

import io
from pysam import VariantFile

from sklearn.metrics import precision_score, recall_score, average_precision_score, precision_recall_curve

In [None]:
tab20 = cm.get_cmap('tab10', 8)
newcmap_list = []
for i in range(4):
    newcmap_list.append(tab20.colors[i])
for i in range(4):
    newcmap_list.append(np.array(list(tab20.colors[i][:-1]) +[0.3]))

newcmap = ListedColormap(newcmap_list, name='newcmap')

In [None]:
def read_vcf(path):
    with open(path, 'r') as f:
        lines = [l for l in f if not l.startswith('##')]
    res = pd.read_csv(
        io.StringIO(''.join(lines)),
        dtype={'#CHROM': str, 'POS': int, 'ID': str, 'REF': str, 'ALT': str,
               'QUAL': str, 'FILTER': str, 'INFO': str},
        sep='\t'
    ).rename(columns={'#CHROM': 'CHROM'})
    return res

# Dilution effect on 809_110914

## Load data

In [None]:
foo = lambda x: pd.Series(x.split('CALLERS=')[1].split(';')[0])
vcf_pd_0 = read_vcf("../data/2015-07-31_CRC-809_110914-filter-1-0/CRC-809_110914-filter-1-0-ensemble-annotated.vcf")
vcf_pd_1 = read_vcf("../data/2015-07-31_CRC-809_110914-filter-05-05/CRC-809_110914-filter-05-05-ensemble-annotated.vcf")
vcf_pd_2 = read_vcf("../data/2015-07-31_CRC-809_110914-filter-005-095/CRC-809_110914-filter-005-095-ensemble-annotated.vcf")
vcf_pd_3 = read_vcf("../data/2015-07-31_CRC-809_110914-filter-001-099/CRC-809_110914-filter-001-099-ensemble-annotated.vcf")

vcf_pd_0['INFO'] = vcf_pd_0['INFO'].apply(foo)
vcf_pd_0['freebayes'] = vcf_pd_0['INFO'].str.contains('freebayes')
vcf_pd_0['vardict'] = vcf_pd_0['INFO'].str.contains('vardict')
vcf_pd_0['varscan'] = vcf_pd_0['INFO'].str.contains('varscan')
vcf_pd_0['mutect2'] = vcf_pd_0['INFO'].str.contains('mutect2')
vcf_pd_0['strelka2'] = vcf_pd_0['INFO'].str.contains('strelka2')
vcf_pd_0.drop('INFO', axis=1)

vcf_pd_1['INFO'] = vcf_pd_1['INFO'].apply(foo)
vcf_pd_1['freebayes'] = vcf_pd_1['INFO'].str.contains('freebayes')
vcf_pd_1['vardict'] = vcf_pd_1['INFO'].str.contains('vardict')
vcf_pd_1['varscan'] = vcf_pd_1['INFO'].str.contains('varscan')
vcf_pd_1['mutect2'] = vcf_pd_1['INFO'].str.contains('mutect2')
vcf_pd_1['strelka2'] = vcf_pd_1['INFO'].str.contains('strelka2')
vcf_pd_1.drop('INFO', axis=1)

vcf_pd_2['INFO'] = vcf_pd_2['INFO'].apply(foo)
vcf_pd_2['freebayes'] = vcf_pd_2['INFO'].str.contains('freebayes')
vcf_pd_2['vardict'] = vcf_pd_2['INFO'].str.contains('vardict')
vcf_pd_2['varscan'] = vcf_pd_2['INFO'].str.contains('varscan')
vcf_pd_2['mutect2'] = vcf_pd_2['INFO'].str.contains('mutect2')
vcf_pd_2['strelka2'] = vcf_pd_2['INFO'].str.contains('strelka2')
vcf_pd_2.drop('INFO', axis=1)

vcf_pd_3['INFO'] = vcf_pd_3['INFO'].apply(foo)
vcf_pd_3['freebayes'] = vcf_pd_3['INFO'].str.contains('freebayes')
vcf_pd_3['vardict'] = vcf_pd_3['INFO'].str.contains('vardict')
vcf_pd_3['varscan'] = vcf_pd_3['INFO'].str.contains('varscan')
vcf_pd_3['mutect2'] = vcf_pd_3['INFO'].str.contains('mutect2')
vcf_pd_3['strelka2'] = vcf_pd_3['INFO'].str.contains('strelka2')
vcf_pd_3.drop('INFO', axis=1)

vcf_pd_0['SNV callers'] = vcf_pd_0['freebayes'].map(str) + '_' + vcf_pd_0['vardict'].map(str) +  '_' + vcf_pd_0['varscan'].map(str)
vcf_pd_1['SNV callers'] = vcf_pd_1['freebayes'].map(str) + '_' + vcf_pd_1['vardict'].map(str) +  '_' + vcf_pd_1['varscan'].map(str)
vcf_pd_2['SNV callers'] = vcf_pd_2['freebayes'].map(str) + '_' + vcf_pd_2['vardict'].map(str) +  '_' + vcf_pd_2['varscan'].map(str)
vcf_pd_3['SNV callers'] = vcf_pd_3['freebayes'].map(str) + '_' + vcf_pd_3['vardict'].map(str) +  '_' + vcf_pd_3['varscan'].map(str)

sample_0 = vcf_pd_0[['CHROM', 'POS', 'REF', 'ALT', 'QUAL', 'freebayes', 'vardict', 'varscan', 'mutect2', 'strelka2', 'SNV callers']]
sample_0['CHROM_POS'] = sample_0['CHROM'].astype('str').str.cat(sample_0['POS'].astype('str'),sep="_")
sample_0.set_index('CHROM_POS', inplace = True)

sample_1 = vcf_pd_1[['CHROM', 'POS', 'REF', 'ALT', 'QUAL', 'freebayes', 'vardict', 'varscan', 'mutect2', 'strelka2', 'SNV callers']]
sample_1['CHROM_POS'] = sample_1['CHROM'].astype('str').str.cat(sample_1['POS'].astype('str'),sep="_")
sample_1.set_index('CHROM_POS', inplace = True)

sample_2 = vcf_pd_2[['CHROM', 'POS', 'REF', 'ALT', 'QUAL', 'freebayes', 'vardict', 'varscan', 'mutect2', 'strelka2', 'SNV callers']]
sample_2['CHROM_POS'] = sample_2['CHROM'].astype('str').str.cat(sample_2['POS'].astype('str'),sep="_")
sample_2.set_index('CHROM_POS', inplace = True)

sample_3 = vcf_pd_3[['CHROM', 'POS', 'REF', 'ALT', 'QUAL', 'freebayes', 'vardict', 'varscan', 'mutect2', 'strelka2', 'SNV callers']]
sample_3['CHROM_POS'] = sample_3['CHROM'].astype('str').str.cat(sample_3['POS'].astype('str'),sep="_")
sample_3.set_index('CHROM_POS', inplace = True)

sample_3.head()

## Estimated tumor burden of mixed samples

In [None]:
samples_tf = {
    'sample_0': 0.47,
    'sample_1': float(list(pd.read_csv("../data/2015-07-31_CRC-809_110914-filter-05-05/estimated_tf.txt").columns)[0]),
    'sample_2': float(list(pd.read_csv("../data/2015-07-31_CRC-809_110914-filter-005-095/estimated_tf.txt").columns)[0]),
    'sample_3': float(list(pd.read_csv("../data/2015-07-31_CRC-809_110914-filter-001-099/estimated_tf.txt").columns)[0]),
}

print(samples_tf)

## Number of detections detected

In [None]:
numbersnvs_pd = pd.DataFrame.empty

for si, s in enumerate([sample_0, sample_1, sample_2, sample_3]):
    nb_snv = []
    for method in ['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan']:
        if si == 4:
            si = 't'
       # print('sample '+ str(si) + ': ', method, s[s[method] == True].shape[0])
        nb_snv.append(s[s[method] == True].shape[0])
    if si == 0:
        numbersnvs_pd = pd.DataFrame.from_dict({'sample_'+ str(si): nb_snv}).T
        numbersnvs_pd.columns = ['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan']
    else:
        numbersnvs_pd.loc['sample_'+ str(si)] = nb_snv
    numbersnvs_pd = numbersnvs_pd.rename(index=samples_tf)

numbersnvs_pd.plot(style='.-', logx=True, xlim=(10e-1, 10e-4))
        
numbersnvs_pd

## AUPRC

In [None]:
pd_methods = pd.DataFrame.empty
count = 0

for method in ['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan']:
    print(method)

    s0 = sample_0[['REF', 'ALT', 'QUAL', method]]
    s0.rename(columns = {method:'sample_0'},  inplace = True)
    s1 = sample_1[['REF', 'ALT', 'QUAL', method]]
    s1.rename(columns = {method:'sample_1'},  inplace = True)
    s2 = sample_2[['REF', 'ALT', 'QUAL', method]]
    s2.rename(columns = {method:'sample_2'},  inplace = True)
    s3 = sample_3[['REF', 'ALT', 'QUAL', method]]
    s3.rename(columns = {method:'sample_3'},  inplace = True)

    pd_method = pd.concat([s0, s1, s2, s3], axis=1)

    pd_method['REF'] = [list(np.unique([i for i in list(ai) if str(i) != 'nan']))[-1] for ai in list(pd_method['REF'].values)]
    pd_method['ALT'] = [list(np.unique([i for i in list(ai) if str(i) != 'nan']))[-1] for ai in list(pd_method['ALT'].values)]
    pd_method = pd_method.T.drop_duplicates().T
    pd_method.columns = ['REF', 'ALT', 'QUAL_0', 'sample_0', 'QUAL_1', 'sample_1', 'QUAL_2', 'sample_2',  'QUAL_3', 'sample_3']
    pd_method.head()

    pd_method[['sample_0', 'sample_1', 'sample_2', 'sample_3']] = pd_method[['sample_0','sample_1', 'sample_2', 'sample_3']].fillna(value=False)
    pd_method.drop(pd_method[pd_method['sample_0'] + pd_method['sample_1'] + pd_method['sample_2'] + pd_method['sample_3'] == False].index, axis=0, inplace=True)
    pd_method[['REF', 'sample_0', 'sample_1', 'sample_2', 'sample_3']].head()
    pd_method['caller'] = method

    if count == 0:
        pd_methods = pd_method
        count = 1
    else:
        pd_methods = pd.concat([pd_methods, pd_method], join='inner')
print(pd_methods.shape)

pd_results = pd.DataFrame.empty
count = 0

for method in ['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan']:
    print(method)
    
    res_df = (100*pd.Series(samples_tf)).round(decimals=2).to_frame(name='tumor burden')

    y_true = pd_methods[pd_methods['caller'] == method]['sample_0'].astype(bool).values

    res_df['precision'] = [1,
                      precision_score(y_true, pd_methods[pd_methods['caller'] == method]['sample_1'].astype(bool).values),
                      precision_score(y_true, pd_methods[pd_methods['caller'] == method]['sample_2'].astype(bool).values),
                      precision_score(y_true, pd_methods[pd_methods['caller'] == method]['sample_3'].astype(bool).values),
                     ]

    res_df['recall'] = [1,
                      recall_score(y_true, pd_methods[pd_methods['caller'] == method]['sample_1'].astype(bool).values),
                      recall_score(y_true, pd_methods[pd_methods['caller'] == method]['sample_2'].astype(bool).values),
                      recall_score(y_true, pd_methods[pd_methods['caller'] == method]['sample_3'].astype(bool).values),
                     ]
    res_df = pd.melt(res_df, id_vars =['tumor burden'], value_vars =['precision', 'recall'],
                    var_name='metric', value_name='value')
    res_df['caller'] = method

    if count == 0:
        pd_results = res_df
        count = 1
    else:
        pd_results = pd.concat([pd_results, res_df], join='inner')

print(pd_results.head())
print(pd_results.shape)

g = sns.catplot(x="tumor burden", y="value", col="metric", hue="caller",  
                capsize=.2, height=6, aspect=.75,
                kind="point", order=sorted(pd_results['tumor burden'].unique(), reverse=True), data=pd_results)


In [None]:
y_true = pd_methods[['sample_0', 'caller']]
y_true.index.name = 'CHROM_POS'
y_true = y_true.groupby(['CHROM_POS'])['sample_0'].sum()
y_true[y_true == 1] = 0
y_true = y_true.astype(bool)
print(y_true.shape[0])

pd_results = pd.DataFrame.empty
count = 0

for method in ['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan']:
    print(method)
    
    res_df = (100*pd.Series(samples_tf)).round(decimals=2).to_frame(name='tumor burden')
    
    y_1 = pd_methods[pd_methods['caller'] == method]['sample_1'].astype(bool).reindex(y_true.index).squeeze()
    y_1 = y_1.fillna(False)
    y_2 = pd_methods[pd_methods['caller'] == method]['sample_2'].astype(bool).reindex(y_true.index).squeeze()
    y_2 = y_2.fillna(False)
    y_3 = pd_methods[pd_methods['caller'] == method]['sample_3'].astype(bool).reindex(y_true.index).squeeze()
    y_3 = y_3.fillna(False)

    res_df['precision'] = [1,
                      precision_score(y_true.values, y_1.values),
                      precision_score(y_true.values, y_2.values),
                      precision_score(y_true.values, y_3.values),
                     ]

    res_df['recall'] = [1,
                      recall_score(y_true.values, y_1.values),
                      recall_score(y_true.values, y_2.values),
                      recall_score(y_true.values, y_3.values),
                     ]
    res_df = pd.melt(res_df, id_vars =['tumor burden'], value_vars =['precision', 'recall'],
                    var_name='metric', value_name='value')
    res_df['caller'] = method

    if count == 0:
        pd_results = res_df
        count = 1
    else:
        pd_results = pd.concat([pd_results, res_df], join='inner')

print(pd_results.head())
print(pd_results.shape)

g = sns.catplot(x="tumor burden", y="value", col="metric", hue="caller",  
                capsize=.2, height=6, aspect=.75,
                kind="point", order=sorted(pd_results['tumor burden'].unique(), reverse=True), data=pd_results)


In [None]:
pd_methods = pd.DataFrame.empty
count = 0

color_list = ['tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple']

for mi, method in enumerate(['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan']):
    print(method)

    s0 = sample_0[['REF', 'ALT', 'QUAL', method]]
    s0.rename(columns = {method:'sample_0'},  inplace = True)
    s1 = sample_1[['REF', 'ALT', 'QUAL', method]]
    s1.rename(columns = {method:'sample_1'},  inplace = True)
    s2 = sample_2[['REF', 'ALT', 'QUAL', method]]
    s2.rename(columns = {method:'sample_2'},  inplace = True)
    s3 = sample_3[['REF', 'ALT', 'QUAL', method]]
    s3.rename(columns = {method:'sample_3'},  inplace = True)

    pd_method = pd.concat([s0, s1, s2, s3], axis=1)

    pd_method['REF'] = [list(np.unique([i for i in list(ai) if str(i) != 'nan']))[-1] for ai in list(pd_method['REF'].values)]
    pd_method['ALT'] = [list(np.unique([i for i in list(ai) if str(i) != 'nan']))[-1] for ai in list(pd_method['ALT'].values)]
    pd_method = pd_method.T.drop_duplicates().T
    pd_method.columns = ['REF', 'ALT', 'QUAL_0', 'sample_0', 'QUAL_1', 'sample_1', 'QUAL_2', 'sample_2',  'QUAL_3', 'sample_3']
    pd_method.head()

    pd_method[['sample_0', 'sample_1', 'sample_2', 'sample_3']] = pd_method[['sample_0','sample_1', 'sample_2', 'sample_3']].fillna(value=False)
    pd_method.drop(pd_method[pd_method['sample_0'] + pd_method['sample_1'] + pd_method['sample_2'] + pd_method['sample_3'] == False].index, axis=0, inplace=True)
    pd_method[['REF', 'sample_0', 'sample_1', 'sample_2', 'sample_3']].head()
    pd_method['caller'] = method

    if count == 0:
        pd_methods = pd_method
        count = 1
    else:
        pd_methods = pd.concat([pd_methods, pd_method], join='inner')
print(pd_methods.shape)

pd_results = pd.DataFrame.empty
count = 0

fig, axs = plt.subplots(3,2,figsize=(10, 15))
fig.suptitle('Precision-Recall curves')

y_true = pd_methods[['sample_0', 'caller']]
y_true.index.name = 'CHROM_POS'
y_true = y_true.groupby(['CHROM_POS'])['sample_0'].sum()
y_true[y_true <= 2] = 0
y_true = y_true.astype(bool)
print(y_true.shape[0])

for mi, method in enumerate(['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan']):
    print(method)
    
    res_df = (100*pd.Series(samples_tf)).round(decimals=2).to_frame(name='tumor burden')

    #y_true = pd_methods[pd_methods['caller'] == method]['sample_0'].astype(bool).values
    y_0 = pd_methods[pd_methods['caller'] == method]['sample_0'].astype(bool).reindex(y_true.index).squeeze()
    y_0 = y_0.fillna(False)
    y_1 = pd_methods[pd_methods['caller'] == method]['sample_1'].astype(bool).reindex(y_true.index).squeeze()
    y_1 = y_1.fillna(False)
    y_2 = pd_methods[pd_methods['caller'] == method]['sample_2'].astype(bool).reindex(y_true.index).squeeze()
    y_2 = y_2.fillna(False)
    y_3 = pd_methods[pd_methods['caller'] == method]['sample_3'].astype(bool).reindex(y_true.index).squeeze()
    y_3 = y_3.fillna(False)

    res_df['AUPCR'] = [average_precision_score(y_true, y_0),
                      average_precision_score(y_true, y_1),
                      average_precision_score(y_true, y_2),
                      average_precision_score(y_true, y_3),
                     ]
    res_df['caller'] = method
 
    # Plot Precision-Recall curve
    alpha_list = [1, .75, .5, .2]
    for i in range(0,4):
        y_i = pd_methods[pd_methods['caller'] == method]['sample_'+str(i)].astype(bool).reindex(y_true.index).squeeze()
        y_i = y_i.fillna(False)
        precision, recall, _ = precision_recall_curve(y_true, y_i)
        axs[mi//2, mi%2].plot(recall, precision, 'o-',
                              label='tf='+str(res_df['tumor burden'].loc['sample_'+str(i)])+ ', AP='+str(round(average_precision_score(y_true, y_i), 2)),
                              c=color_list[mi], alpha=alpha_list[i])
    axs[mi//2, mi%2].set_xlabel('Recall')
    axs[mi//2, mi%2].set_ylabel('Precision')
    axs[mi//2, mi%2].set_ylim([0.0, 1.05])
    axs[mi//2, mi%2].set_xlim([0.0, 1.05])
    axs[mi//2, mi%2].set_title(method)
    axs[mi//2, mi%2].legend()

    if count == 0:
        pd_results = res_df
        count = 1
    else:
        pd_results = pd.concat([pd_results, res_df], join='inner')
plt.show()

print(pd_results.head())
print(pd_results.shape)

g = sns.catplot(x="tumor burden", y="AUPCR", hue="caller",  
                capsize=.2, height=6, aspect=.75,
                kind="point", order=sorted(pd_results['tumor burden'].unique(), reverse=True), data=pd_results)


## 2 liquid biopsy samples as reference

In [None]:
vcf_pd_0_bis = read_vcf("../data/2015-07-31_CRC-809_030915-filter-1-0/CRC-809_030915-filter-1-0-ensemble-annotated.vcf")

vcf_pd_0_bis['INFO'] = vcf_pd_0_bis['INFO'].apply(foo)
vcf_pd_0_bis['freebayes'] = vcf_pd_0_bis['INFO'].str.contains('freebayes')
vcf_pd_0_bis['vardict'] = vcf_pd_0_bis['INFO'].str.contains('vardict')
vcf_pd_0_bis['varscan'] = vcf_pd_0_bis['INFO'].str.contains('varscan')
vcf_pd_0_bis['mutect2'] = vcf_pd_0_bis['INFO'].str.contains('mutect2')
vcf_pd_0_bis['strelka2'] = vcf_pd_0_bis['INFO'].str.contains('strelka2')
vcf_pd_0_bis.drop('INFO', axis=1)

vcf_pd_0_bis['SNV callers'] = vcf_pd_0_bis['freebayes'].map(str) + '_' + vcf_pd_0_bis['vardict'].map(str) +  '_' + vcf_pd_0_bis['varscan'].map(str)

sample_0_bis = vcf_pd_0_bis[['CHROM', 'POS', 'REF', 'ALT', 'QUAL', 'freebayes', 'vardict', 'varscan', 'mutect2', 'strelka2', 'SNV callers']]
sample_0_bis['CHROM_POS'] = sample_0_bis['CHROM'].astype('str').str.cat(sample_0_bis['POS'].astype('str'),sep="_")
sample_0_bis.set_index('CHROM_POS', inplace = True)
sample_0_bis.head()

In [None]:
sample0 = sample_0.copy()
sample0['date'] = '110914'
sample0bis = sample_0_bis.copy()
sample0bis['date'] = '030915'
sample_ref = pd.concat([sample0, sample0bis], join='inner')
l1 = list(sample_ref[sample_ref['date'] == '110914'].index)
l2 = list(sample_ref[sample_ref['date'] == '030915'].index)
print(len(l1), len(l2))
print(len(list(set(set(l1) ^ set(l2)))))
print(len(list(set(set(l1) & set(l2)))))

In [None]:
sample_ref.loc[list(set(set(l1) & set(l2)))]

In [None]:
pd_methods = pd.DataFrame.empty
count = 0

for method in ['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan']:
    print(method)

    s0 = sample_0[['REF', 'ALT', 'QUAL', method]].reindex(list(set(set(l1) & set(l2))))
    s0.rename(columns = {method:'sample_0'},  inplace = True)
    s1 = sample_1[['REF', 'ALT', 'QUAL', method]].reindex(list(set(set(l1) & set(l2))))
    s1.rename(columns = {method:'sample_1'},  inplace = True)
    s2 = sample_2[['REF', 'ALT', 'QUAL', method]].reindex(list(set(set(l1) & set(l2))))
    s2.rename(columns = {method:'sample_2'},  inplace = True)
    s3 = sample_3[['REF', 'ALT', 'QUAL', method]].reindex(list(set(set(l1) & set(l2))))
    s3.rename(columns = {method:'sample_3'},  inplace = True)

    pd_method = pd.concat([s0, s1, s2, s3], axis=1)

    pd_method['REF'] = [list(np.unique([i for i in list(ai) if str(i) != 'nan']))[-1] for ai in list(pd_method['REF'].values)]
    pd_method['ALT'] = [list(np.unique([i for i in list(ai) if str(i) != 'nan']))[-1] for ai in list(pd_method['ALT'].values)]
    pd_method = pd_method.T.drop_duplicates().T
    pd_method.columns = ['REF', 'ALT', 'QUAL_0', 'sample_0', 'QUAL_1', 'sample_1', 'QUAL_2', 'sample_2',  'QUAL_3', 'sample_3']
    pd_method.head()

    pd_method[['sample_0', 'sample_1', 'sample_2', 'sample_3']] = pd_method[['sample_0','sample_1', 'sample_2', 'sample_3']].fillna(value=False)
    pd_method.drop(pd_method[pd_method['sample_0'] + pd_method['sample_1'] + pd_method['sample_2'] + pd_method['sample_3'] == False].index, axis=0, inplace=True)
    pd_method[['REF', 'sample_0', 'sample_1', 'sample_2', 'sample_3']].head()
    pd_method['caller'] = method

    if count == 0:
        pd_methods = pd_method
        count = 1
    else:
        pd_methods = pd.concat([pd_methods, pd_method], join='inner')
print(pd_methods.shape)

pd_results = pd.DataFrame.empty
count = 0

for method in ['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan']:
    print(method)
    
    res_df = (100*pd.Series(samples_tf)).round(decimals=2).to_frame(name='tumor burden')

    y_true = pd_methods[pd_methods['caller'] == method]['sample_0'].astype(bool).values

    res_df['precision'] = [1,
                      precision_score(y_true, pd_methods[pd_methods['caller'] == method]['sample_1'].astype(bool).values),
                      precision_score(y_true, pd_methods[pd_methods['caller'] == method]['sample_2'].astype(bool).values),
                      precision_score(y_true, pd_methods[pd_methods['caller'] == method]['sample_3'].astype(bool).values),
                     ]

    res_df['recall'] = [1,
                      recall_score(y_true, pd_methods[pd_methods['caller'] == method]['sample_1'].astype(bool).values),
                      recall_score(y_true, pd_methods[pd_methods['caller'] == method]['sample_2'].astype(bool).values),
                      recall_score(y_true, pd_methods[pd_methods['caller'] == method]['sample_3'].astype(bool).values),
                     ]
    res_df = pd.melt(res_df, id_vars =['tumor burden'], value_vars =['precision', 'recall'],
                    var_name='metric', value_name='value')
    res_df['caller'] = method

    if count == 0:
        pd_results = res_df
        count = 1
    else:
        pd_results = pd.concat([pd_results, res_df], join='inner')

print(pd_results.head())
print(pd_results.shape)

g = sns.catplot(x="tumor burden", y="value", col="metric", hue="caller",  
                capsize=.2, height=6, aspect=.75,
                kind="point", order=sorted(pd_results['tumor burden'].unique(), reverse=True), data=pd_results)

## Tumor reference

In [None]:
foo = lambda x: pd.Series(x.split('CALLERS=')[1].split(';')[0])
vcf_pd_t = read_vcf("../data/2015-07-31_NCC_CRC-809_290714-T1W/NCC_CRC-809_290714-T1W-ensemble-annotated.vcf")

vcf_pd_t['INFO'] = vcf_pd_t['INFO'].apply(foo)
vcf_pd_t['freebayes'] = vcf_pd_t['INFO'].str.contains('freebayes')
vcf_pd_t['vardict'] = vcf_pd_t['INFO'].str.contains('vardict')
vcf_pd_t['varscan'] = vcf_pd_t['INFO'].str.contains('varscan')
vcf_pd_t['mutect2'] = vcf_pd_t['INFO'].str.contains('mutect2')
vcf_pd_t['strelka2'] = vcf_pd_t['INFO'].str.contains('strelka2')
vcf_pd_t.drop('INFO', axis=1)


vcf_pd_t['SNV callers'] = vcf_pd_t['freebayes'].map(str) + '_' + vcf_pd_t['vardict'].map(str) +  '_' + vcf_pd_t['varscan'].map(str)

sample_t = vcf_pd_t[['CHROM', 'POS', 'REF', 'ALT', 'QUAL', 'freebayes', 'vardict', 'varscan', 'mutect2', 'strelka2', 'SNV callers']]
sample_t['CHROM_POS'] = sample_t['CHROM'].astype('str').str.cat(sample_t['POS'].astype('str'),sep="_")
sample_t.set_index('CHROM_POS', inplace = True)
sample_t = sample_t[sample_t['CHROM'] == '22']
sample_t.head()

In [None]:
l1 = list(sample_t.index)
l2 = list(sample_0.index)
print(len(l1), len(l2))
print(len(list(set(set(l1) & set(l2)))))

In [None]:
pd_method

In [None]:
pd_methods = pd.DataFrame.empty
count = 0

for method in ['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan']:
    print(method)
    
    st = sample_t[['REF', 'ALT', 'QUAL', method]]
    st.rename(columns = {method:'sample_t'},  inplace = True)
    s0 = sample_0[['REF', 'ALT', 'QUAL', method]]
    s0.rename(columns = {method:'sample_0'},  inplace = True)
    s1 = sample_1[['REF', 'ALT', 'QUAL', method]]
    s1.rename(columns = {method:'sample_1'},  inplace = True)
    s2 = sample_2[['REF', 'ALT', 'QUAL', method]]
    s2.rename(columns = {method:'sample_2'},  inplace = True)
    s3 = sample_3[['REF', 'ALT', 'QUAL', method]]
    s3.rename(columns = {method:'sample_3'},  inplace = True)

    pd_method = pd.concat([st, s0, s1, s2, s3], axis=1)

    pd_method['REF'] = [list(np.unique([i for i in list(ai) if str(i) != 'nan']))[-1] for ai in list(pd_method['REF'].values)]
    pd_method['ALT'] = [list(np.unique([i for i in list(ai) if str(i) != 'nan']))[-1] for ai in list(pd_method['ALT'].values)]
    pd_method = pd_method.T.drop_duplicates().T
    pd_method.columns = ['REF', 'ALT', 'QUAL_t', 'sample_t', 'QUAL_0', 'sample_0', 'QUAL_1', 'sample_1', 'QUAL_2', 'sample_2',  'QUAL_3', 'sample_3']
    pd_method.head()

    pd_method[['sample_t', 'sample_0', 'sample_1', 'sample_2', 'sample_3']] = pd_method[['sample_t', 'sample_0','sample_1', 'sample_2', 'sample_3']].fillna(value=False)
    pd_method.drop(pd_method[pd_method['sample_t'] + pd_method['sample_0'] + pd_method['sample_1'] + pd_method['sample_2'] + pd_method['sample_3'] == False].index, axis=0, inplace=True)
    pd_method[['REF', 'sample_t', 'sample_0', 'sample_1', 'sample_2', 'sample_3']].head()
    pd_method['caller'] = method

    if count == 0:
        pd_methods = pd_method
        count = 1
    else:
        pd_methods = pd.concat([pd_methods, pd_method], join='inner')
print(pd_methods.shape)

pd_results = pd.DataFrame.empty
count = 0

for method in ['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan']:
    print(method)
    
    res_df = (100*pd.Series(samples_tf)).round(decimals=2).to_frame(name='tumor burden')

    y_true = pd_methods[pd_methods['caller'] == method]['sample_t'].astype(bool).values

    res_df['precision'] = [
        precision_score(y_true, pd_methods[pd_methods['caller'] == method]['sample_0'].astype(bool).values),
        precision_score(y_true, pd_methods[pd_methods['caller'] == method]['sample_1'].astype(bool).values),
        precision_score(y_true, pd_methods[pd_methods['caller'] == method]['sample_2'].astype(bool).values),
        precision_score(y_true, pd_methods[pd_methods['caller'] == method]['sample_3'].astype(bool).values),
                     ]

    res_df['recall'] = [
        recall_score(y_true, pd_methods[pd_methods['caller'] == method]['sample_0'].astype(bool).values),
        recall_score(y_true, pd_methods[pd_methods['caller'] == method]['sample_1'].astype(bool).values),
        recall_score(y_true, pd_methods[pd_methods['caller'] == method]['sample_2'].astype(bool).values),
        recall_score(y_true, pd_methods[pd_methods['caller'] == method]['sample_3'].astype(bool).values),
                     ]
    res_df = pd.melt(res_df, id_vars =['tumor burden'], value_vars =['precision', 'recall'],
                    var_name='metric', value_name='value')
    res_df['caller'] = method

    if count == 0:
        pd_results = res_df
        count = 1
    else:
        pd_results = pd.concat([pd_results, res_df], join='inner')

print(pd_results.head())
print(pd_results.shape)

g = sns.catplot(x="tumor burden", y="value", col="metric", hue="caller",  
                capsize=.2, height=6, aspect=.75,
                kind="point", order=sorted(pd_results['tumor burden'].unique(), reverse=True), data=pd_results)
    

- Histograms of true allele frequencies in each tumor sample. Note how increasing admixture increases the prevalence of low- frequency variants.
- Benchmarking results for germline SNVs
- Benchmarking results for somatic SNVs on exome data.
- averaged over the four replicates

- add fake mutations on healthy mixtures

# Dilution effect on 986_100215

In [None]:
foo = lambda x: pd.Series(x.split('CALLERS=')[1].split(';')[0])
vcf_pd_0 = read_vcf("../data/2015-07-31_CRC-986_100215-filter-1-0/CRC-986_100215-filter-1-0-ensemble-annotated.vcf")
vcf_pd_1 = read_vcf("../data/2015-07-31_CRC-986_100215-filter-05-05/CRC-986_100215-filter-05-05-ensemble-annotated.vcf")
vcf_pd_2 = read_vcf("../data/2015-07-31_CRC-986_100215-filter-005-095/CRC-986_100215-filter-005-095-ensemble-annotated.vcf")
vcf_pd_3 = read_vcf("../data/2015-07-31_CRC-986_100215-filter-001-099/CRC-986_100215-filter-001-099-ensemble-annotated.vcf")

vcf_pd_0['INFO'] = vcf_pd_0['INFO'].apply(foo)
vcf_pd_0['freebayes'] = vcf_pd_0['INFO'].str.contains('freebayes')
vcf_pd_0['vardict'] = vcf_pd_0['INFO'].str.contains('vardict')
vcf_pd_0['varscan'] = vcf_pd_0['INFO'].str.contains('varscan')
vcf_pd_0['mutect2'] = vcf_pd_0['INFO'].str.contains('mutect2')
vcf_pd_0['strelka2'] = vcf_pd_0['INFO'].str.contains('strelka2')
vcf_pd_0.drop('INFO', axis=1)

vcf_pd_1['INFO'] = vcf_pd_1['INFO'].apply(foo)
vcf_pd_1['freebayes'] = vcf_pd_1['INFO'].str.contains('freebayes')
vcf_pd_1['vardict'] = vcf_pd_1['INFO'].str.contains('vardict')
vcf_pd_1['varscan'] = vcf_pd_1['INFO'].str.contains('varscan')
vcf_pd_1['mutect2'] = vcf_pd_1['INFO'].str.contains('mutect2')
vcf_pd_1['strelka2'] = vcf_pd_1['INFO'].str.contains('strelka2')
vcf_pd_1.drop('INFO', axis=1)

vcf_pd_2['INFO'] = vcf_pd_2['INFO'].apply(foo)
vcf_pd_2['freebayes'] = vcf_pd_2['INFO'].str.contains('freebayes')
vcf_pd_2['vardict'] = vcf_pd_2['INFO'].str.contains('vardict')
vcf_pd_2['varscan'] = vcf_pd_2['INFO'].str.contains('varscan')
vcf_pd_2['mutect2'] = vcf_pd_2['INFO'].str.contains('mutect2')
vcf_pd_2['strelka2'] = vcf_pd_2['INFO'].str.contains('strelka2')
vcf_pd_2.drop('INFO', axis=1)

vcf_pd_3['INFO'] = vcf_pd_3['INFO'].apply(foo)
vcf_pd_3['freebayes'] = vcf_pd_3['INFO'].str.contains('freebayes')
vcf_pd_3['vardict'] = vcf_pd_3['INFO'].str.contains('vardict')
vcf_pd_3['varscan'] = vcf_pd_3['INFO'].str.contains('varscan')
vcf_pd_3['mutect2'] = vcf_pd_3['INFO'].str.contains('mutect2')
vcf_pd_3['strelka2'] = vcf_pd_3['INFO'].str.contains('strelka2')
vcf_pd_3.drop('INFO', axis=1)

vcf_pd_0['SNV callers'] = vcf_pd_0['freebayes'].map(str) + '_' + vcf_pd_0['vardict'].map(str) +  '_' + vcf_pd_0['varscan'].map(str)
vcf_pd_1['SNV callers'] = vcf_pd_1['freebayes'].map(str) + '_' + vcf_pd_1['vardict'].map(str) +  '_' + vcf_pd_1['varscan'].map(str)
vcf_pd_2['SNV callers'] = vcf_pd_2['freebayes'].map(str) + '_' + vcf_pd_2['vardict'].map(str) +  '_' + vcf_pd_2['varscan'].map(str)
vcf_pd_3['SNV callers'] = vcf_pd_3['freebayes'].map(str) + '_' + vcf_pd_3['vardict'].map(str) +  '_' + vcf_pd_3['varscan'].map(str)

sample_0 = vcf_pd_0[['CHROM', 'POS', 'REF', 'ALT', 'QUAL', 'freebayes', 'vardict', 'varscan', 'mutect2', 'strelka2', 'SNV callers']]
sample_0['CHROM_POS'] = sample_0['CHROM'].astype('str').str.cat(sample_0['POS'].astype('str'),sep="_")
sample_0.set_index('CHROM_POS', inplace = True)

sample_1 = vcf_pd_1[['CHROM', 'POS', 'REF', 'ALT', 'QUAL', 'freebayes', 'vardict', 'varscan', 'mutect2', 'strelka2', 'SNV callers']]
sample_1['CHROM_POS'] = sample_1['CHROM'].astype('str').str.cat(sample_1['POS'].astype('str'),sep="_")
sample_1.set_index('CHROM_POS', inplace = True)

sample_2 = vcf_pd_2[['CHROM', 'POS', 'REF', 'ALT', 'QUAL', 'freebayes', 'vardict', 'varscan', 'mutect2', 'strelka2', 'SNV callers']]
sample_2['CHROM_POS'] = sample_2['CHROM'].astype('str').str.cat(sample_2['POS'].astype('str'),sep="_")
sample_2.set_index('CHROM_POS', inplace = True)

sample_3 = vcf_pd_3[['CHROM', 'POS', 'REF', 'ALT', 'QUAL', 'freebayes', 'vardict', 'varscan', 'mutect2', 'strelka2', 'SNV callers']]
sample_3['CHROM_POS'] = sample_3['CHROM'].astype('str').str.cat(sample_3['POS'].astype('str'),sep="_")
sample_3.set_index('CHROM_POS', inplace = True)

sample_3.head()

In [None]:
samples_tf = {
    'sample_0': 0.42,
    'sample_1': float(list(pd.read_csv("../data/2015-07-31_CRC-986_100215-filter-05-05/estimated_tf.txt").columns)[0]),
    'sample_2': float(list(pd.read_csv("../data/2015-07-31_CRC-986_100215-filter-005-095/estimated_tf.txt").columns)[0]),
    'sample_3': float(list(pd.read_csv("../data/2015-07-31_CRC-986_100215-filter-001-099/estimated_tf.txt").columns)[0]),
}

print(samples_tf)

In [None]:
numbersnvs_pd = pd.DataFrame.empty

for si, s in enumerate([sample_0, sample_1, sample_2, sample_3]):
    nb_snv = []
    for method in ['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan']:
        if si == 4:
            si = 't'
       # print('sample '+ str(si) + ': ', method, s[s[method] == True].shape[0])
        nb_snv.append(s[s[method] == True].shape[0])
    if si == 0:
        numbersnvs_pd = pd.DataFrame.from_dict({'sample_'+ str(si): nb_snv}).T
        numbersnvs_pd.columns = ['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan']
    else:
        numbersnvs_pd.loc['sample_'+ str(si)] = nb_snv

numbersnvs_pd.plot(style='.-')
        
numbersnvs_pd

In [None]:
pd_methods = pd.DataFrame.empty
count = 0

for method in ['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan']:
    print(method)

    s0 = sample_0[['REF', 'ALT', 'QUAL', method]]
    s0.rename(columns = {method:'sample_0'},  inplace = True)
    s1 = sample_1[['REF', 'ALT', 'QUAL', method]]
    s1.rename(columns = {method:'sample_1'},  inplace = True)
    s2 = sample_2[['REF', 'ALT', 'QUAL', method]]
    s2.rename(columns = {method:'sample_2'},  inplace = True)
    s3 = sample_3[['REF', 'ALT', 'QUAL', method]]
    s3.rename(columns = {method:'sample_3'},  inplace = True)

    pd_method = pd.concat([s0, s1, s2, s3], axis=1)

    pd_method['REF'] = [list(np.unique([i for i in list(ai) if str(i) != 'nan']))[-1] for ai in list(pd_method['REF'].values)]
    pd_method['ALT'] = [list(np.unique([i for i in list(ai) if str(i) != 'nan']))[-1] for ai in list(pd_method['ALT'].values)]
    pd_method = pd_method.T.drop_duplicates().T
    pd_method.columns = ['REF', 'ALT', 'QUAL_0', 'sample_0', 'QUAL_1', 'sample_1', 'QUAL_2', 'sample_2',  'QUAL_3', 'sample_3']
    pd_method.head()

    pd_method[['sample_0', 'sample_1', 'sample_2', 'sample_3']] = pd_method[['sample_0','sample_1', 'sample_2', 'sample_3']].fillna(value=False)
    pd_method.drop(pd_method[pd_method['sample_0'] + pd_method['sample_1'] + pd_method['sample_2'] + pd_method['sample_3'] == False].index, axis=0, inplace=True)
    pd_method[['REF', 'sample_0', 'sample_1', 'sample_2', 'sample_3']].head()
    pd_method['caller'] = method

    if count == 0:
        pd_methods = pd_method
        count = 1
    else:
        pd_methods = pd.concat([pd_methods, pd_method], join='inner')
print(pd_methods.shape)

pd_results = pd.DataFrame.empty
count = 0

for method in ['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan']:
    print(method)
    
    res_df = (100*pd.Series(samples_tf)).round(decimals=2).to_frame(name='tumor burden')

    y_true = pd_methods[pd_methods['caller'] == method]['sample_0'].astype(bool).values

    res_df['precision'] = [1,
                      precision_score(y_true, pd_methods[pd_methods['caller'] == method]['sample_1'].astype(bool).values),
                      precision_score(y_true, pd_methods[pd_methods['caller'] == method]['sample_2'].astype(bool).values),
                      precision_score(y_true, pd_methods[pd_methods['caller'] == method]['sample_3'].astype(bool).values),
                     ]

    res_df['recall'] = [1,
                      recall_score(y_true, pd_methods[pd_methods['caller'] == method]['sample_1'].astype(bool).values),
                      recall_score(y_true, pd_methods[pd_methods['caller'] == method]['sample_2'].astype(bool).values),
                      recall_score(y_true, pd_methods[pd_methods['caller'] == method]['sample_3'].astype(bool).values),
                     ]
    res_df = pd.melt(res_df, id_vars =['tumor burden'], value_vars =['precision', 'recall'],
                    var_name='metric', value_name='value')
    res_df['caller'] = method

    if count == 0:
        pd_results = res_df
        count = 1
    else:
        pd_results = pd.concat([pd_results, res_df], join='inner')

print(pd_results.head())
print(pd_results.shape)

g = sns.catplot(x="tumor burden", y="value", col="metric", hue="caller",  
                capsize=.2, height=6, aspect=.75,
                kind="point", order=sorted(pd_results['tumor burden'].unique(), reverse=True), data=pd_results)
    

In [None]:
y_true = pd_methods[['sample_0', 'caller']]
y_true.index.name = 'CHROM_POS'
y_true = y_true.groupby(['CHROM_POS'])['sample_0'].sum()
y_true[y_true == 1] = 0
y_true = y_true.astype(bool)
print(y_true.shape[0])

pd_results = pd.DataFrame.empty

count = 0

for method in ['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan']:
    print(method)
    
    res_df = (100*pd.Series(samples_tf)).round(decimals=2).to_frame(name='tumor burden')
    
    y_1 = pd_methods[pd_methods['caller'] == method]['sample_1'].astype(bool).reindex(y_true.index).squeeze()
    y_1 = y_1.fillna(False)
    y_2 = pd_methods[pd_methods['caller'] == method]['sample_2'].astype(bool).reindex(y_true.index).squeeze()
    y_2 = y_2.fillna(False)
    y_3 = pd_methods[pd_methods['caller'] == method]['sample_3'].astype(bool).reindex(y_true.index).squeeze()
    y_3 = y_3.fillna(False)

    res_df['precision'] = [1,
                      precision_score(y_true.values, y_1.values),
                      precision_score(y_true.values, y_2.values),
                      precision_score(y_true.values, y_3.values),
                     ]

    res_df['recall'] = [1,
                      recall_score(y_true.values, y_1.values),
                      recall_score(y_true.values, y_2.values),
                      recall_score(y_true.values, y_3.values),
                     ]
    res_df = pd.melt(res_df, id_vars =['tumor burden'], value_vars =['precision', 'recall'],
                    var_name='metric', value_name='value')
    res_df['caller'] = method

    if count == 0:
        pd_results = res_df
        count = 1
    else:
        pd_results = pd.concat([pd_results, res_df], join='inner')

print(pd_results.head())
print(pd_results.shape)

g = sns.catplot(x="tumor burden", y="value", col="metric", hue="caller",  
                capsize=.2, height=6, aspect=.75,
                kind="point", order=sorted(pd_results['tumor burden'].unique(), reverse=True), data=pd_results)


In [None]:
pd_methods = pd.DataFrame.empty
count = 0

for method in ['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan']:
    print(method)

    s0 = sample_0[['REF', 'ALT', 'QUAL', method]].reindex(list(set(set(l1) & set(l2))))
    s0.rename(columns = {method:'sample_0'},  inplace = True)
    s1 = sample_1[['REF', 'ALT', 'QUAL', method]].reindex(list(set(set(l1) & set(l2))))
    s1.rename(columns = {method:'sample_1'},  inplace = True)
    s2 = sample_2[['REF', 'ALT', 'QUAL', method]].reindex(list(set(set(l1) & set(l2))))
    s2.rename(columns = {method:'sample_2'},  inplace = True)
    s3 = sample_3[['REF', 'ALT', 'QUAL', method]].reindex(list(set(set(l1) & set(l2))))
    s3.rename(columns = {method:'sample_3'},  inplace = True)

    pd_method = pd.concat([s0, s1, s2, s3], axis=1)

    pd_method['REF'] = [list(np.unique([i for i in list(ai) if str(i) != 'nan']))[-1]
                        if len(list(np.unique([i for i in list(ai) if str(i) != 'nan']))) else 'nan'
                        for ai in list(pd_method['REF'].values)]
    pd_method['ALT'] = [list(np.unique([i for i in list(ai) if str(i) != 'nan']))[-1]
                        if len(list(np.unique([i for i in list(ai) if str(i) != 'nan']))) else 'nan'
                        for ai in list(pd_method['ALT'].values)]
    pd_method = pd_method.T.drop_duplicates().T
    pd_method.columns = ['REF', 'ALT', 'QUAL_0', 'sample_0', 'QUAL_1', 'sample_1', 'QUAL_2', 'sample_2',  'QUAL_3', 'sample_3']
    pd_method.head()

    pd_method[['sample_0', 'sample_1', 'sample_2', 'sample_3']] = pd_method[['sample_0','sample_1', 'sample_2', 'sample_3']].fillna(value=False)
    pd_method.drop(pd_method[pd_method['sample_0'] + pd_method['sample_1'] + pd_method['sample_2'] + pd_method['sample_3'] == False].index, axis=0, inplace=True)
    pd_method[['REF', 'sample_0', 'sample_1', 'sample_2', 'sample_3']].head()
    pd_method['caller'] = method

    if count == 0:
        pd_methods = pd_method
        count = 1
    else:
        pd_methods = pd.concat([pd_methods, pd_method], join='inner')
print(pd_methods.shape)

pd_results = pd.DataFrame.empty
count = 0

for method in ['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan']:
    print(method)
    
    res_df = (100*pd.Series(samples_tf)).round(decimals=2).to_frame(name='tumor burden')

    y_true = pd_methods[pd_methods['caller'] == method]['sample_0'].astype(bool).values

    res_df['precision'] = [1,
                      precision_score(y_true, pd_methods[pd_methods['caller'] == method]['sample_1'].astype(bool).values),
                      precision_score(y_true, pd_methods[pd_methods['caller'] == method]['sample_2'].astype(bool).values),
                      precision_score(y_true, pd_methods[pd_methods['caller'] == method]['sample_3'].astype(bool).values),
                     ]

    res_df['recall'] = [1,
                      recall_score(y_true, pd_methods[pd_methods['caller'] == method]['sample_1'].astype(bool).values),
                      recall_score(y_true, pd_methods[pd_methods['caller'] == method]['sample_2'].astype(bool).values),
                      recall_score(y_true, pd_methods[pd_methods['caller'] == method]['sample_3'].astype(bool).values),
                     ]
    res_df = pd.melt(res_df, id_vars =['tumor burden'], value_vars =['precision', 'recall'],
                    var_name='metric', value_name='value')
    res_df['caller'] = method

    if count == 0:
        pd_results = res_df
        count = 1
    else:
        pd_results = pd.concat([pd_results, res_df], join='inner')

print(pd_results.head())
print(pd_results.shape)

g = sns.catplot(x="tumor burden", y="value", col="metric", hue="caller",  
                capsize=.2, height=6, aspect=.75,
                kind="point", order=sorted(pd_results['tumor burden'].unique(), reverse=True), data=pd_results)