In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pysam
from collections import Counter
from matplotlib import cm
from matplotlib.colors import ListedColormap

import io
from pysam import VariantFile

from sklearn.metrics import precision_score, recall_score, average_precision_score, precision_recall_curve

In [None]:
tab20 = cm.get_cmap('tab10', 8)
newcmap_list = []
for i in range(4):
    newcmap_list.append(tab20.colors[i])
for i in range(4):
    newcmap_list.append(np.array(list(tab20.colors[i][:-1]) +[0.3]))

newcmap = ListedColormap(newcmap_list, name='newcmap')
color_list = ['tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple']

In [None]:
df_snv = pd.read_csv('../data/SMURF benchmark/snv-smurf-test20-v7-new5-maxdepth-2.tsv', sep='\t')
#df_snv = df_snv[(df_snv['Sample_Name'] == 'icgc_cll_T20_tumour') | (df_snv['Sample_Name'] == 'icgc_mbl_T20_tumour')]
df_snv = df_snv[['X.CHROM', 'POS', 'Sample_Name', 'TRUTH', 'FILTER_Mutect2', 'FILTER_Freebayes', 'FILTER_Vardict', 'FILTER_Varscan', 'FILTER_Strelka2', 'predict']]
df_snv['CHROM_POS'] = df_snv['X.CHROM'].astype(str).str.cat(df_snv['POS'].astype(str), sep="_")
df_snv.drop(['X.CHROM', 'POS'], axis=1, inplace=True)
df_snv.columns = ['Sample_Name', 'TRUTH', "mutect2", 'freebayes', 'vardict', 'varscan', 'strelka2', 'smurf', 'CHROM_POS']
df_snv.set_index('CHROM_POS', inplace=True)
print(df_snv.shape)
df_snv.head()

In [None]:
df_ind = pd.read_csv('../data/SMURF benchmark/indel-smurf-test20-v7-new5-maxdepth-2.tsv', sep='\t')
#df_ind = df_ind[(df_ind['Sample_Name'] == 'icgc_cll_T20_tumour') | (df_ind['Sample_Name'] == 'icgc_mbl_T20_tumour')]
df_ind = df_ind[['X.CHROM', 'POS', 'Sample_Name', 'TRUTH', 'FILTER_Mutect2', 'FILTER_Freebayes', 'FILTER_Vardict', 'FILTER_Varscan', 'FILTER_Strelka2', 'predict']]
df_ind['CHROM_POS'] = df_ind['X.CHROM'].astype(str).str.cat(df_ind['POS'].astype(str), sep="_")
df_ind.drop(['X.CHROM', 'POS'], axis=1, inplace=True)
df_ind.columns = ['Sample_Name', 'TRUTH', "mutect2", 'freebayes', 'vardict', 'varscan', 'strelka2', 'smurf', 'CHROM_POS']
df_ind.set_index('CHROM_POS', inplace=True)
print(df_ind.shape)
df_ind.head()

# Ground truths

In [None]:
pd_results = pd.DataFrame()
methods = ['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan']
samples = ['icgc_cll_T20_tumour', 'icgc_mbl_T20_tumour']

pd_results['AUPRC'] = [average_precision_score(df_snv[df_snv['Sample_Name'] == sample]['TRUTH'], df_snv[df_snv['Sample_Name'] == sample][m])
                       - len(df_snv[df_snv['Sample_Name'] == sample]['TRUTH'][df_snv[df_snv['Sample_Name'] == sample]['TRUTH'] == True])/len(df_snv[df_snv['Sample_Name'] == sample]['TRUTH'])
                       for m in methods for sample in samples]
pd_results['tumor burden'] = 0.2
pd_results['caller'] = [m for m in methods for sample in samples]
pd_results['sample'] = [sample for m in methods for sample in samples]

plt.figure()
sns.catplot(x="sample", y="AUPRC", hue="caller",  
                capsize=.2, height=6, aspect=.75, kind="point", data=pd_results)
plt.ylim([0, 1])
plt.title('SNV')


pd_results = pd.DataFrame()
pd_results['AUPRC'] = [average_precision_score(df_ind[df_ind['Sample_Name'] == sample]['TRUTH'], df_ind[df_ind['Sample_Name'] == sample][m]) -
                        - len(df_ind[df_ind['Sample_Name'] == sample]['TRUTH'][df_ind[df_ind['Sample_Name'] == sample]['TRUTH'] == True])/len(df_ind[df_ind['Sample_Name'] == sample]['TRUTH'])
                       for m in methods for sample in samples]
pd_results['tumor burden'] = 0.2
pd_results['caller'] = [m for m in methods for sample in samples]
pd_results['sample'] = [sample for m in methods for sample in samples]

plt.figure()
sns.catplot(x="sample", y="AUPRC", hue="caller",  
                capsize=.2, height=6, aspect=.75, kind="point", data=pd_results)
plt.ylim([0, 1])
plt.title('Indels')

# Pseudo ground truth = SMURF output

In [None]:
pd_results = pd.DataFrame()
methods = ['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan']
samples = ['icgc_cll_T20_tumour', 'icgc_mbl_T20_tumour']
refs = ['T20_', 'T40_', '']

pd_results['AUPRC'] = [average_precision_score(df_snv[df_snv['Sample_Name'] == sample.split('T')[0]+ref+'tumour']['smurf'],
                                               df_snv[df_snv['Sample_Name'] == sample][m].reindex(df_snv[df_snv['Sample_Name'] == sample.split('T')[0]+ref+'tumour']['smurf'].index).fillna(False))
                       for ref in refs for m in methods for sample in samples]
pd_results['tumor burden'] = [ref for ref in [0.2, 0.4, 1] for m in methods for sample in samples]
pd_results['caller'] = [m for ref in refs for m in methods for sample in samples]
pd_results['sample'] = [sample for ref in refs for m in methods for sample in samples]

plt.figure()
sns.catplot(x="tumor burden", y="AUPRC", hue="caller",   col='sample', 
                capsize=.2, height=6, aspect=.75, kind="point", data=pd_results)
plt.ylim([0, 1])
#plt.title('SNV')
print(pd_results)

pd_results = pd.DataFrame()
methods = ['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan']
samples = ['icgc_cll_T20_tumour', 'icgc_mbl_T20_tumour']

pd_results['AUPRC'] = [average_precision_score(df_snv[df_snv['Sample_Name'] == sample.split('T')[0]+'tumour']['smurf'],
                                               df_snv[df_snv['Sample_Name'] == sample][m].reindex(df_snv[df_snv['Sample_Name'] == sample.split('T')[0]+'tumour']['smurf'].index).fillna(False))
                       for m in methods for sample in samples]
pd_results['tumor burden'] = 0.2
pd_results['caller'] = [m for m in methods for sample in samples]
pd_results['sample'] = [sample for m in methods for sample in samples]

plt.figure()
sns.catplot(x="sample", y="AUPRC", hue="caller", 
                capsize=.2, height=6, aspect=.75, kind="point", data=pd_results)
plt.ylim([0, 1])
#plt.title('SNV')
print(pd_results)

'''
pd_results = pd.DataFrame()
aux = []
for sample in samples:
    y_true = df_ind[df_ind['Sample_Name'] == sample.split('T')[0]+'T40_tumour'][['smurf']].groupby(df_ind[df_ind['Sample_Name'] == sample.split('T')[0]+'T40_tumour'][['smurf']].index).first()
    for m in methods:
        print(sample, m)
        a = average_precision_score(y_true, df_ind[df_ind['Sample_Name'] == sample][m].reindex(y_true.index).fillna(False))
        aux.append()
pd_results['AUPRC'] = aux
#[average_precision_score(df_ind[df_ind['Sample_Name'] == sample.split('T')[0]+'T40_tumour'][['smurf']].groupby(df_ind[df_ind['Sample_Name'] == sample.split('T')[0]+'T40_tumour'][['smurf']].index).first(),
#                                               df_ind[df_ind['Sample_Name'] == sample][m].reindex(df_ind[df_ind['Sample_Name'] == sample.split('T')[0]+'T40_tumour'][['smurf']].groupby(df_ind[df_ind['Sample_Name'] == sample.split('T')[0]+'T40_tumour'][['smurf']].index).first().index).fillna(False))
#                       for m in methods for sample in samples]
pd_results['tumor burden'] = 0.2
pd_results['caller'] = [m for m in methods for sample in samples]
pd_results['sample'] = [sample for m in methods for sample in samples]

plt.figure()
sns.catplot(x="sample", y="AUPRC", hue="caller",  
                capsize=.2, height=6, aspect=.75, kind="point", data=pd_results)
plt.ylim([0, 1])
plt.title('Indels')
'''

# Pseudo

In [None]:
pd_results = pd.DataFrame()
methods = ['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan']
samples = ['icgc_cll_T20_tumour', 'icgc_mbl_T20_tumour']
refs = ['T20_', 'T40_', '']

pd_results['AUPRC'] = [average_precision_score(df_snv[df_snv['Sample_Name'] == sample.split('T')[0]+ref+'tumour'][m],
                                               df_snv[df_snv['Sample_Name'] == sample][m].reindex(df_snv[df_snv['Sample_Name'] == sample.split('T')[0]+ref+'tumour'][m].index).fillna(False))
                       for ref in refs for m in methods for sample in samples]
pd_results['tumor burden'] = [ref for ref in [0.2, 0.4, 1] for m in methods for sample in samples]
pd_results['caller'] = [m for ref in refs for m in methods for sample in samples]
pd_results['sample'] = [sample for ref in refs for m in methods for sample in samples]

plt.figure()
sns.catplot(x="tumor burden", y="AUPRC", hue="caller",   col='sample', 
                capsize=.2, height=6, aspect=.75, kind="point", data=pd_results)
plt.ylim([0, 1])
#plt.title('SNV')
print(pd_results)

'''
pd_results = pd.DataFrame()
aux = []
for sample in samples:
    y_true = df_ind[df_ind['Sample_Name'] == sample.split('T')[0]+'T40_tumour'][['smurf']].groupby(df_ind[df_ind['Sample_Name'] == sample.split('T')[0]+'T40_tumour'][['smurf']].index).first()
    for m in methods:
        print(sample, m)
        a = average_precision_score(y_true, df_ind[df_ind['Sample_Name'] == sample][m].reindex(y_true.index).fillna(False))
        aux.append()
pd_results['AUPRC'] = aux
#[average_precision_score(df_ind[df_ind['Sample_Name'] == sample.split('T')[0]+'T40_tumour'][['smurf']].groupby(df_ind[df_ind['Sample_Name'] == sample.split('T')[0]+'T40_tumour'][['smurf']].index).first(),
#                                               df_ind[df_ind['Sample_Name'] == sample][m].reindex(df_ind[df_ind['Sample_Name'] == sample.split('T')[0]+'T40_tumour'][['smurf']].groupby(df_ind[df_ind['Sample_Name'] == sample.split('T')[0]+'T40_tumour'][['smurf']].index).first().index).fillna(False))
#                       for m in methods for sample in samples]
pd_results['tumor burden'] = 0.2
pd_results['caller'] = [m for m in methods for sample in samples]
pd_results['sample'] = [sample for m in methods for sample in samples]

plt.figure()
sns.catplot(x="sample", y="AUPRC", hue="caller",  
                capsize=.2, height=6, aspect=.75, kind="point", data=pd_results)
plt.ylim([0, 1])
plt.title('Indels')
'''

In [None]:
pd_results = pd.DataFrame()
methods = ['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan']
samples = ['icgc_cll_T20_tumour', 'icgc_mbl_T20_tumour']
purities = ['T20_', 'T40_', '']

aux = []
for ref in range(3):
    for purity in purities:
        for m in methods:
            for sample in samples:
                if ref == 0: # ground truth
                    y_true = df_snv[df_snv['Sample_Name'] == sample.split('T')[0]+purity+'tumour']['smurf']
                    y_true.index.name = 'CHROM_POS'
                    y_true = y_true.astype(bool)
                elif ref == 2: # smurf gt given method at different purities
                    y_true = df_snv[df_snv['Sample_Name'] == sample.split('T')[0]+purity+'tumour'][m]
                    y_true.index.name = 'CHROM_POS'
                    y_true = y_true.astype(bool)
                elif ref ==1: # smurf gt on 3 methods
                    y_true = df_snv[df_snv['Sample_Name'] == sample.split('T')[0]+purity+'tumour'][methods]
                    y_true.index.name = 'CHROM_POS'
                    y_true = y_true.sum(axis=1)
                    y_true[y_true < 3] = 0
                    y_true[y_true != 0 ] = 1
                    y_true = y_true.astype(bool)
                aux.append(average_precision_score(y_true, df_snv[df_snv['Sample_Name'] == sample][m].reindex(y_true.index).fillna(False)))
pd_results['AUPRC'] = aux
#pd_results['AUPRC'] = [average_precision_score(df_snv[df_snv['Sample_Name'] == sample.split('T')[0]+ref+'tumour'][m],
#df_snv[df_snv['Sample_Name'] == sample][m].reindex(df_snv[df_snv['Sample_Name'] == sample.split('T')[0]+ref+'tumour'][m].index).fillna(False))
pd_results['tumor burden'] = [purity for ref in range(3) for purity in [0.2, 0.4, 1] for m in methods for sample in samples]
pd_results['caller'] = [m for ref in range(3) for purity in purities for m in methods for sample in samples]
pd_results['sample'] = [sample  for ref in range(3) for purity in purities for m in methods for sample in samples]
pd_results['reference'] = [ref for ref in ['ground truth','smurf\n on 3 or more methods',  'smurf\n on given method'] for purity in purities for m in methods for sample in samples]

for si, sample in enumerate(samples):
    sns.catplot(x="tumor burden", y="AUPRC", hue="caller",  col='reference',
              capsize=.2, height=4, aspect=1.5, kind="point",
                order=sorted(np.unique(pd_results['tumor burden'].values), reverse=True),
                    data=pd_results[pd_results['sample'] == sample])
    plt.ylim([0, 1])

'''
pd_results = pd.DataFrame()
aux = []
for sample in samples:
    y_true = df_ind[df_ind['Sample_Name'] == sample.split('T')[0]+'T40_tumour'][['smurf']].groupby(df_ind[df_ind['Sample_Name'] == sample.split('T')[0]+'T40_tumour'][['smurf']].index).first()
    for m in methods:
        print(sample, m)
        a = average_precision_score(y_true, df_ind[df_ind['Sample_Name'] == sample][m].reindex(y_true.index).fillna(False))
        aux.append()
pd_results['AUPRC'] = aux
#[average_precision_score(df_ind[df_ind['Sample_Name'] == sample.split('T')[0]+'T40_tumour'][['smurf']].groupby(df_ind[df_ind['Sample_Name'] == sample.split('T')[0]+'T40_tumour'][['smurf']].index).first(),
#                                               df_ind[df_ind['Sample_Name'] == sample][m].reindex(df_ind[df_ind['Sample_Name'] == sample.split('T')[0]+'T40_tumour'][['smurf']].groupby(df_ind[df_ind['Sample_Name'] == sample.split('T')[0]+'T40_tumour'][['smurf']].index).first().index).fillna(False))
#                       for m in methods for sample in samples]
pd_results['tumor burden'] = 0.2
pd_results['caller'] = [m for m in methods for sample in samples]
pd_results['sample'] = [sample for m in methods for sample in samples]

plt.figure()
sns.catplot(x="sample", y="AUPRC", hue="caller",  
                capsize=.2, height=6, aspect=.75, kind="point", data=pd_results)
plt.ylim([0, 1])
plt.title('Indels')
'''

In [None]:
pd_results = pd.DataFrame()
methods = ['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan']
samples = ['icgc_cll_T20_tumour', 'icgc_mbl_T20_tumour']
purities = ['T20_', 'T40_', '']

aux = []
for ref in range(3):
    for purity in purities:
        for m in methods:
            for sample in samples:
                if ref == 0: # ground truth
                    y_true = df_ind[df_ind['Sample_Name'] == sample.split('T')[0]+purity+'tumour']['smurf']
                    y_true.index.name = 'CHROM_POS'
                    y_true = y_true.astype(bool)
                elif ref == 2: # smurf gt given method at different purities
                    y_true = df_ind[df_ind['Sample_Name'] == sample.split('T')[0]+purity+'tumour'][m]
                    y_true.index.name = 'CHROM_POS'
                    y_true = y_true.astype(bool)
                elif ref ==1: # smurf gt on 3 methods
                    y_true = df_ind[df_ind['Sample_Name'] == sample.split('T')[0]+purity+'tumour'][methods]
                    y_true.index.name = 'CHROM_POS'
                    y_true = y_true.sum(axis=1)
                    y_true[y_true < 3] = 0
                    y_true[y_true != 0 ] = 1
                    y_true = y_true.astype(bool)
                print(sum(y_true.index.duplicated()))
                y_true = y_true[~y_true.index.duplicated(keep='first')]
                baselineAUPRC= len(y_true[y_true == True])/len(y_true)
                print(baselineAUPRC)
                print(ref, m, purity)
                print(y_true.index[y_true.index.duplicated()])
                a = df_ind[df_ind['Sample_Name'] == sample][m]
                a = a[~a.index.duplicated(keep='first')]
                print(a.index[a.index.duplicated()])
                aux.append(average_precision_score(y_true, a.reindex(y_true.index).fillna(False)))
pd_results['AUPRC'] = aux
#pd_results['AUPRC'] = [average_precision_score(df_snv[df_snv['Sample_Name'] == sample.split('T')[0]+ref+'tumour'][m],
#df_snv[df_snv['Sample_Name'] == sample][m].reindex(df_snv[df_snv['Sample_Name'] == sample.split('T')[0]+ref+'tumour'][m].index).fillna(False))
pd_results['tumor burden'] = [purity for ref in range(3) for purity in [0.2, 0.4, 1] for m in methods for sample in samples]
pd_results['caller'] = [m for ref in range(3) for purity in purities for m in methods for sample in samples]
pd_results['sample'] = [sample  for ref in range(3) for purity in purities for m in methods for sample in samples]
pd_results['reference'] = [ref for ref in ['ground truth','smurf\n on 3 or more methods',  'smurf\n on given method'] for purity in purities for m in methods for sample in samples]

for si, sample in enumerate(samples):
    sns.catplot(x="tumor burden", y="AUPRC", hue="caller",  col='reference',
              capsize=.2, height=4, aspect=1.5, kind="point",
                    data=pd_results[pd_results['sample'] == sample])
    plt.axhline(y = 0, color = 'k', linestyle = '--', label='baseline AUPRC') 
    plt.ylim([0, 1])
    
    
for si, sample in enumerate(samples):
    sns.catplot(x="tumor burden", y="AUPRC", hue="caller",
              capsize=.2, height=4, aspect=1.5, kind="point",
                    data=pd_results[(pd_results['sample'] == sample) & (pd_results['reference'] == 'ground truth')])
    plt.axhline(y = bas, color = 'k', linestyle = '--', label='baseline AUPRC') 
    plt.ylim([0, 1])

'''
pd_results = pd.DataFrame()
aux = []
for sample in samples:
    y_true = df_ind[df_ind['Sample_Name'] == sample.split('T')[0]+'T40_tumour'][['smurf']].groupby(df_ind[df_ind['Sample_Name'] == sample.split('T')[0]+'T40_tumour'][['smurf']].index).first()
    for m in methods:
        print(sample, m)
        a = average_precision_score(y_true, df_ind[df_ind['Sample_Name'] == sample][m].reindex(y_true.index).fillna(False))
        aux.append()
pd_results['AUPRC'] = aux
#[average_precision_score(df_ind[df_ind['Sample_Name'] == sample.split('T')[0]+'T40_tumour'][['smurf']].groupby(df_ind[df_ind['Sample_Name'] == sample.split('T')[0]+'T40_tumour'][['smurf']].index).first(),
#                                               df_ind[df_ind['Sample_Name'] == sample][m].reindex(df_ind[df_ind['Sample_Name'] == sample.split('T')[0]+'T40_tumour'][['smurf']].groupby(df_ind[df_ind['Sample_Name'] == sample.split('T')[0]+'T40_tumour'][['smurf']].index).first().index).fillna(False))
#                       for m in methods for sample in samples]
pd_results['tumor burden'] = 0.2
pd_results['caller'] = [m for m in methods for sample in samples]
pd_results['sample'] = [sample for m in methods for sample in samples]

plt.figure()
sns.catplot(x="sample", y="AUPRC", hue="caller",  
                capsize=.2, height=6, aspect=.75, kind="point", data=pd_results)
plt.ylim([0, 1])
plt.title('Indels')
'''

In [None]:
pd_results = pd.DataFrame()
methods = ['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan']
samples = ['icgc_cll_T20_tumour', 'icgc_mbl_T20_tumour']
purities = ['T20_', 'T40_', '']

aux = []
baselineAUPRC_dict = {}

for sample in samples:
    for purity in purities:
        y_true = df_snv[df_snv['Sample_Name'] == sample.split('T')[0]+purity+'tumour']['TRUTH']
        y_true.index.name = 'CHROM_POS'
        y_true = y_true.astype(bool)
        print(sum(y_true.index.duplicated()))
        y_true = y_true[~y_true.index.duplicated(keep='first')]
        baselineAUPRC= len(y_true[y_true == True])/len(y_true)
        print(baselineAUPRC)
        baselineAUPRC_dict[sample] = baselineAUPRC
        for m in methods:
            a = df_snv[df_snv['Sample_Name'] == sample][m]
            a = a[~a.index.duplicated(keep='first')]
            aux.append(average_precision_score(y_true, a.reindex(y_true.index).fillna(False)))
pd_results['AUPRC'] = aux
pd_results['tumor burden'] = [purity for sample in samples for purity in [0.2, 0.4, 1] for m in methods]
pd_results['caller'] = [m for sample in samples for purity in purities for m in methods ]
pd_results['sample'] = [sample for sample in samples for purity in purities for m in methods ]

for si, sample in enumerate(samples):
    sns.catplot(x="tumor burden", y="AUPRC", hue="caller",
              capsize=.2, height=4, aspect=1.5, kind="point",
                order = sorted(np.unique(pd_results['tumor burden'].values), reverse=True),
                    data=pd_results[(pd_results['sample'] == sample)])
    plt.axhline(y = baselineAUPRC_dict[sample], color = 'k', linestyle = '--', label='baseline AUPRC') 
    plt.ylim([0, 1])

In [None]:
pd_results = pd.DataFrame()
methods = ['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan']
samples = ['icgc_cll_T20_tumour', 'icgc_mbl_T20_tumour']
purities = ['T20_', 'T40_', '']

aux = []
baselineAUPRC_dict = {}

for sample in samples:
    for purity in purities:
        y_true = df_ind[df_ind['Sample_Name'] == sample.split('T')[0]+purity+'tumour']['TRUTH']
        y_true.index.name = 'CHROM_POS'
        y_true = y_true.astype(bool)
        print(sum(y_true.index.duplicated()))
        y_true = y_true[~y_true.index.duplicated(keep='first')]
        baselineAUPRC= len(y_true[y_true == True])/len(y_true)
        print(baselineAUPRC)
        baselineAUPRC_dict[sample] = baselineAUPRC
        for m in methods:
            a = df_ind[df_ind['Sample_Name'] == sample][m]
            a = a[~a.index.duplicated(keep='first')]
            aux.append(average_precision_score(y_true, a.reindex(y_true.index).fillna(False)))
pd_results['AUPRC'] = aux
pd_results['tumor burden'] = [purity for sample in samples for purity in [0.2, 0.4, 1] for m in methods]
pd_results['caller'] = [m for sample in samples for purity in purities for m in methods ]
pd_results['sample'] = [sample for sample in samples for purity in purities for m in methods ]

for si, sample in enumerate(samples):
    sns.catplot(x="tumor burden", y="AUPRC", hue="caller",
              capsize=.2, height=4, aspect=1.5, kind="point",
                order = sorted(np.unique(pd_results['tumor burden'].values), reverse=True),
                    data=pd_results[(pd_results['sample'] == sample)])
    plt.axhline(y = baselineAUPRC_dict[sample], color = 'k', linestyle = '--', label='baseline AUPRC') 
    plt.ylim([0, 1])