In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pysam
from collections import Counter
from matplotlib import cm
from matplotlib.colors import ListedColormap

import io
from pysam import VariantFile

from sklearn.metrics import precision_score, recall_score, average_precision_score, precision_recall_curve

In [None]:
tab20 = cm.get_cmap('tab10', 8)
newcmap_list = []
for i in range(4):
    newcmap_list.append(tab20.colors[i])
for i in range(4):
    newcmap_list.append(np.array(list(tab20.colors[i][:-1]) +[0.3]))

newcmap = ListedColormap(newcmap_list, name='newcmap')
color_list = ['tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple']

In [None]:
df_snv = pd.read_csv('../data/SMURF benchmark/snv-combined-test20-v7-2.csv')
df_snv = df_snv[(df_snv['Sample_Name'] == 'icgc_cll_T20_tumour') | (df_snv['Sample_Name'] == 'icgc_mbl_T20_tumour')]
df_snv = df_snv[['X.CHROM', 'POS', 'Sample_Name', 'TRUTH', 'FILTER_Mutect2', 'FILTER_Freebayes', 'FILTER_Vardict', 'FILTER_Varscan', 'FILTER_Strelka2']]
df_snv['CHROM_POS'] = df_snv['X.CHROM'].astype(str).str.cat(df_snv['POS'].astype(str), sep="_")
df_snv.drop(['X.CHROM', 'POS'], axis=1, inplace=True)
df_snv.columns = ['Sample_Name', 'TRUTH', "mutect2", 'freebayes', 'vardict', 'varscan', 'strelka2', 'CHROM_POS']
df_snv.set_index('CHROM_POS', inplace=True)
print(df_snv.shape)
#df_snv.index[df_snv.index.duplicated()]
#print(df_snv.index[df_snv.index.duplicated()])
df_snv.head()

In [None]:
pd_results = pd.DataFrame()
pd_results_PR = pd.DataFrame()
methods = ['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan']
samples = ['icgc_cll_T20_tumour', 'icgc_mbl_T20_tumour']

pd_results['AUPRC'] = [average_precision_score(df_snv[df_snv['Sample_Name'] == sample]['TRUTH'], df_snv[df_snv['Sample_Name'] == sample][m])
                       for m in methods for sample in samples]
pd_results_PR['precision'] =  [precision_score(df_snv[df_snv['Sample_Name'] == sample]['TRUTH'], df_snv[df_snv['Sample_Name'] == sample][m])
                       for m in methods for sample in samples]
pd_results_PR['recall'] = [recall_score(df_snv[df_snv['Sample_Name'] == sample]['TRUTH'], df_snv[df_snv['Sample_Name'] == sample][m])
                       for m in methods for sample in samples]
pd_results['tumor burden'] = 0.2
pd_results_PR['tumor burden'] = 0.2
pd_results['caller'] = [m for m in methods for sample in samples]
pd_results_PR['caller'] = [m for m in methods for sample in samples]
pd_results['sample'] = [sample for m in methods for sample in samples]
pd_results_PR['sample'] = [sample for m in methods for sample in samples]
pd_results_PR = pd.melt(pd_results_PR, id_vars =['tumor burden', 'caller', 'sample'], value_vars =['precision', 'recall'],
                var_name='metric', value_name='value')

# Plot Precision-Recall curve
for si, sample in enumerate(samples):
    print(sample)
    plt.figure()
    plt.title('Precision-Recall curves for sample ' + sample)
    for i, mi in enumerate(methods):
        precision, recall, _ = precision_recall_curve(df_snv[df_snv['Sample_Name'] == sample]['TRUTH'], df_snv[df_snv['Sample_Name'] == sample][mi]) 
        plt.plot(recall, precision, 'o-',
                 label=mi +', AP='+str(round(average_precision_score(df_snv[df_snv['Sample_Name'] == sample]['TRUTH'], df_snv[df_snv['Sample_Name'] == sample][mi]), 2)),
                c=color_list[i])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.05])
    plt.legend()
    plt.show()

print(pd_results.head())
print(pd_results.shape)
print(pd_results_PR.head())
print(pd_results_PR.shape)

plt.figure()
sns.catplot(x="sample", y="AUPRC", hue="caller",  
                capsize=.2, height=6, aspect=.75, kind="point", data=pd_results)
plt.ylim([0, 1])

sns.catplot(x="tumor burden", y="value", col="metric", hue="caller",  
                capsize=.2, height=6, aspect=.75, kind="point", data=pd_results_PR)
plt.ylim([0, 1])


In [None]:
df_ind = pd.read_csv('../data/SMURF benchmark/indel-combined-test20-v7-2.csv')
df_ind = df_ind[(df_ind['Sample_Name'] == 'icgc_cll_T20_tumour') | (df_ind['Sample_Name'] == 'icgc_mbl_T20_tumour')]
df_ind = df_ind[['X.CHROM', 'POS', 'Sample_Name', 'TRUTH', 'FILTER_Mutect2', 'FILTER_Freebayes', 'FILTER_Vardict', 'FILTER_Varscan', 'FILTER_Strelka2']]
df_ind['CHROM_POS'] = df_ind['X.CHROM'].astype(str).str.cat(df_ind['POS'].astype(str), sep="_")
df_ind.drop(['X.CHROM', 'POS'], axis=1, inplace=True)
df_ind.columns = ['Sample_Name', 'TRUTH', "mutect2", 'freebayes', 'vardict', 'varscan', 'strelka2', 'CHROM_POS']
df_ind.set_index('CHROM_POS', inplace=True)
print(df_ind.shape)
df_ind.head()

In [None]:
pd_results = pd.DataFrame()
pd_results_PR = pd.DataFrame()
methods = ['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan']
samples = ['icgc_cll_T20_tumour', 'icgc_mbl_T20_tumour']

pd_results['AUPRC'] = [average_precision_score(df_ind[df_ind['Sample_Name'] == sample]['TRUTH'], df_ind[df_ind['Sample_Name'] == sample][m])
                       for m in methods for sample in samples]
pd_results_PR['precision'] =  [precision_score(df_ind[df_ind['Sample_Name'] == sample]['TRUTH'], df_ind[df_ind['Sample_Name'] == sample][m])
                       for m in methods for sample in samples]
pd_results_PR['recall'] = [recall_score(df_ind[df_ind['Sample_Name'] == sample]['TRUTH'], df_ind[df_ind['Sample_Name'] == sample][m])
                       for m in methods for sample in samples]
pd_results['tumor burden'] = 0.2
pd_results_PR['tumor burden'] = 0.2
pd_results['caller'] = [m for m in methods for sample in samples]
pd_results_PR['caller'] = [m for m in methods for sample in samples]
pd_results['sample'] = [sample for m in methods for sample in samples]
pd_results_PR['sample'] = [sample for m in methods for sample in samples]
pd_results_PR = pd.melt(pd_results_PR, id_vars =['tumor burden', 'caller', 'sample'], value_vars =['precision', 'recall'],
                var_name='metric', value_name='value')

# Plot Precision-Recall curve
for si, sample in enumerate(samples):
    print(sample)
    plt.figure()
    plt.title('Precision-Recall curves for sample ' + sample)
    for i, mi in enumerate(methods):
        precision, recall, _ = precision_recall_curve(df_ind[df_ind['Sample_Name'] == sample]['TRUTH'], df_ind[df_ind['Sample_Name'] == sample][mi]) 
        plt.plot(recall, precision, 'o-',
                 label=mi +', AP='+str(round(average_precision_score(df_ind[df_ind['Sample_Name'] == sample]['TRUTH'], df_ind[df_ind['Sample_Name'] == sample][mi]), 2)),
                c=color_list[i])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.05])
    plt.legend()
    plt.show()

print(pd_results.head())
print(pd_results.shape)
print(pd_results_PR.head())
print(pd_results_PR.shape)

plt.figure()
sns.catplot(x="sample", y="AUPRC", hue="caller",  
                capsize=.2, height=6, aspect=.75, kind="point", data=pd_results)
plt.ylim([0, 1])

sns.catplot(x="tumor burden", y="value", col="metric", hue="caller",  
                capsize=.2, height=6, aspect=.75, kind="point", data=pd_results_PR)
plt.ylim([0, 1])
