# Tissue biopsy benchmark

In [None]:
# Imports

import io
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pysam
import warnings
from sklearn.metrics import precision_recall_curve, f1_score, average_precision_score
warnings.filterwarnings('ignore')
from sklearn.metrics import confusion_matrix

# set working directory
if not os.getcwd().endswith('cfdna_snv_benchmark'):
    os.chdir('../')
print('Current working directory: {}'.format(os.getcwd()))

from utils.config import Config
from utils.viz import *
from utils.table import *
from utils.metrics import *
from utils.venn import venn6, get_labels

In [None]:
# Config and Display paramaters

config = Config("config/", "config_viz.yaml")
set_display_params(config)

color_dict = {config.methods[i]: config.colors[i] for i in range(len(config.methods))}
alpha_dict = dict(zip(config.tissuebenchmark.fractions, [1-i*0.3 for i in range(len(config.tissuebenchmark.fractions))]))

print(color_dict)
print(alpha_dict)

In [None]:
df_table = get_call_table_tissue(config)
print(df_table.shape)
df_table.head()

# Precision - Recall curve

In [None]:
for muttype in config.muttype:
    for i, sample in enumerate(config.tissuebenchmark.samples):
        fig, ax = plt.subplots(figsize=(10,10))
        baseline_dict = {}
        for f in config.tissuebenchmark.fractions:
            for method in config.tissuebenchmark.methods:
                #print(muttype, sample, f, method)
                vcf_sample = df_table[(df_table['sample'] == sample) & (df_table['mutation type'] == muttype) & (df_table['purity'] == round(config.tissuebenchmark.purities[i]*f, 2))]
                df_sample_method = vcf_sample[[method+'_score', 'TRUTH']]
                df_sample_method[method + '_score'].fillna(0, inplace=True)
                precision, recall, thresholds = precision_recall_curve(df_sample_method['TRUTH'], df_sample_method[method + '_score'])
                f1 = f1_score(vcf_sample['TRUTH'], vcf_sample[method])
                estimator_name = method if f == 1 else ''
                plot_pr_curve(precision, recall, estimator_name=estimator_name, f1_score=None, figax=(fig, ax), kwargs={'color':color_dict[method], 'alpha':alpha_dict[f], 'lw':2})
                baseline_dict[f] = len(vcf_sample['TRUTH'][vcf_sample['TRUTH']])/len(vcf_sample['TRUTH'])
            plt.axhline(y=baseline_dict[f], c='k', ls='--', alpha=alpha_dict[f])
        handles, labels = plt.gca().get_legend_handles_labels()
        list_lines = [Line2D([0], [0], color='black', alpha=alpha_dict[f], label='tumor purity = {:.2f}%'.format(round(100*config.tissuebenchmark.purities[i]*f, 2))) for f in config.tissuebenchmark.fractions]
        legend_list = handles + list_lines + [Line2D([0], [0], color='black', ls='--', alpha=alpha_dict[f], label="baseline tf {:.2f}% = {:.2f}".format(round(100*config.tissuebenchmark.purities[i]*f, 2), baseline_dict[f])) for f in config.tissuebenchmark.fractions]
        # Creating legend with color box
        plt.legend(bbox_to_anchor=(1, 1), loc="upper left", handles=legend_list)
        plt.title("Precision Recall curve for {} calling in {}".format(muttype, sample))
        #plt.loglog()
        plt.xlim([-0.01, 1.01])
        plt.ylim([-0.01, 1.01])
        plt.savefig('figures/tissue_benchmark/'+sample+'_'+muttype+'_prcurve',  bbox_inches='tight')

baseline_dict

# AUPRC plots


In [None]:
for muttype in config.muttype:
    results_df = pd.DataFrame()
    aux_auprc = []
    aux_method = []
    aux_sample = []
    aux_tp = []
    aux_baseline = []
    for i, sample in enumerate(config.tissuebenchmark.samples):
        for f in config.tissuebenchmark.fractions:
            for method in config.tissuebenchmark.methods:
                #print(muttype, sample, f, method)
                vcf_sample = df_table[(df_table['sample'] == sample) & (df_table['mutation type'] == muttype) & (df_table['purity'] == round(config.tissuebenchmark.purities[i]*f, 2))]
                df_sample_method = vcf_sample[[method+'_score', 'TRUTH']]
                df_sample_method[method + '_score'].fillna(0, inplace=True)
                precision, recall, thresholds = precision_recall_curve(df_sample_method['TRUTH'], df_sample_method[method + '_score'])
                f1 = f1_score(vcf_sample['TRUTH'], vcf_sample[method])
                estimator_name = method if f == 1 else ''
                auprc = average_precision_score(df_sample_method['TRUTH'], df_sample_method[method+'_score'])
                aux_auprc.append(auprc)
                aux_method.append(method)
                aux_sample.append(sample)
                aux_tp.append(round(100*config.tissuebenchmark.purities[i]*f, 2))
            # baseline
            aux_auprc.append(len(vcf_sample['TRUTH'][vcf_sample['TRUTH']])/len(vcf_sample['TRUTH']))
            aux_method.append('baseline')
            aux_sample.append(sample)
            aux_tp.append(round(100*config.tissuebenchmark.purities[i]*f, 2))
            
    results_df['AUPRC'] = aux_auprc
    results_df['tumor purity'] = aux_tp
    results_df['caller'] = aux_method
    results_df['sample'] = aux_sample
    
    for si, sample in enumerate(config.tissuebenchmark.samples):
        sns.catplot(x="tumor purity", y="AUPRC", hue="caller",
                  capsize=.2, height=4, aspect=1.5, kind="point", colors=config.colors + ['k'],
                    order=sorted(results_df[results_df['sample'] == sample]['tumor purity'].unique(), reverse=True),
                        data=results_df[results_df['sample'] == sample])
        plt.ylim([0, 1])
        plt.title("AUPRC score for {} calling in {}".format(muttype, sample))
        plt.savefig('figures/tissue_benchmark/'+sample+'_'+muttype+'_auprcscore',  bbox_inches='tight')

# Venn diagram

lisibility issue

In [None]:
labels = get_labels([list(vcf_sample['TRUTH'][vcf_sample['TRUTH']].index)] + [list(vcf_sample[method][vcf_sample[method]].index) for m in config.methods], fill=["number"])
          
venn6(labels, names=['TRUTH'] + config.methods)

# Change in prediction with dilution

In [None]:
df = df_table[df_table['TRUTH'] == True][['CHROM_POS', 'sample', 'purity', 'mutation type', 'TRUTH', 'freebayes','mutect2', 'strelka2', 'vardict', 'varscan']] # .purity.unstack().add_prefix('purity_')#.groupby('CHROM_POS').sum()
df.unstack

In [None]:
for i, sample in enumerate(config.tissuebenchmark.samples):
    for muttype in config.muttype:
        fig, ax = plt.subplots(figsize=(15,8))
        for m, method in enumerate(config.methods):
            change_df_aux = df_table[(df_table['sample'] == sample) & (df_table['mutation type'] == muttype) & (df_table['TRUTH']) & (df_table[method])][['CHROM_POS', 'TRUTH', 'purity'] + [method]].set_index(['CHROM_POS'])
            change_df = pd.DataFrame(index=change_df_aux.index.unique(), columns= ['TRUTH'] + [str(round(config.tissuebenchmark.purities[i]*f, 2)) for f in config.tissuebenchmark.fractions])
            for chrpos in change_df.index:
                for f in config.tissuebenchmark.fractions:
                    try:
                        change_df.loc[chrpos][str(round(config.tissuebenchmark.purities[i]*f, 2))] = change_df_aux[(change_df_aux['purity'] == round(config.tissuebenchmark.purities[i]*f, 2))].loc[chrpos][method]
                    except:
                        change_df.loc[chrpos][str(round(config.tissuebenchmark.purities[i]*f, 2))] = False
                if change_df_aux.loc[chrpos]['TRUTH'].shape == ():
                    change_df.loc[chrpos]['TRUTH'] = change_df_aux.loc[chrpos]['TRUTH']
                else:
                    change_df.loc[chrpos]['TRUTH'] = np.unique(change_df_aux.loc[chrpos]['TRUTH'].values)[0]
            # print(change_df.shape)
            change_df['sum'] = sum([change_df[str(round(config.tissuebenchmark.purities[i]*f, 2))] for f in config.tissuebenchmark.fractions])
            change_df = change_df[~change_df['sum'].isin([0, 3])].dropna(how='all')
            change_df = change_df.astype(int).drop('sum', axis=1)
            # plot
            #for chrpos, row in change_df.astype(int).drop('sum', axis=1).iterrows():
            #    plt.plot(row.astype(int), label=chrpos)
            #plt.legend()
            change_df = change_df.groupby(change_df.columns.tolist(),as_index=False).size()
            print(method)
            print(change_df)

            c = 0.7
            for idx, row in change_df.iterrows():
                plt.plot(row[['TRUTH'] + [str(round(config.tissuebenchmark.purities[i]*f, 2)) for f in config.tissuebenchmark.fractions]], lw=row['size'], label=method, c=config.colors[m], alpha=c-max(1/row['size'], 0.2))
        custom_lines = [Line2D([0], [0], color=config.colors[m], lw=2) for m in range(len(config.methods))]
        plt.legend(custom_lines, config.methods)

        plt.xlabel('purity')
        plt.ylabel('mutation calling output prediction')
        plt.title('Change in mutation prediction w.r.t purity in {} for {}'.format(sample, muttype))

        ax.set_yticks([0,1])
        ax.set_yticklabels(['False','True'])
        plt.show()

# True Positive, False Negative and False Positive Mutations

In [None]:
df = pd.DataFrame(columns=['sample', 'mutation type', 'caller', 'purity', 'metric', 'number of loci', 'rate',  'number of mutations (ground truth)']) #'TN', 'FP', 'FN', 'TP'])
metric = ['TN', 'FP', 'FN', 'TP']
c = 0
for muttype in config.muttype:
    for i, sample in enumerate(config.tissuebenchmark.samples):
        for method in config.tissuebenchmark.methods:
            print(muttype, sample, method)
            for f in config.tissuebenchmark.fractions:
                vcf_sample = df_table[(df_table['sample'] == sample) & (df_table['mutation type'] == muttype) & (df_table['purity'] == round(config.tissuebenchmark.purities[i]*f, 2))]
                ntruth = sum(vcf_sample['TRUTH'])
                tn, fp, fn, tp = confusion_matrix(vcf_sample['TRUTH'], vcf_sample[method], labels=[0,1]).ravel()
                tpfn = tp + fn
                tnfp = tn + fp
                for j, tfpn in enumerate([tn, fp, fn, tp]):
                    # df.loc[c, :] = [sample, muttype, method, round(config.tissuebenchmark.purities[i]*f, 2), tn, fp, fn, tp]
                    if j in [0, 1]:
                        df.loc[c, :] = [sample, muttype, method, round(config.tissuebenchmark.purities[i]*f, 2), metric[c%4], tfpn, tfpn/tnfp, ntruth]
                    else: # j in [2,3]
                        df.loc[c, :] = [sample, muttype, method, round(config.tissuebenchmark.purities[i]*f, 2), metric[c%4], tfpn, tfpn/tpfn, ntruth]
                    c += 1
df.head(20)

In [None]:
for muttype in config.muttype:
    for s, sample in enumerate(config.tissuebenchmark.samples):
        plt.figure(figsize=(10,8))
        plots = sns.barplot(x="caller", y="rate", hue="purity", data=df[(df['sample'] == sample) & (df['mutation type'] == muttype) & (df['metric'] == 'TP')], ci=None, palette='Blues_r',
                            hue_order=[round(config.tissuebenchmark.purities[s]*config.tissuebenchmark.fractions[i], 2) for i in range(3)])
        plt.suptitle('True Positive Rate - '+sample+ ' - '+muttype)
        plt.legend(bbox_to_anchor=(1.01, 1), borderaxespad=0, title='purity')
        plt.ylim([0,1])
        xcoords, ycoords = [], []
        ci = 0
        # Iterrating over the bars one-by-one
        for b, bar in enumerate(plots.patches):
            plots.annotate(df[(df['sample'] == sample) & (df['mutation type'] == muttype) & (df['metric'] == 'TP') & (df['caller'] == config.methods[b%5]) & (df['purity'] == round(config.tissuebenchmark.purities[s]*config.tissuebenchmark.fractions[b//5], 2))]['number of loci'].values[0],
                   (bar.get_x() + (bar.get_width() / 2),
                    bar.get_height()), ha='center', va='center',
                   size=15, xytext=(0, 8),
                   textcoords='offset points')
            
        for m,methods in enumerate(config.methods):
            for b, bar in enumerate(plots.patches):
                if b % len(config.methods) == m:
                    #print(b)
                    if len(xcoords) == 3:
                        xcoords, ycoords = [], []
                    xi = bar.get_x() + (bar.get_width() / 2)
                    yi = df[(df['sample'] == sample) & (df['mutation type'] == muttype) & (df['caller'] == config.methods[b//3]) & (df['purity'] == round(config.tissuebenchmark.purities[s]*config.tissuebenchmark.fractions[b%3], 2))]['number of mutations (ground truth)'].values[0]
                    xcoords.append(xi)
                    ycoords.append(yi)
                    if len(xcoords) == 3:
                        if m == 0:
                            redline = ax.plot(xcoords, ycoords, 'ro-', label='Number of ground truths mutations')
                        else:
                            ax.plot(xcoords, ycoords, 'ro-')
        plt.savefig('figures/tissue_benchmark/'+sample+'_'+muttype+'_TPrate',  bbox_inches='tight')

        
        plt.figure(figsize=(10,8))
        plots = sns.barplot(x="caller", y="rate", hue="purity", data=df[(df['sample'] == sample) & (df['mutation type'] == muttype) & (df['metric'] == 'FP')], ci=None, palette='Blues_r',
                            hue_order=[round(config.tissuebenchmark.purities[s]*config.tissuebenchmark.fractions[i], 2) for i in range(3)])
        plt.suptitle('False Positive Rate - '+sample+ ' - '+muttype)
        plt.legend(bbox_to_anchor=(1.01, 1), borderaxespad=0, title='purity')
        plt.ylim([0,1])
        # Iterrating over the bars one-by-one
        for b, bar in enumerate(plots.patches):
            plots.annotate(df[(df['sample'] == sample) & (df['mutation type'] == muttype) & (df['metric'] == 'FP') & (df['caller'] == config.methods[b//3]) & (df['purity'] == round(config.tissuebenchmark.purities[s]*config.tissuebenchmark.fractions[b%3], 2))]['number of loci'].values[0],
                            #format(bar.get_height(), '.2f'),
                           (bar.get_x() + bar.get_width() / 2,
                            bar.get_height()), ha='center', va='center',
                           size=15, xytext=(0, 8),
                           textcoords='offset points')
        plt.savefig('figures/tissue_benchmark/'+sample+'_'+muttype+'_FPrate',  bbox_inches='tight')

            
        plt.figure(figsize=(10,8))
        plots = sns.barplot(x="caller", y="rate", hue="purity", data=df[(df['sample'] == sample) & (df['mutation type'] == muttype) & (df['metric'] == 'FN')], ci=None, palette='Blues_r',
                            hue_order=[round(config.tissuebenchmark.purities[s]*config.tissuebenchmark.fractions[i], 2) for i in range(3)])
        plt.suptitle('False Negative Rate - '+sample+ ' - '+muttype)
        plt.legend(bbox_to_anchor=(1.01, 1),  borderaxespad=0, title='purity')
        plt.ylim([0,1])
        # Iterrating over the bars one-by-one
        for b, bar in enumerate(plots.patches):
            plots.annotate(df[(df['sample'] == sample) & (df['mutation type'] == muttype) & (df['metric'] == 'FN') & (df['caller'] == config.methods[b//3]) & (df['purity'] == round(config.tissuebenchmark.purities[s]*config.tissuebenchmark.fractions[b%3], 2))]['number of loci'].values[0],
                            #format(bar.get_height(), '.2f'),
                           (bar.get_x() + bar.get_width() / 2,
                            bar.get_height()), ha='center', va='center',
                           size=15, xytext=(0, 8),
                           textcoords='offset points')
        plt.savefig('figures/tissue_benchmark/'+sample+'_'+muttype+'_FNrate',  bbox_inches='tight')


# Stacked TP, FN, FP, TN

In [None]:
df = pd.DataFrame(columns=['sample', 'mutation type', 'caller', 'purity', 'TP', 'FN', 'FP', 'FN', ])
metric = ['TN', 'FP', 'FN', 'TP']
c = 0
for muttype in config.muttype:
    for i, sample in enumerate(config.tissuebenchmark.samples):
        for method in config.tissuebenchmark.methods:
            print(muttype, sample, method)
            for f in config.tissuebenchmark.fractions:
                vcf_sample = df_table[(df_table['sample'] == sample) & (df_table['mutation type'] == muttype) & (df_table['purity'] == round(config.tissuebenchmark.purities[i]*f, 2))]
                tn, fp, fn, tp = confusion_matrix(vcf_sample['TRUTH'], vcf_sample[method], labels=[0,1]).ravel()
                tpfn = tp + fn
                tnfp = tn + fp
                df.loc[c, :] = [sample, muttype, method, round(config.tissuebenchmark.purities[i]*f, 2), tp, fn, fp, tn]
                c += 1
df.head()

In [None]:
df = pd.DataFrame(columns=['sample', 'mutation type', 'caller', 'purity',  'caller_purity',  'metric', 'number of loci', 'rate']) #'TN', 'FP', 'FN', 'TP'])
metric = ['TN', 'FP', 'FN', 'TP']
c = 0
d = 0
for muttype in config.muttype:
    for i, sample in enumerate(config.tissuebenchmark.samples):
        for method in config.tissuebenchmark.methods:
            print(muttype, sample, method)
            for f in config.tissuebenchmark.fractions:
                vcf_sample = df_table[(df_table['sample'] == sample) & (df_table['mutation type'] == muttype) & (df_table['purity'] == round(config.tissuebenchmark.purities[i]*f, 2))]
                tn, fp, fn, tp = confusion_matrix(vcf_sample['TRUTH'], vcf_sample[method], labels=[0,1]).ravel()
                tpfn = tp + fn
                tnfp = tn + fp
                for j, tfpn in enumerate([tn, fp, fn, tp]):
                    # df.loc[c, :] = [sample, muttype, method, round(config.tissuebenchmark.purities[i]*f, 2), tn, fp, fn, tp]
                    if j in [0, 1]:
                        df.loc[c+d, :] = [sample, muttype, method, round(config.tissuebenchmark.purities[i]*f, 2), method + '_' + str(round(config.tissuebenchmark.purities[i]*f, 2)), metric[c%4], tfpn, tfpn/tnfp]
                    else: # j in [2,3]
                        df.loc[c+d, :] = [sample, muttype, method, round(config.tissuebenchmark.purities[i]*f, 2), method + '_' + str(round(config.tissuebenchmark.purities[i]*f, 2)), metric[c%4], tfpn, tfpn/tpfn]
                    c += 1
            
            df.loc[c+d, :] = [sample, muttype, method, 0, method+'_0', 'TN', 0, 0]
            d += 1
df.head(30)

In [None]:
for muttype in config.muttype:
    for s, sample in enumerate(config.tissuebenchmark.samples):
        #for f in config.tissuebenchmark.fractions:
        #for m in config.methods:
        plt.figure(figsize=(10,4))
        bar = sns.barplot(x="caller_purity", y="number of loci", hue="metric", data=df[(df['sample'] == sample) & (df['mutation type'] == muttype)],
                    #order= [round(config.tissuebenchmark.purities[s]*config.tissuebenchmark.fractions[i], 2) for i in range(3)],
                    palette=['lightcoral', 'red', 'lightgreen', 'green'],
                    dodge= False)
        bar.set_xticklabels(bar.get_xticklabels(), rotation=90, horizontalalignment='right')
        labels = bar.get_xticklabels() 
        labels = [l if ((i+1)%4 != 0) else '' for i, l in enumerate(labels)]
        bar.set_xticklabels(labels)
        plt.suptitle('Predictions - '+sample+ ' - '+muttype)
        plt.legend(bbox_to_anchor=(1.01, 1))
        #plt.ylim([0,1])
        plt.savefig('figures/tissue_benchmark/'+sample+'_'+muttype+'_predictions',  bbox_inches='tight')

# Number of mutations

In [None]:
df = pd.DataFrame(columns=['sample', 'mutation type', 'caller', 'purity', 'number of loci called', 'number of mutations (ground truth)']) #'TN', 'FP', 'FN', 'TP'])
metric = ['TN', 'FP', 'FN', 'TP']
c = 0
for muttype in config.muttype:
    for i, sample in enumerate(config.tissuebenchmark.samples):
        for method in config.tissuebenchmark.methods:
            print(muttype, sample, method)
            for f in config.tissuebenchmark.fractions:
                vcf_sample = df_table[(df_table['sample'] == sample) & (df_table['mutation type'] == muttype) & (df_table['purity'] == round(config.tissuebenchmark.purities[i]*f, 2))]
                ntruth = sum(vcf_sample['TRUTH'])
                ncaller = sum(vcf_sample[method])
                df.loc[c, :] = [sample, muttype, method, round(config.tissuebenchmark.purities[i]*f, 2), ncaller, ntruth]
                c += 1
df.head()

In [None]:
for muttype in config.muttype:
    for s, sample in enumerate(config.tissuebenchmark.samples):
        plt.figure(figsize=(10,8))
        ax = sns.barplot(x="caller", y="number of loci called", hue="purity", data=df[(df['mutation type'] == muttype) & (df['sample'] == sample)], ci=None, palette="Greys_r", log=True,
                        hue_order=[round(config.tissuebenchmark.purities[s]*config.tissuebenchmark.fractions[i], 2) for i in range(3)])
        leg1 = plt.legend(title='purity', loc=2)

        xcoords, ycoords = [], []
        for m,methods in enumerate(config.methods):
            for b, bar in enumerate(plots.patches):
                if b % len(config.methods) == m:
                    #print(b)
                    if len(xcoords) == 3:
                        xcoords, ycoords = [], []
                    xi = bar.get_x() + (bar.get_width() / 2)
                    yi = df[(df['sample'] == sample) & (df['mutation type'] == muttype) & (df['caller'] == config.methods[b//3]) & (df['purity'] == round(config.tissuebenchmark.purities[s]*config.tissuebenchmark.fractions[b%3], 2))]['number of mutations (ground truth)'].values[0]
                    xcoords.append(xi)
                    ycoords.append(yi)
                    if len(xcoords) == 3:
                        if m == 0:
                            redline = ax.plot(xcoords, ycoords, 'ro-', label='Number of ground truths mutations')
                        else:
                            ax.plot(xcoords, ycoords, 'ro-')
        plt.ylim([0,15000])
        plt.legend(redline, ['# ground truths mutations'], loc='upper right', bbox_to_anchor=(1, 1))
        plt.gca().add_artist(leg1)
        plt.suptitle('Number of mutations called - '+sample+' - '+muttype)
    

# Ratio performance attenuation

In [None]:
df = pd.DataFrame(columns=['sample', 'mutation type', 'caller', 'purity', 'metric', 'number of loci', 'rate',  'number of mutations (ground truth)']) #'TN', 'FP', 'FN', 'TP'])
metric = ['TN', 'FP', 'FN', 'TP']
c = 0
for muttype in config.muttype:
    for i, sample in enumerate(config.tissuebenchmark.samples):
        for method in config.tissuebenchmark.methods:
            print(muttype, sample, method)
            for f in config.tissuebenchmark.fractions:
                vcf_sample = df_table[(df_table['sample'] == sample) & (df_table['mutation type'] == muttype) & (df_table['purity'] == round(config.tissuebenchmark.purities[i]*f, 2))]
                ntruth = sum(vcf_sample['TRUTH'])
                tn, fp, fn, tp = confusion_matrix(vcf_sample['TRUTH'], vcf_sample[method], labels=[0,1]).ravel()
                tpfn = tp + fn
                tnfp = tn + fp
                for j, tfpn in enumerate([tn, fp, fn, tp]):
                    # df.loc[c, :] = [sample, muttype, method, round(config.tissuebenchmark.purities[i]*f, 2), tn, fp, fn, tp]
                    if j in [0, 1]:
                        df.loc[c, :] = [sample, muttype, method, round(config.tissuebenchmark.purities[i]*f, 2), metric[c%4], tfpn, tfpn/tnfp, ntruth]
                    else: # j in [2,3]
                        df.loc[c, :] = [sample, muttype, method, round(config.tissuebenchmark.purities[i]*f, 2), metric[c%4], tfpn, tfpn/tpfn, ntruth]
                    c += 1
                    
df = df[df['metric'] == 'TP']
df['TP/GT'] =  df['number of loci'].divide(df['number of mutations (ground truth)'], axis='index') 

df.head(20)

In [None]:
for muttype in config.muttype:
    for i, sample in enumerate(config.tissuebenchmark.samples):
        g = plt.figure(figsize=(12,6))
        g = sns.catplot(x='purity', y='TP/GT', hue='caller', data=df[(df['sample'] == sample) & (df['mutation type'] == muttype)], kind='point', height=8,
                        order=[round(config.tissuebenchmark.purities[i]*config.tissuebenchmark.fractions[f], 2) for f in range(3)])
        plt.title('Detection rate evolution w.r.t purity in '+sample+' - '+muttype)
        plt.grid() 
        plt.ylim([0.5,1])
        plt.savefig('figures/tissue_benchmark/'+sample+'_'+muttype+'_detectionrate',  bbox_inches='tight')

In [None]:
res = df
a = res[['sample', 'mutation type', 'caller', 'TP/GT', 'purity']][res['purity'] > 0.9].set_index(['sample', 'mutation type', 'caller']).drop('purity', axis=1)
b = res[['sample', 'mutation type', 'caller', 'TP/GT', 'purity']][res['purity'] <= 0.2].set_index(['sample', 'mutation type', 'caller']).drop('purity', axis=1)
res = (a-b).divide(a).reset_index()
res['TP/GT'] = 100*res['TP/GT']
res

In [None]:
for muttype in config.muttype:
    plt.figure(figsize=(20,12))
    #sns.set_style("whitegrid")
    sns.catplot(y='TP/GT', x='sample', hue='caller', data=res[res['mutation type'] == muttype], height=8, aspect=0.8, s=15)
    plt.axhline(y=0, c='k', ls='--')
    plt.grid()
    plt.ylim([-15, 20])
    plt.ylabel('(TP/GT$_{\\rm{high purity}}$ - TP/GT$_{\\rm{low purity}}$) / TP/GT$_{\\rm{high purity}}$) (%) ')
    plt.title('Performance attenuation factor\n between high and low tumor purity sample - ' + muttype)
    plt.savefig('figures/tissue_benchmark/'+muttype+'_performanceattenuationfactor',  bbox_inches='tight')