# Series analysis with respect to VAF


Analysis updates:
- retrieve calls rejected only because VAF<0.01 (bcbio lower acceptable threshold) for Mutect2, Strelka2 and Vardict
- correct for germline mutations using GATK haplotype calls
- get PR curves for sinvict by encoding the 6 files as thresholds. Assumption: linear filters
- plotting: PR curves stops at 10e-2 on the left
- ground truths:

    1) Consensus: build using
        majority of 5/8 callers for SNV and 3/5 callers for INDELS


    2) Ranked mutations: metascore built using
        weigthed sum of normalised scores between 0 and 1 for each caller
        with weigths = inversially proportional to number of calls made by caller (if a caller calls few mutations higher weight, it many calls low weight)
        threshold = 1/ncallers
        interpretation: if 1 caller is sure (score = 1) of calling this position, add it to GT
        interpretation: if 2 callers are quite sure of calling this position (score > 0,5 each), add it to GT
        
- integrate VAF approx

    1) mixture with VAF instead of tumor burden
    
    2) correct for mutations non present in diluted samples (vaf = 0)
    
    3) pool patients together using VAF

In [None]:
# Imports

%load_ext autoreload
%autoreload 2

import io
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pysam
import warnings
from sklearn.metrics import precision_recall_curve, f1_score, average_precision_score
warnings.filterwarnings('ignore')
from sklearn.metrics import confusion_matrix
from tqdm.notebook import trange, tqdm

# set working directory
if not os.getcwd().endswith('cfdna_snv_benchmark'):
    os.chdir('../')
print('Current working directory: {}'.format(os.getcwd()))

from utils.config import Config
from utils.viz import *
from utils.table import *
from utils.metrics import *
from utils.calltable import *
from utils.calltableseries import *
from utils.groundtruth import *
from utils.metricsseries import *
from utils.venn import venn6, get_labels

In [None]:
# Config and Display paramaters

config = Config("config/", "config_viz.yaml")
set_display_params(config)
print(config.methods)

In [None]:
# Chomosome

mixtureids =  ['CRC-123_310715-CW-T_CRC-123_121115-CW-T', 'CRC-1014_180816-CW-T_CRC-1014_090516-CW-T', 'CRC-986_100215-CW-T_CRC-986_300316-CW-T', ]
mixtureid = 'CRC-986_100215-CW-T_CRC-986_300316-CW-T'
#mixtureid = 'CRC-1014_180816-CW-T_CRC-1014_090516-CW-T'
#mixtureid = 'CRC-123_310715-CW-T_CRC-123_121115-CW-T'
reload = False
save = True
fixedvars=['coverage', 'ctdna']
filterparam = 'all'

markers = ['o', '^', 'X']
linestyles = ['-', '-', '-']
color_dict = {config.methods[i]: config.colors[i] for i in range(len(config.methods))}

muttypes = ['snv', 'indel']
metrics = ['auprc', 'precision', 'recall']

chrom = 'all'

# Part I: Load/Generate call tables, Generate Ground truths and Compute metrics per patient

In [None]:
fixedvar = 'coverage'
#for fixedvar in fixedvars:
if fixedvar == 'coverage':
    seriesorder = [(1000, 0), (1000, 1000), (750, 1250), (500, 1500), (400, 1600), (200, 1800), (100, 1900)]
    xaxis = 'tumor burden'
elif fixedvar == 'ctdna':
    seriesorder = [(1000, 0), (1000, 1000), (1000, 1900)]
    xaxis = 'coverage'
#for mixtureid in mixtureids:
print('############# {} ############'.format(mixtureid))
calltables = {'sampleid':[], 'tf':[], 'cov':[], 'snv':[], 'indel':[], 'snp':[]}
calltable_snv, aux = get_calltableseries(config, mixtureid, 'all', muttype='snv', filterparam=filterparam, reload=reload, save=save, diltype='mixture_wes', concat='vaf')
#calltable_indel, aux = get_calltableseries(config, mixtureid, 'all', muttype='indel', filterparam=filterparam, reload=reload, save=save, diltype='mixture_wes', concat='vaf')
#calltable_snp, aux = get_calltableseries(config, mixtureid, 'all', muttype='snp', filterparam=filterparam, reload=reload, save=save, diltype='mixture_wes', concat='vaf')
#print(calltable_snv.shape, calltable_indel.shape, calltable_snp.shape)
print(aux)
#calltable_snv = calltable_snv[calltable_snv['chrom'] != 'X']
calltable_snv = calltable_snv[calltable_snv['chrom'] != 'Y']
#calltable_indel = calltable_indel[calltable_indel['chrom'] != 'X']
#calltable_indel = calltable_indel[calltable_indel['chrom'] != 'Y']
#calltable_snp = calltable_snp[calltable_snp['chrom'] != 'X']
#calltable_snp = calltable_snp[calltable_snp['chrom'] != 'Y']
#calltable_snv['chrom'] = calltable_snv['chrom'].astype(int)
#calltable_indel['chrom'] = calltable_indel['chrom'].astype(int)
#calltable_snp['chrom'] = calltable_snp['chrom'].astype(int)
plasmasample = '_'.join(mixtureid.split('_')[:2])
print(plasmasample)
healthysample = '_'.join(mixtureid.split('_')[2:])
print(healthysample)
calltables['snv'].append(calltable_snv)
#calltables['indel'].append(calltable_indel)
#calltables['snp'].append(calltable_snp)
calltables['sampleid'] = mixtureid 
#calltables['tf'] = np.unique([cn.split('_')[0] for cn in list(calltable_snv.columns)])[:-5].astype(float)
calltables['snv'] = pd.concat(calltables['snv'])
#calltables['indel'] = pd.concat(calltables['indel'])
#calltables['snp'] = pd.concat(calltables['snp'])
dilutionseries = aux.T[['mixture_chrall_' + '_'.join(mixtureid.split('_')[:2]) + '_' + str(s[0]) + 'x_' + '_'.join(mixtureid.split('_')[2:4]) + '_' + str(s[1]) + 'x' for s in seriesorder]].T
#for muttype in muttypes:
muttype = 'snv'
if muttype == 'snv':
    gtm = 3
else:  # elif muttype == 'indel':
    gtm = 2
print(max(aux['tf']))
refsample = 'undiluted'
if mixtureid ==  'CRC-986_100215-CW-T_CRC-986_300316-CW-T':
    gtm = 3
    refsample = 'tissue'
    #calltablesseries  = generate_groundtruth(config, calltables[muttype], aux['tf'], ground_truth_method='tissue', muttype=muttype,
    #                                    matchedtissuepath=os.path.join('data', 'matchedtissue', 'NCC_CRC-986_100215-T1W', 'calls', 'NCC_CRC-986_100215-T1W_snv_calls_PASS_exome.csv'))
    calltablesseries  = generate_groundtruth(config, calltables[muttype], aux['tf'], ground_truth_method='tissue', muttype=muttype,
                                        matchedtissuepath=os.path.join('data', 'matchedtissue_ultradeep', '986_100215_T1-E', 'calls', '986_100215_T1-E_snv_calls_all.csv'))
else:
    calltablesseries = generate_groundtruth(config, calltables[muttype], aux['tf'], ground_truth_method=gtm, muttype=muttype)
print(calltablesseries)
results_auprc_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='auprc', ground_truth_method=gtm,
                                 refsample=refsample, muttype=muttype, methods=config.methods, fixedvar=fixedvar, xaxis='vaf', diltype='mixture_wes')
#results_recall_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='recall', ground_truth_method=gtm,
#                                       refsample=refsample, muttype=muttype, methods=config.methods, fixedvar=fixedvar, xaxis=xaxis, diltype='mixture_wes')
#results_precision_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='precision', ground_truth_method=gtm,
#                                      refsample=refsample, muttype=muttype, methods=config.methods, fixedvar=fixedvar, xaxis=xaxis, diltype='mixture_wes')
#figure_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, xy='pr', ground_truth_method=gtm,
#                    refsample=refsample, muttype=muttype.upper(), methods=None, fixedvar=fixedvar, save=save, diltype='mixture_wes')

In [None]:
calltables_snv = calltables['snv']
calltables_snv.head()

In [None]:
calltables_snv[(calltables_snv['truth'] == True) & (calltables_snv[[m+'_score' for m in config.methods]].isna().sum(axis=1)  == len(config.methods))]

In [None]:
#precision, recall, thresholds = precision_recall_curve(calltables_snv['truth'], calltables_snv['varscan_score'].fillna(0))
for tf in np.sort(calltables_snv['sampletf'].unique()):
    A = calltables_snv[calltables_snv['sampletf'] == tf]
    #print(A['median_vaf'].describe())
    print(tf, average_precision_score(A['truth'], A['strelka2_score'].fillna(0)))

In [None]:
len(set(calltables_snv['truth'][calltables_snv['truth'] == True].index))

In [None]:
vafranges = [1., 0.1, 0.075, 0.05, 0.025, 0.01, 0.009, 0.008, 0.007, 0.006, 0.005, 0.004, 0.003, 0.002, 0.001]
vafranges = [1., 0.4, 0.3, 0.2, 0.15, 0.1, 0.075, 0.05, 0.04, 0.03, 0.02, 0.01, 0.001]
vafranges = [1., 0.4, 0.3, 0.2, 0.1, 0.09, 0.08, 0.07, 0.06, 0.05, 0.04, 0.03, 0.02, 0.01, 0]

vafranges = vafranges[::-1]
res = {}
cond = True
for i in range(len(mixtureids)):
    cond = cond & (calltables_snv['sampletf'] != np.sort(calltables_snv['sampletf'].unique())[-i])
calltable_snv = calltables_snv[cond]
# calltable_snv = calltables_snv[(calltables_snv['sampletf'] != tfmax for np.sort(calltables_snv['sampletf'].unique())[-3:]]
for method in config.methods:
    print(method)
    x, y  = [], []
    for vi, vafrange in enumerate(vafranges):
        #print(vafranges[vi-1], vafranges[vi])
        if vi > 0:
            aux = calltable_snv[(calltable_snv['median_vaf'] >= vafranges[vi-1]) & (calltable_snv['median_vaf'] < vafranges[vi])]
            if method == 'varnet':
                aux = aux[aux['sampleid'] == 'CRC-986_100215-CW-T_CRC-986_300316-CW-T']
            print(aux['truth'].sum(), aux.shape[0], vafranges[vi-1], vafranges[vi])
            if not aux.empty and aux[method + '_score'].isna().sum() != aux.shape[0]:
                #print(vafranges[vi-1], vafranges[vi], aux.shape[0])
                precision, recall, thresholds = precision_recall_curve(aux['truth'], aux[method + '_score'].fillna(0))
                f1list = 2*(precision * recall)/(precision + recall)
                #print(len(f1list), len(precision))
                #max(f1list)
                #print(precision, recall, thresholds)
                #print(average_precision_score(aux['truth'], aux[method + '_score'].fillna(0)))
                x.append((vafranges[vi-1]+ vafranges[vi])/2)
                #y.append(max(f1list))
                y.append(average_precision_score(aux['truth'], aux[method + '_score'].fillna(0)))
                #y.append(roc_auc_score(aux['truth'], aux[method + '_score'].fillna(0)))
                
    res[method] = {'x': x, 'y': y}
color_dict = {config.methods[i]: config.colors[i] for i in range(len(config.methods))}
plt.figure(figsize=(15,8))
for k,v in res.items():
    plt.plot(v['x'], v['y'], marker='o', label=k, color=color_dict[k])
plt.gca().invert_xaxis()
plt.xscale("log")
#plt.ylim([0.4, 1])
plt.legend(bbox_to_anchor=(1, 1), loc="upper left")
plt.show()

In [None]:
vafranges = [1., 0.1, 0.075, 0.05, 0.025, 0.01, 0.009, 0.008, 0.007, 0.006, 0.005, 0.004, 0.003, 0.002, 0.001]
vafranges = [1., 0.4, 0.3, 0.2, 0.15, 0.1, 0.075, 0.05, 0.04, 0.03, 0.02, 0.01, 0.001]
vafranges = [1., 0.4, 0.3, 0.2, 0.1, 0.09, 0.08, 0.07, 0.06, 0.05, 0.04, 0.03, 0.02, 0.01, 0.0075, 0.005, 0.004, 0.003, 0.002, 0.001]

vafranges = vafranges[::-1]
res = {}
cond = True
for i in range(len(mixtureids)):
    cond = cond & (calltables_snv['sampletf'] != np.sort(calltables_snv['sampletf'].unique())[-i])
calltable_snv = calltables_snv[cond]
# calltable_snv = calltables_snv[(calltables_snv['sampletf'] != tfmax for np.sort(calltables_snv['sampletf'].unique())[-3:]]
for method in config.methods:
    print(method)
    x, y  = [], []
    for vi, vafrange in enumerate(vafranges):
        #print(vafranges[vi-1], vafranges[vi])
        if vi > 0:
            aux = calltable_snv[(calltable_snv['median_vaf'] >= vafranges[vi-1]) & (calltable_snv['median_vaf'] < vafranges[vi])]
            #if method == 'varnet':
                #aux = aux[aux['sampleid'].str.contains('CRC-986_100215-CW-T')]
            print(aux['truth'].sum(), aux.shape[0])
            if not aux.empty and aux[method + '_score'].isna().sum() != aux.shape[0]:
                #print(vafranges[vi-1], vafranges[vi], aux.shape[0])
                precision, recall, thresholds = precision_recall_curve(aux['truth'], aux[method + '_score'].fillna(0))
                f1list = 2*(precision * recall)/(precision + recall)
                #print(len(f1list), len(precision))
                #max(f1list)
                #print(precision, recall, thresholds)
                #print(average_precision_score(aux['truth'], aux[method + '_score'].fillna(0)))
                #x.append((vafranges[vi-1]+ vafranges[vi])/2)
                #y.append(max(f1list))
                #idxa = min(range(len(precision)), key=lambda i: abs(precision[i]-0.7))
                #if not  precision[idxa] < 0.65 and precision[idxa] != 1 :
                #    x.append((vafranges[vi-1]+ vafranges[vi])/2)
                #    y.append(recall[idxa])
                if np.nanmax(f1list) >= 0.1 and aux[method + '_score'].isna().sum() != aux.shape[0]:
                    print(method, np.nanmax(f1list), vafranges[vi-1], vafranges[vi])
                    x.append((vafranges[vi-1]+ vafranges[vi])/2)
                    y.append(recall[list(f1list).index(np.nanmax(f1list))])
                #y.append(average_precision_score(aux['truth'], aux[method + '_score'].fillna(0)))
                #y.append(roc_auc_score(aux['truth'], aux[method + '_score'].fillna(0)))
                
    res[method] = {'x': x, 'y': y}
color_dict = {config.methods[i]: config.colors[i] for i in range(len(config.methods))}
fig, ax = plt.subplots(1, 1, figsize=(20,8))
for k,v in res.items():
    plt.plot(v['x'], v['y'], marker='o', label=k, color=color_dict[k])
plt.gca().invert_xaxis()
plt.xscale("log")
ax.set_xticks(np.array(vafranges).round(2))
import matplotlib.ticker
ax.get_xaxis().set_major_formatter(matplotlib.ticker.ScalarFormatter())
ax.get_xaxis().set_minor_formatter(matplotlib.ticker.NullFormatter())
xlim = ax.set_xlim(1, 0.002)
#plt.ylim([0.4, 1])
plt.legend(bbox_to_anchor=(1, 1), loc="upper left")
plt.show()

In [None]:
calltables_snv_vaf = calltables_snv[calltables_snv['sampletf'] != calltables_snv['sampletf'].unique().max()]
out, bins = pd.qcut(calltables_snv_vaf[calltables_snv_vaf['truth']==True]['median_totcov'], q=5, retbins=True)
vafranges = list(bins)# + [1]
print(vafranges)
#np.sort(pd.qcut(calltable_snv[calltable_snv['truth']==True]['median_vaf'], q=10).unique())
#vafranges = [1., 0.1, 0.075, 0.05, 0.025, 0.01, 0.0075, 0.005, 0.0025, 0.001]
#vafranges = [1., 0.016, 0.012, 0.0093, 0.0073, 0.0063, 0.0056, 0.0051, 0.0045, 0.0038, 0]
#vafranges = [.195, 0.0894, 0.0758, 0.0597, 0.0495, 0.0429, 0.0342, 0.0281, 0.024, 0.0172, 0.0052]
#vafranges = vafranges[::-1]
res = {}
for method in config.methods:
    print(method)
    x, y  = [], []
    for vi, vafrange in enumerate(vafranges):
        #print(vafranges[vi-1], vafranges[vi])
        if vi > 0:
            aux = calltables_snv_vaf[(calltables_snv_vaf['median_totcov'] >= vafranges[vi-1]) & (calltables_snv_vaf['median_totcov'] < vafranges[vi])]
            if method == 'varnet':
                aux = aux[aux['sampleid'].str.contains('CRC-986_100215-CW-T')]
            #print(aux['truth'].sum())
            #if not aux.empty:
            print(vafranges[vi-1], vafranges[vi], aux.shape[0])
            #precision, recall, thresholds = precision_recall_curve(aux['truth'], aux[method + '_score'].fillna(0))
            #f1list = 2*(precision * recall)/(precision + recall)
            #print(len(f1list), len(precision))
            #max(f1list)
            #print(precision, recall, thresholds)
            #print(average_precision_score(aux['truth'], aux[method + '_score'].fillna(0)))
            #if np.nanmax(f1list) > 0:
            #    x.append((vafranges[vi-1]+ vafranges[vi])/2)
            #    y.append(recall[list(f1list).index(np.nanmax(f1list))])
            x.append((vafranges[vi-1]+ vafranges[vi])/2)
            y.append(average_precision_score(aux['truth'], aux[method + '_score'].fillna(0)))
            #x.append((vafranges[vi-1]+ vafranges[vi])/2)
            #y.append(np.nanmax(f1list))
    res[method] = {'x': x, 'y': y}
color_dict = {config.methods[i]: config.colors[i] for i in range(len(config.methods))}
#plt.figure(figsize=(15,8))
fig, ax = plt.subplots(1, 1, figsize=(12,8))
for k,v in res.items():
    plt.plot(v['x'], v['y'], marker='o', label=k, color=color_dict[k])
#plt.gca().invert_xaxis()
#plt.xscale("log")
plt.axhline(y=1/aux['truth'].sum(), ls='--', c='k')
#plt.xlim([0.5, 0.01])
#ax.set_xticks([500*np.round(bins/500, 1)])
#xlim = ax.set_xlim(100, 1000)
import matplotlib.ticker
#ax.get_xaxis().set_major_formatter(matplotlib.ticker.ScalarFormatter())
#ax.get_xaxis().set_minor_formatter(matplotlib.ticker.NullFormatter())
plt.xlabel('Local depth of coverage')
plt.ylabel('AUPRC score')
plt.title("Callers' AUPRC performance w.r.t local depth of coverage", pad=50)
plt.legend(bbox_to_anchor=(1, 1), loc="upper left")
if not os.path.exists(os.path.join(*config.outputpath, 'figure2c')):
        os.mkdir(os.path.join(*config.outputpath, 'figure2c'))
#plt.savefig(os.path.join(*config.outputpath, 'figure2c', 'depthanalysis_auprc_986.svg'), bbox_inches='tight')
plt.show()

In [None]:
cond = True
cond = cond & (calltables_snv['sampletf'] != np.sort(calltables_snv['sampletf'].unique())[-1])
calltables_snv_vaf = calltables_snv[cond]
#calltables_snv_vaf = calltables_snv[calltables_snv['sampletf'] != calltables_snv['sampletf'].unique().max()]
out, bins = pd.qcut(calltables_snv_vaf[calltables_snv_vaf['truth']==True]['median_vaf'], q=10, retbins=True)
vafranges = list(bins)# + [1]
print(vafranges)
if vafranges[0] == 0:
    vafranges[0] = 0.001
#np.sort(pd.qcut(calltable_snv[calltable_snv['truth']==True]['median_vaf'], q=10).unique())
#vafranges = [1., 0.1, 0.075, 0.05, 0.025, 0.01, 0.0075, 0.005, 0.0025, 0.001]
#vafranges = [1., 0.016, 0.012, 0.0093, 0.0073, 0.0063, 0.0056, 0.0051, 0.0045, 0.0038, 0]
#vafranges = [.195, 0.0894, 0.0758, 0.0597, 0.0495, 0.0429, 0.0342, 0.0281, 0.024, 0.0172, 0.0052]
#vafranges = vafranges[::-1]
res = {}
for method in config.methods:
    print(method)
    x, y  = [], []
    for vi, vafrange in enumerate(vafranges):
        #print(vafranges[vi-1], vafranges[vi])
        if vi > 0:
            aux = calltables_snv_vaf[(calltables_snv_vaf['median_vaf'] >= vafranges[vi-1]) & (calltables_snv_vaf['median_vaf'] < vafranges[vi])]
            #if method == 'varnet':
            #    aux = aux[aux['sampleid'].str.contains('CRC-986_100215-CW-T')]
            #print(aux['truth'].sum())
            #if not aux.empty:
            if not aux.empty and method != 'varnet':
                print(vafranges[vi-1], vafranges[vi], aux.shape[0])
                #precision, recall, thresholds = precision_recall_curve(aux['truth'], aux[method + '_score'].fillna(0))
                #f1list = 2*(precision * recall)/(precision + recall)
                #print(len(f1list), len(precision))
                #max(f1list)
                #print(precision, recall, thresholds)
                #print(average_precision_score(aux['truth'], aux[method + '_score'].fillna(0)))
                #if np.nanmax(f1list) > 0:
                #    x.append((vafranges[vi-1]+ vafranges[vi])/2)
                #    y.append(recall[list(f1list).index(np.nanmax(f1list))])
                x.append((vafranges[vi-1]+ vafranges[vi])/2)
                y.append(average_precision_score(aux['truth'], aux[method + '_score'].fillna(0)))
                #x.append((vafranges[vi-1]+ vafranges[vi])/2)
                #y.append(np.nanmax(f1list))
    res[method] = {'x': x, 'y': y}
color_dict = {config.methods[i]: config.colors[i] for i in range(len(config.methods))}
#plt.figure(figsize=(15,8))
fig, ax = plt.subplots(1, 1, figsize=(22,8))
for k,v in res.items():
    plt.plot(v['x'], v['y'], marker='o', markersize=10, label=k, color=color_dict[k])
plt.gca().invert_xaxis()
plt.axhline(y=1/aux['truth'].sum(), ls='--', c='k', label='baseline AUPRC')
#plt.xlim([0.5, 0.001])
plt.xscale("log")
ax.set_xticks([b.round(2) if b > 0.025 else b.round(3) for b in bins])
xlim = ax.set_xlim(0.75, 0.006)
import matplotlib.ticker
ax.get_xaxis().set_major_formatter(matplotlib.ticker.ScalarFormatter())
ax.get_xaxis().set_minor_formatter(matplotlib.ticker.NullFormatter())
plt.xlabel('Variant Allele Frequency (VAF)')
plt.ylabel('AUPRC score')
plt.title("Callers' AUPRC performance w.r.t VAF", pad=50)
plt.legend(bbox_to_anchor=(1, 1), loc="upper left")
if not os.path.exists(os.path.join(*config.outputpath, 'figure2c')):
        os.mkdir(os.path.join(*config.outputpath, 'figure2c'))
#plt.savefig(os.path.join(*config.outputpath, 'figure2c', 'vafanalysis_auprc_986_ultradeep.svg'), bbox_inches='tight')
plt.show()

In [None]:
cond = True
for i in range(len(mixtureids)):
    cond = cond & (calltables_snv['sampletf'] != np.sort(calltables_snv['sampletf'].unique())[-i])
calltables_snv_vaf = calltables_snv[cond]
#calltables_snv_vaf = calltables_snv[calltables_snv['sampletf'] != calltables_snv['sampletf'].unique().max()]
out, bins = pd.qcut(calltables_snv_vaf[calltables_snv_vaf['truth']==True]['median_vaf'], q=10, retbins=True)
vafranges = list(bins)# + [1]
print(vafranges)
#np.sort(pd.qcut(calltable_snv[calltable_snv['truth']==True]['median_vaf'], q=10).unique())
#vafranges = [1., 0.1, 0.075, 0.05, 0.025, 0.01, 0.0075, 0.005, 0.0025, 0.001]
#vafranges = [1., 0.016, 0.012, 0.0093, 0.0073, 0.0063, 0.0056, 0.0051, 0.0045, 0.0038, 0]
#vafranges = [.195, 0.0894, 0.0758, 0.0597, 0.0495, 0.0429, 0.0342, 0.0281, 0.024, 0.0172, 0.0052]
#vafranges = vafranges[::-1]
res = {}
for method in config.methods:
    print(method)
    x, y  = [], []
    for vi, vafrange in enumerate(vafranges):
        #print(vafranges[vi-1], vafranges[vi])
        if vi > 0:
            aux = calltables_snv_vaf[(calltables_snv_vaf['median_vaf'] >= vafranges[vi-1]) & (calltables_snv_vaf['median_vaf'] < vafranges[vi])]
            #print(aux['truth'].sum())
            #if not aux.empty:
            #print(vafranges[vi-1], vafranges[vi], aux.shape[0])
            precision, recall, thresholds = precision_recall_curve(aux['truth'], aux[method + '_score'].fillna(0))
            f1list = 2*(precision * recall)/(precision + recall)
            #print(len(f1list), len(precision))
            #max(f1list)
            #print(precision, recall, thresholds)
            #print(average_precision_score(aux['truth'], aux[method + '_score'].fillna(0)))
            if np.nanmax(f1list) >= 0.05 and method != 'varnet':
                print(method, np.nanmax(f1list), vafranges[vi-1], vafranges[vi])
                x.append((vafranges[vi-1]+ vafranges[vi])/2)
                y.append(recall[list(f1list).index(np.nanmax(f1list))])
            #x.append((vafranges[vi-1]+ vafranges[vi])/2)
            #y.append(average_precision_score(aux['truth'], aux[method + '_score'].fillna(0)))
            #x.append((vafranges[vi-1]+ vafranges[vi])/2)
            #y.append(np.nanmax(f1list))
    res[method] = {'x': x, 'y': y}
color_dict = {config.methods[i]: config.colors[i] for i in range(len(config.methods))}
#plt.figure(figsize=(15,8))
fig, ax = plt.subplots(1, 1, figsize=(22,8))
for k,v in res.items():
    if v['x'] != []:
        print(k)
        plt.plot(v['x'], v['y'], marker='s', markersize=10, label=k, color=color_dict[k])
plt.gca().invert_xaxis()
plt.xscale("log")
plt.axhline(y=1/aux['truth'].sum(), ls='--', c='k')
#plt.xlim([0.5, 0.01])
ax.set_xticks(bins.round(3))
xlim = ax.set_xlim(0.75, 0.006)
#xlim = ax.set_xlim(100, 1000)
import matplotlib.ticker
ax.get_xaxis().set_major_formatter(matplotlib.ticker.ScalarFormatter())
ax.get_xaxis().set_minor_formatter(matplotlib.ticker.NullFormatter())
plt.xlabel('Variant Allele Frequency (VAF)')
plt.ylabel('Recall value corresponding to max F1-score (when ≥ 0.05)')
plt.title("Callers' recall of max F1-score w.r.t VAF", pad=50)
plt.legend(bbox_to_anchor=(1, 1), loc="upper left")
if not os.path.exists(os.path.join(*config.outputpath, 'figure2c')):
        os.mkdir(os.path.join(*config.outputpath, 'figure2c'))
#plt.savefig(os.path.join(*config.outputpath, 'figure2c', 'vafanalysis_recallmaxf1_986.svg'), bbox_inches='tight')
plt.show()

In [None]:
cond = True
for i in range(len(mixtureids)):
    cond = cond & (calltables_snv['sampletf'] != np.sort(calltables_snv['sampletf'].unique())[-i])
calltables_snv_vaf = calltables_snv[cond]
#calltables_snv_vaf = calltables_snv[calltables_snv['sampletf'] != calltables_snv['sampletf'].unique().max()]
out, bins = pd.qcut(calltables_snv_vaf[calltables_snv_vaf['truth']==True]['median_vaf'], q=10, retbins=True)
vafranges = list(bins)# + [1]
print(vafranges)
#np.sort(pd.qcut(calltable_snv[calltable_snv['truth']==True]['median_vaf'], q=10).unique())
#vafranges = [1., 0.1, 0.075, 0.05, 0.025, 0.01, 0.0075, 0.005, 0.0025, 0.001]
#vafranges = [1., 0.016, 0.012, 0.0093, 0.0073, 0.0063, 0.0056, 0.0051, 0.0045, 0.0038, 0]
#vafranges = [.195, 0.0894, 0.0758, 0.0597, 0.0495, 0.0429, 0.0342, 0.0281, 0.024, 0.0172, 0.0052]
#vafranges = vafranges[::-1]
res = {}
for method in config.methods:
    print(method)
    x, y  = [], []
    for vi, vafrange in enumerate(vafranges):
        #print(vafranges[vi-1], vafranges[vi])
        if vi > 0:
            aux = calltables_snv_vaf[(calltables_snv_vaf['median_vaf'] >= vafranges[vi-1]) & (calltables_snv_vaf['median_vaf'] < vafranges[vi])]
            #print(aux['truth'].sum())
            #if not aux.empty:
            #print(vafranges[vi-1], vafranges[vi], aux.shape[0])
            precision, recall, thresholds = precision_recall_curve(aux['truth'], aux[method + '_score'].fillna(0))
            f1list = 2*(precision * recall)/(precision + recall)
            if np.nanmax(f1list) >= 0.05 and method != 'varnet':
                ar = np.argwhere(f1list == np.nanmax(f1list)).flatten()
                br = [recall[a] for a in ar]
                idxa = ar[br.index(max(br))]
                #print(len(f1list), len(precision))
                #max(f1list)
                #print(precision, recall, thresholds)
                #print(average_precision_score(aux['truth'], aux[method + '_score'].fillna(0)))
                print(method, np.nanmax(f1list), vafranges[vi-1], vafranges[vi])
                x.append((vafranges[vi-1]+ vafranges[vi])/2)
                y.append(recall[list(f1list).index(np.nanmax(f1list))])
            #x.append((vafranges[vi-1]+ vafranges[vi])/2)
            #y.append(average_precision_score(aux['truth'], aux[method + '_score'].fillna(0)))
            #x.append((vafranges[vi-1]+ vafranges[vi])/2)
            #y.append(np.nanmax(f1list))
    res[method] = {'x': x, 'y': y}
color_dict = {config.methods[i]: config.colors[i] for i in range(len(config.methods))}
#plt.figure(figsize=(15,8))
fig, ax = plt.subplots(1, 1, figsize=(22,8))
for k,v in res.items():
    if v['x'] != []:
        print(k)
        plt.plot(v['x'], v['y'], marker='s', markersize=10, label=k, color=color_dict[k])
plt.gca().invert_xaxis()
plt.xscale("log")
#plt.axhline(y=1/aux['truth'].sum(), ls='--', c='k')
#plt.xlim([0.5, 0.01])
ax.set_xticks(bins.round(3))
xlim = ax.set_xlim(0.75, 0.006)
#xlim = ax.set_xlim(100, 1000)
import matplotlib.ticker
ax.get_xaxis().set_major_formatter(matplotlib.ticker.ScalarFormatter())
ax.get_xaxis().set_minor_formatter(matplotlib.ticker.NullFormatter())
plt.xlabel('Variant Allele Frequency (VAF)')
plt.ylabel('Recall value corresponding to max F1-score (when ≥ 0.05)')
plt.title("Callers' recall of max F1-score w.r.t VAF", pad=50)
plt.legend(bbox_to_anchor=(1, 1), loc="upper left")
if not os.path.exists(os.path.join(*config.outputpath, 'figure2c')):
        os.mkdir(os.path.join(*config.outputpath, 'figure2c'))
#plt.savefig(os.path.join(*config.outputpath, 'figure2c', 'vafanalysis_recallmaxf1_986.svg'), bbox_inches='tight')
plt.show()

In [None]:
bins

In [None]:
precisionthreshold = 0.1

cond = True
for i in range(len(mixtureids)):
    cond = cond & (calltables_snv['sampletf'] != np.sort(calltables_snv['sampletf'].unique())[-i])
calltables_snv_vaf = calltables_snv[cond]
#calltables_snv_vaf = calltables_snv[calltables_snv['sampletf'] != calltables_snv['sampletf'].unique().max()]
out, bins = pd.qcut(calltables_snv_vaf[calltables_snv_vaf['truth']==True]['median_vaf'], q=10, retbins=True)
vafranges = list(bins)# + [1]
print(vafranges)
#np.sort(pd.qcut(calltable_snv[calltable_snv['truth']==True]['median_vaf'], q=10).unique())
#vafranges = [1., 0.1, 0.075, 0.05, 0.025, 0.01, 0.0075, 0.005, 0.0025, 0.001]
#vafranges = [1., 0.016, 0.012, 0.0093, 0.0073, 0.0063, 0.0056, 0.0051, 0.0045, 0.0038, 0]
#vafranges = [.195, 0.0894, 0.0758, 0.0597, 0.0495, 0.0429, 0.0342, 0.0281, 0.024, 0.0172, 0.0052]
#vafranges = vafranges[::-1]
res = {}
for method in config.methods:
    print(method)
    x, y  = [], []
    for vi, vafrange in enumerate(vafranges):
        #print(vafranges[vi-1], vafranges[vi])
        if vi > 0:
            aux = calltables_snv_vaf[(calltables_snv_vaf['median_vaf'] >= vafranges[vi-1]) & (calltables_snv_vaf['median_vaf'] < vafranges[vi])]
            #print(aux['truth'].sum())
            #if not aux.empty:
            #print(vafranges[vi-1], vafranges[vi], aux.shape[0])
            precision, recall, thresholds = precision_recall_curve(aux['truth'], aux[method + '_score'].fillna(0))
            #idxa = min(range(len(precision)), key=lambda i: abs(precision[i]-precisionthreshold))
            a = [(lambda pi: abs(pi-precisionthreshold))(pi) for pi in precision]
            ar = np.argwhere(a == np.amin(a)).flatten()
            br = [recall[a] for a in ar]
            idxa = ar[br.index(max(br))]
            #idxa = min(range(len(precision[::-1])), key=lambda i: abs(precision[::-1][i]-precisionthreshold))
            print(precision[idxa].round(3), recall[idxa].round(3), vafranges[vi-1].round(3), vafranges[vi].round(3))
            #f1list = 2*(precision * recall)/(precision + recall)
            #print(len(f1list), len(precision))
            #max(f1list)
            #print(precision, recall, thresholds)
            #print(average_precision_score(aux['truth'], aux[method + '_score'].fillna(0)))
            #if np.nanmax(f1list) >= 0.01:
            #    print(method, np.nanmax(f1list), vafranges[vi-1], vafranges[vi])
            #    x.append((vafranges[vi-1]+ vafranges[vi])/2)
            #    y.append(recall[list(f1list).index(np.nanmax(f1list))])
            if precision[idxa] >= precisionthreshold-0.05 and recall[idxa] != 0 and method != 'varnet':
                x.append((vafranges[vi-1]+ vafranges[vi])/2)
                y.append(recall[idxa])
            #x.append((vafranges[vi-1]+ vafranges[vi])/2)
            #y.append(average_precision_score(aux['truth'], aux[method + '_score'].fillna(0)))
            #x.append((vafranges[vi-1]+ vafranges[vi])/2)
            #y.append(np.nanmax(f1list))
    res[method] = {'x': x, 'y': y}
color_dict = {config.methods[i]: config.colors[i] for i in range(len(config.methods))}
#plt.figure(figsize=(15,8))
fig, ax = plt.subplots(1, 1, figsize=(20,8))
for k,v in res.items():
    if k == 'strelka2':
        plt.plot(v['x'], v['y'], marker='o', markersize=15, label=k, color=color_dict[k])
    else:
        plt.plot(v['x'], v['y'], marker='o', markersize=10, label=k, color=color_dict[k])
plt.gca().invert_xaxis()
plt.xscale("log")
plt.axhline(y=1/aux['truth'].sum(), ls='--', c='k')
#plt.xlim([0.5, 0.01])
ax.set_xticks(bins.round(2))
#xlim = ax.set_xlim(100, 1000)
xlim = ax.set_xlim(0.75, 0.006)
import matplotlib.ticker
ax.get_xaxis().set_major_formatter(matplotlib.ticker.ScalarFormatter())
ax.get_xaxis().set_minor_formatter(matplotlib.ticker.NullFormatter())
plt.xlabel('Variant Allele Frequency (VAF)')
plt.ylabel('Recall value for precision fixed at '+str(precisionthreshold))
plt.title("Callers' recall when precision = {} w.r.t VAF".format(precisionthreshold), pad=50)
plt.legend(bbox_to_anchor=(1, 1), loc="upper left")
if not os.path.exists(os.path.join(*config.outputpath, 'figure2c')):
        os.mkdir(os.path.join(*config.outputpath, 'figure2c'))
plt.savefig(os.path.join(*config.outputpath, 'figure2c', 'vafanalysis_recallatfixedprecision_'+str(precisionthreshold)+'_986_ultradeep.svg'), bbox_inches='tight')
plt.show()

In [None]:
a = [(lambda pi: abs(pi-precisionthreshold))(pi) for pi in precision]
ar = np.argwhere(a == np.amin(a)).flatten()
print(ar)
br = [recall[a] for a in ar]
idx = ar[br.index(max(br))]
idx
#print(list(ar).index(max([recall[a] for a in ar])))

In [None]:
vafranges = [1., 0.1, 0.075, 0.05, 0.025, 0.01, 0.009, 0.008, 0.007, 0.006, 0.005, 0.004, 0.003, 0.002, 0.001]
vafranges = vafranges[::-1]
res = {}
calltable_snv = calltables['snv'][calltables['snv']['sampletf'] != calltables['snv']['sampletf'].unique().max()]
for method in config.methods:
    print(method)
    x, y  = [], []
    for vi, vafrange in enumerate(vafranges):
        #print(vafranges[vi-1], vafranges[vi])
        if vi > 0:
            aux = calltable_snv[(calltable_snv['median_vaf'] >= vafranges[vi-1]) & (calltable_snv['median_vaf'] < vafranges[vi])]
            if not aux.empty:
                #print(vafranges[vi-1], vafranges[vi], aux.shape[0])
                precision, recall, thresholds = precision_recall_curve(aux['truth'], aux[method + '_score'].fillna(0))
                f1list = 2*(precision * recall)/(precision + recall)
                #print(len(f1list), len(precision))
                #max(f1list)
                #print(precision, recall, thresholds)
                #print(average_precision_score(aux['truth'], aux[method + '_score'].fillna(0)))
                x.append((vafranges[vi-1]+ vafranges[vi])/2)
                #y.append(max(f1list))
                y.append(average_precision_score(aux['truth'], aux[method + '_score'].fillna(0)))
    res[method] = {'x': x, 'y': y}
color_dict = {config.methods[i]: config.colors[i] for i in range(len(config.methods))}
plt.figure(figsize=(15,8))
for k,v in res.items():
    plt.plot(v['x'], v['y'], marker='o', label=k, color=color_dict[k])
plt.gca().invert_xaxis()
plt.xscale("log")
plt.legend()
plt.show()

In [None]:
vafranges = [1., 0.1, 0.075, 0.05, 0.025, 0.01, 0.009, 0.008, 0.007, 0.006, 0.005, 0.004, 0.003, 0.002, 0.001]
vafranges = vafranges[::-1]
res = {}
calltable_snv = calltables['snv'][calltables['snv']['sampletf'] != calltables['snv']['sampletf'].unique().max()]
for method in config.methods:
    print(method)
    res[method] = {}
    x, y  = [], []
    for tf in aux['sampletf'].unique():
        for vi, vafrange in enumerate(vafranges):
            if vi > 0:
                aux = calltable_snv[(calltable_snv['median_vaf'] >= vafranges[vi-1]) & (calltable_snv['median_vaf'] < vafranges[vi])]
                if not aux[aux['sampletf'] == tf].empty:
                    #precision, recall, thresholds = precision_recall_curve(aux[aux['sampletf']==tf]['truth'], aux[aux['sampletf']==tf][method + '_score'].fillna(0))
                    #print(precision, recall, thresholds)
                    #print(average_precision_score(aux['truth'], aux[method + '_score'].fillna(0)))
                    x.append((vafranges[vi-1]+ vafranges[vi])/2)
                    y.append(average_precision_score(aux[aux['sampletf']==tf]['truth'], aux[aux['sampletf']==tf][method + '_score'].fillna(0)))
        res[method][tf] = {'x': x, 'y': y}
color_dict = {config.methods[i]: config.colors[i] for i in range(len(config.methods))}
plt.figure(figsize=(15,8))
for k,v in res.items():
    for vk, vv in v.items():
        plt.plot(vv['x'], vv['y'], color=color_dict[k])
plt.gca().invert_xaxis()
plt.xscale("log")
plt.show()

In [None]:
lwlist = list(np.sort(calltables['snv']['sampletf'].unique()))
plt.figure(figsize=(15,8))
for k,v in res.items():
    for vk, vv in v.items():
        #print(np.ceil(lwlist.index(vk)/2))
        #print(int(np.ceil(lwlist.index(vk)/2)), 1-vk/50)
        plt.plot(vv['x'], vv['y'], lw=int(np.ceil(lwlist.index(vk)/2)), color=color_dict[k])
plt.gca().invert_xaxis()
plt.xscale("log")
plt.show()

In [None]:
caux

In [None]:
calltables

# Part II: Group per VAF

In [None]:
calltablesseries

In [None]:
for dil in dilutionseries['tf'].values:
    dil = '{:.2f}'.format(np.round(100*float(dil), 4)/100)
    print(dil)
    print([str(dil)+'_'+method+'_vaf' for method in config.methods])
    calltablesseries[str(dil)+'_vaf'] = calltablesseries[[str(dil)+'_'+method+'_vaf' for method in config.methods]].median(skipna=True, axis=1)

In [None]:
calltablesseries['vaf'] = [l.tolist() for l in calltablesseries[colvaf].values]

In [None]:
colvaf = ['{:.2f}'.format(np.round(100*float(dil), 4)/100)+'_vaf' for dil in dilutionseries['tf'].values][1:]
aux = calltablesseries.iloc[:2000][colvaf]
#aux.columns = [float(c.split('_')[0]) for c in colvaf]
aux['direction'] = np.nan
aux.at[(aux[colvaf[1]] <= aux[colvaf[0]]) & (aux[colvaf[2]] <= aux[colvaf[1]]) &
    (aux[colvaf[3]] <= aux[colvaf[2]]) & (aux[colvaf[4]] <= aux[colvaf[3]]) & (aux[colvaf[5]] <= aux[colvaf[4]]), 'direction'] = 'strict decrease'
aux.at[(aux[colvaf[1]] >= aux[colvaf[0]]) & (aux[colvaf[2]] >= aux[colvaf[1]]) & 
    (aux[colvaf[3]] >= aux[colvaf[2]]) & (aux[colvaf[4]] >= aux[colvaf[3]]) & (aux[colvaf[5]] >= aux[colvaf[4]]), 'direction'] = 'strict increase'
aux.at[(aux[colvaf[1]] <= aux[colvaf[0]]) | (aux[colvaf[2]] <= aux[colvaf[1]]) | 
    (aux[colvaf[3]] <= aux[colvaf[2]]) | (aux[colvaf[4]] <= aux[colvaf[3]]) | (aux[colvaf[5]] <= aux[colvaf[4]]), 'direction'] = 'decrease'
aux.at[((aux[colvaf[1]] >= aux[colvaf[0]]) | (aux[colvaf[2]] >= aux[colvaf[1]]) | (aux[colvaf[3]] >= aux[colvaf[2]])  |
       (aux[colvaf[4]] >= aux[colvaf[3]]) | (aux[colvaf[5]] >= aux[colvaf[4]])) & (aux['direction'] != 'decrease'), 'direction'] = 'increase'
aux.at[((aux[colvaf[1]] >= aux[colvaf[0]]) | (aux[colvaf[2]] >= aux[colvaf[1]]) | (aux[colvaf[3]] >= aux[colvaf[2]])  |
       (aux[colvaf[4]] >= aux[colvaf[3]]) | (aux[colvaf[5]] >= aux[colvaf[4]])) & (aux['direction'] == 'decrease'), 'direction'] = 'up and down'
aux['direction'].fillna('other', inplace=True)
print(aux['direction'].value_counts())
fig, ax = plt.subplots(figsize=(10,10))
aux[aux['direction'] == 'other'][colvaf].T.plot(kind='line', color='k', ax=ax, legend=False)
aux[aux['direction'] == 'increase'][colvaf].T.plot(kind='line', color='r',  ax=ax,  legend=False)
aux[aux['direction'] == 'up and down'][colvaf].T.plot(kind='line', color='b',  ax=ax,  legend=False)
aux[aux['direction'] == 'decrease'][colvaf].T.plot(kind='line', color='g',  ax=ax,  legend=False)

#plt.plt(c='k', hue='direction', data=aux[colvaf])
#ax.invert_xaxis()
plt.xlabel('sample TF')
plt.ylabel('median VAF found')
#plt.ylim([0, 0.02])

In [None]:
calltablesseries['sampleTF'] = 


In [None]:
aux.explode('vaf')

In [None]:
for fixedvar in fixedvars:
#fixedvar = 'coverage'
    if fixedvar == 'coverage':
        xaxis = 'tumor burden'
    elif fixedvar == 'ctdna':
        xaxis = 'coverage'
    #for mt in muttypes:
    mt = 'snv'
    if mt == 'snv':
        gtm = 4
        refname = 'inundilutedsamplebyatleast'+str(gtm)+'callers'
    else:  # elif mt == 'indel':
        gtm = 2
        refname = 'inundilutedsamplebyatleast'+str(gtm)+'callers'
    print(refname)
    # for metric in metrics:
    metric = 'auprc'
    # load results tables
    restables = {'snv': [], 'indel': []}
    mixtureid = 'CRC-986_100215-CW-T_CRC-986_300316-CW-T'
    if mixtureid == 'CRC-986_100215-CW-T_CRC-986_300316-CW-T':
        gtm = 3
        refname = 'intissuesamplebyatleast'+str(gtm)+'callers'
    else:
        gtm = 4
        refname = 'inundilutedsamplebyatleast'+str(gtm)+'callers'
    plasmasample = '_'.join(mixtureid.split('_')[:2])
    print(mixtureid, plasmasample)
    xa = xaxis if xaxis != 'tumor burden' else 'tb'
    print(xa)
    restable = pd.read_csv(os.path.join(*config.mixturefolder, 'mixtures_allchr', 'results', mixtureid+'_'+mt+'_'+metric+'_'+refname+'_fixed'+fixedvar+'_'+ xa +'.csv'), index_col=0)
    #restable = pd.read_csv(os.path.join(*config.mixturefolder, 'mixtures_chr22_wgs', 'mixtures_chr22_'+mixtureid, 'results', mixtureid+'_'+mt+'_'+metric+'_'+refname+'_fixed'+fixedvar +'.csv'), index_col=0)
    restable['plasma sample'] = plasmasample
    restables[mt].append(restable)
    restables[mt] = pd.concat(restables[mt])
    res1 = plot_metricsseries(config, restables, mixtureids, 'all', metric=metric, muttype=mt,
                       ground_truth_method='mixture', fixedvar=fixedvar, refname=refname, allpatients=True, logscale=False, save=False)
    #res2 = plot_metricsseries(config, restables, mixtureids, 'all', metric=metric, muttype=mt,
    #                   ground_truth_method='mixture', fixedvar=fixedvar, refname=refname, allpatients=True, logscale=True, save=True)
    #res3 = plot_metricsseries(config, restables, mixtureids, 'all', metric=metric, muttype=mt,
    #                   ground_truth_method='mixture', fixedvar=fixedvar, refname=refname, allpatients=False, logscale=False, save=True)
    #res4 = plot_metricsseries(config, restables, mixtureids, 'all', metric=metric, muttype=mt,
    #                       ground_truth_method='mixture', fixedvar=fixedvar, refname=refname, allpatients=False, logscale=True, save=True)
    #plt.grid(linewidth=1)
    plt.ylim([0, .8])
    #plt.xlim([20, 0])
    #plt.xlim([40, 250])

In [None]:
#plt.grid(linewidth=1)

#for fixedvar in fixedvars:
fixedvar = 'coverage'
if fixedvar == 'coverage':
    xaxis = 'tumor burden'
elif fixedvar == 'ctdna':
    xaxis = 'coverage'
#for mt in muttypes:
mt = 'snv'
if mt == 'snv':
    gtm = 4
    refname = 'inundilutedsamplebyatleast'+str(gtm)+'callers'
else:  # elif mt == 'indel':
    gtm = 2
    refname = 'inundilutedsamplebyatleast'+str(gtm)+'callers'
print(refname)
# for metric in metrics:
metric = 'auprc'
# load results tables
restables = {'snv': [], 'indel': []}
for mixtureid in mixtureids:
    if mixtureid == 'CRC-986_100215-CW-T_CRC-986_300316-CW-T':
        gtm = 3
        refname = 'intissuesamplebyatleast'+str(gtm)+'callers'
    else:
        gtm = 4
        refname = 'inundilutedsamplebyatleast'+str(gtm)+'callers'
    plasmasample = '_'.join(mixtureid.split('_')[:2])
    print(mixtureid, plasmasample)
    xa = xaxis if xaxis != 'tumor burden' else 'tb'
    print(xa)
    restable = pd.read_csv(os.path.join(*config.mixturefolder, 'mixtures_allchr', 'results', mixtureid+'_'+mt+'_'+metric+'_'+refname+'_fixed'+fixedvar+'_'+ xa +'.csv'), index_col=0)
    #restable = pd.read_csv(os.path.join(*config.mixturefolder, 'mixtures_chr22_wgs', 'mixtures_chr22_'+mixtureid, 'results', mixtureid+'_'+mt+'_'+metric+'_'+refname+'_fixed'+fixedvar +'.csv'), index_col=0)
    restable['plasma sample'] = plasmasample
    restables[mt].append(restable)
restables[mt] = pd.concat(restables[mt])
res1 = plot_metricsseries(config, restables, mixtureids, 'all', metric=metric, muttype=mt,
                   ground_truth_method='mixture', fixedvar=fixedvar, refname=refname, allpatients=True, logscale=False, save=False)
#res2 = plot_metricsseries(config, restables, mixtureids, 'all', metric=metric, muttype=mt,
#                   ground_truth_method='mixture', fixedvar=fixedvar, refname=refname, allpatients=True, logscale=True, save=True)
#res3 = plot_metricsseries(config, restables, mixtureids, 'all', metric=metric, muttype=mt,
#                   ground_truth_method='mixture', fixedvar=fixedvar, refname=refname, allpatients=False, logscale=False, save=True)
#res4 = plot_metricsseries(config, restables, mixtureids, 'all', metric=metric, muttype=mt,
#                       ground_truth_method='mixture', fixedvar=fixedvar, refname=refname, allpatients=False, logscale=True, save=True)

In [None]:
resx = np.array([rx.values for rx in res1['x']])
resx.mean(axis=0)
resy = np.array([ry.values for ry in res1['y']])
print(resy)
res = {m: [] for m in config.methods}
for mi, m in enumerate(config.methods):
    resmean = np.mean([resy[mi], resy[mi+len(config.methods)], resy[mi+2*len(config.methods)]], axis=0)
    resstd = np.std([resy[mi], resy[mi+len(config.methods)], resy[mi+2*len(config.methods)]], axis=0)
    #reslabel = m
    res[m] = [resmean, resstd, resx.mean(axis=0)]

In [None]:
color_dict = {config.methods[i]: config.colors[i] for i in range(len(config.methods))}
plt.figure(figsize=(15, 10))
#plt.grid(linewidth=1)
plt.grid()
for m in config.methods:
    #plt.plot(res['freebayes'][2], res['freebayes'][0], c=color_dict['freebayes'], markersize=15, lw=2)
    plt.errorbar(res[m][2], res[m][0], xerr = resx.std(axis=0), yerr=res[m][1], marker=config.markers[0],  c=color_dict[m], label=m,  markersize=15, lw=2, fmt='-o')
ax = plt.gca()
if fixedvar == 'coverage':
    plt.gca().invert_xaxis()
    xlab='tumor burden'
else:
    xlab = 'coverage or added noise'
hand, labl = ax.get_legend_handles_labels()
ax.legend(hand, labl, bbox_to_anchor=(1, 1), loc="upper left")
plt.xlabel(xlab)
plt.ylabel(metric.upper()+' score')
plt.grid()
plt.title(metric.upper() + " score for {} calling in chr{} with ref {}".format(mt.upper(), chrom, refname))
plt.ylim([0,1])

In [None]:
for m in config.methods:
    res[m][2] = res[m][2][::-1]
    res[m][0] = res[m][0][::-1]
    res[m][1] = res[m][1][::-1]

In [None]:
#color_dict = {config.methods[i]: config.colors[i] for i in range(len(config.methods))}
#plt.figure(figsize=(15, 10))
#plt.grid(linewidth=1)
#res[m][2] = res[m][2][::-1]
#res[m][0] = res[m][0][::-1]
#res[m][1] = res[m][1][::-1]
rankres = []
for m in config.methods:
    xstart = 0
    ystart = 0
    sum_all = []
    for i in range(len(res[m][2])):
        #if i >0:
        #print(res[m][2][i], xstart)
        #print((res[m][2][i]-xstart) * res[m][0][i] )
        #print(((res[m][2][i]-xstart) * (res[m][0][i]-ystart)/2))
        sum_all.append(((res[m][2][i]-xstart) * res[m][0][i]) - ((res[m][2][i]-xstart) * (res[m][0][i]-ystart)/2))
        xstart = res[m][2][i]
        ystart = res[m][0][i]
    print(m, np.sum(sum_all) )
    rankres.append(np.sum(sum_all))
    #plt.plot(res['freebayes'][2], res['freebayes'][0], c=color_dict['freebayes'], markersize=15, lw=2)
    #plt.errorbar(res[m][2], res[m][0], xerr = resx.std(axis=0), yerr=res[m][1], marker=config.markers[0], label=m,  markersize=15, lw=2, fmt='-o')
    #plt.gca().invert_xaxis()
#ax = plt.gca()
#xlab = 'coverage or added noise'
#xlab='tumor burden'
#hand, labl = ax.get_legend_handles_labels()
#ax.legend(hand, labl, bbox_to_anchor=(1, 1), loc="upper left")
#plt.xlabel(xlab)
#plt.ylabel(metric.upper()+' score')
#plt.title(metric.upper() + " score for {} calling in chr{} with ref {}".format(mt.upper(), chrom, refname))

In [None]:
import matplotlib.patches as mpatches
rankres_sort = np.argsort(rankres)[::-1]
rankres_sort
plt.figure(figsize=(14, 5))
method_order =[]
rank_order =[]
for i in rankres_sort:
    method_order.append(config.methods[i])
    rank_order.append(rankres[i])
plt.bar(method_order, rank_order, color=['b', 'b', 'b', 'b', 'b', 'r', 'r'])
blue_patch = mpatches.Patch(color='b', label='DNA methods')
red_patch = mpatches.Patch(color='r', label='cfDNA methods')
plt.legend(handles=[blue_patch, red_patch])
#plt.legend(['DNA methods', 'cfDNA methods'])
plt.ylabel('area under AUPRC curve')
plt.title('Caller ranking mixtures with decreasing tumor burden')