# Series analysis


Analysis updates:
- retrieve calls rejected only because VAF<0.01 (bcbio lower acceptable threshold) for Mutect2, Strelka2 and Vardict
- correct for germline mutations using GATK haplotype calls
- get PR curves for sinvict by encoding the 6 files as thresholds. Assumption: linear filters
- plotting: PR curves stops at 10e-2 on the left
- ground truths:

    1) Consensus: build using
        majority of 5/8 callers for SNV and 3/5 callers for INDELS


    2) Ranked mutations: metascore built using
        weigthed sum of normalised scores between 0 and 1 for each caller
        with weigths = inversially proportional to number of calls made by caller (if a caller calls few mutations higher weight, it many calls low weight)
        threshold = 1/ncallers
        interpretation: if 1 caller is sure (score = 1) of calling this position, add it to GT
        interpretation: if 2 callers are quite sure of calling this position (score > 0,5 each), add it to GT
        
- integrate VAF approx

    1) mixture with VAF instead of tumor burden
    
    2) correct for mutations non present in diluted samples (vaf = 0)
    
    3) pool patients together using VAF

In [None]:
# Imports

%load_ext autoreload
%autoreload 2

import io
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pysam
import warnings
from sklearn.metrics import precision_recall_curve, f1_score, average_precision_score
warnings.filterwarnings('ignore')
from sklearn.metrics import confusion_matrix

# set working directory
if not os.getcwd().endswith('cfdna_snv_benchmark'):
    os.chdir('../')
print('Current working directory: {}'.format(os.getcwd()))

from utils.config import Config
from utils.viz import *
from utils.table import *
from utils.metrics import *
from utils.calltable import *
from utils.calltableseries import *
from utils.groundtruth import *
from utils.metricsseries import *
from utils.venn import venn6, get_labels

In [None]:
# Config and Display paramaters

config = Config("config/", "config_viz.yaml")
set_display_params(config)
print(config.methods)

In [None]:
# Chomosome

mixtureid = 'CRC-986_100215-CW-T_CRC-986_300316-CW-T'
reload = False
save = False
fixedvars=['coverage', 'ctdna']
filterparam = 'all'

markers = ['o', '^', 'X']
linestyles = ['-', '-', '-']
color_dict = {config.methods[i]: config.colors[i] for i in range(len(config.methods))}

muttypes = ['snv', 'indel']
metrics = ['auprc', 'precision', 'recall']

chrom = 'all'

# Part I: (1) Load/Generate call tables, (2) Generate Ground truths and (3) Compute/Save metrics per patient

In [None]:
fixedvar = 'coverage'
#for fixedvar in fixedvars:
if fixedvar == 'coverage':
    seriesorder = [(1000, 0), (1000, 1000), (750, 1250), (500, 1500), (400, 1600), (200, 1800), (100, 1900)] 
    xaxis = 'tumor burden'
elif fixedvar == 'ctdna':
    seriesorder = [(1000, 0), (1000, 1000), (1000, 1900)]
    xaxis = 'coverage'
#for mixtureid in mixtureids:
print('############# {} ############'.format(mixtureid))
calltables = {'sampleid':[], 'tf':[], 'cov':[], 'snv':[], 'indel':[], 'snp':[]}
calltable_snv, aux = get_calltableseries(config, mixtureid, 'all', muttype='snv', filterparam=filterparam, reload=reload, save=save, diltype='mixture_wes', concat='tf', bcbiovaf=0.01)
#calltable_indel, aux = get_calltableseries(config, mixtureid, 'all', muttype='indel', filterparam=filterparam, reload=reload, save=save, diltype='mixture_wes', bcbiovaf=0.01)
#calltable_snp, aux = get_calltableseries(config, mixtureid, 'all', muttype='snp', filterparam=filterparam, reload=reload, save=save, diltype='mixture_wes', bcbiovaf=0.01)
#print(calltable_snv.shape, calltable_indel.shape, calltable_snp.shape)
print(aux)
calltable_snv = calltable_snv[calltable_snv['chrom'] != 'X']
calltable_snv = calltable_snv[calltable_snv['chrom'] != 'Y']
calltable_snv = calltable_snv[calltable_snv['chrom'].isin([str(i) for i in range(1, 23)])]
#calltable_indel = calltable_indel[calltable_indel['chrom'] != 'X']
#calltable_indel = calltable_indel[calltable_indel['chrom'] != 'Y']
#calltable_indel = calltable_indel[calltable_indel['chrom'].isin([str(i) for i in range(1, 23)])]
#calltable_snp = calltable_snp[calltable_snp['chrom'] != 'X']
#calltable_snp = calltable_snp[calltable_snp['chrom'] != 'Y']
#calltable_snp = calltable_snp[calltable_snp['chrom'].isin([str(i) for i in range(1, 23)])]
calltable_snv['chrom'] = calltable_snv['chrom'].astype(int)
#calltable_indel['chrom'] = calltable_indel['chrom'].astype(int)
#calltable_snp['chrom'] = calltable_snp['chrom'].astype(int)
plasmasample = '_'.join(mixtureid.split('_')[:2])
print(plasmasample)
healthysample = '_'.join(mixtureid.split('_')[2:])
print(healthysample)
#calltables['snv'].append(calltable_snv)
#calltables['indel'].append(calltable_indel)
#calltables['snp'].append(calltable_snp)
calltables['sampleid'] = mixtureid 
calltables['tf'] = np.unique([cn.split('_')[0] for cn in list(calltable_snv.columns)])[:-5].astype(float)
#calltables['snv'] = pd.concat(calltables['snv'])
#calltables['indel'] = pd.concat(calltables['indel'])
#calltables['snp'] = pd.concat(calltables['snp'])
calltables['snv'] = calltable_snv.copy() ## as only one mixtureid
dilutionseries = aux.T[['mixture_chrall_' + '_'.join(mixtureid.split('_')[:2]) + '_' + str(s[0]) + 'x_' + '_'.join(mixtureid.split('_')[2:4]) + '_' + str(s[1]) + 'x' for s in seriesorder]].T
#for muttype in muttypes:
muttype = 'snv'
if muttype == 'snv':
    gtm = 5
else:  # elif muttype == 'indel':
    gtm = 3
print(max(aux['tf']))
refsample = 'undiluted'
#if mixtureid ==  'CRC-986_100215-CW-T_CRC-986_300316-CW-T':
#    gtm = 3
#    refsample = 'tissue'
    #calltablesseries  = generate_groundtruth(config, calltables[muttype], aux['tf'], ground_truth_method='tissue', muttype=muttype,
    #                                    matchedtissuepath=os.path.join('data', 'matchedtissue', 'NCC_CRC-986_100215-T1W', 'calls', 'NCC_CRC-986_100215-T1W_snv_calls_PASS_exome.csv'))
#    calltablesseries  = generate_groundtruth(config, calltables[muttype], aux['tf'], ground_truth_method='tissue', muttype=muttype,
#                                        matchedtissuepath=os.path.join('data', 'matchedtissue_ultradeep', '986_100215_T1-E', 'calls', '986_100215_T1-E_snv_calls_all.csv'))#
#else:
calltablesseries = generate_groundtruth(config, calltable_snv, aux['tf'], ground_truth_method=gtm, muttype=muttype,
                                        matchedtissuepath=None, methods=['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan', 'varnet', 'abemus', 'sinvict'])
#calltablesseries = generate_groundtruth(config, calltables[muttype], aux['tf'], ground_truth_method=gtm, muttype=muttype,
#                                        matchedtissuepath=None, methods=['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan', 'varnet', 'abemus', 'sinvict'])
print(calltablesseries)
#results_auprc_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='auprc', ground_truth_method=gtm,
#                                 refsample=refsample, muttype=muttype, methods=config.methods, fixedvar=fixedvar, xaxis=xaxis, diltype='mixture_wes')
#results_recall_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='recall', ground_truth_method=gtm,
#                                       refsample=refsample, muttype=muttype, methods=config.methods, fixedvar=fixedvar, xaxis=xaxis, diltype='mixture_wes')
#results_precision_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='precision', ground_truth_method=gtm,
#                                      refsample=refsample, muttype=muttype, methods=config.methods, fixedvar=fixedvar, xaxis=xaxis, diltype='mixture_wes')
#figure_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, xy='pr', ground_truth_method=gtm,
#                    refsample=refsample, muttype=muttype.upper(), methods=None, fixedvar=fixedvar, save=save, diltype='mixture_wes')
results_maxrecallatlast0_05precision_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='maxrecallatleast0_05precision', ground_truth_method=gtm,
                                      refsample=refsample, muttype=muttype, methods=config.methods, fixedvar=fixedvar, xaxis=xaxis, diltype='mixture_wes')
results_maxrecallatlast0_10precision_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='maxrecallatleast0_10precision', ground_truth_method=gtm,
                                      refsample=refsample, muttype=muttype, methods=config.methods, fixedvar=fixedvar, xaxis=xaxis, diltype='mixture_wes')

In [None]:
aux

In [None]:
calltablesseries = pd.read_csv(os.path.join('data', 'mixtures_ultradeep', 'mixtures_chrall', 'mixtures_chrall_CRC-986_100215-CW-T_CRC-986_300316-CW-T' , 'calls', 'CRC-986_100215-CW-T_CRC-986_300316-CW-T_snv_calls_all.csv'), index_col=0)
calltablesseries = generate_groundtruth(config, calltablesseries, aux['tf'], ground_truth_method=gtm, muttype=muttype,
                                        matchedtissuepath=None, methods=['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan', 'varnet', 'abemus', 'sinvict'])
aux = pd.read_csv(os.path.join('data', 'mixtures_ultradeep', 'mixtures_chrall', 'mixtures_chrall_CRC-986_100215-CW-T_CRC-986_300316-CW-T' , 'calls', 'CRC-986_100215-CW-T_CRC-986_300316-CW-T_tf_cov.csv'), index_col=0)
print(calltablesseries)
save=True
#seriesorder = [(1000, 0), (1000, 1000), (1000, 1900)]
#fixedvar = 'ctdna'
#xaxis = 'coverage'
seriesorder = [(1000, 0), (1000, 1000), (750, 1250), (500, 1500), (400, 1600), (200, 1800), (100, 1900)] 
fixedvar = 'coverage'
xaxis = 'tumor burden'
dilutionseries = aux.T[['mixture_chrall_' + '_'.join(mixtureid.split('_')[:2]) + '_' + str(s[0]) + 'x_' + '_'.join(mixtureid.split('_')[2:4]) + '_' + str(s[1]) + 'x' for s in seriesorder]].T
#results_auprc_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='auprc', ground_truth_method=gtm,
#                                 refsample=refsample, muttype=muttype, methods=config.methods, fixedvar=fixedvar, xaxis=xaxis, diltype='mixture_wes')
#results_recall_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='recall', ground_truth_method=gtm,
#                                       refsample=refsample, muttype=muttype, methods=config.methods, fixedvar=fixedvar, xaxis=xaxis, diltype='mixture_wes')
#results_precision_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='precision', ground_truth_method=gtm,
#                                      refsample=refsample, muttype=muttype, methods=config.methods, fixedvar=fixedvar, xaxis=xaxis, diltype='mixture_wes')
#figure_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, xy='pr', ground_truth_method=gtm,
#                    refsample=refsample, muttype=muttype.upper(), methods=None, fixedvar=fixedvar, save=save, diltype='mixture_wes')
#results_maxrecallatlast0_05precision_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='maxrecallatleast0_01precision', ground_truth_method=gtm,
#                                      refsample=refsample, muttype=muttype, methods=config.methods, fixedvar=fixedvar, xaxis=xaxis, diltype='mixture_wes')
results_maxrecallatlast0_10precision_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='maxrecallatleast0_025precision', ground_truth_method=gtm,
                                      refsample=refsample, muttype=muttype, methods=config.methods, fixedvar=fixedvar, xaxis=xaxis, diltype='mixture_wes')

In [None]:
#calltablesseries = pd.read_csv(os.path.join('data', 'mixtures_ultradeep', 'mixtures_chrall', 'mixtures_chrall_CRC-986_100215-CW-T_CRC-986_300316-CW-T' , 'calls', 'CRC-986_100215-CW-T_CRC-986_300316-CW-T_snv_calls_all.csv'), index_col=0)
aux = pd.read_csv(os.path.join('data', 'mixtures_ultradeep', 'mixtures_chrall', 'mixtures_chrall_CRC-986_100215-CW-T_CRC-986_300316-CW-T' , 'calls', 'CRC-986_100215-CW-T_CRC-986_300316-CW-T_tf_cov.csv'),index_col=0)
gtm = 5
muttype = 'snv'
refsample = 'undiluted'
fixedvar = 'coverage'
xaxis = 'tumor burden'
#calltablesseries = generate_groundtruth(config, calltablesseries, aux['tf'], ground_truth_method=gtm, muttype=muttype,
#                                        matchedtissuepath=None, methods=['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan', 'varnet', 'abemus', 'sinvict'])
results_auprc_df = metric_curve_allchr(config, calltablesseries, aux, mixtureid, metric='auprc', ground_truth_method=gtm,
                                 refsample=refsample, muttype=muttype, methods=config.methods, fixedvar=fixedvar, xaxis=xaxis, diltype='mixture_wes')
#results_recall_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='recall', ground_truth_method=gtm,
#                                       refsample=refsample, muttype=muttype, methods=config.methods, fixedvar=fixedvar, xaxis=xaxis, diltype='mixture_wes')
#results_precision_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='precision', ground_truth_method=gtm,
#                                      refsample=refsample, muttype=muttype, methods=config.methods, fixedvar=fixedvar, xaxis=xaxis, diltype='mixture_wes')
#figure_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, xy='pr', ground_truth_method=gtm,
#                    refsample=refsample, muttype=muttype.upper(), methods=None, fixedvar=fixedvar, save=save, diltype='mixture_wes')

In [None]:
gtdf = calltablesseries[calltablesseries['truth'] == True]
gtdf['median_vaf'] = gtdf[['{:.2f}_{}_vaf'.format(aux['tf'].max(), m) for m in config.methods if '{:.2f}_{}_vaf'.format(aux['tf'].max(), m) in gtdf.columns]].median(axis=1, skipna=True)
sns.histplot(data=gtdf, x="median_vaf", bins=30, stat='frequency')

In [None]:
gtdf['median_vaf'].to_csv('figures/figure2b/gt_986_exome_2000x_atleast5callersinundilutedsample_snv.csv')

In [None]:
Awithoutab = ['12_40882016_G_T',
 '18_9122611_T_C',
 '19_9015323_C_A',
 '11_1085845_C_T',
 '15_23104346_C_G',
 '6_32712967_C_G',
 '7_100681363_C_A',
 '19_9018508_T_G',
 '7_152522195_C_T',
 '18_47364009_G_A',
 '19_9015324_A_G',
 '11_124253181_T_C',
 '4_103832610_C_T',
 '19_9012501_A_G',
 '11_71847122_C_T',
 '7_72412591_G_A',
 '11_46369127_G_A',
 '16_71163579_C_T',
 '4_190883030_G_T',
 '12_52431751_C_G',
 '7_112092224_G_T',
 '19_6222411_T_G',
 '12_11286164_G_A',
 '15_22471481_G_T',
 '17_21114408_T_C',
 '12_11244356_A_T',
 '18_47363917_A_G',
 '14_105939672_C_G',
 '19_41354606_G_T',
 '19_41354661_T_C',
 '18_13761856_G_C',
 '4_103832597_A_G',
 '12_11286149_G_A',
 '4_103832665_T_C',
 '11_1017806_G_T',
 '14_105410352_T_C',
 '12_11244378_C_G',
 '19_9018476_G_A',
 '17_39383069_A_G',
 '7_157959929_G_A',
 '4_103832643_A_C',
 '19_14877799_G_C',
 '15_22471522_C_T',
 '11_67759316_C_T',
 '15_50785016_A_G',
 '5_69372353_T_C',
 '17_5036748_G_T',
 '3_195506271_A_T',
 '19_9021141_G_A',
 '12_94543436_G_A',
 '3_195509006_C_G',
 '18_9122591_T_C',
 '12_11546742_T_C',
 '15_22471542_G_A',
 '15_23686113_C_G',
 '12_11546686_C_T',
 '16_88788786_G_A',
 '13_45968389_G_C',
 '6_31323233_A_G',
 '19_55795866_A_C',
 '12_52699033_G_A',
 '14_105173873_A_C',
 '18_31325884_T_C',
 '5_256514_G_C',
 '6_32712961_G_A',
 '16_88909572_G_A',
 '19_53116939_C_G',
 '19_1110748_G_C',
 '12_40880634_T_G',
 '17_5036740_C_G',
 '4_103832611_G_A',
 '7_143453661_T_G',
 '19_9012515_G_A',
 '11_1085849_A_C',
 '3_125297866_G_A',
 '4_103832656_C_T',
 '7_151896464_A_G',
 '19_9008248_G_C',
 '4_103832620_A_T',
 '7_151932923_G_A',
 '18_74208553_C_G',
 '19_1881355_T_C',
 '14_105173879_A_C',
 '17_63200307_G_A',
 '19_9008264_C_T',
 '9_117800_G_T',
 '15_32738694_G_A',
 '10_50534954_T_C',
 '19_9009652_C_T',
 '11_4608262_A_C',
 '17_74676686_C_A',
 '15_79057989_C_T',
 '11_89531533_G_T',
 '5_145613200_T_C',
 '15_22471573_C_T',
 '11_47647265_A_G',
 '3_195505897_G_A',
 '12_11546749_T_C',
 '12_40877466_T_C',
 '9_151046_A_G',
 '19_9018479_G_T',
 '12_11244390_T_C']

In [None]:
A = pd.read_csv('figures/figure2b/gt_986_exome_150x_atleast5callersinundilutedsample_snv.csv', index_col=0)
B = pd.read_csv('figures/figure2b/gt_986_exome_2000x_atleast5callersinundilutedsample_snv.csv', index_col=0)
print(A.shape[0], B.shape[0])
ab = list(set(set(list(A.index)) & set(list(B.index))))
len(ab)
Awithoutab = A.loc[list(set(A.index) - set(ab))]

In [None]:
print(calltablesseries.loc[[a for a in list(Awithoutab.index) if a in calltablesseries.index]][['{:.2f}_{}'.format(aux['tf'].max(), m) for m in config.methods if ('{:.2f}_{}'.format(aux['tf'].max(), m) in calltablesseries.columns) and (m != 'smurf')]].sum(axis=1).value_counts())

plt.figure(figsize=(20,5))
ac = calltablesseries.loc[[a for a in list(Awithoutab.index) if a in calltablesseries.index]][['{:.2f}_{}_vaf'.format(aux['tf'].max(), m) for m in config.methods if ('{:.2f}_{}'.format(aux['tf'].max(), m) in calltablesseries.columns) and (m != 'smurf')]].median(skipna=True, axis=1)
print(ac.shape)
sns.histplot(x=ac, binwidth=0.01, binrange=[0,1], alpha=0.5, label='150x only looked up in 2000x')
plt.legend()

In [None]:
comp

In [None]:
comp = pd.concat([Awithoutab, ac], axis=1)
comp.columns = ['vaf 150x', 'vaf 2000x']
plt.figure(figsize=(10,10))
ac = calltablesseries.loc[[a for a in list(Awithoutab.index) if a in calltablesseries.index]][['{:.2f}_{}_vaf'.format(aux['tf'].max(), m) for m in config.methods if ('{:.2f}_{}'.format(aux['tf'].max(), m) in calltablesseries.columns) and (m != 'smurf')]].median(skipna=True, axis=1)
print(ac.shape)
sns.histplot(x='vaf 150x', y='vaf 2000x', data=comp, binwidth=0.01, binrange=[0,0.5], alpha=0.5)

In [None]:
test = pd.read_csv(os.path.join('data', 'mixtures_ultradeep', 'mixtures_chrall', 'mixtures_chrall_CRC-986_100215-CW-T_CRC-986_300316-CW-T' , 'calls', 'CRC-986_100215-CW-T_CRC-986_300316-CW-T_snv_calls_all.csv'), index_col=0)
test.head()

In [None]:
calltables[muttype].shape, test.shape

In [None]:
test = generate_groundtruth(config,test, aux['tf'], ground_truth_method=gtm, muttype=muttype,
                                        matchedtissuepath=None, methods=['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan', 'varnet', 'abemus', 'sinvict'])

In [None]:
test[(test['chrom'] != 'X') & (test['chrom'] != 'Y') & (test[['{:.2f}_{}'.format(max(aux['tf']), m) for m in config.methods]].sum(axis=1) >= 5)]

In [None]:
calltablesseries['41.90_freebayes'].sum()

In [None]:
aux['tf'].max()

In [None]:
calltablesseries[['{:.2f}_{}'.format(max(aux['tf']), m) for m in config.methods]]#.sum(axis=0)

In [None]:
max(aux['tf'])
calltables[muttype].columns
dilutionseries

In [None]:
aux_dict = {'tf': [41.9000, 20.9496, 15.7102, 10.4725, 8.3783, 4.18723, 2.0935], 'vaf': [np.nan] * 7, 'cov': [1000.0] + 6*[2000.0], 'ichorcna': [np.nan] * 7, 'samplename': [np.nan] * 7}
print(aux_dict)

In [None]:
aux_dict = {'tf': [41.9000, 20.9496, 15.7102, 10.4725, 8.3783, 4.18723, 2.0935], 'vaf': [np.nan] * 7, 'cov': [1000.0] + 6*[2000.0], 'ichorcna': [np.nan] * 7, 'samplename': [np.nan] * 7, 'sampleid':
            ['mixture_chrall_CRC-986_100215-CW-T_1000x_CRC-986_300316-CW-T_0x', 'mixture_chrall_CRC-986_100215-CW-T_1000x_CRC-986_300316-CW-T_1000x', 'mixture_chrall_CRC-986_100215-CW-T_750x_CRC-986_300316-CW-T_1250x',
    'mixture_chrall_CRC-986_100215-CW-T_500x_CRC-986_300316-CW-T_1500x', 'mixture_chrall_CRC-986_100215-CW-T_400x_CRC-986_300316-CW-T_1600x', 'mixture_chrall_CRC-986_100215-CW-T_200x_CRC-986_300316-CW-T_1800x',
    'mixture_chrall_CRC-986_100215-CW-T_100x_CRC-986_300316-CW-T_1900x']}
print(aux_dict)
aux = pd.DataFrame.from_dict(aux_dict).set_index('sampleid')
aux.to_csv('data/mixtures_ultradeep/mixtures_chrall/mixtures_chrall_CRC-986_100215-CW-T_CRC-986_300316-CW-T/calls/CRC-986_100215-CW-T_CRC-986_300316-CW-T_tf_cov.csv')

In [None]:
calltablesseries['truth'].sum()

In [None]:
A = list(test[(test['chrom'] != 'X') & (test['chrom'] != 'Y') & (test[['{:.2f}_{}'.format(max(aux['tf']), m) for m in config.methods]].sum(axis=1) >= 5)].index)
len(A)

In [None]:
A = list(calltablesseries[calltablesseries['truth'] == True][['{:.2f}_{}_score'.format(aux['tf'].max(), m) for m in config.methods]].index)
A

In [None]:
B = ['4_66217311_G_A',
 '4_103832597_A_G',
 '4_103832610_C_T',
 '4_103832611_G_A',
 '4_103832620_A_T',
 '4_103832643_A_C',
 '4_103832656_C_T',
 '4_103832665_T_C',
 '4_144922436_T_G',
 '4_190883030_G_T',
 '5_256514_G_C',
 '5_15937234_G_A',
 '5_31297469_C_T',
 '5_69372353_T_C',
 '5_89949151_G_A',
 '5_112128143_C_T',
 '5_140502997_A_C',
 '5_140503002_C_A',
 '5_140736267_T_C',
 '5_145613200_T_C',
 '5_175387011_G_A',
 '5_177733904_G_A',
 '6_26205178_C_T',
 '6_26368486_C_A',
 '6_29407980_G_A',
 '6_31323233_A_G',
 '6_32712961_G_A',
 '6_32712967_C_G',
 '6_116783166_T_C',
 '7_5352438_G_A',
 '7_44840986_A_C',
 '7_72412591_G_A',
 '7_72418618_A_T',
 '7_89790175_C_G',
 '7_100681211_G_C',
 '7_100681266_C_A',
 '7_100681293_G_A',
 '7_100681345_T_G',
 '7_100681363_C_A',
 '7_112092224_G_T',
 '7_117188736_C_A',
 '7_117188797_A_G',
 '7_143094423_C_T',
 '7_143140675_G_A',
 '7_143453661_T_G',
 '7_151896464_A_G',
 '7_151932923_G_A',
 '7_151945101_G_C',
 '7_152522195_C_T',
 '7_153750014_G_A',
 '7_157959929_G_A',
 '9_117800_G_T',
 '9_151046_A_G',
 '9_151071_T_C',
 '9_38571752_G_A',
 '9_38571802_A_G',
 '9_73233803_G_A',
 '9_111625513_C_T',
 '9_130507136_G_A',
 '9_140772453_G_C',
 '10_19856538_G_A',
 '10_22023011_C_T',
 '10_29779847_A_C',
 '10_50534954_T_C',
 '10_79569328_C_T',
 '10_103587994_C_T',
 '10_117278812_C_T',
 '10_124583002_C_T',
 '11_320606_G_T',
 '11_320649_G_A',
 '11_1017806_G_T',
 '11_1085845_C_T',
 '11_1085849_A_C',
 '11_4608262_A_C',
 '11_46369127_G_A',
 '11_47647265_A_G',
 '11_48347424_T_A',
 '11_64545497_C_T',
 '11_67759316_C_T',
 '11_71847122_C_T',
 '11_89531533_G_T',
 '11_93826745_G_A',
 '11_124253181_T_C',
 '11_132016219_C_T',
 '12_7945629_T_A',
 '12_11244356_A_T',
 '12_11244378_C_G',
 '12_11244390_T_C',
 '12_11244432_A_G',
 '12_11244646_T_C',
 '12_11244687_G_C',
 '12_11286149_G_A',
 '12_11286164_G_A',
 '12_11546686_C_T',
 '12_11546742_T_C',
 '12_11546749_T_C',
 '12_40877466_T_C',
 '12_40880634_T_G',
 '12_40882016_G_T',
 '12_40882735_A_G',
 '12_52431751_C_G',
 '12_52699033_G_A',
 '12_94543436_G_A',
 '13_20066994_T_C',
 '13_20067011_A_C',
 '13_45968389_G_C',
 '13_47470350_C_T',
 '13_73357855_C_T',
 '13_73636650_C_T',
 '13_88330033_A_C',
 '14_58063540_C_T',
 '14_77229439_C_T',
 '14_105173873_A_C',
 '14_105173879_A_C',
 '14_105410352_T_C',
 '14_105939672_C_G',
 '15_22471481_G_T',
 '15_22471522_C_T',
 '15_22471542_G_A',
 '15_22471573_C_T',
 '15_23104346_C_G',
 '15_23104657_G_C',
 '15_23686113_C_G',
 '15_28950629_C_T',
 '15_32738694_G_A',
 '15_41192469_C_T',
 '15_50785016_A_G',
 '15_51634218_A_C',
 '15_74425533_A_G',
 '15_79057989_C_T',
 '15_79067099_G_C',
 '15_84488603_G_T',
 '15_90213399_C_A',
 '16_2522557_C_T',
 '16_30718920_G_A',
 '16_67183982_C_T',
 '16_71163579_C_T',
 '16_88788786_G_A',
 '16_88909572_G_A',
 '17_1394895_G_C',
 '17_5036740_C_G',
 '17_5036748_G_T',
 '17_7386217_T_C',
 '17_7386234_G_A',
 '17_7578394_T_C',
 '17_11772531_C_T',
 '17_21114408_T_C',
 '17_28887667_T_C',
 '17_39383069_A_G',
 '17_56569055_A_G',
 '17_58234877_G_A',
 '17_63200307_G_A',
 '17_74676686_C_A',
 '18_335173_G_T',
 '18_9122591_T_C',
 '18_9122611_T_C',
 '18_9122638_G_C',
 '18_13761856_G_C',
 '18_31325884_T_C',
 '18_47363917_A_G',
 '18_47364009_G_A',
 '18_74208553_C_G',
 '19_1110748_G_C',
 '19_1440068_G_C',
 '19_1881355_T_C',
 '19_1881408_A_G',
 '19_6222411_T_G',
 '19_7677678_C_T',
 '19_9008248_G_C',
 '19_9008264_C_T',
 '19_9009652_C_T',
 '19_9012501_A_G',
 '19_9012515_G_A',
 '19_9015323_C_A',
 '19_9015324_A_G',
 '19_9018476_G_A',
 '19_9018479_G_T',
 '19_9018508_T_G',
 '19_9021141_G_A',
 '19_14877799_G_C',
 '19_15540781_C_A',
 '19_15918177_G_T',
 '19_33698018_C_T',
 '19_41354606_G_T',
 '19_41354661_T_C',
 '19_53116939_C_G',
 '19_55795866_A_C']

In [None]:
len([a for a in A if a.startswith('9_')])
len([a for a in B if a.startswith('9_')])

In [None]:
print(len(A), len(B))
A = ['_'.join(a.split('_')[:2]) for a in A]
B = ['_'.join(b.split('_')[:2]) for b in B]
print(len(set(A) & set(B)))

In [None]:
aux = calltablesseries[[c for c in list(calltablesseries.columns) if c.startswith('20') or c=='truth']]
#aux[aux['truth'] == True][['20.95_mutect2','20.95_mutect2_score', '20.95_freebayes_score', '20.95_strelka2_score', '20.95_vardict_score', '20.95_varscan_score', '20.95_abemus_score', '20.95_smurf_score']].head(50)

print(aux[(aux['truth'] != True) & (aux['20.95_mutect2'] == True)].shape)
aux[(aux['truth'] != True) & (aux['20.95_mutect2'] == True)]['20.95_mutect2_score'].head(50)#.isna().sum()

# Part II: (4) Load back metric results and plot combined metric plots

In [None]:

mixtureids =  ['CRC-1014_180816-CW-T_CRC-1014_090516-CW-T', 'CRC-986_100215-CW-T_CRC-986_300316-CW-T', 'CRC-123_310715-CW-T_CRC-123_121115-CW-T']


#for fixedvar in fixedvars:
fixedvar = 'coverage'
if fixedvar == 'coverage':
    xaxis = 'tumor burden'
elif fixedvar == 'ctdna':
    xaxis = 'coverage'
#for mt in muttypes:
mt = 'snv'
if mt == 'snv':
    if mixtureid == 'CRC-986_100215-CW-T_CRC-986_300316-CW-T':
        gtm = 3
        refname = 'intissuesamplebyatleast'+str(gtm)+'callers'
    else:
        gtm = 3
        refname = 'inundilutedsamplebyatleast'+str(gtm)+'callers'
else:  # elif mt == 'indel':
    gtm = 2
    refname = 'inundilutedsamplebyatleast'+str(gtm)+'callers'
print(refname)
# for metric in metrics:
metric = 'auprc'
# load results tables
restables = {'snv': [], 'indel': []}
# for mixtureid in mixtureids:
plasmasample = '_'.join(mixtureid.split('_')[:2])
print(mixtureid, plasmasample)
xa = xaxis if xaxis != 'tumor burden' else 'tb'
print(xa)
restable = pd.read_csv(os.path.join(*config.mixturefolderultradeep, 'mixtures_allchr', 'results', mixtureid+'_'+mt+'_'+metric+'_'+refname+'_fixed'+fixedvar+'_'+ xa +'.csv'), index_col=0)
restable['plasma sample'] = plasmasample
restables[mt].append(restable)
restables[mt] = pd.concat(restables[mt])
res1 = plot_metricsseries(config, restables, mixtureids, 'all', metric=metric, muttype=mt,
                   ground_truth_method='mixture', fixedvar=fixedvar, refname=refname, allpatients=True, logscale=False, save=False)
#res2 = plot_metricsseries(config, restables, mixtureids, 'all', metric=metric, muttype=mt,
#                   ground_truth_method='mixture', fixedvar=fixedvar, refname=refname, allpatients=True, logscale=True, save=True)
#res3 = plot_metricsseries(config, restables, mixtureids, 'all', metric=metric, muttype=mt,
#                   ground_truth_method='mixture', fixedvar=fixedvar, refname=refname, allpatients=False, logscale=False, save=True)
#res4 = plot_metricsseries(config, restables, mixtureids, 'all', metric=metric, muttype=mt,
#                       ground_truth_method='mixture', fixedvar=fixedvar, refname=refname, allpatients=False, logscale=True, save=True)
plt.grid(linewidth=1)
plt.ylim([0, 0.5])
plt.xlim([22, 0])

In [None]:

mixtureids =  ['CRC-1014_180816-CW-T_CRC-1014_090516-CW-T', 'CRC-986_100215-CW-T_CRC-986_300316-CW-T', 'CRC-123_310715-CW-T_CRC-123_121115-CW-T']


#for fixedvar in fixedvars:
fixedvar = 'tumor burden'
if fixedvar == 'coverage':
    xaxis = 'tumor burden'
elif fixedvar == 'ctdna':
    xaxis = 'coverage'
#for mt in muttypes:
mt = 'snv'
if mt == 'snv':
    if mixtureid == 'CRC-986_100215-CW-T_CRC-986_300316-CW-T':
        gtm = 3
        refname = 'intissuesamplebyatleast'+str(gtm)+'callers'
    else:
        gtm = 3
        refname = 'inundilutedsamplebyatleast'+str(gtm)+'callers'
else:  # elif mt == 'indel':
    gtm = 2
    refname = 'inundilutedsamplebyatleast'+str(gtm)+'callers'
print(refname)
# for metric in metrics:
metric = 'auprc'
# load results tables
restables = {'snv': [], 'indel': []}
# for mixtureid in mixtureids:
plasmasample = '_'.join(mixtureid.split('_')[:2])
print(mixtureid, plasmasample)
xa = xaxis if xaxis != 'tumor burden' else 'tb'
print(xa)
restable = pd.read_csv(os.path.join(*config.mixturefolderultradeep, 'mixtures_allchr', 'results', mixtureid+'_'+mt+'_'+metric+'_'+refname+'_fixed'+fixedvar+'_'+ xa +'.csv'), index_col=0)
restable['plasma sample'] = plasmasample
restables[mt].append(restable)
restables[mt] = pd.concat(restables[mt])
res1 = plot_metricsseries(config, restables, mixtureids, 'all', metric=metric, muttype=mt,
                   ground_truth_method='mixture', fixedvar=fixedvar, refname=refname, allpatients=True, logscale=False, save=False)
#res2 = plot_metricsseries(config, restables, mixtureids, 'all', metric=metric, muttype=mt,
#                   ground_truth_method='mixture', fixedvar=fixedvar, refname=refname, allpatients=True, logscale=True, save=True)
#res3 = plot_metricsseries(config, restables, mixtureids, 'all', metric=metric, muttype=mt,
#                   ground_truth_method='mixture', fixedvar=fixedvar, refname=refname, allpatients=False, logscale=False, save=True)
#res4 = plot_metricsseries(config, restables, mixtureids, 'all', metric=metric, muttype=mt,
#                       ground_truth_method='mixture', fixedvar=fixedvar, refname=refname, allpatients=False, logscale=True, save=True)
plt.grid(linewidth=1)
#plt.ylim([0, 0.5])
#plt.xlim([22, 0])

In [None]:
resx = np.array([rx.values for rx in res1['x']])
resy = np.array([ry.values for ry in res1['y']])
res = {m: [] for m in config.methods}
for mi, m in enumerate(config.methods):
    if len(resy.shape) > 2:
        resmean = np.mean([resy[mi], resy[mi+5], resy[mi+10]], axis=0)
        resstd = np.std([resy[mi], resy[mi+5], resy[mi+10]], axis=0)
        #reslabel = m
    else:
        resmean = list(resy[mi])
        resstd = [0 for i in range(7)]
    if len(resx.shape) == 2:
        res[m] = [resmean, resstd, resx[0]]
    else:
        res[m] = [resmean, resstd, resx.mean(axis=0)]

In [None]:
color_dict = {config.methods[i]: config.colors[i] for i in range(len(config.methods))}
plt.figure(figsize=(15, 10))
plt.grid(linewidth=0)
for m in config.methods:
    #plt.plot(res['freebayes'][2], res['freebayes'][0], c=color_dict['freebayes'], markersize=15, lw=2)
    plt.plot(res[m][2], res[m][0], marker=config.markers[0], label=m,  markersize=15, lw=2)
    plt.gca().invert_xaxis()
ax = plt.gca()
xlab = 'coverage or added noise'
xlab='tumor burden'
hand, labl = ax.get_legend_handles_labels()
ax.legend(hand, labl, bbox_to_anchor=(1, 1), loc="upper left")
plt.xlabel(xlab)
plt.ylabel(metric.upper()+' score')
plt.title(metric.upper() + " score for {} calling in chr{} with ref {}".format(mt.upper(), chrom, refname))

In [None]:
for m in config.methods:
    res[m][2] = res[m][2][::-1]
    res[m][0] = res[m][0][::-1]
    res[m][1] = res[m][1][::-1]

In [None]:
#color_dict = {config.methods[i]: config.colors[i] for i in range(len(config.methods))}
#plt.figure(figsize=(15, 10))
#plt.grid(linewidth=1)
#res[m][2] = res[m][2][::-1]
#res[m][0] = res[m][0][::-1]
#res[m][1] = res[m][1][::-1]
rankres = []
for m in config.methods:
    xstart = 0
    ystart = 0
    sum_all = []
    for i in range(len(res[m][2])):
        #if i >0:
        #print(res[m][2][i], xstart)
        #print((res[m][2][i]-xstart) * res[m][0][i] )
        #print(((res[m][2][i]-xstart) * (res[m][0][i]-ystart)/2))
        sum_all.append(((res[m][2][i]-xstart) * res[m][0][i]) - ((res[m][2][i]-xstart) * (res[m][0][i]-ystart)/2))
        xstart = res[m][2][i]
        ystart = res[m][0][i]
    print(m, np.sum(sum_all) )
    rankres.append(np.sum(sum_all))
    #plt.plot(res['freebayes'][2], res['freebayes'][0], c=color_dict['freebayes'], markersize=15, lw=2)
    #plt.errorbar(res[m][2], res[m][0], xerr = resx.std(axis=0), yerr=res[m][1], marker=config.markers[0], label=m,  markersize=15, lw=2, fmt='-o')
    #plt.gca().invert_xaxis()
#ax = plt.gca()
#xlab = 'coverage or added noise'
#xlab='tumor burden'
#hand, labl = ax.get_legend_handles_labels()
#ax.legend(hand, labl, bbox_to_anchor=(1, 1), loc="upper left")
#plt.xlabel(xlab)
#plt.ylabel(metric.upper()+' score')
#plt.title(metric.upper() + " score for {} calling in chr{} with ref {}".format(mt.upper(), chrom, refname))

In [None]:
import matplotlib.patches as mpatches
rankres_sort = np.argsort(rankres)[::-1]
rankres_sort
plt.figure(figsize=(14, 5))
method_order =[]
rank_order =[]
for i in rankres_sort:
    method_order.append(config.methods[i])
    rank_order.append(rankres[i])
plt.bar(method_order, rank_order, color=['b', 'b', 'b', 'b', 'b', 'r', 'r'])
blue_patch = mpatches.Patch(color='b', label='DNA methods')
red_patch = mpatches.Patch(color='r', label='cfDNA methods')
plt.legend(handles=[blue_patch, red_patch])
#plt.legend(['DNA methods', 'cfDNA methods'])
plt.ylabel('area under AUPRC curve')
plt.title('Caller ranking mixtures with decreasing tumor burden')