# Series analysis


Analysis updates:
- retrieve calls rejected only because VAF<0.01 (bcbio lower acceptable threshold) for Mutect2, Strelka2 and Vardict
- correct for germline mutations using GATK haplotype calls
- get PR curves for sinvict by encoding the 6 files as thresholds. Assumption: linear filters
- plotting: PR curves stops at 10e-2 on the left
- ground truths:

    1) Consensus: build using
        majority of 5/8 callers for SNV and 3/5 callers for INDELS


    2) Ranked mutations: metascore built using
        weigthed sum of normalised scores between 0 and 1 for each caller
        with weigths = inversially proportional to number of calls made by caller (if a caller calls few mutations higher weight, it many calls low weight)
        threshold = 1/ncallers
        interpretation: if 1 caller is sure (score = 1) of calling this position, add it to GT
        interpretation: if 2 callers are quite sure of calling this position (score > 0,5 each), add it to GT
        
- integrate VAF approx

    1) mixture with VAF instead of tumor burden
    
    2) correct for mutations non present in diluted samples (vaf = 0)
    
    3) pool patients together using VAF

In [None]:
# Imports

%load_ext autoreload
%autoreload 2

import io
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pysam
import warnings
from sklearn.metrics import precision_recall_curve, f1_score, average_precision_score
warnings.filterwarnings('ignore')
from sklearn.metrics import confusion_matrix

# set working directory
if not os.getcwd().endswith('cfdna_snv_benchmark'):
    os.chdir('../')
print('Current working directory: {}'.format(os.getcwd()))

from utils.config import Config
from utils.viz import *
from utils.table import *
from utils.metrics import *
from utils.calltable import *
from utils.calltableseries import *
from utils.groundtruth import *
from utils.metricsseries import *
from utils.venn import venn6, get_labels

In [None]:
# Config and Display paramaters

config = Config("config/", "config_viz.yaml")
set_display_params(config)
print(config.methods)

In [None]:
# Chomosome

mixtureids =  ['CRC-1014_180816-CW-T_CRC-1014_090516-CW-T', 'CRC-123_310715-CW-T_CRC-123_121115-CW-T', 'CRC-986_100215-CW-T_CRC-986_300316-CW-T']
#mixtureid = 'CRC-986_100215-CW-T_CRC-986_300316-CW-T'
#mixtureid = 'CRC-1014_180816-CW-T_CRC-1014_090516-CW-T'
mixtureid = 'CRC-123_310715-CW-T_CRC-123_121115-CW-T'
#mixtureid = 'BRA-412_240820-CW-T_BRA-412_060220-CW-T'
reload = False
save = False
fixedvars=['coverage', 'ctdna']
filterparam = 'all'

markers = ['o', '^', 'X']
linestyles = ['-', '-', '-']
color_dict = {config.methods[i]: config.colors[i] for i in range(len(config.methods))}

muttypes = ['snv', 'indel']
metrics = ['auprc', 'precision', 'recall']

chrom = 'all'

# INDELS: patient 986 = 96, patient 123 = 357, patient 1014 = 277 (≥ 4/6 callers without SMURF)

# Part I: (1) Load/Generate call tables, (2) Generate Ground truths and (3) Compute/Save metrics per patient

In [None]:
fixedvar = 'coverage'
for fixedvar in fixedvars:
    if fixedvar == 'coverage':
        seriesorder = [(70, 0), (70, 80), (50, 100), (30, 120), (20, 130), (10, 140), (5, 145)]
        xaxis = 'tumor burden'
    elif fixedvar == 'ctdna':
        seriesorder = [(70, 0), (70, 80), (70, 180)]
        xaxis = 'coverage'
    #for mixtureid in mixtureids:
    #mixtureid = 'CRC-986_100215-CW-T_CRC-986_300316-CW-T'
    print('############# {} ############'.format(mixtureid))
    if mixtureid ==  'CRC-1014_180816-CW-T_CRC-1014_090516-CW-T':
        chroms = [str(c) for c in range(1,23) if c != 2 and c!=6 and c !=17 and c!=19 and c!=20 and c!=21]
        #chroms = [str(c) for c in range(1,9) if c != 2 and c!=6]
    elif mixtureid ==  'CRC-986_100215-CW-T_CRC-986_300316-CW-T':
        chroms = [str(c) for c in range(1,23) if c !=1 and c!= 2 and c !=8 and c!=20 and c!=21 and c!=22] 
    else:
        chroms = [str(c) for c in range(1,23) if c !=6 and c!=19 and c!=20]  # c !=1 and c!= 2 and
    calltables = {'sampleid':[], 'tf':[], 'cov':[], 'snv':[], 'indel':[], 'snp':[]}
    aux_all = []
    calltable_snv, aux = get_calltableseries(config, mixtureid, chroms, muttype='snv', filterparam=filterparam, reload=reload, save=save)
    calltable_indel, aux = get_calltableseries(config, mixtureid, chroms, muttype='indel', filterparam=filterparam, reload=reload, save=save)
    calltable_snp, aux = get_calltableseries(config, mixtureid, chroms, muttype='snp', filterparam=filterparam, reload=reload, save=save)
    print(calltable_snv.shape, calltable_indel.shape, calltable_snp.shape)
    print(aux)
    plasmasample = '_'.join(mixtureid.split('_')[:2])
    print(plasmasample)
    healthysample = '_'.join(mixtureid.split('_')[2:])
    print(healthysample)
    calltables['snv'].append(calltable_snv)
    calltables['indel'].append(calltable_indel)
    calltables['snp'].append(calltable_snp)
    calltables['sampleid'] = mixtureid 
    calltables['tf'] = np.unique([cn.split('_')[0] for cn in list(calltable_snv.columns)])[:-5].astype(float)
    calltables['snv'] = pd.concat(calltables['snv'])
    calltables['indel'] = pd.concat(calltables['indel'])
    calltables['snp'] = pd.concat(calltables['snp'])
    dilutionseries = aux.T[['mixture_' + '_'.join(mixtureid.split('_')[:2]) + '_' + str(s[0]) + 'x_' + '_'.join(mixtureid.split('_')[2:4]) + '_' + str(s[1]) + 'x' for s in seriesorder]].T
    #for muttype in muttypes:
    muttype = 'snv'
    refsample = 'undiluted'
    if muttype == 'snv':
        gtm = 5
    else:  # elif muttype == 'indel':
        gtm = 4
    print(max(aux['tf']))
    #if mixtureid ==  'CRC-986_100215-CW-T_CRC-986_300316-CW-T' and muttype == 'snv':
    #    gtm = 3
    #    refsample = 'tissue'
    #    #calltablesseries  = generate_groundtruth(config, calltables[muttype], aux['tf'], ground_truth_method='tissue', muttype=muttype,
    #    #                                    matchedtissuepath=os.path.join('data', 'matchedtissue', 'NCC_CRC-986_100215-T1W', 'calls', 'NCC_CRC-986_100215-T1W_snv_calls_PASS_exome.csv'))
    #    calltablesseries  = generate_groundtruth(config, calltables[muttype], aux['tf'], ground_truth_method='tissue', muttype=muttype,
    #                                        matchedtissuepath=os.path.join('data', 'matchedtissue_ultradeep', '986_100215_T1-E', 'calls', '986_100215_T1-E_snv_calls_all.csv'))
    #if mixtureid !=  'CRC-986_100215-CW-T_CRC-986_300316-CW-T':
    #calltablesseries = generate_groundtruth(config, calltables[muttype], aux['tf'], ground_truth_method=gtm, muttype=muttype,
    #                                        matchedtissuepath=None, methods=['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan', 'abemus', 'sinvict'])
    #else:
    calltablesseries = generate_groundtruth(config, calltables[muttype], aux['tf'], ground_truth_method=gtm, muttype=muttype,
                                            matchedtissuepath=None, methods=['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan', 'varnet', 'abemus', 'sinvict'])
    results_auprc_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='auprc', ground_truth_method=gtm,
                                     refsample=refsample, muttype=muttype, methods=config.methods, fixedvar=fixedvar, xaxis=xaxis)
    #results_recall_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='recall', ground_truth_method=gtm,
    #                                       refsample=refsample, muttype=muttype, methods=config.methods, fixedvar=fixedvar, xaxis=xaxis)
    #results_precision_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='precision', ground_truth_method=gtm,
    #                                      refsample=refsample, muttype=muttype, methods=config.methods, fixedvar=fixedvar, xaxis=xaxis)
    #results_maxf1_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='maxf1', ground_truth_method=gtm,
    #                                      refsample=refsample, muttype=muttype, methods=config.methods, fixedvar=fixedvar, xaxis=xaxis)
    #results_maxrecallatlast0_05precision_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='maxrecallatleast0_03precision', ground_truth_method=gtm,
    #                                  refsample=refsample, muttype=muttype, methods=config.methods, fixedvar=fixedvar, xaxis=xaxis)
    #results_maxrecallatlast0_05precision_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='maxrecallatleast0_04precision', ground_truth_method=gtm,
    #                                  refsample=refsample, muttype=muttype, methods=config.methods, fixedvar=fixedvar, xaxis=xaxis)
    results_maxrecallatlast0_03precision_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='maxrecallatleast0_03precision', ground_truth_method=gtm,
                                      refsample=refsample, muttype=muttype, methods=config.methods, fixedvar=fixedvar, xaxis=xaxis)
    #results_maxf1recall_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='maxf1recall', ground_truth_method=gtm,
    #                                      refsample=refsample, muttype=muttype, methods=config.methods, fixedvar=fixedvar, xaxis=xaxis)
    #results_maxf1precision_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='maxf1precision', ground_truth_method=gtm,
    #                                      refsample=refsample, muttype=muttype, methods=config.methods, fixedvar=fixedvar, xaxis=xaxis)
    #figure_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, xy='pr', ground_truth_method=gtm,
    #                    refsample=refsample, muttype=muttype.upper(), methods=None, fixedvar=fixedvar, save=save)

In [None]:
aux

In [None]:
calltablesseries = pd.read_csv(os.path.join('data', 'mixtures', 'mixtures_allchr', 'BRA-412_240820-CW-T_BRA-412_060220-CW-T_snv_calls_all.csv'), index_col=0)
aux = pd.read_csv(os.path.join('data', 'mixtures', 'mixtures_allchr', 'BRA-412_240820-CW-T_BRA-412_060220-CW-T_tf_cov.csv'),index_col=0)
gtm = 5
muttype = 'snv'
refsample = 'undiluted'
fixedvar = 'coverage'
if fixedvar == 'coverage':
    seriesorder = [(100, 0), (70, 30), (50, 50), (30, 70), (20, 80), (10, 90)] 
    xaxis = 'tumor burden'
elif fixedvar == 'ctdna':
    seriesorder = [(100, 0), (100, 50), (100, 100)]
    xaxis = 'coverage'
dilutionseries = aux.T[['mixture_' + '_'.join(mixtureid.split('_')[:2]) + '_' + str(s[0]) + 'x_' + '_'.join(mixtureid.split('_')[2:4]) + '_' + str(s[1]) + 'x' for s in seriesorder]].T
calltablesseries = generate_groundtruth(config, calltablesseries, aux['tf'], ground_truth_method=gtm, muttype=muttype,
                                        matchedtissuepath=None, methods=['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan', 'varnet', 'abemus', 'sinvict'])
results_auprc_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='auprc', ground_truth_method=gtm,
                                 refsample=refsample, muttype=muttype, methods=config.methods, fixedvar=fixedvar, xaxis=xaxis, diltype='mixture')
results_maxrecallatlast0_03precision_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='maxrecallatleast0_03precision', ground_truth_method=gtm,
                                      refsample=refsample, muttype=muttype, methods=config.methods, fixedvar=fixedvar, xaxis=xaxis, diltype='mixture')

In [None]:
aux

In [None]:
results_maxrecallatlast0_05precision_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='maxrecallatleast0_03precision', ground_truth_method=gtm,
                                      refsample=refsample, muttype=muttype, methods=config.methods, fixedvar=fixedvar, xaxis=xaxis)

In [None]:
print(calltablesseries['truth'].value_counts())

In [None]:
3*10e6 == 30000000

In [None]:
print(calltablesseries.shape[0])
print(calltablesseries['truth'].value_counts())
print(calltablesseries[calltablesseries['truth'] == True].shape[0] / calltablesseries.shape[0])
print(calltablesseries[calltablesseries['truth'] == True].shape[0] / (3*10e6))
print(72815 * calltablesseries[calltablesseries['truth'] == True].shape[0] / (3*10e6))
print(0.03 / (calltablesseries[calltablesseries['truth'] == True].shape[0] /(3*10e6)) )
print(0.03 / (calltablesseries[calltablesseries['truth'] == True].shape[0] / calltablesseries.shape[0] ))

print(100000 * (calltablesseries[calltablesseries['truth'] == True].shape[0] /(3*10e6)) )
print(500 * (calltablesseries[calltablesseries['truth'] == True].shape[0] / calltablesseries.shape[0] ))

In [None]:
list(config.methods)

# GT analysis

In [None]:
# Config and Display paramaters

config = Config("config/", "config_viz.yaml")
set_display_params(config)
print(config.methods)

In [None]:
mixtureid = 'CRC-986_100215-CW-T_CRC-986_300316-CW-T'
#mixtureid = 'CRC-1014_180816-CW-T_CRC-1014_090516-CW-T'
#mixtureid = 'CRC-123_310715-CW-T_CRC-123_121115-CW-T'

fixedvar = 'coverage'
if fixedvar == 'coverage':
    seriesorder = [(70, 0), (70, 80), (50, 100), (30, 120), (20, 130), (10, 140), (5, 145)]
    xaxis = 'tumor burden'
elif fixedvar == 'ctdna':
    seriesorder = [(70, 0), (70, 80), (70, 180)]
    xaxis = 'coverage'
#for mixtureid in mixtureids:
#mixtureid = 'CRC-986_100215-CW-T_CRC-986_300316-CW-T'
print('############# {} ############'.format(mixtureid))
if mixtureid ==  'CRC-1014_180816-CW-T_CRC-1014_090516-CW-T':
    chroms = [str(c) for c in range(1,23) if c != 2 and c!=6 and c !=17 and c!=19 and c!=20 and c!=21]
    #chroms = [str(c) for c in range(1,9) if c != 2 and c!=6]
elif mixtureid ==  'CRC-986_100215-CW-T_CRC-986_300316-CW-T':
    chroms = [str(c) for c in range(1,23) if c !=1 and c!= 2 and c !=8 and c!=20 and c!=21 and c!=22] 
else:
    chroms = [str(c) for c in range(1,23) if c !=6 and c!=19 and c!=20]  # c !=1 and c!= 2 and
calltables = {'sampleid':[], 'tf':[], 'cov':[], 'snv':[], 'indel':[], 'snp':[]}
aux_all = []
calltable_snv, aux = get_calltableseries(config, mixtureid, chroms, muttype='snv', filterparam=filterparam, reload=reload, save=save)
calltable_indel, aux = get_calltableseries(config, mixtureid, chroms, muttype='indel', filterparam=filterparam, reload=reload, save=save)
calltable_snp, aux = get_calltableseries(config, mixtureid, chroms, muttype='snp', filterparam=filterparam, reload=reload, save=save)
print(calltable_snv.shape, calltable_indel.shape, calltable_snp.shape)
print(aux)
plasmasample = '_'.join(mixtureid.split('_')[:2])
print(plasmasample)
healthysample = '_'.join(mixtureid.split('_')[2:])
print(healthysample)
calltables['snv'].append(calltable_snv)
calltables['indel'].append(calltable_indel)
calltables['snp'].append(calltable_snp)
calltables['sampleid'] = mixtureid 
calltables['tf'] = np.unique([cn.split('_')[0] for cn in list(calltable_snv.columns)])[:-5].astype(float)
calltables['snv'] = pd.concat(calltables['snv'])
calltables['indel'] = pd.concat(calltables['indel'])
calltables['snp'] = pd.concat(calltables['snp'])
dilutionseries = aux.T[['mixture_' + '_'.join(mixtureid.split('_')[:2]) + '_' + str(s[0]) + 'x_' + '_'.join(mixtureid.split('_')[2:4]) + '_' + str(s[1]) + 'x' for s in seriesorder]].T
#for muttype in muttypes:
muttype = 'snv'
refsample = 'undiluted'
if muttype == 'snv':
    gtm = 5
else:  # elif muttype == 'indel':
    gtm = 3
print(max(aux['tf']))
#if mixtureid ==  'CRC-986_100215-CW-T_CRC-986_300316-CW-T' and muttype == 'snv':
#    gtm = 3
#    refsample = 'tissue'
#    #calltablesseries  = generate_groundtruth(config, calltables[muttype], aux['tf'], ground_truth_method='tissue', muttype=muttype,
#    #                                    matchedtissuepath=os.path.join('data', 'matchedtissue', 'NCC_CRC-986_100215-T1W', 'calls', 'NCC_CRC-986_100215-T1W_snv_calls_PASS_exome.csv'))
#    calltablesseries  = generate_groundtruth(config, calltables[muttype], aux['tf'], ground_truth_method='tissue', muttype=muttype,
#                                        matchedtissuepath=os.path.join('data', 'matchedtissue_ultradeep', '986_100215_T1-E', 'calls', '986_100215_T1-E_snv_calls_all.csv'))
if mixtureid !=  'CRC-986_100215-CW-T_CRC-986_300316-CW-T':
    calltablesseries = generate_groundtruth(config, calltables[muttype], aux['tf'], ground_truth_method=gtm, muttype=muttype,
                                        matchedtissuepath=None, methods=['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan', 'abemus', 'sinvict'])
else:
    calltablesseries = generate_groundtruth(config, calltables[muttype], aux['tf'], ground_truth_method=gtm, muttype=muttype,
                                        matchedtissuepath=None, methods=['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan', 'varnet', 'abemus', 'sinvict'])

In [None]:
gtdf = calltablesseries[calltablesseries['truth'] == True]
gtdf['median_vaf'] = gtdf[['{:.2f}_{}_vaf'.format(aux['tf'].max(), m) for m in config.methods if '{:.2f}_{}_vaf'.format(aux['tf'].max(), m) in gtdf.columns]].median(axis=1, skipna=True)
sns.histplot(data=gtdf, x="median_vaf", bins=30, stat='frequency')

In [None]:
gtdf['median_vaf'].to_csv('figures/figure2b/gt_986_exome_150x_atleast5callersinundilutedsample_snv.csv')

In [None]:
calltablesseries.loc[ac.loc[ac.index.duplicated()].index]

In [None]:
Bwithoutab.loc[Bwithoutab.loc[Bwithoutab.index.duplicated()].index]

In [None]:
A = pd.read_csv('figures/figure2b/gt_986_exome_150x_atleast5callersinundilutedsample_snv.csv', index_col=0)
B = pd.read_csv('figures/figure2b/gt_986_exome_2000x_atleast5callersinundilutedsample_snv.csv', index_col=0)
print(A.shape[0], B.shape[0])
ab = list(set(set(list(A.index)) & set(list(B.index))))
len(ab)
Awithoutab = A.loc[list(set(A.index) - set(ab))]
Bwithoutab = B.loc[list(set(B.index) - set(ab))]

plt.figure(figsize=(10,10))

#calltablesseries = pd.read_csv(os.path.join('data', 'mixtures', 'mixtures_allchr', 'CRC-986_100215-CW-T_CRC-986_300316-CW-T_snv_calls_all.csv'), index_col=0, memory_map=True)
aux = pd.read_csv(os.path.join('data', 'mixtures_ultradeep', 'mixtures_chrall', 'mixtures_chrall_CRC-986_100215-CW-T_CRC-986_300316-CW-T' , 'calls', 'CRC-986_100215-CW-T_CRC-986_300316-CW-T_tf_cov.csv'), index_col=0)
ac = pd.DataFrame(calltablesseries.loc[[a for a in list(Bwithoutab.index) if a in calltablesseries.index]][['{:.2f}_{}_vaf'.format(aux['tf'].max(), m) for m in config.methods if ('{:.2f}_{}'.format(aux['tf'].max(), m) in calltablesseries.columns) and (m != 'smurf')]].median(skipna=True, axis=1))
ac = ac[~ac.index.duplicated()]
Bwithoutab = Bwithoutab[~Bwithoutab.index.duplicated()]
comp = pd.concat([ac, Bwithoutab], axis=1)
comp.columns = ['vaf 150x', 'vaf 2000x']
comp.fillna(0, inplace=True)
list_only2000x = list(comp.index)
print(Bwithoutab.shape[0], ac.shape[0])
sns.histplot(x='vaf 150x', y='vaf 2000x', data=comp, binwidth=0.01, binrange=[0,.5], alpha=1, color='tab:blue', label='2000x only')

comp = pd.concat([A.loc[ab], B.loc[ab]], axis=1)
comp.columns = ['vaf 150x', 'vaf 2000x']
comp.fillna(0, inplace=True)
list_both = list(comp.index)
sns.histplot(x='vaf 150x', y='vaf 2000x', data=comp, binwidth=0.01, binrange=[0,.5], alpha=1, color='tab:red', label='both')

#table_ultradeep = pd.read_csv(os.path.join('data', 'mixtures_ultradeep', 'mixtures_chrall', 'mixtures_chrall_CRC-986_100215-CW-T_CRC-986_300316-CW-T' , 'calls', 'CRC-986_100215-CW-T_CRC-986_300316-CW-T_snv_calls_all.csv'), index_col=0, memory_map=True)
#aux_ultradeep = pd.read_csv(os.path.join('data', 'mixtures_ultradeep', 'mixtures_chrall', 'mixtures_chrall_CRC-986_100215-CW-T_CRC-986_300316-CW-T' , 'calls', 'CRC-986_100215-CW-T_CRC-986_300316-CW-T_tf_cov.csv'), index_col=0)
#table_ultradeep = generate_groundtruth(config, table_ultradeep, aux_ultradeep['tf'], ground_truth_method=5, muttype='snv', matchedtissuepath=None, methods=['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan', 'varnet', 'abemus', 'sinvict'])

ac = calltablesseries.loc[[a for a in list(Awithoutab.index) if a in table_ultradeep.index]][['{:.2f}_{}_vaf'.format(aux_ultradeep['tf'].max(), m) for m in config.methods if ('{:.2f}_{}'.format(aux_ultradeep['tf'].max(), m) in table_ultradeep.columns) and (m != 'smurf')]].median(skipna=True, axis=1)
comp = pd.concat([Awithoutab, ac], axis=1)
comp.columns = ['vaf 150x', 'vaf 2000x']
comp.fillna(0, inplace=True)
list_only150x = list(comp.index)
print(Awithoutab.shape[0], ac.shape[0])
sns.histplot(x='vaf 150x', y='vaf 2000x', data=comp, binwidth=0.01, binrange=[0,.5], alpha=1, color='tab:olive', label='150x only')

from matplotlib.lines import Line2D
a = Line2D([0], [0], color='tab:red', lw=4)
b = Line2D([0], [0], color='tab:blue', lw=4)
c = Line2D([0], [0], color='tab:olive', lw=4)

plt.legend([b, a,  c], [ 'both', '2000x only', '150x only'], bbox_to_anchor=(1,1), loc="upper left")
plt.show()

In [None]:
A = pd.read_csv('figures/figure2b/gt_986_exome_150x_atleast5callersinundilutedsample_snv.csv', index_col=0)
B = pd.read_csv('figures/figure2b/gt_986_exome_2000x_atleast5callersinundilutedsample_snv.csv', index_col=0)
print(A.shape[0], B.shape[0])
ab = list(set(set(list(A.index)) & set(list(B.index))))
len(ab)
Awithoutab = A.loc[list(set(A.index) - set(ab))]
Bwithoutab = B.loc[list(set(B.index) - set(ab))]

plt.figure(figsize=(10,10))
ac = pd.DataFrame(calltablesseries.loc[[a for a in list(Bwithoutab.index) if a in calltablesseries.index]][['{:.2f}_{}_vaf'.format(aux['tf'].max(), m) for m in config.methods if ('{:.2f}_{}'.format(aux['tf'].max(), m) in calltablesseries.columns) and (m != 'smurf')]].median(skipna=True, axis=1))
ac = ac[~ac.index.duplicated()]
Bwithoutab = Bwithoutab[~Bwithoutab.index.duplicated()]
comp = pd.concat([ac, Bwithoutab], axis=1)
comp.fillna(0, inplace=True)
comp.columns = ['vaf 150x', 'vaf 2000x']
print(Bwithoutab.shape[0], ac.shape[0])
sns.histplot(x='vaf 150x', y='vaf 2000x', data=comp, binwidth=0.01, binrange=[0,1], alpha=1, color='g', label='2000x only')

comp = pd.concat([A.loc[ab], B.loc[ab]], axis=1)
comp.columns = ['vaf 150x', 'vaf 2000x']
sns.histplot(x='vaf 150x', y='vaf 2000x', data=comp, binwidth=0.01, binrange=[0,1], alpha=1, label='both')

table_ultradeep = pd.read_csv(os.path.join('data', 'mixtures_ultradeep', 'mixtures_chrall', 'mixtures_chrall_CRC-986_100215-CW-T_CRC-986_300316-CW-T' , 'calls', 'CRC-986_100215-CW-T_CRC-986_300316-CW-T_snv_calls_all.csv'), index_col=0, memory_map=True)
aux_ultradeep = pd.read_csv(os.path.join('data', 'mixtures_ultradeep', 'mixtures_chrall', 'mixtures_chrall_CRC-986_100215-CW-T_CRC-986_300316-CW-T' , 'calls', 'CRC-986_100215-CW-T_CRC-986_300316-CW-T_tf_cov.csv'), index_col=0)
table_ultradeep = generate_groundtruth(config, table_ultradeep, aux_ultradeep['tf'], ground_truth_method=5, muttype='snv', matchedtissuepath=None, methods=['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan', 'varnet', 'abemus', 'sinvict'])

ac = calltablesseries.loc[[a for a in list(Awithoutab.index) if a in table_ultradeep.index]][['{:.2f}_{}_vaf'.format(aux_ultradeep['tf'].max(), m) for m in config.methods if ('{:.2f}_{}'.format(aux_ultradeep['tf'].max(), m) in table_ultradeep.columns) and (m != 'smurf')]].median(skipna=True, axis=1)
comp = pd.concat([Awithoutab, ac], axis=1)
comp.fillna(0, inplace=True)
comp.columns = ['vaf 150x', 'vaf 2000x']
print(Awithoutab.shape[0], ac.shape[0])
sns.histplot(x='vaf 150x', y='vaf 2000x', data=comp, binwidth=0.01, binrange=[0,1], alpha=1, color='r', label='150x only')

from matplotlib.lines import Line2D
a = Line2D([0], [0], color='g', lw=4)
b = Line2D([0], [0], color='b', lw=4)
c = Line2D([0], [0], color='r', lw=4)

plt.legend([b, a,  c], [ 'both', '2000x only', '150x only'], bbox_to_anchor=(1,1), loc="upper left")
plt.show()

In [None]:
from scipy.stats import fisher_exact
from statsmodels.sandbox.stats.multicomp import multipletests

# contingency table and Fisher exact test

print(len(list(A.index)), len(list(B.index)))
A = A[~A.index.duplicated()]
B = B[~B.index.duplicated()]
print(len(list(A.index)), len(list(B.index)))
res = pd.concat([A, B], axis=1)
res[res.isna()] = False
res[res != False] = True
res.columns = ['150x', '2000x']

testres = {}
a, b = '150x', '2000x'
print("############")
print(a, 'VS', b)
print("############")
ctable = [[res[(res[a]==True) & (res[b]==True)].shape[0], res[(res[a]==False) & (res[b]==True)].shape[0]],
          [res[(res[a]==True) & (res[b]==False)].shape[0], int(20000 - res[(res[a]==True) | (res[b]==True)].shape[0])]]

#p_adjusted = multipletests(Column6, method='bonferroni')
odd_ratio, p_value = fisher_exact(ctable, alternative='two-sided')
print('Contingency table:')
print(pd.DataFrame(ctable, columns=[a, '!'+a], index=[b, '!'+b]))
print('Fisher exact test (two-sided): odd_ratio = {}, p_value = {}'.format(odd_ratio, p_value))
testres[a+' VS '+b] = [p_value]

In [None]:
 int(res[(res[a]==True) | (res[b]==True)].shape[0])

In [None]:
list(calltablesseries[calltablesseries['truth'] == True][['{:.2f}_{}_score'.format(aux['tf'].max(), m) for m in config.methods]].index)

In [None]:
import matplotlib.transforms as transforms

patient = mixtureid.split('-')[1].split('_')[0]
print(patient)
gtanalysis = calltablesseries[calltablesseries['truth'] == True][['{:.2f}_{}'.format(aux['tf'].max(), m) for m in config.methods]]
initialanalysis = calltablesseries[calltablesseries['truth'] == False][['{:.2f}_{}'.format(aux['tf'].max(), m) for m in config.methods]]
ngt = gtanalysis.shape[0]
print(ngt)
gtanalysis = pd.DataFrame(gtanalysis.sum()).T
gtanalysis.columns = [l.split('_')[1] for l in list(gtanalysis.columns)]
gtanalysis.index = ['calls in GT']
initialanalysis = pd.DataFrame(initialanalysis.sum()).T
initialanalysis.columns = [l.split('_')[1] for l in list(initialanalysis.columns)]
initialanalysis.index = ['calls not in GT']

fig, ax = plt.subplots(figsize=(10,5))
plt.bar(gtanalysis.columns, gtanalysis.values.flatten(), bottom=1, color=[config.colors[config.methods.index(m)] for m in gtanalysis.columns], width=1)
plt.axhline(y=ngt, c='blue', lw='3')
trans = transforms.blended_transform_factory(
    ax.get_yticklabels()[0].get_transform(), ax.transData)
ax.text(0, ngt, "{:.0f}".format(ngt), color="blue", transform=trans, ha="right", va="center")
for col in initialanalysis.columns:
    print(col, initialanalysis[col].values[0], gtanalysis[col].values[0])
    plt.bar(col, initialanalysis[col].values[0], bottom=gtanalysis[col].values[0], label=col, color='grey', alpha=0.5, width=1)
        
for pi, p in enumerate(ax.patches):
    print(pi, p)
    if pi < (len(ax.patches)/2):
        width, height = p.get_width(), p.get_height()
        x, y = p.get_xy() 
        ax.text(x+width/2, 
                20, 
                '{:.0f}'.format(height), 
                horizontalalignment='center')
    else:
        width, height = p.get_width(), p.get_height()
        x, y = p.get_xy() 
        ax.text(x+width/2, 
                y+height/10, 
                '{:.0f}'.format(height), 
                horizontalalignment='center')
ax.set_yscale('log')
plt.ylabel('# SNV')
ax.grid(axis='y')
plt.ylim([1, 5e5])
ax = plt.gca()
ax.set_xticklabels(labels=gtanalysis.columns,rotation=90)
plt.savefig(os.path.join(*config.outputpath, 'figure2a', 'gtanalysis_barplot_'+patient+'_SNV.svg'), bbox_inches='tight')

In [None]:
import matplotlib.transforms as transforms

gtanalysis = calltablesseries[calltablesseries['truth'] == True][['{:.2f}_{}'.format(aux['tf'].max(), m) for m in config.methods]]
initialanalysis = calltablesseries[calltablesseries['truth'] == False][['{:.2f}_{}'.format(aux['tf'].max(), m) for m in config.methods]]
ngt = gtanalysis.shape[0]
print(ngt)
gtanalysis = pd.DataFrame(gtanalysis.sum()).T
gtanalysis.columns = [l.split('_')[1] for l in list(gtanalysis.columns)]
gtanalysis.index = ['calls in GT']
initialanalysis = pd.DataFrame(initialanalysis.sum()).T
initialanalysis.columns = [l.split('_')[1] for l in list(initialanalysis.columns)]
initialanalysis.index = ['calls not in GT']
gtanalysis = pd.concat([gtanalysis, initialanalysis])
#ax = gtanalysis.T.plot(kind='bar', stacked=True, figsize=(12, 5), rot=0, color=[config.colors[config.methods.index(m)] for m in gtanalysis.columns])
ax = sns.barplot(x=gtanalysis.columns, y='calls in GT', data=gtanalysis.T) #, figsize=(12, 5), rot=0) #, color=[config.colors[config.methods.index(m)] for m in gtanalysis.columns])
ax.set_yscale('log')
for ci, c in enumerate(ax.containers):
    if ci == 0:
        ax.bar_label(c, label_type='center')
    else:
        ax.bar_label(c, label_type='edge')
ax.grid(axis='y')
plt.ylim([1, 5e5])
ax = plt.gca()
ax.set_xticklabels(labels=gtanalysis.columns,rotation=90)
hand, labl = ax.get_legend_handles_labels()
ax.legend(hand, labl, bbox_to_anchor=(1, 1), loc="upper left")
"""
fig, ax = plt.subplots(figsize=(15,5))
plt.bar(gtanalysis.columns, gtanalysis.values.flatten(), color=[config.colors[config.methods.index(m)] for m in gtanalysis.columns], width=.9)
plt.axhline(y=ngt, c='blue', lw='3')
trans = transforms.blended_transform_factory(
    ax.get_yticklabels()[0].get_transform(), ax.transData)
ax.text(0, ngt, "{:.0f}".format(ngt), color="blue", transform=trans, ha="right", va="center")
for col in initialanalysis.columns:
    print(col, initialanalysis[col].values[0], gtanalysis[col].values[0], bottom=0)
    plt.bar(col, initialanalysis[col].values[0], bottom=gtanalysis[col].values[0], label=col, color='grey', alpha=0.5, width=.9)
ax.set_yscale('log')
plt.ylabel('# SNV')
ax.grid(axis='y')
plt.ylim([1, 5e5])
#plt.savefig(os.path.join(*config.outputpath, 'figure2a', 'gtanalysis_barplot_986_SNV.svg'), bbox_inches='tight')
"""

In [None]:

results_auprc_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='auprc', ground_truth_method=gtm,
                                 refsample=refsample, muttype=muttype, methods=config.methods, fixedvar=fixedvar, xaxis=xaxis)
results_recall_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='recall', ground_truth_method=gtm,
                                       refsample=refsample, muttype=muttype, methods=config.methods, fixedvar=fixedvar, xaxis=xaxis)
results_precision_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='precision', ground_truth_method=gtm,
                                      refsample=refsample, muttype=muttype, methods=config.methods, fixedvar=fixedvar, xaxis=xaxis)
results_maxf1_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='maxf1', ground_truth_method=gtm,
                                      refsample=refsample, muttype=muttype, methods=config.methods, fixedvar=fixedvar, xaxis=xaxis)
figure_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, xy='pr', ground_truth_method=gtm,
                    refsample=refsample, muttype=muttype.upper(), methods=None, fixedvar=fixedvar, save=save)

# Part II: (4) Load back metric results and plot combined metric plots

In [None]:
for fixedvar in fixedvars:
#fixedvar = 'coverage'
    if fixedvar == 'coverage':
        xaxis = 'tumor burden'
    elif fixedvar == 'ctdna':
        xaxis = 'coverage'
    #for mt in muttypes:
    mt = 'indel'
    if mt == 'snv':
        gtm = 4
        refname = 'inundilutedsamplebyatleast'+str(gtm)+'callers'
    else:  # elif mt == 'indel':
        gtm = 2
        refname = 'inundilutedsamplebyatleast'+str(gtm)+'callers'
    print(refname)
    # for metric in metrics:
    metric = 'auprc'
    # load results tables
    restables = {'snv': [], 'indel': []}
    mixtureid = 'CRC-986_100215-CW-T_CRC-986_300316-CW-T'
    if mixtureid == 'CRC-986_100215-CW-T_CRC-986_300316-CW-T':
        gtm = 3
        refname = 'intissuesamplebyatleast'+str(gtm)+'callers'
    else:
        gtm = 4
        refname = 'inundilutedsamplebyatleast'+str(gtm)+'callers'
    plasmasample = '_'.join(mixtureid.split('_')[:2])
    print(mixtureid, plasmasample)
    xa = xaxis if xaxis != 'tumor burden' else 'tb'
    print(xa)
    restable = pd.read_csv(os.path.join(*config.mixturefolder, 'mixtures_allchr', 'results', mixtureid+'_'+mt+'_'+metric+'_'+refname+'_fixed'+fixedvar+'_'+ xa +'.csv'), index_col=0)
    #restable = pd.read_csv(os.path.join(*config.mixturefolder, 'mixtures_chr22_wgs', 'mixtures_chr22_'+mixtureid, 'results', mixtureid+'_'+mt+'_'+metric+'_'+refname+'_fixed'+fixedvar +'.csv'), index_col=0)
    restable['plasma sample'] = plasmasample
    restables[mt].append(restable)
    restables[mt] = pd.concat(restables[mt])
    res1 = plot_metricsseries(config, restables, mixtureids, 'all', metric=metric, muttype=mt,
                       ground_truth_method='mixture', fixedvar=fixedvar, refname=refname, allpatients=True, logscale=False, save=False)
    #res2 = plot_metricsseries(config, restables, mixtureids, 'all', metric=metric, muttype=mt,
    #                   ground_truth_method='mixture', fixedvar=fixedvar, refname=refname, allpatients=True, logscale=True, save=True)
    #res3 = plot_metricsseries(config, restables, mixtureids, 'all', metric=metric, muttype=mt,
    #                   ground_truth_method='mixture', fixedvar=fixedvar, refname=refname, allpatients=False, logscale=False, save=True)
    #res4 = plot_metricsseries(config, restables, mixtureids, 'all', metric=metric, muttype=mt,
    #                       ground_truth_method='mixture', fixedvar=fixedvar, refname=refname, allpatients=False, logscale=True, save=True)
    #plt.grid(linewidth=1)
    plt.ylim([0, .8])
    #plt.xlim([20, 0])
    #plt.xlim([40, 250])

In [None]:
#plt.grid(linewidth=1)

#for fixedvar in fixedvars:
fixedvar = 'coverage'
if fixedvar == 'coverage':
    xaxis = 'tumor burden'
elif fixedvar == 'ctdna':
    xaxis = 'coverage'
#for mt in muttypes:
mt = 'snv'
if mt == 'snv':
    gtm = 5
    refname = 'inundilutedsamplebyatleast'+str(gtm)+'callers'
else:  # elif mt == 'indel':
    gtm = 4
    refname = 'inundilutedsamplebyatleast'+str(gtm)+'callers'
print(refname)
# for metric in metrics:
metric = 'auprc'
#metric = 'maxrecallatleast0_03precision'
# load results tables
restables = {'snv': [], 'indel': []}
#for mixtureid in mixtureids:
print(mixtureid)
refname = 'inundilutedsamplebyatleast'+str(gtm)+'callers'
plasmasample = '_'.join(mixtureid.split('_')[:2])
print(mixtureid, plasmasample)
xa = xaxis if xaxis != 'tumor burden' else 'tb'
print(xa)
restable = pd.read_csv(os.path.join(*config.mixturefolder, 'mixtures_allchr', 'results', mixtureid+'_'+mt+'_'+metric+'_'+refname+'_fixed'+fixedvar+'_'+ xa +'.csv'), index_col=0)
#restable = pd.read_csv(os.path.join(*config.mixturefolder, 'mixtures_chr22_wgs', 'mixtures_chr22_'+mixtureid, 'results', mixtureid+'_'+mt+'_'+metric+'_'+refname+'_fixed'+fixedvar +'.csv'), index_col=0)
restable['plasma sample'] = plasmasample
restables[mt].append(restable)

restables[mt] = pd.concat(restables[mt])
res1 = plot_metricsseries(config, restables, [mixtureid], 'all', metric=metric, muttype=mt,
                   ground_truth_method='mixture', fixedvar=fixedvar, refname=refname, allpatients=False, logscale=False, save=True)
#res1 = plot_metricsseries(config, restables, mixtureids, 'all', metric=metric, muttype=mt,
#                   ground_truth_method='mixture', fixedvar=fixedvar, refname=refname, allpatients=True, logscale=False, save=False)
#res2 = plot_metricsseries(config, restables, mixtureids, 'all', metric=metric, muttype=mt,
#                   ground_truth_method='mixture', fixedvar=fixedvar, refname=refname, allpatients=True, logscale=True, save=True)
#res3 = plot_metricsseries(config, restables, mixtureids, 'all', metric=metric, muttype=mt,
#                   ground_truth_method='mixture', fixedvar=fixedvar, refname=refname, allpatients=False, logscale=False, save=True)
#res4 = plot_metricsseries(config, restables, mixtureids, 'all', metric=metric, muttype=mt,
#                       ground_truth_method='mixture', fixedvar=fixedvar, refname=refname, allpatients=False, logscale=True, save=True)

In [None]:
resx

In [None]:
tfdict = {'CRC-1014_180816-CW-T_CRC-1014_090516-CW-T': [40.3, 3.2], 
         'CRC-123_310715-CW-T_CRC-123_121115-CW-T': [62.1, 2.2],
         'CRC-986_100215-CW-T_CRC-986_300316-CW-T': [32.6, 0]}

for mi, mixtureid in enumerate(mixtureids):
    print(mixtureid)
    seriesorder = [(70, 0), (70, 80), (50, 100), (30, 120), (20, 130), (10, 140), (5, 145)]
    print(seriesorder)
    esttflist = []
    for s in seriesorder[1:]:
        sh, sl = s
        esttf = round((tfdict[mixtureid][0]*sh + tfdict[mixtureid][1]*sl)/(sl+sh), 2)
        print(esttf)
        esttflist.append(esttf)
    esttflist =[esttflist] * len(config.methods)
    print(esttflist)
    resx[mi*len(config.methods):(mi+1)*len(config.methods), :] = esttflist
    

In [None]:
resx

In [None]:
#resx = np.array([rx.values for rx in res1['x']])
resx.mean(axis=0)
resy = np.array([ry.values for ry in res1['y']])
print(resy)
res = {m: [] for m in config.methods}
for mi, m in enumerate(config.methods):
    resmean = np.mean([resy[mi], resy[mi+len(config.methods)], resy[mi+2*len(config.methods)]], axis=0)
    resstd = np.std([resy[mi], resy[mi+len(config.methods)], resy[mi+2*len(config.methods)]], axis=0)
    #reslabel = m
    res[m] = [resmean, resstd, resx.mean(axis=0)]

In [None]:
res[m]

In [None]:
color_dict = {config.methods[i]: config.colors[i] for i in range(len(config.methods))}
plt.figure(figsize=(15, 10))
#plt.grid(linewidth=1)
plt.grid()
for m in config.methods:
    #plt.plot(res['freebayes'][2], res['freebayes'][0], c=color_dict['freebayes'], markersize=15, lw=2)
    plt.errorbar(res[m][2], res[m][0], xerr = resx.std(axis=0)/np.sqrt(int(resx.shape[0]/(len(config.methods)))), yerr=res[m][1]/int(resx.shape[0]/(len(config.methods))), marker='s',  c=color_dict[m], label=m,  markersize=15, lw=2, fmt='-o')
ax = plt.gca()
if fixedvar == 'coverage':
    plt.gca().invert_xaxis()
    xlab='tumor burden'
else:
    xlab = 'coverage or added noise'
hand, labl = ax.get_legend_handles_labels()
ax.legend(hand, labl, bbox_to_anchor=(1, 1), loc="upper left")
plt.xlabel(xlab)
plt.ylabel(metric.upper()+' score')
plt.grid()
plt.title(metric.upper() + " score for {} calling in chr{} with ref {}".format(mt.upper(), chrom, refname))
plt.ylim([0,0.5])

In [None]:
for m in config.methods:
    res[m][2] = res[m][2][::-1]
    res[m][0] = res[m][0][::-1]
    res[m][1] = res[m][1][::-1]

In [None]:
#color_dict = {config.methods[i]: config.colors[i] for i in range(len(config.methods))}
#plt.figure(figsize=(15, 10))
#plt.grid(linewidth=1)
#res[m][2] = res[m][2][::-1]
#res[m][0] = res[m][0][::-1]
#res[m][1] = res[m][1][::-1]
rankres = []
for m in config.methods:
    xstart = 0
    ystart = 0
    sum_all = []
    for i in range(len(res[m][2])):
        #if i >0:
        #print(res[m][2][i], xstart)
        #print((res[m][2][i]-xstart) * res[m][0][i] )
        #print(((res[m][2][i]-xstart) * (res[m][0][i]-ystart)/2))
        sum_all.append(((res[m][2][i]-xstart) * res[m][0][i]) - ((res[m][2][i]-xstart) * (res[m][0][i]-ystart)/2))
        xstart = res[m][2][i]
        ystart = res[m][0][i]
    print(m, np.sum(sum_all) )
    rankres.append(np.sum(sum_all))
    #plt.plot(res['freebayes'][2], res['freebayes'][0], c=color_dict['freebayes'], markersize=15, lw=2)
    #plt.errorbar(res[m][2], res[m][0], xerr = resx.std(axis=0), yerr=res[m][1], marker=config.markers[0], label=m,  markersize=15, lw=2, fmt='-o')
    #plt.gca().invert_xaxis()
#ax = plt.gca()
#xlab = 'coverage or added noise'
#xlab='tumor burden'
#hand, labl = ax.get_legend_handles_labels()
#ax.legend(hand, labl, bbox_to_anchor=(1, 1), loc="upper left")
#plt.xlabel(xlab)
#plt.ylabel(metric.upper()+' score')
#plt.title(metric.upper() + " score for {} calling in chr{} with ref {}".format(mt.upper(), chrom, refname))

In [None]:
import matplotlib.patches as mpatches
rankres_sort = np.argsort(rankres)[::-1]
rankres_sort
plt.figure(figsize=(14, 5))
method_order =[]
rank_order =[]
for i in rankres_sort:
    method_order.append(config.methods[i])
    rank_order.append(rankres[i])
plt.bar(method_order, rank_order, color=['b', 'b', 'b', 'b', 'b', 'r', 'r'])
blue_patch = mpatches.Patch(color='b', label='DNA methods')
red_patch = mpatches.Patch(color='r', label='cfDNA methods')
plt.legend(handles=[blue_patch, red_patch])
#plt.legend(['DNA methods', 'cfDNA methods'])
plt.ylabel('area under AUPRC curve')
plt.title('Caller ranking mixtures with decreasing tumor burden')