# Ground truth analysis

In [2]:
# Imports

%load_ext autoreload
%autoreload 2

import io
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pysam
import warnings
from tqdm.notebook import tqdm
from sklearn.metrics import precision_recall_curve, f1_score, average_precision_score
warnings.filterwarnings('ignore')
from sklearn.metrics import confusion_matrix

# set working directory
if not os.getcwd().endswith('cfdna_snv_benchmark'):
    os.chdir('../')
print('Current working directory: {}'.format(os.getcwd()))

from utils.config import Config
from utils.viz import *
from utils.table import *
from utils.metrics import *
from utils.calltable import *
from utils.calltableseries import *
from utils.groundtruth import *
from utils.metricsseries import *
from utils.venn import *

Current working directory: /Users/hanae/Repositories/cfdna_snv_benchmark


In [3]:
# Config and Display paramaters

config = Config("config/", "config_viz.yaml")
set_display_params(config)
print(config.methods)

paper
['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan']


# Get call sets for cfDNA, T1W, M1W on exome calling

In [6]:
T1W_WES_snv_table, _, _ = get_calltable('data/matchedtissue_ultradeep/986_100215_T1-E', config.methods_tissue, save=True, filter='all')
M1W_WES_snv_table, _, _ = get_calltable('data/matchedtissue_ultradeep/986_100215_M1-E', config.methods_tissue, save=True, filter='all')
M2W_WES_snv_table, _, _ = get_calltable('data/matchedtissue_ultradeep/986_100215_M2-E', config.methods_tissue, save=True, filter='all')

986_100215_T1-E
freebayes
mutect2
retrieving mutect2 104 calls with MinAF tags out of 126
strelka2
retrieving strelka2 345 calls with MinAF tags out of 119
vardict
retrieving vardict 0 calls with f0.01;REJECT;REJECT tags out of 197
varscan
# calls before using germline calls from GATK Haplotype: 674 SNV, 121 INDEL, 0 SNP
# calls after using germline calls from GATK Haplotype: 671 SNV, 119 INDEL, 5 SNP
final shape SNV: (671, 30)
final shape INDEL: (119, 30)
final shape SNP: (5, 30)
986_100215_M1-E
freebayes
mutect2
retrieving mutect2 52 calls with MinAF tags out of 174
strelka2
retrieving strelka2 180 calls with MinAF tags out of 176
vardict
retrieving vardict 0 calls with f0.01;REJECT;REJECT tags out of 241
varscan
# calls before using germline calls from GATK Haplotype: 617 SNV, 104 INDEL, 0 SNP
# calls after using germline calls from GATK Haplotype: 612 SNV, 99 INDEL, 10 SNP
final shape SNV: (612, 30)
final shape INDEL: (99, 30)
final shape SNP: (10, 30)
986_100215_M2-E
freebayes
mut

In [None]:
#T1W_WGS_snv_table, _, _ = get_calltable('data/matchedtissue/NCC_CRC-986_100215-T1W', config.methods_tissue, save=True, filter='PASS')
#M1W_WGS_snv_table, _, _ = get_calltable('data/matchedtissue/NCC_CRC-986_100215-M1W', config.methods_tissue, save=True, filter='PASS')
M2W_WGS_snv_table, _, _ = get_calltable('data/matchedtissue/NCC_CRC-986_100215-M2W', ['mutect2', 'strelka2'], save=True, filter='PASS')

NCC_CRC-986_100215-M2W
mutect2


In [3]:
M2W_WGS_snv_table = pd.read_csv('data/matchedtissue/NCC_CRC-986_100215-M2W/calls/NCC_CRC-986_100215-M2W_snv_calls_PASS.csv', index_col=0)
M2W_WGS_snv_table

Unnamed: 0_level_0,chrom,pos,ref,alt,type,freebayes,freebayes_score,vardict,vardict_score,varscan,varscan_score,freebayes_altcov,freebayes_totcov,freebayes_vaf,vardict_altcov,vardict_totcov,vardict_vaf,varscan_altcov,varscan_totcov,varscan_vaf
chrom_pos_ref_alt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
10_10004273_C_A,10,10004273,C,A,SNV,False,,True,0.78177,True,0.826180,,,,0.0,47.0,0.0,0.0,45.0,0.0
10_100081798_A_G,10,100081798,A,G,SNV,True,0.954714,True,0.99998,False,,0.0,49.0,0.0,0.0,45.0,0.0,,,
10_100099019_C_T,10,100099019,C,T,SNV,True,0.960858,False,,True,0.999039,0.0,59.0,0.0,,,,0.0,54.0,0.0
10_100111597_A_C,10,100111597,A,C,SNV,False,,False,,True,0.585060,,,,,,,0.0,30.0,0.0
10_100153430_C_A,10,100153430,C,A,SNV,True,0.895333,True,0.89262,True,0.887120,0.0,41.0,0.0,0.0,35.0,0.0,0.0,36.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Y_9999388_G_T,Y,9999388,G,T,SNV,True,0.964881,False,,False,,0.0,6.0,0.0,,,,,,
Y_9999399_C_G,Y,9999399,C,G,SNV,True,0.964729,False,,False,,0.0,6.0,0.0,,,,,,
Y_9999421_G_A,Y,9999421,G,A,SNV,True,0.963905,False,,False,,0.0,6.0,0.0,,,,,,
Y_9999808_G_T,Y,9999808,G,T,SNV,True,0.908102,False,,False,,0.0,5.0,0.0,,,,,,


In [4]:
# load exome regions
exomebed = pd.read_csv('data/extdata/xgen-exome-research-panel-targets_nochr.bed', sep='\t', header=None)
exomebed.columns = ['chrom', 'startpos', 'endpos', 'a', 'b', 'c']
exomebed = exomebed[['chrom', 'startpos', 'endpos']]
exomebed['chrom'] = exomebed['chrom'].astype(str)

# exome filtering
noncodingregions = []
for pos in tqdm(list(M2W_WGS_snv_table.index)):
    chrom, startpos, ref, alt = pos.split('_')
    if str(chrom) not in exomebed['chrom'].unique().astype(str):
        noncodingregions.append(pos)
    else:
        exomechrom = exomebed[exomebed['chrom'] == str(chrom)]
        count = False
        for ind in list(exomechrom.index):
            # print(exomechrom['endpos'][ind], exomechrom['startpos'][ind])
            if exomechrom['endpos'][ind] >= int(startpos):
                if exomechrom['startpos'][ind] <= int(startpos):
                    count = True
        if not count:
            noncodingregions.append(pos)
        #print(pos, count)

HBox(children=(FloatProgress(value=0.0, max=169581.0), HTML(value='')))




In [8]:
print(len(noncodingregions), M2W_WGS_snv_table.shape[0])

#T1W_exome_snv_table = T1W_WGS_snv_table.drop(noncodingregions)
M2W_exome_snv_table = M2W_WGS_snv_table.drop(noncodingregions)

168158 169581


In [9]:
#T1W_exome_snv_table.to_csv(os.path.join('data/matchedtissue/NCC_CRC-986_100215-T1W', 'calls', 'NCC_CRC-986_100215-T1W_snv_calls_PASS_exome.csv'))
#M1W_exome_snv_table.to_csv(os.path.join('data/matchedtissue/NCC_CRC-986_100215-M1W', 'calls', 'NCC_CRC-986_100215-M1W_snv_calls_PASS_exome.csv'))
M2W_exome_snv_table.to_csv(os.path.join('data/matchedtissue/NCC_CRC-986_100215-M2W', 'calls', 'NCC_CRC-986_100215-M2W_snv_calls_PASS_exome.csv'))

In [None]:
M1W_exome_snv_table = pd.read_csv(os.path.join('data', 'matchedtissue', 'NCC_CRC-986_100215-M1W', 'calls', 'NCC_CRC-986_100215-M1W_snv_calls_PASS_exome.csv'), index_col=0)
M1W_exome_snv_table = M1W_exome_snv_table[M1W_exome_snv_table['chrom'] != 'X']


In [12]:
M2W_exome_snv_table = pd.read_csv(os.path.join('data', 'matchedtissue', 'NCC_CRC-986_100215-M2W', 'calls', 'NCC_CRC-986_100215-M2W_snv_calls_PASS_exome.csv'), index_col=0)
M2W_exome_snv_table = M2W_exome_snv_table[M2W_exome_snv_table['chrom'] != 'X']

In [None]:
T1W_exome_snv_table = pd.read_csv(os.path.join('data', 'matchedtissue', 'NCC_CRC-986_100215-T1W', 'calls', 'NCC_CRC-986_100215-T1W_snv_calls_PASS_exome.csv'), index_col=0)
T1W_exome_snv_table = T1W_exome_snv_table[T1W_exome_snv_table['chrom'] != 'X']
T1W_exome_snv_table

In [24]:
cfdna1_snv_table_list = []
for i in range(1,23):
    cfdna1_chri_snv_table = pd.read_csv(os.path.join('data', 'mixtures', 'mixtures_chr'+str(i), 'mixtures_chr'+str(i)+'_CRC-986_100215-CW-T_CRC-986_300316-CW-T',
                                                'mixture_chr'+str(i)+'_CRC-986_100215-CW-T_70x_CRC-986_300316-CW-T_0x', 'calls',
                                                'mixture_chr'+str(i)+'_CRC-986_100215-CW-T_70x_CRC-986_300316-CW-T_0x_snv_calls_all.csv'), index_col=0)
    cfdna1_snv_table_list.append(cfdna1_chri_snv_table)
cfdna1_snv_table = pd.concat(cfdna1_snv_table_list)
cfdna1_snv_table

Unnamed: 0_level_0,chrom,pos,ref,alt,type,freebayes,freebayes_score,mutect2,mutect2_score,strelka2,...,vardict_vaf,varscan_altcov,varscan_totcov,varscan_vaf,abemus_altcov,abemus_totcov,abemus_vaf,sinvict_altcov,sinvict_totcov,sinvict_vaf
chrom_pos_ref_alt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1_100436166_C_A,1,100436166,C,A,SNV,False,,False,,False,...,0.0,,,,,,,,,
1_100684255_T_A,1,100684255,T,A,SNV,False,,False,,False,...,0.0,,,,,,,,,
1_1007218_G_A,1,1007218,G,A,SNV,False,,False,,False,...,,,,,2.0,91.0,0.021978,,,
1_1007226_A_G,1,1007226,A,G,SNV,False,,False,,False,...,,,,,4.0,73.0,0.054795,,,
1_10166570_G_T,1,10166570,G,T,SNV,False,,False,,False,...,,,,,2.0,76.0,0.026316,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22_51159340_A_C,22,51159340,A,C,SNV,False,,False,,False,...,,,,,2.0,52.0,0.038462,,,
22_51159637_C_T,22,51159637,C,T,SNV,False,,False,,False,...,,,,,2.0,100.0,0.020000,,,
22_51159801_C_T,22,51159801,C,T,SNV,False,,False,,False,...,,,,,2.0,61.0,0.032787,,,
22_51159957_G_A,22,51159957,G,A,SNV,False,,False,,False,...,,,,,2.0,79.0,0.025316,,,


In [25]:
calltabledict = {
    'cfdna1': cfdna1_snv_table,
    'M2W': M2W_exome_snv_table,
}

In [None]:
calltabledict = {
    'cfdna1': cfdna1_snv_table,
    'T1W': T1W_exome_snv_table,
    'M1W': M1W_exome_snv_table,
    'M2W': M2W_exome_snv_table,
}

In [26]:
res =  compare_groundtruth(calltabledict)

1
cfdna1
['abemus' 'freebayes' 'mutect2' 'sinvict' 'strelka2' 'vardict' 'varscan']
27151
1 27151 27151
2 1533 1533
3 375 375
4 159 159
5 92 92
6 40 40
7 15 15
M2W
['freebayes' 'vardict' 'varscan']
1394
1 1394 229
2 318 85
3 71 21
2
cfdna1
['abemus' 'freebayes' 'mutect2' 'sinvict' 'strelka2' 'vardict' 'varscan']
27151
1 27151 1533
2 1533 1533
3 375 375
4 159 159
5 92 92
6 40 40
7 15 15
M2W
['freebayes' 'vardict' 'varscan']
1394
1 1394 105
2 318 63
3 71 19
3
cfdna1
['abemus' 'freebayes' 'mutect2' 'sinvict' 'strelka2' 'vardict' 'varscan']
27151
1 27151 375
2 1533 375
3 375 375
4 159 159
5 92 92
6 40 40
7 15 15
M2W
['freebayes' 'vardict' 'varscan']
1394
1 1394 91
2 318 61
3 71 17
4
cfdna1
['abemus' 'freebayes' 'mutect2' 'sinvict' 'strelka2' 'vardict' 'varscan']
27151
1 27151 159
2 1533 159
3 375 159
4 159 159
5 92 92
6 40 40
7 15 15
M2W
['freebayes' 'vardict' 'varscan']
1394
1 1394 82
2 318 56
3 71 15
5
cfdna1
['abemus' 'freebayes' 'mutect2' 'sinvict' 'strelka2' 'vardict' 'varscan']
27151


In [28]:
calls_cfdna1 = res['cfdna1_4_4']
calls_M2W = res['M2W_1_2']
print(len(calls_M2W))
labels = get_labels([calls_cfdna1, calls_M2W])
print(labels)

318
{'01': '262', '10': '103', '11': '56'}


In [30]:
np.array(calls_M2W).tofile('data/matchedtissue/venndiagram/calls_M2W_atleast2callers.csv', sep = ',')

In [None]:
calls_cfdna1 = res['cfdna1_4_4']
print(len(calls_cfdna1))
calls_T1W = res['T1W_1_3']
print(len(calls_T1W))
calls_M1W = res['M1W_1_3']
print(len(calls_M1W))
labels = get_labels([calls_cfdna1, calls_T1W, calls_M1W])
print(labels)

In [None]:
np.array(calls_cfdna1).tofile('data/matchedtissue/venndiagram/calls_cfdna1_atleast4callers.csv', sep = ',')
np.array(calls_T1W).tofile('data/matchedtissue/venndiagram/calls_T1W_atleast3callers.csv', sep = ',')
np.array(calls_M1W).tofile('data/matchedtissue/venndiagram/calls_M1W_atleast3callers.csv', sep = ',')

In [3]:
calls_cfdna1 = [str(i)[1:-1] for i in list(pd.read_csv("data/matchedtissue/venndiagram/calls_cfdna1_atleast4callers.csv", header=None).values)[0]]
calls_T1W =  [str(i)[1:-1] for i in list(pd.read_csv("data/matchedtissue/venndiagram/calls_T1W_atleast3callers.csv", header=None).values)[0]]
calls_M1W = [str(i)[1:-1] for i in list(pd.read_csv("data/matchedtissue/venndiagram/calls_M1W_atleast3callers.csv", header=None).values)[0]]

# Table to compare call sets

In [23]:
cfdna_df = pd.DataFrame(index=calls_cfdna1)
cfdna_df['cfdna'] = True
T1W_df = pd.DataFrame(index=calls_T1W)
T1W_df['T1W'] = True
M1W_df = pd.DataFrame(index=calls_M1W)
M1W_df['M1W'] = True
res = pd.concat([cfdna_df, T1W_df, M1W_df], axis=1)
res.fillna(False, inplace=True)
res

Unnamed: 0,cfdna,T1W,M1W
1_10687378_T_G,True,False,False
1_11898693_G_A,True,False,False
1_147380601_C_T,True,True,True
1_173878724_G_T,True,True,True
1_183086809_G_A,True,False,False
...,...,...,...
9_114393727_G_A,False,False,True
9_13221458_A_G,False,False,True
9_140009153_G_A,False,False,True
9_43915893_G_C,False,False,True


# Jaccard similarity score of call set pairs

In [30]:
from sklearn.metrics import jaccard_score
print("{:.2f},{:.2f},{:.2f}".format(jaccard_score(res['cfdna'], res['T1W']), jaccard_score(res['cfdna'], res['M1W']), jaccard_score(res['T1W'], res['M1W'])))
print("{:.2f},{:.2f},{:.2f}".format(jaccard_score(res['T1W'], res['cfdna']), jaccard_score(res['M1W'], res['cfdna']), jaccard_score(res['M1W'], res['T1W'])))

0.29,0.30,0.35
0.29,0.30,0.35


# Fisher exact test on call sets pairs

## NB: would need to evaluate number of base pairs with sufficient coverage evaluated. Here, took N = 20K (approx number of protein-coding genes with decent coverage)

In [87]:

from scipy.stats import fisher_exact
from statsmodels.sandbox.stats.multicomp import multipletests

# contingency table and Fisher exact test

testres = {}
for pairs in [('cfdna', 'T1W'), ('cfdna', 'M1W'), ('T1W', 'M1W')]:
    a, b =  pairs
    print("############")
    print(a, 'VS', b)
    print("############")
    ctable = [[res[(res[a]==True) & (res[b]==True)].shape[0], res[(res[a]==False) & (res[b]==True)].shape[0]],
              [res[(res[a]==True) & (res[b]==False)].shape[0], int(20000 - res[(res[a]==True) | (res[b]==True)].shape[0])]]
    
    #p_adjusted = multipletests(Column6, method='bonferroni')
    odd_ratio, p_value = fisher_exact(ctable, alternative='two-sided')
    print('Contingency table:')
    print(pd.DataFrame(ctable, columns=[a+' & '+b, '!'+a+' & '+b], index=[a+' & !'+b, '!'+a+' & !'+b]))
    print('Fisher exact test (two-sided): odd_ratio = {}, p_value = {}'.format(odd_ratio, p_value))
    testres[a+' VS '+b] = [p_value]

testres = pd.DataFrame.from_dict(testres).T
testres.columns = ['p_value']
testres['p_value_adjusted'] = multipletests(testres['p_value'].values, method='bonferroni')[1]
testres[['p_value_adjusted']]

############
cfdna VS T1W
############
Contingency table:
               cfdna & T1W  !cfdna & T1W
cfdna & !T1W            68            73
!cfdna & !T1W           91         19768
Fisher exact test (two-sided): odd_ratio = 202.35194942044257, p_value = 1.0190037260737785e-109
############
cfdna VS M1W
############
Contingency table:
               cfdna & M1W  !cfdna & M1W
cfdna & !M1W            75            95
!cfdna & !M1W           84         19746
Fisher exact test (two-sided): odd_ratio = 185.58270676691728, p_value = 5.148787917418059e-118
############
T1W VS M1W
############
Contingency table:
             T1W & M1W  !T1W & M1W
T1W & !M1W          81          89
!T1W & !M1W         60       19770
Fisher exact test (two-sided): odd_ratio = 299.8820224719101, p_value = 6.450909216876732e-138


Unnamed: 0,p_value_adjusted
cfdna VS T1W,3.057011e-109
cfdna VS M1W,1.544636e-117
T1W VS M1W,1.935273e-137
