# Ground truth analysis

In [None]:
# Imports

%load_ext autoreload
%autoreload 2

import io
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pysam
import warnings
from tqdm.notebook import tqdm
from sklearn.metrics import precision_recall_curve, f1_score, average_precision_score
warnings.filterwarnings('ignore')
from sklearn.metrics import confusion_matrix

# set working directory
if not os.getcwd().endswith('cfdna_snv_benchmark'):
    os.chdir('../')
print('Current working directory: {}'.format(os.getcwd()))

from utils.config import Config
from utils.viz import *
from utils.table import *
from utils.metrics import *
from utils.calltable import *
from utils.calltableseries import *
from utils.groundtruth import *
from utils.metricsseries import *
from utils.venn import *

In [None]:
# Config and Display paramaters

config = Config("config/", "config_viz.yaml")
set_display_params(config)
print(config.methods)
print(config.methods_tissue)

In [None]:
M1W_snv_table, _, _ = get_calltable('data/matchedtissue/NCC_CRC-986_100215-M1W', config.methods_tissue, save=True, filter='PASS')

In [None]:
gt1 = pd.read_csv(os.path.join('data', 'matchedtissue', 'NCC_CRC-986_100215-T1W', 'calls', 'NCC_CRC-986_100215-T1W_snv_calls_PASS_exome.csv'), index_col=0)
gt2 = pd.read_csv(os.path.join('data', 'matchedtissue_ultradeep', '986_100215_T1-E', 'calls', '986_100215_T1-E_snv_calls_all.csv'), index_col=0)
print(gt1.shape[0], gt2.shape[0])

In [None]:
len(list(set(list(gt1.index)) & set(list(gt2.index))))

In [None]:
gt1filtered = list(gt1[gt1[['{}'.format(m) for m in config.methods_tissue]].sum(axis=1) >= 3].index)
gt2filtered = list(gt2[gt2[['{}'.format(m) for m in config.methods_tissue]].sum(axis=1) >= 3].index)
print(len(gt1filtered), len(gt2filtered))

In [None]:
len(list(set(gt1filtered) & set(gt2filtered)))

# Get call sets for cfDNA, T1W, M1W on exome calling

In [None]:
T1W_exome_snv_table = pd.read_csv(os.path.join('data', 'matchedtissue_ultradeep', '986_100215_T1-E', 'calls', '986_100215_T1-E_snv_calls_all.csv'), index_col=0)
T1W_exome_snv_table

In [None]:
T1W_exome_indel_table = pd.read_csv(os.path.join('data', 'matchedtissue_ultradeep', '986_100215_T1-E', 'calls', '986_100215_T1-E_indel_calls_all.csv'), index_col=0)
T1W_exome_indel_table

In [None]:
M1W_exome_snv_table = pd.read_csv(os.path.join('data', 'matchedtissue_ultradeep', '986_100215_M1-E', 'calls', '986_100215_M1-E_snv_calls_all.csv'), index_col=0)
M1W_exome_snv_table

In [None]:
M1W_exome_indel_table = pd.read_csv(os.path.join('data', 'matchedtissue_ultradeep', '986_100215_M1-E', 'calls', '986_100215_M1-E_indel_calls_all.csv'), index_col=0)
M1W_exome_indel_table

In [None]:
mutations = {'APC': '5_112128143_C_T',
            'EGFR:V441G': '7_55227855_T_G',
            'EGFR:S492R': '7_55228007_A_C',
            'PIK3CA': '3_178936092_A_G',
            'SOX9': '17_70119758_C_CGA',
            'TP53': '17_7578394_T_C', 
            'DPYP': '1_97915727_G_T',
            'NF1': '17_29663722_C_T',
            'POLD1': '19_50905525_G_A',
            'EPHB2': '1_23222037_T_G'}

for mutname, mutcoords in mutations.items():
    print("###### "+ mutname + ': ' + mutcoords + ' #######')
    if mutname != 'SOX9':
        if mutcoords in T1W_exome_snv_table.index:
            print('Mutation called by {} callers in T1'.format(T1W_exome_snv_table[['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan']].loc[mutcoords].sum()))
            print(T1W_exome_snv_table[['freebayes_totcov', 'mutect2_totcov', 'strelka2_totcov', 'vardict_totcov', 'varscan_totcov']].loc[mutcoords])
        else:
            print('Mutation not present in T1')
        if mutcoords in M1W_exome_snv_table.index:
            print('Mutation called by {} callers in M1'.format(M1W_exome_snv_table[['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan']].loc[mutcoords].sum()))
        else:
            print('Mutation not present in M1')
    else:
        if mutcoords in T1W_exome_indel_table.index:
            print('Mutation called by {} callers in T1'.format(T1W_exome_indel_table[['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan']].loc[mutcoords].sum()))
            print(T1W_exome_indel_table[['freebayes_altcov', 'mutect2_altcov', 'strelka2_altcov', 'vardict_altcov', 'varscan_altcov']].loc[mutcoords])

        else:
            print('Mutation not present in T1')
        if mutcoords in T1W_exome_indel_table.index:
            print('Mutation called by {} callers in M1'.format(T1W_exome_indel_table[['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan']].loc[mutcoords].sum()))
        else:
            print('Mutation not present in M1')

In [None]:
T1W_exome_indel_table[T1W_exome_indel_table['pos'] == 70119758].index

In [None]:
M1W_WGS_snv_table = pd.read_csv('data/matchedtissue/NCC_CRC-986_100215-M1W/calls/NCC_CRC-986_100215-M1W_snv_calls_PASS.csv', index_col=0)
M1W_WGS_snv_table

In [None]:
# load exome regions
exomebed = pd.read_csv('data/extdata/xgen-exome-research-panel-targets_nochr.bed', sep='\t', header=None)
exomebed.columns = ['chrom', 'startpos', 'endpos', 'a', 'b', 'c']
exomebed = exomebed[['chrom', 'startpos', 'endpos']]
exomebed['chrom'] = exomebed['chrom'].astype(str)

# exome filtering
noncodingregions = []
for pos in tqdm(list(M1W_WGS_snv_table.index)):
    chrom, startpos, ref, alt = pos.split('_')
    if str(chrom) not in exomebed['chrom'].unique().astype(str):
        noncodingregions.append(pos)
    else:
        exomechrom = exomebed[exomebed['chrom'] == str(chrom)]
        count = False
        for ind in list(exomechrom.index):
            # print(exomechrom['endpos'][ind], exomechrom['startpos'][ind])
            if exomechrom['endpos'][ind] >= int(startpos):
                if exomechrom['startpos'][ind] <= int(startpos):
                    count = True
        if not count:
            noncodingregions.append(pos)
        #print(pos, count)

In [None]:
M1W_exome_snv_table = M1W_WGS_snv_table.drop(noncodingregions)
#T1W_exome_snv_table.to_csv(os.path.join('data/matchedtissue/NCC_CRC-986_100215-T1W', 'calls', 'NCC_CRC-986_100215-T1W_snv_calls_all_exome.csv'))

In [None]:
# M1W_exome_snv_table.to_csv(os.path.join('data/matchedtissue/NCC_CRC-986_100215-M1W', 'calls', 'NCC_CRC-986_100215-M1W_snv_calls_PASS_exome.csv'))

In [None]:
T1W_exome_snv_table[T1W_exome_snv_table['chrom'] != 'X'].shape

In [None]:
T1W_exome_snv_table[config.methods_tissue].sum()

In [None]:
M1W_exome_snv_table = pd.read_csv(os.path.join('data', 'matchedtissue', 'NCC_CRC-986_100215-M1W', 'calls', 'NCC_CRC-986_100215-M1W_snv_calls_PASS_exome.csv'), index_col=0)
M1W_exome_snv_table = M1W_exome_snv_table[M1W_exome_snv_table['chrom'] != 'X']

In [None]:
T1W_exome_snv_table = pd.read_csv(os.path.join('data', 'matchedtissue', 'NCC_CRC-986_100215-T1W', 'calls', 'NCC_CRC-986_100215-T1W_snv_calls_PASS_exome.csv'), index_col=0)
T1W_exome_snv_table = T1W_exome_snv_table[T1W_exome_snv_table['chrom'] != 'X']
T1W_exome_snv_table

In [None]:
len(set(M1W_exome_snv_table.index) | set(T1W_exome_snv_table.index)  | set(cfdna1_snv_table.index))

In [None]:
len(set(cfdna1_snv_table.index))

In [None]:
cfdna1_snv_table = cfdna1_snv_table[cfdna1_snv_table[['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan']].sum(axis=1) > 0]

In [None]:
cfdna1_snv_table['vardict'].sum()

In [None]:
cfdna1_snv_table_list = []
for i in range(1,23):
    if i not in [1, 2, 8, 20, 21, 22]: 
        cfdna1_chri_snv_table = pd.read_csv(os.path.join('data', 'mixtures', 'mixtures_chr'+str(i), 'mixtures_chr'+str(i)+'_CRC-986_100215-CW-T_CRC-986_300316-CW-T',
                                                    'mixture_chr'+str(i)+'_CRC-986_100215-CW-T_70x_CRC-986_300316-CW-T_0x', 'calls',
                                                    'mixture_chr'+str(i)+'_CRC-986_100215-CW-T_70x_CRC-986_300316-CW-T_0x_snv_calls_all.csv'), index_col=0)
        cfdna1_snv_table_list.append(cfdna1_chri_snv_table)
cfdna1_snv_table = pd.concat(cfdna1_snv_table_list)
#cfdna1_snv_table.drop(['varnet', 'varnet_score', 'varnet_vaf', 'varnet_altcov', 'varnet_totcov'], axis=1, inplace=True)
#cfdna1_snv_table.drop(['varnetbis', 'varnetbis_score', 'varnetbis_vaf', 'varnetbis_altcov', 'varnetbis_totcov'], axis=1, inplace=True)
cfdna1_snv_table.drop(['smurf', 'smurf_score', 'smurf_vaf', 'smurf_altcov', 'smurf_totcov'], axis=1, inplace=True)
cfdna1_snv_table

In [None]:
for chrom in [1, 2, 8, 20, 21, 22]:
    T1W_exome_snv_table = T1W_exome_snv_table[T1W_exome_snv_table['chrom'] != str(chrom)]
    M1W_exome_snv_table = M1W_exome_snv_table[M1W_exome_snv_table['chrom'] != str(chrom)]

In [None]:
calltabledict = {
    'cfdna1': cfdna1_snv_table,
    'T1W': T1W_exome_snv_table,
    'M1W': M1W_exome_snv_table,
    #'M2W': M2W_exome_snv_table,
}

In [None]:
calltabledict = {
    'cfdna1': cfdna1_snv_table,
    'T1W': T1W_WES_snv_table,
    'M1W': M1W_WES_snv_table,
}

In [None]:
calltabledict['M1W'].index#.columns
#calltabledict['cfdna1'].index

In [None]:
res =  compare_groundtruth(calltabledict)

In [None]:
calls_cfdna1 = res['cfdna1_5_5']
print(len(calls_cfdna1))
calls_T1W = res['T1W_1_3']
print(len(calls_T1W))
calls_M1W = res['M1W_1_3']
print(len(calls_M1W))
labels = get_labels([calls_cfdna1, calls_T1W, calls_M1W])
print(labels)

In [None]:
venn3(labels, ['cfdna', 'T1', 'M1'])

In [None]:
cfdna_df = pd.DataFrame(index=calls_cfdna1)

cfdna_df['cfdna'] = True

T1W_df = pd.DataFrame(index=calls_T1W)

T1W_df['T1W'] = True

M1W_df = pd.DataFrame(index=calls_M1W)

M1W_df['M1W'] = True

res1 = pd.concat([cfdna_df, T1W_df, M1W_df], axis=1)

res1.fillna(False, inplace=True)

res1

In [None]:

from scipy.stats import fisher_exact
from statsmodels.sandbox.stats.multicomp import multipletests

# contingency table and Fisher exact test

testres = {}
for pairs in [('cfdna', 'T1W'), ('cfdna', 'M1W'), ('T1W', 'M1W')]:
    a, b =  pairs
    print("############")
    print(a, 'VS', b)
    print("############")
    ctable = [[res1[(res1[a]==True) & (res1[b]==True)].shape[0], res1[(res1[a]==False) & (res1[b]==True)].shape[0]],
              [res1[(res1[a]==True) & (res1[b]==False)].shape[0], int(20000 - res1[(res1[a]==True) | (res1[b]==True)].shape[0])]]
    
    #p_adjusted = multipletests(Column6, method='bonferroni')
    odd_ratio, p_value = fisher_exact(ctable, alternative='two-sided')
    print('Contingency table:')
    print(pd.DataFrame(ctable, columns=[a+' & '+b, '!'+a+' & '+b], index=[a+' & !'+b, '!'+a+' & !'+b]))
    print('Fisher exact test (two-sided): odd_ratio = {}, p_value = {}'.format(odd_ratio, p_value))
    testres[a+' VS '+b] = [p_value]

testres = pd.DataFrame.from_dict(testres).T
testres.columns = ['p_value']
testres['p_value_adjusted'] = multipletests(testres['p_value'].values, method='bonferroni')[1]
testres[['p_value_adjusted']]

In [None]:
cfdnaonly = list(set(calls_cfdna1) - set(calls_T1W) - set(calls_M1W))
len(cfdnaonly)

In [None]:
cfdna1_snv_table.columns

In [None]:
cfdna1_snv_table.loc[cfdnaonly][config.methods].sum()

In [None]:
cfdna1_snv_table.loc[cfdnaonly][config.methods]#.sum(axis=1)

In [None]:
#np.array(calls_cfdna1).tofile('data/matchedtissue/venndiagram/calls_cfdna1_atleast5callers_withvarnet.csv', sep = ',')
np.array(calls_T1W).tofile('data/matchedtissue/venndiagram/calls_T1W_atleast3callers_withvarnet.csv', sep = ',')
np.array(calls_M1W).tofile('data/matchedtissue/venndiagram/calls_M1W_atleast3callers_withvarnet.csv', sep = ',')

In [None]:
calls_cfdna1 = [str(i)[1:-1] for i in list(pd.read_csv("data/matchedtissue/venndiagram/calls_cfdna1_atleast4callers.csv", header=None).values)[0]]
calls_T1W =  [str(i)[1:-1] for i in list(pd.read_csv("data/matchedtissue/venndiagram/calls_T1W_atleast3callers.csv", header=None).values)[0]]
calls_M1W = [str(i)[1:-1] for i in list(pd.read_csv("data/matchedtissue/venndiagram/calls_M1W_atleast3callers.csv", header=None).values)[0]]

# Table to compare call sets

In [None]:
cfdna_df = pd.DataFrame(index=calls_cfdna1)
cfdna_df['cfdna'] = True
T1W_df = pd.DataFrame(index=calls_T1W)
T1W_df['T1W'] = True
M1W_df = pd.DataFrame(index=calls_M1W)
M1W_df['M1W'] = True
res = pd.concat([cfdna_df, T1W_df, M1W_df], axis=1)
res.fillna(False, inplace=True)
res

# Jaccard similarity score of call set pairs

In [None]:
from sklearn.metrics import jaccard_score
print("{:.2f},{:.2f},{:.2f}".format(jaccard_score(res['cfdna'], res['T1W']), jaccard_score(res['cfdna'], res['M1W']), jaccard_score(res['T1W'], res['M1W'])))
print("{:.2f},{:.2f},{:.2f}".format(jaccard_score(res['T1W'], res['cfdna']), jaccard_score(res['M1W'], res['cfdna']), jaccard_score(res['M1W'], res['T1W'])))

# Fisher exact test on call sets pairs

## NB: would need to evaluate number of base pairs with sufficient coverage evaluated. Here, took N = 20K (approx number of protein-coding genes with decent coverage)

In [None]:

from scipy.stats import fisher_exact
from statsmodels.sandbox.stats.multicomp import multipletests

# contingency table and Fisher exact test

testres = {}
for pairs in [('cfdna', 'T1W'), ('cfdna', 'M1W'), ('T1W', 'M1W')]:
    a, b =  pairs
    print("############")
    print(a, 'VS', b)
    print("############")
    ctable = [[res[(res[a]==True) & (res[b]==True)].shape[0], res[(res[a]==False) & (res[b]==True)].shape[0]],
              [res[(res[a]==True) & (res[b]==False)].shape[0], int(20000 - res[(res[a]==True) | (res[b]==True)].shape[0])]]
    
    #p_adjusted = multipletests(Column6, method='bonferroni')
    odd_ratio, p_value = fisher_exact(ctable, alternative='two-sided')
    print('Contingency table:')
    print(pd.DataFrame(ctable, columns=[a+' & '+b, '!'+a+' & '+b], index=[a+' & !'+b, '!'+a+' & !'+b]))
    print('Fisher exact test (two-sided): odd_ratio = {}, p_value = {}'.format(odd_ratio, p_value))
    testres[a+' VS '+b] = [p_value]

testres = pd.DataFrame.from_dict(testres).T
testres.columns = ['p_value']
testres['p_value_adjusted'] = multipletests(testres['p_value'].values, method='bonferroni')[1]
testres[['p_value_adjusted']]

# 150x and 2000x ground truths

In [None]:
A = pd.read_csv('figures/figure2b/gt_986_exome_150x_atleast5callersinundilutedsample_snv.csv', index_col=0)
B = pd.read_csv('figures/figure2b/gt_986_exome_2000x_atleast5callersinundilutedsample_snv.csv', index_col=0)
print(A.shape[0], B.shape[0])

In [None]:
ab = list(set(set(list(A.index)) & set(list(B.index))))
len(ab)

In [None]:
comp = pd.concat([A.loc[ab], B.loc[ab]], axis=1)
comp.columns = ['vaf 150x', 'vaf 2000x']
plt.figure(figsize=(10,10))
sns.histplot(x='vaf 150x', y='vaf 2000x', data=comp, binwidth=0.01, binrange=[0,0.5], alpha=1)
#comp.plot(x='A', y='B', kind='scatter', color='b')

In [None]:
comp = pd.concat([Awithoutab, ac], axis=1)


ac = calltablesseries.loc[[a for a in list(Awithoutab.index) if a in calltablesseries.index]][['{:.2f}_{}_vaf'.format(aux['tf'].max(), m) for m in config.methods if ('{:.2f}_{}'.format(aux['tf'].max(), m) in calltablesseries.columns) and (m != 'smurf')]].median(skipna=True, axis=1)
print(ac.shape)


In [None]:
Awithoutab = A.loc[list(set(A.index) - set(ab))]
sns.histplot(x='median_vaf', data=Awithoutab)

In [None]:
plt.figure(figsize=(20,5))
Bwithoutab = B.loc[list(set(B.index) - set(ab))]
sns.histplot(x='median_vaf', data=Bwithoutab, binwidth=0.01, binrange=[0,1], alpha=0.5, label='2000x only')
sns.histplot(x='B', data=comp, color='red', binwidth=0.01, binrange=[0,1], alpha=0.5, label='both')
Awithoutab = A.loc[list(set(A.index) - set(ab))]
sns.histplot(x='median_vaf', data=Awithoutab, color='green', binwidth=0.01, binrange=[0,1], alpha=0.5, label='150x only')
plt.legend()

In [None]:
list(Awithoutab.index)