# cfDNA mixtures series on single chrom

In [None]:
# Imports

%load_ext autoreload
%autoreload 2

import io
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pysam
import warnings
from sklearn.metrics import precision_recall_curve, f1_score, average_precision_score
warnings.filterwarnings('ignore')
from sklearn.metrics import confusion_matrix

# set working directory
if not os.getcwd().endswith('cfdna_snv_benchmark'):
    os.chdir('../')
print('Current working directory: {}'.format(os.getcwd()))

from utils.config import Config
from utils.viz import *
from utils.table import *
from utils.metrics import *
from utils.calltable import *
from utils.venn import venn6, get_labels

In [None]:
# Config and Display paramaters

config = Config("config/", "config_viz.yaml")
set_display_params(config)
print(config.methods)

In [None]:
# Chomosome

chrom = '22'
muttype = 'snv'
mixtureid =  'CRC-1014_180816-CW-T_CRC-1014_090516-CW-T'
#mixtureid = 'CRC-986_100215-CW-T_CRC-986_300316-CW-T'
#mixtureid =  'CRC-123_310715-CW-T_CRC-123_121115-CW-T'
plasmasample = '_'.join(mixtureid.split('_')[:2])
print(plasmasample)
healthysample = '_'.join(mixtureid.split('_')[2:])
print(healthysample)

In [None]:
reload = False
save = True
fixedvar = 'coverage'
filterparam = 'all'

# Load call table

In [None]:
# Save table if do not exist and load tables

calltables = {'sampleid':[], 'tf':[], 'cov':[], 'snv':[], 'indel':[], 'snp':[]}
mixturefolder = os.path.join(*config.mixturefolder, 'mixtures_chr'+chrom, 'mixtures_chr'+chrom+'_'+mixtureid)
for mixturepath in [l for l in os.listdir(mixturefolder) if l.endswith('x') or l.endswith('T')]:
    print(mixturepath)
    if not os.path.exists(os.path.join(mixturefolder, mixturepath, 'calls', mixturepath+'_snv_calls_'+filterparam+'.csv')) or reload:
        calltable_snv, calltable_indel, calltable_snp = (os.path.join(mixturefolder, mixturepath), config.methods, save=True, filter=filterparam)
    calltables['sampleid'].append(mixturepath)
    calltables['tf'].append(np.round(100*float(pd.read_csv(os.path.join(mixturefolder, mixturepath, 'estimated_tf_chr'+chrom+mixturepath[len(('mixture_chr'+chrom)):]+'.txt')).columns[0]), 4))
    calltables['cov'].append(np.round(float(pd.read_csv(os.path.join(mixturefolder, mixturepath, 'coverage_chr'+chrom+mixturepath[len(('mixture_chr'+chrom)):]+'.txt')).columns[0]), 4))
    calltable_snv = pd.read_csv(os.path.join(mixturefolder, mixturepath, 'calls', mixturepath+'_snv_calls_'+filterparam+'.csv'), index_col=0)
    calltable_indel = pd.read_csv(os.path.join(mixturefolder, mixturepath, 'calls', mixturepath+'_indel_calls_'+filterparam+'.csv'), index_col=0)
    calltable_snp = pd.read_csv(os.path.join(mixturefolder, mixturepath, 'calls', mixturepath+'_snp_calls_'+filterparam+'.csv'), index_col=0)
    calltables['snv'].append(calltable_snv)
    calltables['indel'].append(calltable_indel)
    calltables['snp'].append(calltable_snp)
calltables.keys()

In [None]:
for mt in ['snv', 'indel', 'snp']:
    if not os.path.exists(os.path.join(mixturefolder, 'calls')):
        os.mkdir(os.path.join(mixturefolder, 'calls'))
    if not os.path.exists(os.path.join(mixturefolder, 'calls', mixtureid+'_'+mt+'_calls_'+filterparam+'.csv')) or reload:
        for ci, csnv in enumerate(calltables[mt]):
            cols = ['chrom', 'pos', 'ref', 'alt', 'type']
            for m in config.methods:
                cols.append('{:.2f}_{}'.format(calltables['tf'][ci], m))
                cols.append('{:.2f}_{}_score'.format(calltables['tf'][ci], m))
            for m in config.methods:
                cols.append('{:.2f}_{}_altcov'.format(calltables['tf'][ci], m)) 
                cols.append('{:.2f}_{}_totcov'.format(calltables['tf'][ci], m)) 
                cols.append('{:.2f}_{}_vaf'.format(calltables['tf'][ci], m)) 
            csnv.columns = cols
        # ensure no duplicated index
        print(calltables[mt][0].loc[calltables[mt][0].index[calltables[mt][0].index.duplicated(keep=False)]].shape[0])
        # get call series
        calltablesseries = pd.concat([ct.set_index(['chrom', 'pos', 'ref', 'alt', 'type']) for ct in calltables[mt]], axis=1)
        calltablesseries.reset_index(inplace=True)
        calltablesseries['chrom_pos_ref_alt'] = calltablesseries['chrom'].astype('str').str.cat(calltablesseries['pos'].astype('str'), sep="_").str.cat(calltablesseries['ref'].astype('str'), sep='_').str.cat(calltablesseries['alt'].astype('str'), sep='_')
        calltablesseries.set_index('chrom_pos_ref_alt', inplace=True)
        print(calltablesseries.shape)
        calltablesseries.to_csv(os.path.join(mixturefolder, 'calls', mixtureid+'_'+mt+'_calls_'+filterparam+'.csv'))
        
calltablesseries = pd.read_csv(os.path.join(mixturefolder, 'calls', mixtureid+'_'+muttype+'_calls_'+filterparam+'.csv'), index_col=0)
calltablesseries.head()

# Generate ground truth

## Approach 1: optimal threshold for each method + consensus

In [None]:
calltablesseries['truth'] = False
refmethods = list(np.copy(config.methods))
print(refmethods)

ncallsinundiluted = calltablesseries[['{:.2f}_{}'.format(max(calltables['tf']), m) for m in refmethods]].sum(axis=0)
callsinundiluted = calltablesseries[['{:.2f}_{}_score'.format(max(calltables['tf']), m) for m in refmethods]]
print(ncallsinundiluted)
callsinundiluted.columns = refmethods
callsinundiluted = callsinundiluted.stack().reset_index(level=0, drop=False).reset_index()
callsinundiluted.set_index('chrom_pos_ref_alt', inplace=True)
callsinundiluted.columns = ['method', 'score']
for mi, m in enumerate(refmethods):
    plt.figure()
    sns.histplot(callsinundiluted[callsinundiluted['method'] == m], x='score', stat="probability", color=config.colors[config.methods.index(m)], binwidth=0.01)
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.title(m)

# pseudo ground truth = mutations found by at least k callers
if muttype == 'snv':
    nref = 5
elif muttype == 'indel':
    nref = 3
truthpos = list(calltablesseries[calltablesseries[['{:.2f}_{}'.format(max(calltables['tf']), m) for m in refmethods]].sum(axis=1) >= nref].index)
calltablesseries.loc[truthpos, 'truth'] = True
calltablesseries['truth'].value_counts()

## Approach 2: rank mutations

In [None]:
#calltablesseries['truth'] = False

if muttype == 'snv':
    refmethods = list(np.copy(config.methods))
    #refmethods.remove('cfsnv')
elif muttype == 'indel':
    refmethods = list(np.copy(config.methods))
    refmethods.remove('abemus')
    refmethods.remove('cfsnv')
    

# pseudo ground truth = best K mutations found by each caller
# number mutations found by each method
ncallsinundiluted = calltablesseries[['{:.2f}_{}'.format(max(calltables['tf']), m) for m in refmethods]].sum(axis=0)
callsinundiluted = calltablesseries[['{:.2f}_{}_score'.format(max(calltables['tf']), m) for m in refmethods]]
callsinundiluted = (callsinundiluted - callsinundiluted.mean()) + 0.5
#callsinundiluted = ((callsinundiluted - callsinundiluted.mean()) / callsinundiluted.std())
#callsinundiluted = (callsinundiluted - callsinundiluted.min()) / (callsinundiluted.max() - callsinundiluted.min())
print(ncallsinundiluted)
callsinundiluted.columns = refmethods
callsinundiluted = callsinundiluted.stack().reset_index(level=0, drop=False).reset_index()
callsinundiluted.set_index('chrom_pos_ref_alt', inplace=True)
callsinundiluted.columns = ['method', 'score']
for mi, m in enumerate(refmethods):
    plt.figure()
    sns.histplot(callsinundiluted[callsinundiluted['method'] == m], x='score', stat="probability", color=config.colors[config.methods.index(m)])
    #plt.xlim([0, 1])
    #plt.ylim([0, 1])
    plt.title(m)
    
ncallsinundiluted = calltablesseries[['{:.2f}_{}'.format(max(calltables['tf']), m) for m in refmethods]].sum(axis=0)
print(ncallsinundiluted)
plt.figure(figsize=(15,10))
plt.bar(refmethods, ncallsinundiluted)

plt.figure(figsize=(15,10))
plt.bar(refmethods, np.log(ncallsinundiluted.values.astype(float)))
#plt.semilogy()

ncallsinundiluted = calltablesseries[['{:.2f}_{}'.format(max(calltables['tf']), m) for m in refmethods]].sum(axis=0)
#ncallsinundiluted = ncallsinundiluted.astype(float)
#ncallsinundiluted = np.log(ncallsinundiluted)
ncallsinundiluted = ncallsinundiluted.max()/ncallsinundiluted
ncallsinundiluted = ncallsinundiluted/ncallsinundiluted.max()
print(ncallsinundiluted.shape)
print(ncallsinundiluted)

callsinundiluted = calltablesseries[['{:.2f}_{}_score'.format(max(calltables['tf']), m) for m in refmethods]]
callsinundiluted.sort_values(by=list(callsinundiluted.columns), ascending=False).head()

for c in callsinundiluted.columns:
    callsinundiluted[c] *= callsinundiluted[c] * ncallsinundiluted.loc[c[:-6]]
callsinundiluted = callsinundiluted.fillna(0)
callsinundiluted['score'] = callsinundiluted[['{:.2f}_{}_score'.format(max(calltables['tf']), m) for m in refmethods]].sum(axis=1) / len(refmethods)
print(callsinundiluted['score'].describe())

print(callsinundiluted[callsinundiluted['score'] > 1/(len(refmethods))].shape)

plt.figure()
sns.histplot(callsinundiluted[callsinundiluted['score'] > 1/(len(refmethods))], x='score', stat='probability')
#plt.xlim([0,1])
plt.title('meta score')

print(callsinundiluted[callsinundiluted['score'] > 1/(len(refmethods))].describe())

callsinundiluted.head()

In [None]:
calltablesseries['truth'] = False
callsinundiluted[callsinundiluted['score'] > 1/(len(config.methods))]
truthpos = list(callsinundiluted[callsinundiluted['score'] > 1/(len(config.methods))].index)
calltablesseries.loc[truthpos, 'truth'] = True

print(calltablesseries[calltablesseries['truth'] == True][['{:.2f}_{}'.format(max(calltables['tf']), m) for m in config.methods]].sum(axis=1).value_counts())

calltablesseries['truth'].value_counts()

# Plots

In [None]:
if fixedvar == 'coverage':
    dilutionseries = [(70,0), (70, 80), (50, 100), (30, 120), (20, 130), (10, 140), (5, 145)]
elif fixedvar == 'ctdna':
    dilutionseries = [(70,0), (70, 30), (70, 80), (70, 130), (70, 180)]

results_auprc_df = metric_curve(config, calltablesseries, plasmasample, healthysample, dilutionseries,
                                    metric='auprc', ground_truth_method='ranked', refsample='undilutedranked', muttype=muttype, chrom=chrom, methods=config.methods, fixedvar=fixedvar, save=save)
results_recall_df = metric_curve(config, calltablesseries, plasmasample, healthysample, dilutionseries,
                                    metric='recall', ground_truth_method='ranked', refsample='undilutedranked', muttype=muttype, chrom=chrom, methods=config.methods, fixedvar=fixedvar, save=save)
results_precision_df = metric_curve(config, calltablesseries, plasmasample, healthysample, dilutionseries,
                                    metric='precision', ground_truth_method='ranked', refsample='undilutedranked', muttype=muttype, chrom=chrom, methods=config.methods, fixedvar=fixedvar, save=save)


In [None]:
figure_curve(config, calltablesseries, plasmasample, healthysample, dilutionseries, xy='pr', ground_truth_method='ranked',
             refsample='undilutedranked', muttype=muttype.upper(), chrom='22', methods=None, fixedvar=fixedvar, save=save)

# Confusion matrix

# Call set similarity: Jaccard Index

In [None]:

for sklearn.metrics.jaccard_similarity_score(y_true, y_pred, normalize=True)

# Change in prediction with dilution

# True Positive, False Negative and False Positive Mutations

# Stacked TP, FN, FP, TN

# Number of mutations

# Ratio performance attenuation