# cfDNA mixtures series on single chrom

In [3]:
# Imports

%load_ext autoreload
%autoreload 2

import io
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pysam
import warnings
from sklearn.metrics import precision_recall_curve, f1_score, average_precision_score
warnings.filterwarnings('ignore')
from sklearn.metrics import confusion_matrix

# set working directory
if not os.getcwd().endswith('cfdna_snv_benchmark'):
    os.chdir('../')
print('Current working directory: {}'.format(os.getcwd()))

from utils.config import Config
from utils.viz import *
from utils.table import *
from utils.metrics import *
from utils.calltable import *
from utils.calltableseries import *
from utils.venn import venn6, get_labels

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Current working directory: /Users/hanae/Repositories/cfdna_snv_benchmark


In [4]:
# Config and Display paramaters

config = Config("config/", "config_viz.yaml")
set_display_params(config)
print(config.methods)

paper
['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan', 'abemus', 'sinvict']


In [5]:
# Chomosome

muttype = 'snv'
#mixtureid =  'CRC-1014_180816-CW-T_CRC-1014_090516-CW-T'
mixtureid = 'CRC-986_100215-CW-T_CRC-986_300316-CW-T'
#mixtureid =  'CRC-123_310715-CW-T_CRC-123_121115-CW-T'
plasmasample = '_'.join(mixtureid.split('_')[:2])
print(plasmasample)
healthysample = '_'.join(mixtureid.split('_')[2:])
print(healthysample)

if mixtureid ==  'CRC-1014_180816-CW-T_CRC-1014_090516-CW-T':
    chrom = [str(c) for c in range(1,23) if c !=17 and c !=8]
else:
    chrom = 'all'

CRC-986_100215-CW-T
CRC-986_300316-CW-T


In [6]:
reload = False
save = True
fixedvar = 'coverage'    
filterparam = 'all'

# Load call table

In [7]:
calltables = {'sampleid':[], 'tf':[], 'cov':[], 'snv':[], 'indel':[], 'snp':[]}

calltable_snv, aux = get_calltableseries(config, mixtureid, chrom, muttype='snv', filterparam=filterparam, reload=reload, save=save)
calltable_indel, aux = get_calltableseries(config, mixtureid, chrom, muttype='indel', filterparam=filterparam, reload=reload, save=save)
calltable_snp, aux = get_calltableseries(config, mixtureid, chrom, muttype='snp', filterparam=filterparam, reload=reload, save=save)
print(calltable_snv.shape, calltable_indel.shape, calltable_snp.shape)
calltables['snv'] = calltable_snv
calltables['indel'] = calltable_indel
calltables['snp'] = calltable_snp
calltables['sampleid'] = mixtureid
#calltables['cov'].append(np.round(float(pd.read_csv(os.path.join(mixturefolder, mixturepath, 'coverage_chr'+chrom+mixturepath[len(('mixture_chr'+chrom)):]+'.txt')).columns[0]), 4))    
calltables['tf'] = np.unique([cn.split('_')[0] for cn in list(calltable_snv.columns)])[:-5].astype(float)
print(aux)
calltables.keys()

['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22']
['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22']
['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22']


KeyboardInterrupt: 

In [None]:
# choice

if fixedvar == 'coverage':
    seriesorder = [(70,0), (70, 80), (50, 100), (30, 120), (20, 130), (10, 140), (5, 145)]
elif fixedvar == 'ctdna':
    seriesorder = [(70,0), (70, 30), (70, 80), (70, 130), (70, 180)]

calltablesseries = calltables[muttype]

dilutionseries = aux.T[['mixture_' + '_'.join(mixtureid.split('_')[:2]) + '_' + str(s[0]) + 'x_' + '_'.join(mixtureid.split('_')[2:4]) + '_' + str(s[1]) + 'x' for s in seriesorder]].T
dilutionseries

# Generate ground truth

## Approach 1: consensus 

In [None]:
calltablesseries['truth'] = False
refmethods = list(np.copy(config.methods))
print(refmethods)

ncallsinundiluted = calltablesseries[['{:.2f}_{}'.format(max(dilutionseries['tf']), m) for m in refmethods]].sum(axis=0)
callsinundiluted = calltablesseries[['{:.2f}_{}_score'.format(max(dilutionseries['tf']), m) for m in refmethods]]
print(ncallsinundiluted)
callsinundiluted.columns = refmethods
callsinundiluted = callsinundiluted.stack().reset_index(level=0, drop=False).reset_index()
callsinundiluted.set_index('chrom_pos_ref_alt', inplace=True)
callsinundiluted.columns = ['method', 'score']
for mi, m in enumerate(refmethods):
    plt.figure()
    sns.histplot(callsinundiluted[callsinundiluted['method'] == m], x='score', stat="probability", color=config.colors[config.methods.index(m)], binwidth=0.01)
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.title(m)

# pseudo ground truth = mutations found by at least k callers
if muttype == 'snv':
    nref = 4
elif muttype == 'indel':
    nref = 3
truthpos = list(calltablesseries[calltablesseries[['{:.2f}_{}'.format(max(dilutionseries['tf']), m) for m in refmethods]].sum(axis=1) >= nref].index)
calltablesseries.loc[truthpos, 'truth'] = True
calltablesseries['truth'].value_counts()

## Approach 2: rank mutations

In [None]:
#calltablesseries['truth'] = False

if muttype == 'snv':
    refmethods = config.methods
elif muttype == 'indel':
    refmethods = list(np.copy(config.methods))
    refmethods.remove('abemus')
    refmethods.remove('cfsnv')
    

# pseudo ground truth = best K mutations found by each caller
# number mutations found by each method
ncallsinundiluted = calltablesseries[['{:.2f}_{}'.format(max(calltables['tf']), m) for m in refmethods]].sum(axis=0)
callsinundiluted = calltablesseries[['{:.2f}_{}_score'.format(max(calltables['tf']), m) for m in refmethods]]
print(ncallsinundiluted)
callsinundiluted.columns = refmethods
callsinundiluted = callsinundiluted.stack().reset_index(level=0, drop=False).reset_index()
callsinundiluted.set_index('chrom_pos_ref_alt', inplace=True)
callsinundiluted.columns = ['method', 'score']
for mi, m in enumerate(refmethods):
    plt.figure()
    sns.histplot(callsinundiluted[callsinundiluted['method'] == m], x='score', stat="probability", color=config.colors[config.methods.index(m)], binwidth=0.01)
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.title(m)
    
ncallsinundiluted = calltablesseries[['{:.2f}_{}'.format(max(calltables['tf']), m) for m in refmethods]].sum(axis=0)
print(ncallsinundiluted)

ncallsinundiluted = calltablesseries[['{:.2f}_{}'.format(max(calltables['tf']), m) for m in refmethods]].sum(axis=0)
ncallsinundiluted = ncallsinundiluted.max()/ncallsinundiluted
ncallsinundiluted = ncallsinundiluted/ncallsinundiluted.max()
print(ncallsinundiluted.shape)
print(ncallsinundiluted)

callsinundiluted = calltablesseries[['{:.2f}_{}_score'.format(max(calltables['tf']), m) for m in refmethods]]
callsinundiluted.sort_values(by=list(callsinundiluted.columns), ascending=False).head()
#callsinundiluted.head()

for c in callsinundiluted.columns:
    callsinundiluted[c] *= callsinundiluted[c] * ncallsinundiluted.loc[c[:-6]]
callsinundiluted = callsinundiluted.fillna(0)
callsinundiluted['score'] = callsinundiluted[['{:.2f}_{}_score'.format(max(calltables['tf']), m) for m in refmethods]].sum(axis=1) / len(refmethods)
print(callsinundiluted['score'].describe())

print(callsinundiluted[callsinundiluted['score'] > 1/(len(refmethods))].shape)

plt.figure()
sns.histplot(callsinundiluted[callsinundiluted['score'] > 1/(len(refmethods))], x='score', stat='probability', bins=10)
plt.xlim([0,1])
plt.title('score')

callsinundiluted.head()

In [None]:
calltablesseries['truth'] = False
callsinundiluted[callsinundiluted['score'] > 1/(len(config.methods))]
truthpos = list(callsinundiluted[callsinundiluted['score'] > 1/(len(config.methods))].index)
calltablesseries.loc[truthpos, 'truth'] = True

print(calltablesseries[calltablesseries['truth'] == True][['{:.2f}_{}'.format(max(calltables['tf']), m) for m in config.methods]].sum(axis=1).value_counts())

calltablesseries['truth'].value_counts()

# Plots

In [None]:
results_auprc_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, 
                                    metric='auprc', ground_truth_method=nref, refsample='undiluted', muttype=muttype, methods=config.methods, fixedvar=fixedvar, save=save)
results_recall_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, 
                                    metric='recall', ground_truth_method=nref, refsample='undiluted', muttype=muttype, methods=config.methods, fixedvar=fixedvar, save=save)
results_precision_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, 
                                    metric='precision', ground_truth_method=nref, refsample='undiluted', muttype=muttype, methods=config.methods, fixedvar=fixedvar, save=save)


In [None]:
figure_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, xy='pr', ground_truth_method=nref,
             refsample='undiluted', muttype=muttype.upper(), chrom=chrom, methods=None, fixedvar=fixedvar, save=save)

# Confusion matrix

# Call set similarity: Jaccard Index

In [None]:

for sklearn.metrics.jaccard_similarity_score(y_true, y_pred, normalize=True)

# Change in prediction with dilution

# True Positive, False Negative and False Positive Mutations

# Stacked TP, FN, FP, TN

# Number of mutations

# Ratio performance attenuation