# SEQC2

In [None]:
# Imports

%load_ext autoreload
%autoreload 2

import io
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pysam
import warnings
from tqdm.notebook import tqdm
from sklearn.metrics import precision_recall_curve, f1_score, average_precision_score
warnings.filterwarnings('ignore')
from sklearn.metrics import confusion_matrix

# set working directory
if not os.getcwd().endswith('cfdna_snv_benchmark'):
    os.chdir('../')
print('Current working directory: {}'.format(os.getcwd()))

from utils.config import Config
from utils.viz import *
from utils.table import *
from utils.metrics import *
from utils.calltable import *
from utils.calltableseries import *
from utils.groundtruth import *
from utils.metricsseries import *
from utils.util_lift_over import *
from utils.venn import venn6, get_labels

In [None]:
# Config and Display paramaters

config = Config("config/", "config_viz.yaml")
set_display_params(config)
print(config.methods)

In [None]:
# Chomosome

mixtureids =  ['BRP2_ST26_25ng_LIB1-P', 'IDT2_ST05_25ng_LIB1-P', 'ILM2_ST29_25ng_LIB1-P', 'ROC2_ST21_25ng_LIB1-PH']
mixtureid = 'BRP2_ST26_25ng_LIB1-P'
reload = False
save = False
fixedvars=['coverage', 'ctdna']
filterparam = 'all'

markers = ['o', '^', 'X']
linestyles = ['-', '-', '-']
color_dict = {config.methods[i]: config.colors[i] for i in range(len(config.methods))}

muttypes = ['snv', 'indel']
metrics = ['auprc', 'precision', 'recall']

# Bed files intersection

In [None]:

# load bedfile for method BRP
bedfile = pd.read_csv(os.path.join(*config.mixturefolderSEQC2, 'bedfiles', 'intersect_BRP_IDT_ILM_ROC_hg19.bed'), sep='\t', header=None)
bedfile.columns = ['chrom', 'startpos', 'endpos', 'gene']
bedfile['chrom'] = bedfile['chrom'].str.replace('chr', '')

In [None]:
bedfile.shape[0]
germlinetruthslistall

In [None]:
# load bedfile for method BRP
bedfile = pd.read_csv(os.path.join(*config.mixturefolderSEQC2, 'bedfiles', 'intersect_BRP_IDT_ILM_ROC_hg19.bed'), sep='\t', header=None)
bedfile.columns = ['chrom', 'startpos', 'endpos', 'gene']
bedfile['chrom'] = bedfile['chrom'].str.replace('chr', '')

# load ground truths (cerified mutations either somatic or diluted germline in the sample A)
groundtruths = read_vcf(os.path.join(*config.mixturefolderSEQC2, 'ground_truths', 'KnownPositives_hg19.vcf'))
groundtruths['CHROM'] = groundtruths['CHROM'].str.replace('chr', '')
groundtruths['chrom_pos'] = groundtruths['CHROM'].astype(str).str.cat(groundtruths['POS'].astype('str'), sep='_')
groundtruths['chrom_pos_ref_alt'] = groundtruths['CHROM'].astype(str).str.cat(groundtruths['POS'].astype('str'), sep='_').str.cat(groundtruths['REF'].astype('str'), sep='_').str.cat(groundtruths['ALT'].astype('str'), sep='_')
groundtruths.set_index('chrom_pos_ref_alt', inplace=True)
print(groundtruths.shape[0])

bedfilelistall = []
for il in range(bedfile.shape[0]):
    for i in list(np.arange(bedfile.startpos.iloc[il], bedfile.endpos.iloc[il])):
        bedfilelistall.append(bedfile.chrom.iloc[il]+'_'+str(i))
print(len(bedfilelistall))

evaluation_bed = pd.DataFrame(index=bedfilelistall)
evaluation_bed['truth'] = False
gtchrompos = set(set(groundtruths['chrom_pos'].values) & set(evaluation_bed.index))
evaluation_bed.loc[gtchrompos, 'truth'] = True
evaluation_bed.value_counts()

In [None]:
# load ground truths (cerified mutations either somatic or diluted germline in the sample A)
groundtruths = read_vcf(os.path.join(*config.mixturefolderSEQC2, 'ground_truths', 'KnownPositives_hg19.vcf'))
groundtruths['CHROM'] = groundtruths['CHROM'].str.replace('chr', '')
groundtruths['chrom_pos_ref_alt'] = groundtruths['CHROM'].astype(str).str.cat(groundtruths['POS'].astype('str'), sep='_').str.cat(groundtruths['REF'].astype('str'), sep='_').str.cat(groundtruths['ALT'].astype('str'), sep='_')
groundtruths.set_index('chrom_pos_ref_alt', inplace=True)

# load known negatives (no mutation in sample A and sample B)
germlinetruths = pd.read_csv(os.path.join(*config.mixturefolderSEQC2, 'ground_truths', 'KnownNegatives_hg19.bed'), sep='\t', header=None)
germlinetruths.columns = ['chrom', 'startpos', 'endpos']
germlinetruths['chrom'] = germlinetruths['chrom'].str.replace('chr', '')
germlinetruths['chrom_pos'] = germlinetruths['chrom'].astype(str).str.cat(germlinetruths['startpos'].astype('str'), sep='_')
germlinetruths.set_index('chrom_pos', inplace=True)

# load bedfile for method BRP
bedfile = pd.read_csv(os.path.join(*config.mixturefolderSEQC2, 'bedfiles', 'intersect_BRP_IDT_ILM_ROC_hg19.bed'), sep='\t', header=None)
bedfile.columns = ['chrom', 'startpos', 'endpos', 'gene']
bedfile['chrom'] = bedfile['chrom'].str.replace('chr', '')

# keep only ground truths in bed
groundtruth_outside_bed = []
for idx, row in tqdm(groundtruths.iterrows(), total=groundtruths.shape[0]):
    chrom = row['CHROM']
    pos = row['POS']
    ref = row['REF']
    alt = row['ALT']
    aux = bedfile[bedfile['chrom'] == str(chrom)]
    count = False
    for ind in list(aux.index):
        #print(aux['endpos'][ind], aux['startpos'][ind])
        if aux['endpos'][ind] >= int(pos):
            if aux['startpos'][ind] <= int(pos):
                count = True
    if not count:
        groundtruth_outside_bed.append(chrom + '_'+ str(pos) + '_'+ ref + '_'+ alt)
groundtruths_in_bed = groundtruths.drop(groundtruth_outside_bed)
print(groundtruths_in_bed.shape[0])

bedfilelistall = []
for il in range(bedfile.shape[0]):
    for i in list(np.arange(bedfile.startpos.iloc[il], bedfile.endpos.iloc[il])):
        bedfilelistall.append(bedfile.chrom.iloc[il]+'_'+str(bedfile.startpos.iloc[il]))
print(len(bedfilelistall))
germlinetruthslistall = []
for il in tqdm(range(germlinetruths.shape[0])):
    for i in list(np.arange(germlinetruths.startpos.iloc[il], germlinetruths.endpos.iloc[il])):
        germlinetruthslistall.append(germlinetruths.chrom.iloc[il]+'_'+str(germlinetruths.startpos.iloc[il]))
germlinetruths_in_bed = list(set(bedfilelistall) & set(germlinetruthslistall))
print(len(germlinetruths_in_bed))

In [None]:
evaluation_bed = list(set(set(germlinetruths_in_bed) | set(list(groundtruths_in_bed.index.str.split('_').str[:2].str.join('_')))))
print(len(evaluation_bed))
evaluation_bed_df = pd.DataFrame(index=evaluation_bed, columns=['truth'])
evaluation_bed_df.loc[germlinetruths_in_bed] = False
evaluation_bed_df.loc[list(groundtruths_in_bed.index.str.split('_').str[:2].str.join('_'))] = True
print(evaluation_bed_df.isna().sum())
print(evaluation_bed_df.value_counts())
#np.save(os.path.join(*config.mixturefolderSEQC2, 'bedfiles', 'evaluationbed_hg19.npy'), evaluation_bed)

In [None]:
evaluation_bed_df.to_csv(os.path.join(*config.mixturefolderSEQC2, 'bedfiles', 'evaluationbed_hg19.csv'))

In [None]:
#evalaution_bed = np.load(os.path.join(*config.mixturefolderSEQC2, 'bedfiles', 'evaluationbed_hg19.npy'))
evaluation_bed_df = pd.read_csv(os.path.join(*config.mixturefolderSEQC2, 'bedfiles', 'evaluationbed_hg19.csv'), index_col=0)
evaluation_bed_df

In [None]:
evaluation_bed_hg38 = []
for eb in evaluation_bed:
    #print(eb)
    eb = 'chr'+eb
    evaluation_bed_hg38.append(liftover(eb, os.path.join('data', 'extdata', 'hg19ToHg38.over.chain.gz')))
evaluation_bed_hg38

In [None]:
groundtruths_annotated = read_vcf(os.path.join(*config.mixturefolderSEQC2, 'ground_truths', 'KnownPositives_hg19_nochr_annotated.vcf.gz'))
a = groundtruths_annotated['ID'].str.contains('rs').sum()
b = groundtruths_annotated.shape[0]
print(a, b, round(100*a/b, 2))

In [None]:
groundtruths_annotated

In [None]:
groundtruths_annotated['chrom_pos'] = groundtruths_annotated['CHROM'].astype(str).str.cat(groundtruths_annotated['POS'].astype(str), sep='_')
groundtruths_annotated_intersect = groundtruths_annotated[groundtruths_annotated['chrom_pos'].isin(evaluation_bed_df.index)]
print(groundtruths_annotated_intersect.shape[0])
print(groundtruths_annotated_intersect['ID'].str.contains('rs').sum())

In [None]:
for eb in evaluation_bed_df.index:
    print(eb)
    print(groundtruths_annotated[groundtruths_annotated['chrom_pos'] == eb])

In [None]:
print(len(evaluation_bed_hg38))
evaluation_bed_hg38.count(np.nan)
#np.save(os.path.join(*config.mixturefolderSEQC2, 'bedfiles', 'evaluationbed_hg38.npy'), evaluation_bed_hg38)
evaluation_bed_hg38_df = evaluation_bed_df.copy()
evaluation_bed_hg38_df.index = evaluation_bed_hg38
evaluation_bed_hg38_df
evaluation_bed_hg38_df.to_csv(os.path.join(*config.mixturefolderSEQC2, 'bedfiles', 'evaluationbed_hg38.csv'))

In [None]:
evaluation_bed_hg38_df = pd.read_csv(os.path.join(*config.mixturefolderSEQC2, 'bedfiles', 'evaluationbed_hg38.csv'), index_col=0)
evaluation_bed_hg38_df[evaluation_bed_hg38_df['truth'] == True]

In [None]:
mixtureid = 'BRP2_ST26_25ng_LIB1-P'
seriesorder = ['SampleDf', 'SampleEf']
print('############# {} ############'.format(mixtureid))
calltables = {'sampleid':[], 'tf':[], 'cov':[], 'snv':[], 'indel':[], 'snp':[]}
calltable_snv, aux = get_calltableseries(config, mixtureid, chrom='all', muttype='snv', filterparam=filterparam, reload=reload, save=save, diltype='SEQC2', concat='tf', bcbiovaf=0.01)
calltable_indel, aux = get_calltableseries(config, mixtureid, chrom='all', muttype='indel', filterparam=filterparam, reload=reload, save=save, diltype='SEQC2', concat='tf', bcbiovaf=0.01)
calltable_snp, aux = get_calltableseries(config, mixtureid, chrom='all', muttype='snp', filterparam=filterparam, reload=reload, save=save, diltype='SEQC2', concat='tf', bcbiovaf=0.01)
print(calltable_snv.shape, calltable_indel.shape, calltable_snp.shape)
print(aux)
calltables['snv'].append(calltable_snv)
calltables['indel'].append(calltable_indel)
calltables['snp'].append(calltable_snp)
calltables['sampleid'] = mixtureid 
#calltables['tf'] = np.unique([cn.split('_')[0] for cn in list(calltable_snv.columns)])[:-5].astype(float)
calltables['snv'] = pd.concat(calltables['snv'])
calltables['indel'] = pd.concat(calltables['indel'])
calltables['snp'] = pd.concat(calltables['snp'])
#dilutionseries
dilutionseries = aux
#dilutionseries = aux.T[['mixture_' + '_'.join(mixtureid.split('_')[:2]) + '_' + str(s[0]) + 'x_' + '_'.join(mixtureid.split('_')[2:4]) + '_' + str(s[1]) + 'x' for s in seriesorder]].T
muttype = 'snv'
refsample = 'SEQC2'

# Part 0: Reproduce paper figure

In [None]:
germlinetruths = pd.read_csv(os.path.join(*config.mixturefolderSEQC2, 'ground_truths', 'KnownNegatives_hg19.bed'), sep='\t', header=None)
germlinetruths.columns = ['chrom', 'startpos', 'endpos']
germlinetruths['chrom'] = germlinetruths['chrom'].str.replace('chr', '')
germlinetruths['chrom_pos'] = germlinetruths['chrom'].astype(str).str.cat(germlinetruths['startpos'].astype('str'), sep='_')
germlinetruths.set_index('chrom_pos', inplace=True)
germlinetruths

In [None]:
# load ground truths (cerified mutations either somatic or diluted germline in the sample A)
groundtruths = read_vcf(os.path.join(*config.mixturefolderSEQC2, 'ground_truths', 'KnownPositives_hg19.vcf'))
groundtruths['CHROM'] = groundtruths['CHROM'].str.replace('chr', '')
groundtruths['chrom_pos'] = groundtruths['CHROM'].astype(str).str.cat(groundtruths['POS'].astype('str'), sep='_')
groundtruths.set_index('chrom_pos', inplace=True)
groundtruths

In [None]:
# load ground truths (cerified mutations either somatic or diluted germline in the sample A)
groundtruths = read_vcf(os.path.join(*config.mixturefolderSEQC2, 'ground_truths', 'KnownPositives_hg19.vcf'))
groundtruths['CHROM'] = groundtruths['CHROM'].str.replace('chr', '')
groundtruths['chrom_pos_ref_alt'] = groundtruths['CHROM'].astype(str).str.cat(groundtruths['POS'].astype('str'), sep='_').str.cat(groundtruths['REF'].astype('str'), sep='_').str.cat(groundtruths['ALT'].astype('str'), sep='_')
groundtruths.set_index('chrom_pos_ref_alt', inplace=True)

# load known negatives (no mutation in sample A and sample B)
germlinetruths = pd.read_csv(os.path.join(*config.mixturefolderSEQC2, 'ground_truths', 'KnownNegatives_hg19.bed'), sep='\t', header=None)
germlinetruths.columns = ['chrom', 'startpos', 'endpos']
germlinetruths['chrom'] = germlinetruths['chrom'].str.replace('chr', '')
germlinetruths['chrom_pos'] = germlinetruths['chrom'].astype(str).str.cat(germlinetruths['startpos'].astype('str'), sep='_')
germlinetruths.set_index('chrom_pos', inplace=True)

# load bedfile for method BRP
bedfile = pd.read_csv(os.path.join(*config.mixturefolderSEQC2, 'bedfiles', 'LBx_BRP_hg19.bed'), sep='\t', header=None)
bedfile.columns = ['chrom', 'startpos', 'endpos', 'gene']
bedfile['chrom'] = bedfile['chrom'].str.replace('chr', '')

# keep only ground truths in bed
groundtruth_outside_bed = []
for idx, row in tqdm(groundtruths.iterrows(), total=groundtruths.shape[0]):
    chrom = row['CHROM']
    pos = row['POS']
    ref = row['REF']
    alt = row['ALT']
    aux = bedfile[bedfile['chrom'] == str(chrom)]
    count = False
    for ind in list(aux.index):
        #print(aux['endpos'][ind], aux['startpos'][ind])
        if aux['endpos'][ind] >= int(pos):
            if aux['startpos'][ind] <= int(pos):
                count = True
    if not count:
        groundtruth_outside_bed.append(chrom + '_'+ str(pos) + '_'+ ref + '_'+ alt)
print(len(groundtruth_outside_bed))
groundtruths_in_bed = groundtruths.drop(groundtruth_outside_bed)
print(groundtruths_in_bed.shape[0])

bedfilelistall = []
for il in range(bedfile.shape[0]):
    for i in list(np.arange(bedfile.startpos.iloc[il], bedfile.endpos.iloc[il])):
        bedfilelistall.append(bedfile.chrom.iloc[il]+'_'+str(bedfile.startpos.iloc[il]))
print(len(bedfilelistall))
germlinetruthslistall = []
for il in tqdm(range(germlinetruths.shape[0])):
    for i in list(np.arange(germlinetruths.startpos.iloc[il], germlinetruths.endpos.iloc[il])):
        germlinetruthslistall.append(germlinetruths.chrom.iloc[il]+'_'+str(germlinetruths.startpos.iloc[il]))
print(len(germlinetruthslistall))
germlinetruths_in_bed = list(set(bedfilelistall) & set(germlinetruthslistall))
print(len(germlinetruths_in_bed))
      
# load table
calltable_snv, calltable_indel, calltable_snp = get_calltable(os.path.join(*config.mixturefolderSEQC2, 'SEQC2s_chrall', 'SEQC2s_chrall_BRP2_ST26_25ng_LIB1-P', 'SampleEf_BRP2_ST26_25ng_LIB1-P'), ['varnet', 'BRP'], save=False, filter='PASS')
calltable = pd.concat([calltable_snv, calltable_indel, calltable_snp])
calltable['chrom_pos'] = calltable['chrom'].astype(str).str.cat(calltable['pos'].astype('str'), sep='_')

# build lists method VS ground truths
calltable['truth'] = False
print(calltable.shape[0])
calltable = calltable.reindex(list(set(list(groundtruths_in_bed.index)) | set([list(germlinetruths_in_bed)))) ### get known negatives
#calltable = calltable.reindex(list(set(list(calltable.index))))
#calltable.loc[calltable.index.isin(list(calltable.index)), 'truth'] = True
#calltable = calltable.reindex(list(set(list(calltable.index))))
#calltable.loc[calltable.index.isin(list(calltable.index)), 'truth'] = True
print(calltable.shape[0])
calltable = calltable[~calltable['chrom_pos'].isin(germlinetruths_in_bed)]
print(calltable.shape[0])
calltable.loc[calltable.index.isin(list(groundtruths_in_bed.index)), 'truth'] = True
calltable['truth'].fillna(True, inplace=True)
calltable['BRP'].fillna(False, inplace=True)
print(calltable['truth'].value_counts())
print(calltable.shape[0])
print(calltable['BRP'].value_counts())


print("germline")
print(germlinetruths.shape[0])
print(calltable[(calltable['truth'] == False) & (calltable['BRP_vaf'] > 0)].shape[0])
print(list(set(list(calltable['chrom_pos'].values)) & set(list(germlinetruths.index))))


aux = calltable[['truth', 'BRP_vaf', 'BRP_score']]
aux['BRP_score'] =  aux['BRP_vaf']
aux['BRP_score'][(aux['BRP_score'] < 0.001)] = 0
aux['BRP_score'][(aux['BRP_score'] >= 0.001) & (aux['BRP_score'] < 0.0025)] = 0.001
aux['BRP_score'][(aux['BRP_score'] >= 0.0025) & (aux['BRP_score'] < 0.003)] = 0.0025
aux['BRP_score'][(aux['BRP_score'] >= 0.003) & (aux['BRP_score'] < 0.004)] = 0.003
aux['BRP_score'][(aux['BRP_score'] >= 0.004) & (aux['BRP_score'] < 0.005)] = 0.004
aux['BRP_score'][(aux['BRP_score'] >= 0.005) & (aux['BRP_score'] < 0.01)] = 0.005
aux['BRP_score'][(aux['BRP_score'] >= 0.01) & (aux['BRP_score'] < 0.015)] = 0.01
aux['BRP_score'][(aux['BRP_score'] >= 0.015) & (aux['BRP_score'] < 0.02)] = 0.015
aux['BRP_score'][(aux['BRP_score'] >= 0.02) & (aux['BRP_score'] < 0.025)] = 0.02
aux['BRP_score'][(aux['BRP_score'] >= 0.025)] = 0.025
aux['BRP_score'].fillna(0, inplace=True)
aux['BRP_vaf'].fillna(0, inplace=True)
#aux.dropna(inplace=True)
precision, recall, thresholds = precision_recall_curve(aux['truth'], aux['BRP_score'])
idxa = min(range(len(thresholds)), key=lambda i: abs(thresholds[i]-0.001))
idxb = min(range(len(thresholds)), key=lambda i: abs(thresholds[i]-0.025))

# VAF cutoffs 0.1–0.5%, 0.5–2.5% and >2.5%
plot_pr_curve(precision[idxa:idxb], recall[idxa:idxb], estimator_name='BRP', f1_score=None, figax=None, kwargs={'color': 'darkgreen'})
plt.ylim([0, 1])

In [None]:
calltable

In [None]:
print(len(thresholds))
print(idxa, idxb)
print(thresholds)

In [None]:
print(calltable['BRP_vaf'][~calltable['BRP_vaf'].isna()].shape[0])
print(calltable['truth'].sum())
print(calltable['truth'][calltable['truth'] == False].shape[0])

In [None]:
precision, recall

In [None]:
calltable

In [None]:
# load ground truths (somatic mutations)
groundtruths = read_vcf(os.path.join(*config.mixturefolderSEQC2, 'ground_truths', 'KnownPositives_hg19.vcf'))
groundtruths['CHROM'] = groundtruths['CHROM'].str.replace('chr', '')
groundtruths['chrom_pos_ref_alt'] = groundtruths['CHROM'].astype(str).str.cat(groundtruths['POS'].astype('str'), sep='_').str.cat(groundtruths['REF'].astype('str'), sep='_').str.cat(groundtruths['ALT'].astype('str'), sep='_')
groundtruths.set_index('chrom_pos_ref_alt', inplace=True)

# load known negatives (germline mutations)
germlinetruths = pd.read_csv(os.path.join(*config.mixturefolderSEQC2, 'ground_truths', 'KnownNegatives_hg19.bed'), sep='\t', header=None)
germlinetruths.columns = ['chrom', 'startpos', 'endpos']
germlinetruths['chrom'] = germlinetruths['chrom'].str.replace('chr', '')
germlinetruths['chrom_pos'] = germlinetruths['chrom'].astype(str).str.cat(germlinetruths['startpos'].astype('str'), sep='_')
germlinetruths.set_index('chrom_pos', inplace=True)

# load bedfile for method BRP
bedfile = pd.read_csv(os.path.join(*config.mixturefolderSEQC2, 'bedfiles', 'LBx_BRP_hg19.bed'), sep='\t', header=None)
bedfile.columns = ['chrom', 'startpos', 'endpos', 'gene']
bedfile['chrom'] = bedfile['chrom'].str.replace('chr', '')

# keep only ground truths in bed
groundtruth_outside_bed = []
for idx, row in tqdm(groundtruths.iterrows(), total=groundtruths.shape[0]):
    chrom = row['CHROM']
    pos = row['POS']
    ref = row['REF']
    alt = row['ALT']
    aux = bedfile[bedfile['chrom'] == str(chrom)]
    count = False
    for ind in list(aux.index):
        #print(aux['endpos'][ind], aux['startpos'][ind])
        if aux['endpos'][ind] >= int(pos):
            if aux['startpos'][ind] <= int(pos):
                count = True
    if not count:
        groundtruth_outside_bed.append(chrom + '_'+ str(pos) + '_'+ ref + '_'+ alt)
print(len(groundtruth_outside_bed))
groundtruths_in_bed = groundtruths.drop(groundtruth_outside_bed)
print(groundtruths_in_bed.shape[0])

"""
germlinetruth_outside_bed = []
for idx, row in tqdm(germlinetruths.iterrows(), total=germlinetruths.shape[0]):
    chrom = row['chrom']
    pos = row['startpos']
    aux = bedfile[bedfile['chrom'] == str(chrom)]
    count = False
    for ind in list(aux.index):
        #print(aux['endpos'][ind], aux['startpos'][ind])
        if aux['endpos'][ind] >= int(pos):
            if aux['startpos'][ind] <= int(pos):
                count = True
    if not count:
        germlinetruth_outside_bed.append(chrom + '_'+ str(pos))
print(len(germlinetruth_outside_bed))
germlinetruths_in_bed = groundtruths.drop(germlinetruth_outside_bed)
print(germlinetruths_in_bed.shape[0])
"""
"""
bedfilelistall = []
for il in range(bedfile.shape[0]):
    for i in list(np.arange(bedfile.startpos.iloc[il], bedfile.endpos.iloc[il])):
        bedfilelistall.append(i)
print(len(bedfilelistall))
germlinetruthslistall = []
for il in tqdm(range(germlinetruths.shape[0])):
    for i in list(np.arange(germlinetruths.startpos.iloc[il], germlinetruths.endpos.iloc[il])):
        germlinetruthslistall.append(i)
print(len(germlinetruthslistall))
germlinetruths_in_bed = list(set(bedfilelistall) & set(germlinetruthslistall))
print(len(germlinetruths_in_bed))
"""
      
# load table
calltable = pd.DataFrame()
for st in ['25', '26']:
    for lib in ['1', '2', '3', '4']:
#for sample in ['SampleEf_BRP2_ST25_25ng_LIB1-P',  'SampleEf_BRP2_ST25_25ng_LIB2-P', 'SampleEf_BRP2_ST25_25ng_LIB3-P',  'SampleEf_BRP2_ST25_25ng_LIB4-P', 'SampleEf_BRP2_ST26_25ng_LIB1-P',  'SampleEf_BRP2_ST26_25ng_LIB2-P',  'SampleEf_BRP2_ST26_25ng_LIB3-P',  'SampleEf_BRP2_ST26_25ng_LIB4-P']:
        calltable_snv, calltable_indel, calltable_snp = get_calltable(os.path.join(*config.mixturefolderSEQC2, 'SEQC2s_chrall', 'SEQC2s_chrall_BRP2_ST'+st+'_25ng_LIB'+lib+'-P', 'SampleEf_BRP2_ST'+st+'_25ng_LIB'+lib+'-P'), ['varnet', 'BRP'], save=False, filter='PASS')
        calltable_tmp = pd.concat([calltable_snv, calltable_indel, calltable_snp])
        calltable = pd.concat([calltable, calltable_tmp])
        #calltable = calltable[(calltable['BRP_vaf'] <= 0.025) & (calltable['BRP_vaf'] > 0.001)]

# build lists method VS ground truths
calltable['truth'] = False
print(calltable.shape[0])
#calltable = calltable.reindex(list(set(list(calltable.index)) | set(list(groundtruths_in_bed.index))))
print(calltable.shape[0])
calltable.loc[calltable.index.isin(list(groundtruths_in_bed.index)), 'truth'] = True
calltable['truth'].fillna(True, inplace=True)
calltable['BRP'].fillna(False, inplace=True)
print(calltable['truth'].value_counts())
print(calltable.shape[0])
print(calltable['BRP'].value_counts())


print("germline")
print(germlinetruths.shape[0])
print(calltable[(calltable['truth'] == False) & (calltable['BRP_vaf'] > 0)].shape[0])
calltable['chrom_pos'] = calltable['chrom'].astype(str).str.cat(calltable['pos'].astype('str'), sep='_')
print(list(set(list(calltable['chrom_pos'].values)) & set(list(germlinetruths.index))))


aux = calltable[['truth', 'BRP_vaf', 'BRP_score']]
aux['BRP_score'] = aux['BRP_vaf']
#aux['BRP_score'][aux['BRP_score'] > 0.025] = 0 #0.03
#aux['BRP_score'][aux['BRP_score'] <= 0.005] = 0 # 0.005
#aux['BRP_score'][(aux['BRP_score'] > 0.005) & (aux['BRP_score'] <= 0.025)] = 0.01
aux['BRP_score'].fillna(0, inplace=True)
aux['BRP_vaf'].fillna(0, inplace=True)
#aux.dropna(inplace=True)
precision, recall, thresholds = precision_recall_curve(aux['truth'], aux['BRP_score'])
idxa = min(range(len(thresholds)), key=lambda i: abs(thresholds[i]-0.001))
idxb = min(range(len(thresholds)), key=lambda i: abs(thresholds[i]-0.025))

# VAF cutoffs 0.1–0.5%, 0.5–2.5% and >2.5%
plot_pr_curve(precision[idxa:idxb], recall[idxa:idxb], estimator_name='BRP', f1_score=None, figax=None, kwargs={'color': 'darkgreen'})
plt.ylim([0, 1])

In [None]:
# load table
for st in ['25', '26']:
    for lib in ['1', '2', '3', '4']:
        calltable_snv, calltable_indel, calltable_snp = get_calltable(os.path.join(*config.mixturefolderSEQC2, 'SEQC2s_chrall', 'SEQC2s_chrall_BRP2_ST'+st+'_25ng_LIB'+lib+'-P', 'SampleEf_BRP2_ST'+st+'_25ng_LIB'+lib+'-P'), ['varnet', 'BRP'], save=False, filter='PASS')
        calltable = pd.concat([calltable_snv, calltable_indel, calltable_snp])
        ###
        # build lists method VS ground truths
        calltable['truth'] = False
        print(calltable.shape[0])
        calltable.loc[calltable.index.isin(list(groundtruths_in_bed.index)), 'truth'] = True
        calltable['truth'].fillna(True, inplace=True)
        calltable['truth_vaf'] = 0
        calltable['truth_vaf'] = groundtruths['INFO'].str.split(';').str[0].str.split('VAF=').str[1].astype(float)
        calltable['truth_vaf'].fillna(0, inplace=True)
        calltable['BRP'].fillna(False, inplace=True)
        print(calltable['truth'].value_counts())
        print(calltable.shape[0])
        print(calltable['BRP'].value_counts())
        calltable['chrom_pos'] = calltable['chrom'].astype(str).str.cat(calltable['pos'].astype('str'), sep='_')

aux = calltable[['truth', 'truth_vaf', 'BRP_vaf', 'BRP_score']]
aux['BRP_score'] = aux['BRP_vaf']
#aux['BRP_score'][aux['BRP_score'] > 0.025] = 0 #0.03
#aux['BRP_score'][aux['BRP_score'] <= 0.005] = 0 # 0.005
#aux['BRP_score'][(aux['BRP_score'] > 0.005) & (aux['BRP_score'] <= 0.025)] = 0.01
aux['BRP_score'].fillna(0, inplace=True)
aux['BRP_vaf'].fillna(0, inplace=True)
#aux.dropna(inplace=True)
precision, recall, thresholds = precision_recall_curve(aux['truth'], aux['BRP_score'])
idxa = min(range(len(thresholds)), key=lambda i: abs(thresholds[i]-0.001))
idxb = min(range(len(thresholds)), key=lambda i: abs(thresholds[i]-0.025))

# VAF cutoffs 0.1–0.5%, 0.5–2.5% and >2.5%
plot_pr_curve(precision[idxa:idxb], recall[idxa:idxb], estimator_name='BRP', f1_score=None, figax=None, kwargs={'color': 'darkgreen'})
plt.ylim([0, 1])

In [None]:

fig, ax = plt.subplots(figsize=(10, 10))

method_sampleid_dict = {
    'BRP': 'SampleDf_BRP2_ST26_25ng_LIB1-P',
    'IDT': 'SampleDfIS_IDT2_ST05_25ng_LIB1-P',
    'ILM': 'SampleDf_ILM2_ST29_25ng_LIB1-P',
    'ROC': 'SampleDf_ROC2_ST21_25ng_LIB1-P',
    'TFS': 'SampleDf_TFS2_ST24_25ng_LIB1-P',
}


method_sampleid_dict = {
    'BRP': 'SampleEf_BRP2_ST26_25ng_LIB1-P',
    'IDT': 'SampleEfIS_IDT2_ST05_25ng_LIB1-P',
    'ILM': 'SampleEf_ILM2_ST29_25ng_LIB1-P',
    'ROC': 'SampleEf_ROC2_ST21_25ng_LIB1-P',
    'TFS': 'SampleEf_TFS2_ST24_25ng_LIB1-P',
}


for method in ['ROC', 'BRP']: #, 'IDT', 'ILM']: #'ROC', 'TFS']:
    
    print('######## '+ method + ' ########')
    
    if method == 'ROC':
        refgenome = 'hg38'
    else:
        refgenome = 'hg19'
    
    groundtruths = read_vcf(os.path.join(*config.mixturefolderSEQC2, 'ground_truths', 'KnownPositives_'+refgenome+'.vcf'))
    if method == 'BRP':
        groundtruths['CHROM'] = groundtruths['CHROM'].str.replace('chr', '')
    groundtruths['chrom_pos_ref_alt'] = groundtruths['CHROM'].astype(str).str.cat(groundtruths['POS'].astype('str'), sep='_').str.cat(groundtruths['REF'].astype('str'), sep='_').str.cat(groundtruths['ALT'].astype('str'), sep='_')
    groundtruths.set_index('chrom_pos_ref_alt', inplace=True)

    germlinetruths = pd.read_csv(os.path.join(*config.mixturefolderSEQC2, 'ground_truths', 'KnownNegatives_'+refgenome+'.bed'), sep='\t', header=None)
    germlinetruths.columns = ['chrom', 'startpos', 'endpos']
    if method == 'BRP':
        germlinetruths['chrom'] = germlinetruths['chrom'].str.replace('chr', '')
    germlinetruths['chrom_pos'] = germlinetruths['chrom'].astype(str).str.cat(germlinetruths['startpos'].astype('str'), sep='_')
    germlinetruths.set_index('chrom_pos', inplace=True)

    bedfile = pd.read_csv(os.path.join(*config.mixturefolderSEQC2, 'bedfiles', 'LBx_'+method+'_'+refgenome+'.bed'), sep='\t', header=None)
    bedfile = bedfile.iloc[:,:3]
    bedfile.columns = ['chrom', 'startpos', 'endpos']
    if method == 'BRP':
        bedfile['chrom'] = bedfile['chrom'].str.replace('chr', '')
    bedfile['diff'] = bedfile['endpos'] - bedfile['startpos'] + 1
    N = bedfile['diff'].sum()
    print('reportable region in bp')
    print(N)
    
    germlinetruths['chrom_pos'] = germlinetruths['chrom']+'_'+germlinetruths['startpos'].astype(str)
    germlinetruths_all = list(germlinetruths['chrom_pos'].values)
    bed_all = []
    for ind in tqdm(list(bedfile.index)):
        for i in range(int(bedfile['startpos'][ind]), int(bedfile['endpos'][ind])+1):
            bed_all.append(bedfile['chrom'][ind]+'_'+str(i))
    print(len(germlinetruths_all))
    print(len(bed_all))
    germlinetruths_in_bed = list(set(germlinetruths_all) & set(bed_all))
    print(len(germlinetruths_in_bed))

    groundtruth_outside_bed = []
    for idx, row in tqdm(groundtruths.iterrows(), total=groundtruths.shape[0]):
        chrom = row['CHROM']
        pos = row['POS']
        ref = row['REF']
        alt = row['ALT']
        aux = bedfile[bedfile['chrom'] == str(chrom)]
        count = False
        for ind in list(aux.index):
            #print(aux['endpos'][ind], aux['startpos'][ind])
            if aux['endpos'][ind] >= int(pos):
                if aux['startpos'][ind] <= int(pos):
                    count = True
        if not count:
            groundtruth_outside_bed.append(chrom + '_'+ str(pos) + '_'+ ref + '_'+ alt)
    print('groundtruth')
    print(len(groundtruth_outside_bed))
    groundtruths_in_bed = groundtruths.drop(groundtruth_outside_bed)
    print(groundtruths_in_bed.shape[0])

    """
    germlinetruths['chrom_pos'] = germlinetruths['chrom']+'_'+germlinetruths['startpos']
    germlinetruths_all = list(germlinetruths['chrom_pos'].values)
    bed_all = []
    for ind in list(befile.index):
        bed_all.append([befile['chrom'][ind]+'_'+str(i) for i in range(int(befile['startpos'][ind]), int(befile['endpos'][ind])+1)])
    germlinetruth_outside_bed = []
    for idx, row in tqdm(germlinetruths.iterrows(), total=germlinetruths.shape[0]):
        chrom = row['chrom']
        pos = row['startpos']
        aux = bedfile[bedfile['chrom'] == str(chrom)]
        count = False
        for ind in list(aux.index):
            #print(aux['endpos'][ind], aux['startpos'][ind])
            if aux['endpos'][ind] >= int(pos):
                if aux['startpos'][ind] <= int(pos):
                    count = True
        if not count:
            germlinetruth_outside_bed.append(chrom + '_'+ str(pos))
    print(len(germlinetruth_outside_bed))
    germlinetruths_in_bed = groundtruths.drop(germlinetruth_outside_bed)
    print(germlinetruths_in_bed.shape[0])
    """
    bedfilelistall = []
    for il in range(bedfile.shape[0]):
        for i in list(np.arange(bedfile.startpos.iloc[il], bedfile.endpos.iloc[il])):
            bedfilelistall.append(bedfile.chrom.iloc[il]+'_'+str(bedfile.startpos.iloc[il]))
    print(len(bedfilelistall))
    germlinetruthslistall = []
    for il in tqdm(range(germlinetruths.shape[0])):
        for i in list(np.arange(germlinetruths.startpos.iloc[il], germlinetruths.endpos.iloc[il])):
            germlinetruthslistall.append(germlinetruths.chrom.iloc[il]+'_'+str(germlinetruths.startpos.iloc[il]))
    print(len(germlinetruthslistall))
    germlinetruths_in_bed = list(set(bedfilelistall) & set(germlinetruthslistall))
    print(len(germlinetruths_in_bed))

    calltable_snv, calltable_indel, calltable_snp = get_calltable(os.path.join(*config.mixturefolderSEQC2, 'SEQC2s_chrall', 'SEQC2s_chrall_'+'_'.join(method_sampleid_dict[method].split('_')[1:]), method_sampleid_dict[method]), ['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan', method], save=False, filter='all')
    #calltable_snv, calltable_indel, calltable_snp = get_calltable(os.path.join(*config.mixturefolderSEQC2, 'SEQC2s_chrall', 'SEQC2s_chrall_'+'_'.join(method_sampleid_dict[method].split('_')[1:]), method_sampleid_dict[method]), ['varnet', method], save=False, filter='PASS')
    print(calltable_snv.shape[0], calltable_indel.shape[0], calltable_snp.shape[0])
    calltable = pd.concat([calltable_snv, calltable_indel, calltable_snp])
    calltable['chrom_pos'] = calltable['chrom'].astype(str).str.cat(calltable['pos'].astype('str'), sep='_')

    calltable['truth'] = False
    print(calltable.shape[0])
    calltable = calltable.reindex(list(set(list(calltable.index)) | set(list(groundtruths_in_bed.index))))
    print('TP: {}'.format(len(list(set(list(calltable.index)) & set(list(groundtruths_in_bed.index))))))
    print('FP: {}'.format(len(list(set(list(calltable.index)) ^ set(list(groundtruths_in_bed.index))))))
    print(calltable.shape[0])
    calltable = calltable[~calltable['chrom_pos'].isin(germlinetruths_in_bed)]
    print(calltable.shape[0])
    calltable.loc[calltable.index.isin(list(groundtruths_in_bed.index)), 'truth'] = True
    calltable['truth'].fillna(True, inplace=True)
    #for m in ['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan', method]:
    #    print(m)
    #    calltable[m].fillna(False, inplace=True)
    #    print(calltable[m].value_counts())
    m = method
    calltable[m].fillna(False, inplace=True)
    print(calltable[m].value_counts())
    print(calltable['truth'].value_counts())
    print(calltable.shape[0])

    print("germline")
    print(germlinetruths.shape[0])
    print(calltable[(calltable['truth'] == False) & (calltable[method+'_vaf'] > 0)].shape[0])
    print('germline in calls')
    print(len(list(set(list(calltable['chrom_pos'].values)) & set(list(germlinetruths.index)))))
    
    if method == 'BRP':
        for m in ['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan']:
            aux = calltable[['truth', m+'_vaf', m+'_score']]
            print(aux[m+'_score'].describe())
            aux[m+'_score'].fillna(0, inplace=True)
            aux[m+'_vaf'].fillna(0, inplace=True)
            precision, recall, thresholds = precision_recall_curve(aux['truth'], aux[m+'_score'])
            precisionbis = []
            Nneg = len(germlinetruths_in_bed)
            print('Nneg')
            print(Nneg)
            plot_pr_curve(precision, recall, estimator_name=m, f1_score=None, figax=(fig, ax), kwargs={'color': config.colors[config.methods.index(m)]})

    m = method
    aux = calltable[['truth', m+'_vaf', m+'_score']]
    #if m == 'BRP':
    aux[m+'_score'] = aux[m+'_vaf']
    print(aux[m+'_vaf'].describe())
    aux[m+'_score'].fillna(0, inplace=True)
    aux[m+'_vaf'].fillna(0, inplace=True)
    #aux.dropna(inplace=True)
    precision, recall, thresholds = precision_recall_curve(aux['truth'], aux[m+'_vaf'])
    precisionbis = []
    Nneg = len(germlinetruths_in_bed)
    print('Nneg')
    print(Nneg)
    for thres in thresholds:
        y_pred = aux[m+'_vaf'].copy()
        y_pred[y_pred >= thres] = 1
        y_pred[y_pred != 1] = 0
        _, fp, _, tp = confusion_matrix(aux['truth'], y_pred).ravel()
        precisionbis.append(tp/(tp+(fp/Nneg)))

    if m == 'BRP' or m == 'IDT' or m =='ILM' or m=='ROC':
        idxa = min(range(len(thresholds)), key=lambda i: abs(thresholds[i]-0.001))
        idxb = min(range(len(thresholds)), key=lambda i: abs(thresholds[i]-0.025))
    else:
        idxa = 0
        idxb = len(thresholds)

    # VAF cutoffs 0.1–0.5%, 0.5.–2.5% and >2.5%
    # precision divided by the size of on-target negative positions for each panel
    #print(precision[idxa:idxb])
    #print(precisionbis[idxa:idxb])
    #print(recall[idxa:idxb])
    plot_pr_curve(precision[idxa:idxb], recall[idxa:idxb], estimator_name=m, f1_score=None, figax=(fig, ax), kwargs={'color': config.colors[config.methods.index(m)]})
#plt.ylim([0.97, 1.0])
plt.ylim([0.0, 1.0])
plt.xlim([0.0, 1.0])
#plt.ylim([0., 1.0])
plt.legend(bbox_to_anchor=(1, 1), loc="upper left")

In [None]:
calltable['truth'].value_counts()

In [None]:

fig, ax = plt.subplots(figsize=(10, 10))

# taking one sample of each company
method_sampleid_dict = {
    'BRP': 'SampleEf_BRP2_ST26_25ng_LIB1-P',
    'IDT': 'SampleEfIS_IDT2_ST05_25ng_LIB1-P',
    'ILM': 'SampleEf_ILM2_ST29_25ng_LIB1-P',
    'ROC': 'SampleEf_ROC2_ST21_25ng_LIB1-P',
    'TFS': 'SampleEf_TFS2_ST24_25ng_LIB1-P',
}


for method in ['ROC', 'BRP', 'IDT', 'ILM']: #, 'TFS']: 
    
    print('######## '+ method + ' ########')
    
    if method == 'ROC':
        refgenome = 'hg38'
    else:
        refgenome = 'hg19'
    
    # read ground truths vcf file 'known positives'
    groundtruths = read_vcf(os.path.join(*config.mixturefolderSEQC2, 'ground_truths', 'KnownPositives_'+refgenome+'.vcf'))
    if method == 'BRP':
        groundtruths['CHROM'] = groundtruths['CHROM'].str.replace('chr', '')
    groundtruths['chrom_pos_ref_alt'] = groundtruths['CHROM'].astype(str).str.cat(groundtruths['POS'].astype('str'), sep='_').str.cat(groundtruths['REF'].astype('str'), sep='_').str.cat(groundtruths['ALT'].astype('str'), sep='_')
    groundtruths.set_index('chrom_pos_ref_alt', inplace=True)

    # read germline truths bed file 'known negatives'
    germlinetruths = pd.read_csv(os.path.join(*config.mixturefolderSEQC2, 'ground_truths', 'KnownNegatives_'+refgenome+'.bed'), sep='\t', header=None)
    germlinetruths.columns = ['chrom', 'startpos', 'endpos']
    if method == 'BRP':
        germlinetruths['chrom'] = germlinetruths['chrom'].str.replace('chr', '')
    germlinetruths['chrom_pos'] = germlinetruths['chrom'].astype(str).str.cat(germlinetruths['startpos'].astype('str'), sep='_')
    germlinetruths.set_index('chrom_pos', inplace=True)

    # get bed file corresponding to each company
    if method == 'TFS':
        bedfile = pd.read_csv(os.path.join(*config.mixturefolderSEQC2, 'bedfiles', 'LBx_TFS_pre-defined_Hotspots_hg19.bed'), sep='\t', header=None)
    else:
        bedfile = pd.read_csv(os.path.join(*config.mixturefolderSEQC2, 'bedfiles', 'LBx_'+method+'_'+refgenome+'.bed'), sep='\t', header=None)
    bedfile = bedfile.iloc[:,:3]
    bedfile.columns = ['chrom', 'startpos', 'endpos']
    if method == 'BRP':
        bedfile['chrom'] = bedfile['chrom'].str.replace('chr', '')
    bedfile['diff'] = bedfile['endpos'] - bedfile['startpos'] + 1
    N = bedfile['diff'].sum()
    print('reportable region in bp')
    print(N)
    
    # keep germline inside current bed
    germlinetruths['chrom_pos'] = germlinetruths['chrom']+'_'+germlinetruths['startpos'].astype(str)
    germlinetruths_all = list(germlinetruths['chrom_pos'].values)
    bed_all = []
    for ind in tqdm(list(bedfile.index)):
        for i in range(int(bedfile['startpos'][ind]), int(bedfile['endpos'][ind])+1):
            bed_all.append(bedfile['chrom'][ind]+'_'+str(i))
    print('germline all')
    print(len(germlinetruths_all))
    #print(len(bed_all))
    germlinetruths_in_bed = list(set(germlinetruths_all) & set(bed_all))
    print('germline in bed')
    print(len(germlinetruths_in_bed))

    # keep ground truths inside current bed
    groundtruth_outside_bed = []
    for idx, row in tqdm(groundtruths.iterrows(), total=groundtruths.shape[0]):
        chrom = row['CHROM']
        pos = row['POS']
        ref = row['REF']
        alt = row['ALT']
        aux = bedfile[bedfile['chrom'] == str(chrom)]
        count = False
        for ind in list(aux.index):
            #print(aux['endpos'][ind], aux['startpos'][ind])
            if aux['endpos'][ind] >= int(pos):
                if aux['startpos'][ind] <= int(pos):
                    count = True
        if not count:
            groundtruth_outside_bed.append(chrom + '_'+ str(pos) + '_'+ ref + '_'+ alt)
    print('groundtruth in bed')
    #print(len(groundtruth_outside_bed))
    groundtruths_in_bed = groundtruths.drop(groundtruth_outside_bed)
    print(groundtruths_in_bed.shape[0])

    # read company vcf file (snvs + indels)
    #calltable_snv, calltable_indel, calltable_snp = get_calltable(os.path.join(*config.mixturefolderSEQC2, 'SEQC2s_chrall', 'SEQC2s_chrall_'+method+'2_ST26_25ng_LIB1-P', 'SampleDf_BRP2_ST26_25ng_LIB1-P'), ['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan', method], save=False, filter='all')
    calltable_snv, calltable_indel, calltable_snp = get_calltable(os.path.join(*config.mixturefolderSEQC2, 'SEQC2s_chrall', 'SEQC2s_chrall_'+'_'.join(method_sampleid_dict[method].split('_')[1:]), method_sampleid_dict[method]), ['varnet', method], save=False, filter='PASS')
    print(calltable_snv.shape[0], calltable_indel.shape[0], calltable_snp.shape[0])
    calltable = pd.concat([calltable_snv, calltable_indel, calltable_snp])
    #calltable =  calltable_snv

    calltable['truth'] = False
    print('calltable shape')
    print(calltable.shape[0])
    calltable = calltable.reindex(list(set(list(calltable.index)) | set(list(groundtruths_in_bed.index))))
    print('TP: {}'.format(len(list(set(list(calltable.index)) & set(list(groundtruths_in_bed.index))))))
    print('FP: {}'.format(len(list(set(list(calltable.index)) ^ set(list(groundtruths_in_bed.index))))))
    print(calltable.shape[0])
    calltable.loc[calltable.index.isin(list(groundtruths_in_bed.index)), 'truth'] = True
    calltable['truth'].fillna(True, inplace=True)
    #for m in ['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan', method]:
    #    print(m)
    #    calltable[m].fillna(False, inplace=True)
    #    print(calltable[m].value_counts())
    m = method
    calltable[m].fillna(False, inplace=True)
    print(calltable[m].value_counts())
    print(calltable['truth'].value_counts())
    print(calltable.shape[0])

    print("germline")
    print(germlinetruths.shape[0])
    print(calltable[(calltable['truth'] == False) & (calltable[method+'_vaf'] > 0)].shape[0])
    calltable['chrom_pos'] = calltable['chrom'].astype(str).str.cat(calltable['pos'].astype('str'), sep='_')
    print('germline in calls')
    print(len(list(set(list(calltable['chrom_pos'].values)) & set(list(germlinetruths.index)))))
    calltable.set_index('chrom_pos').drop(list(set(list(calltable['chrom_pos'].values)) & set(list(germlinetruths.index))), inplace=True)
    calltable['chrom_pos_ref_alt'] = calltable['chrom'].astype(str).str.cat(calltable['pos'].astype('str'), sep='_').str.cat(calltable['ref'].astype('str'), sep='_').str.cat(calltable['alt'].astype('str'), sep='_')
    calltable.set_index('chrom_pos_ref_alt', inplace=True)

    #for m in ['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan', method]:
    m = method
    aux = calltable[['truth', m+'_vaf', m+'_score']]
    #if m == 'BRP':
    aux[m+'_score'] = aux[m+'_vaf']
    print(aux[m+'_vaf'].describe())
    #aux['BRP_score'] = aux['BRP_vaf']
    #aux['BRP_score'][aux['BRP_score'] > 0.025] = 0.03
    #aux['BRP_score'][aux['BRP_score'] <= 0.005] = 0.005
    #aux['BRP_score'][(aux['BRP_score'] > 0.005) & (aux['BRP_score'] <= 0.025)] = 0.01
    aux[m+'_score'].fillna(0, inplace=True)
    aux[m+'_vaf'].fillna(0, inplace=True)
    #aux.dropna(inplace=True)
    precision, recall, thresholds = precision_recall_curve(aux['truth'], aux[m+'_vaf'])
    precisionbis = []
    Nneg = len(germlinetruths_in_bed)
    print('Nneg')
    print(Nneg)
    for thres in thresholds:
        y_pred = aux[m+'_vaf'].copy()
        y_pred[y_pred >= thres] = 1
        y_pred[y_pred != 1] = 0
        _, fp, _, tp = confusion_matrix(aux['truth'], y_pred).ravel()
        precisionbis.append(tp/(tp+(fp/Nneg)))
                     
    
    idxa = min(range(len(thresholds)), key=lambda i: abs(thresholds[i]-0.001))
    idxb = min(range(len(thresholds)), key=lambda i: abs(thresholds[i]-0.025))
    #else:
    #    idxa = 0
    #    idxb = len(thresholds)

    # VAF cutoffs 0.1–0.5%, 0.5.–2.5% and >2.5%
    # precision divided by the size of on-target negative positions for each panel
    #print(precision[idxa:idxb])
    #print(precisionbis[idxa:idxb])
    #print(recall[idxa:idxb])
    plot_pr_curve(precision[idxa:idxb], recall[idxa:idxb], estimator_name=m, f1_score=None, figax=(fig, ax), kwargs={'color': config.colors[config.methods.index(m)]})
#plt.ylim([0.97, 1.0])
plt.ylim([0.0, 1.0])
plt.xlim([0.0, 1.0])
#plt.ylim([0., 1.0])
plt.legend(bbox_to_anchor=(1, 1), loc="upper left")

In [None]:
# compare bed files

groundtruths_bedfiles = {}

for method in ['ROC', 'BRP', 'IDT', 'ILM']: #, 'TFS']: 
    
    print('######## '+ method + ' ########')
    
    if method == 'ROC':
        refgenome = 'hg38'
    else:
        refgenome = 'hg19'
    
    # read ground truths vcf file 'known positives'
    groundtruths = read_vcf(os.path.join(*config.mixturefolderSEQC2, 'ground_truths', 'KnownPositives_'+refgenome+'.vcf'))
    if method == 'BRP':
        groundtruths['CHROM'] = groundtruths['CHROM'].str.replace('chr', '')
    groundtruths['chrom_pos_ref_alt'] = groundtruths['CHROM'].astype(str).str.cat(groundtruths['POS'].astype('str'), sep='_').str.cat(groundtruths['REF'].astype('str'), sep='_').str.cat(groundtruths['ALT'].astype('str'), sep='_')
    groundtruths.set_index('chrom_pos_ref_alt', inplace=True)

    # get bed file corresponding to each company
    if method == 'TFS':
        bedfile = pd.read_csv(os.path.join(*config.mixturefolderSEQC2, 'bedfiles', 'LBx_TFS_pre-defined_Hotspots_hg19.bed'), sep='\t', header=None)
    else:
        bedfile = pd.read_csv(os.path.join(*config.mixturefolderSEQC2, 'bedfiles', 'LBx_'+method+'_'+refgenome+'.bed'), sep='\t', header=None)
    bedfile = bedfile.iloc[:,:3]
    bedfile.columns = ['chrom', 'startpos', 'endpos']
    if method == 'BRP':
        bedfile['chrom'] = bedfile['chrom'].str.replace('chr', '')
    bedfile['diff'] = bedfile['endpos'] - bedfile['startpos'] + 1
    N = bedfile['diff'].sum()
    print('reportable region in bp')
    print(N)

    # keep ground truths inside current bed
    groundtruth_outside_bed = []
    for idx, row in tqdm(groundtruths.iterrows(), total=groundtruths.shape[0]):
        chrom = row['CHROM']
        pos = row['POS']
        ref = row['REF']
        alt = row['ALT']
        aux = bedfile[bedfile['chrom'] == str(chrom)]
        count = False
        for ind in list(aux.index):
            #print(aux['endpos'][ind], aux['startpos'][ind])
            if aux['endpos'][ind] >= int(pos):
                if aux['startpos'][ind] <= int(pos):
                    count = True
        if not count:
            groundtruth_outside_bed.append(chrom + '_'+ str(pos) + '_'+ ref + '_'+ alt)
    print('groundtruth in bed')
    #print(len(groundtruth_outside_bed))
    groundtruths_in_bed = groundtruths.drop(groundtruth_outside_bed)
    print(groundtruths_in_bed.shape[0])
    
    groundtruths_bedfiles[method] = list(groundtruths_in_bed.index)

In [None]:
#list(groundtruths_bedfiles.values())[0]
print(groundtruths_bedfiles.keys())
print(len(set(set(groundtruths_bedfiles['BRP']) & set([c[3:] for c in groundtruths_bedfiles['IDT']]) & set([c[3:] for c in groundtruths_bedfiles['ILM']]))))
print([len(i) for i in list(groundtruths_bedfiles.values())])

In [None]:
set.intersection(*map(set,list(groundtruths_bedfiles.values())[1:-1]))

In [None]:

fig, ax = plt.subplots(figsize=(10, 10))

# taking one sample of each company
method_sampleid_dict = {
    'BRP': 'SampleEf_BRP2_ST26_25ng_LIB1-P',
    'IDT': 'SampleEfIS_IDT2_ST05_25ng_LIB1-P',
    'ILM': 'SampleEf_ILM2_ST29_25ng_LIB1-P',
    'ROC': 'SampleEf_ROC2_ST21_25ng_LIB1-P',
    'TFS': 'SampleEf_TFS2_ST24_25ng_LIB1-P',
}


for method in ['ROC', 'BRP', 'IDT', 'ILM']: 
    
    print('######## '+ method + ' ########')
    if method == 'ROC':
        refgenome = 'hg38'
    else:
        refgenome = 'hg19'
    
    # read ground truths vcf file 'known positives'
    groundtruths = read_vcf(os.path.join(*config.mixturefolderSEQC2, 'ground_truths', 'KnownPositives_'+refgenome+'.vcf'))
    if method == 'BRP':
        groundtruths['CHROM'] = groundtruths['CHROM'].str.replace('chr', '')
    groundtruths['chrom_pos_ref_alt'] = groundtruths['CHROM'].astype(str).str.cat(groundtruths['POS'].astype('str'), sep='_').str.cat(groundtruths['REF'].astype('str'), sep='_').str.cat(groundtruths['ALT'].astype('str'), sep='_')
    groundtruths.set_index('chrom_pos_ref_alt', inplace=True)

    # read germline truths bed file 'known negatives'
    germlinetruths = pd.read_csv(os.path.join(*config.mixturefolderSEQC2, 'ground_truths', 'KnownNegatives_'+refgenome+'.bed'), sep='\t', header=None)
    germlinetruths.columns = ['chrom', 'startpos', 'endpos']
    if method == 'BRP':
        germlinetruths['chrom'] = germlinetruths['chrom'].str.replace('chr', '')
    germlinetruths['chrom_pos'] = germlinetruths['chrom'].astype(str).str.cat(germlinetruths['startpos'].astype('str'), sep='_')
    germlinetruths.set_index('chrom_pos', inplace=True)

    # get bed file corresponding to each company
    bedfile = pd.read_csv(os.path.join(*config.mixturefolderSEQC2, 'bedfiles', 'LBx_'+method+'_'+refgenome+'.bed'), sep='\t', header=None)
    bedfile = bedfile.iloc[:,:3]
    bedfile.columns = ['chrom', 'startpos', 'endpos']
    if method == 'BRP':
        bedfile['chrom'] = bedfile['chrom'].str.replace('chr', '')
    bedfile['diff'] = bedfile['endpos'] - bedfile['startpos'] + 1
    N = bedfile['diff'].sum()
    print('reportable region in bp')
    print(N)
    
    # keep germline inside current bed
    germlinetruths['chrom_pos'] = germlinetruths['chrom']+'_'+germlinetruths['startpos'].astype(str)
    germlinetruths_all = list(germlinetruths['chrom_pos'].values)
    bed_all = []
    for ind in tqdm(list(bedfile.index)):
        for i in range(int(bedfile['startpos'][ind]), int(bedfile['endpos'][ind])+1):
            bed_all.append(bedfile['chrom'][ind]+'_'+str(i))
    print('germline all')
    print(len(germlinetruths_all))
    germlinetruths_in_bed = list(set(germlinetruths_all) & set(bed_all))
    print('germline in bed')
    print(len(germlinetruths_in_bed))

    # keep ground truths inside current bed
    groundtruth_outside_bed = []
    for idx, row in tqdm(groundtruths.iterrows(), total=groundtruths.shape[0]):
        chrom = row['CHROM']
        pos = row['POS']
        ref = row['REF']
        alt = row['ALT']
        aux = bedfile[bedfile['chrom'] == str(chrom)]
        count = False
        for ind in list(aux.index):
            #print(aux['endpos'][ind], aux['startpos'][ind])
            if aux['endpos'][ind] >= int(pos):
                if aux['startpos'][ind] <= int(pos):
                    count = True
        if not count:
            groundtruth_outside_bed.append(chrom + '_'+ str(pos) + '_'+ ref + '_'+ alt)
    print('groundtruth in bed')
    #print(len(groundtruth_outside_bed))
    groundtruths_in_bed = groundtruths.drop(groundtruth_outside_bed)
    print(groundtruths_in_bed.shape[0])

    # read company vcf file (snvs + indels)
    calltable_snv, calltable_indel, calltable_snp = get_calltable(os.path.join(*config.mixturefolderSEQC2, 'SEQC2s_chrall', 'SEQC2s_chrall_'+'_'.join(method_sampleid_dict[method].split('_')[1:]), method_sampleid_dict[method]), ['varnet', method], save=False, filter='PASS')
    print(calltable_snv.shape[0], calltable_indel.shape[0], calltable_snp.shape[0])
    calltable = pd.concat([calltable_snv, calltable_indel, calltable_snp])

    calltable['truth'] = False
    print('calltable shape')
    print(calltable.shape[0])
    calltable = calltable.reindex(list(set(list(calltable.index)) | set(list(groundtruths_in_bed.index))))
    #print('TP: {}'.format(len(list(set(list(calltable.index)) & set(list(groundtruths_in_bed.index))))))
    #print('FP: {}'.format(len(list(set(list(calltable.index)) ^ set(list(groundtruths_in_bed.index))))))
    print(calltable.shape[0])
    calltable.loc[calltable.index.isin(list(groundtruths_in_bed.index)), 'truth'] = True
    calltable['truth'].fillna(True, inplace=True)
    calltable[method].fillna(False, inplace=True)
    print(calltable[method].value_counts())
    print(calltable['truth'].value_counts())

    print("germline")
    print(germlinetruths.shape[0])
    print(calltable[(calltable['truth'] == False) & (calltable[method+'_vaf'] > 0)].shape[0])
    calltable['chrom_pos'] = calltable['chrom'].astype(str).str.cat(calltable['pos'].astype('str'), sep='_')
    print('germline in calls')
    print(len(list(set(list(calltable['chrom_pos'].values)) & set(list(germlinetruths.index)))))
    calltable.set_index('chrom_pos').drop(list(set(list(calltable['chrom_pos'].values)) & set(list(germlinetruths.index))), inplace=True)
    calltable['chrom_pos_ref_alt'] = calltable['chrom'].astype(str).str.cat(calltable['pos'].astype('str'), sep='_').str.cat(calltable['ref'].astype('str'), sep='_').str.cat(calltable['alt'].astype('str'), sep='_')
    calltable.set_index('chrom_pos_ref_alt', inplace=True)

    aux = calltable[['truth', method+'_vaf', method+'_score']]
    aux[method+'_vaf'].fillna(0, inplace=True)
    precision, recall, thresholds = precision_recall_curve(aux['truth'], aux[method+'_vaf'])    
    idxa = min(range(len(thresholds)), key=lambda i: abs(thresholds[i]-0.001))
    idxb = min(range(len(thresholds)), key=lambda i: abs(thresholds[i]-0.025))
    plot_pr_curve(precision[idxa:idxb], recall[idxa:idxb], estimator_name=method, f1_score=None, figax=(fig, ax), kwargs={'color': config.colors[config.methods.index(method)]})

plt.ylim([0.0, 1.0])
plt.xlim([0.0, 1.0])
plt.legend(bbox_to_anchor=(1, 1), loc="upper left")

In [None]:
groundtruths[groundtruths['INFO'].str.split(';').str[1] == 'TYPE=INDEL'].value_counts() 

In [None]:
calltable[calltable['truth']]

In [None]:
calltable[calltable['truth']]

In [None]:
calltable.loc[list(set(list(calltable.index)) & set(list(groundtruths_in_bed.index)))]

# Bed file study

In [None]:
bedfile_ROC = pd.read_csv(os.path.join(*config.mixturefolderSEQC2, 'bedfiles', 'LBx_ROC_hg38.bed'), sep='\t', header=None)
bedfile_ROC.columns = ['chrom', 'startpos', 'endpos']

bedfile_BRP = pd.read_csv(os.path.join(*config.mixturefolderSEQC2, 'bedfiles', 'LBx_BRP_hg19.bed'), sep='\t', header=None)
bedfile_BRP.columns = ['chrom', 'startpos', 'endpos', 'gene']

bedfile_ILM = pd.read_csv(os.path.join(*config.mixturefolderSEQC2, 'bedfiles', 'LBx_ILM_hg19.bed'), sep='\t', header=None)
bedfile_ILM.columns = ['chrom', 'startpos', 'endpos']

bedfile_IDT = pd.read_csv(os.path.join(*config.mixturefolderSEQC2, 'bedfiles', 'LBx_IDT_hg19.bed'), sep='\t', header=None)
bedfile_IDT.columns = ['chrom', 'startpos', 'endpos', 'A', 'B', 'C']



In [None]:
#for mixtureid in mixtureids:
mixtureid = 'ROC2_ST21_25ng_LIB1-PH'
seriesorder = ['SampleDf', 'SampleEf']
print('############# {} ############'.format(mixtureid))
calltables = {'sampleid':[], 'tf':[], 'cov':[], 'snv':[], 'indel':[], 'snp':[]}
calltable_snv, aux = get_calltableseries(config, mixtureid, chrom='all', muttype='snv', filterparam=filterparam, reload=reload, save=save, diltype='SEQC2', concat='tf')
calltable_indel, aux = get_calltableseries(config, mixtureid, chrom='all', muttype='indel', filterparam=filterparam, reload=reload, save=save, diltype='SEQC2', concat='tf')
calltable_snp, aux = get_calltableseries(config, mixtureid, chrom='all', muttype='snp', filterparam=filterparam, reload=reload, save=save, diltype='SEQC2', concat='tf')
print(calltable_snv.shape, calltable_indel.shape, calltable_snp.shape)
print(aux)
calltables['snv'].append(calltable_snv)
calltables['indel'].append(calltable_indel)
calltables['snp'].append(calltable_snp)
calltables['sampleid'] = mixtureid 
#calltables['tf'] = np.unique([cn.split('_')[0] for cn in list(calltable_snv.columns)])[:-5].astype(float)
calltables['snv'] = pd.concat(calltables['snv'])
calltables['indel'] = pd.concat(calltables['indel'])
calltables['snp'] = pd.concat(calltables['snp'])
#dilutionseries
dilutionseries = aux
#dilutionseries = aux.T[['mixture_' + '_'.join(mixtureid.split('_')[:2]) + '_' + str(s[0]) + 'x_' + '_'.join(mixtureid.split('_')[2:4]) + '_' + str(s[1]) + 'x' for s in seriesorder]].T
muttype = 'snv'
refsample = 'SEQC2'
# ground truth
groundtruths = read_vcf(os.path.join(*config.mixturefolderSEQC2, 'ground_truths', 'KnownPositives_hg38.vcf'))
#groundtruths['CHROM'] = groundtruths['CHROM'].str.replace('chr', '')
groundtruths['chrom_pos_ref_alt'] = groundtruths['CHROM'].astype(str).str.cat(groundtruths['POS'].astype('str'), sep='_').str.cat(groundtruths['REF'].astype('str'), sep='_').str.cat(groundtruths['ALT'].astype('str'), sep='_')
groundtruths.set_index('chrom_pos_ref_alt', inplace=True)

bedfile = pd.read_csv(os.path.join(*config.mixturefolderSEQC2, 'bedfiles', 'LBx_ROC_hg38.bed'), sep='\t', header=None)
bedfile.columns = ['chrom', 'startpos', 'endpos']
#bedfile['chrom'] = bedfile['chrom'].str.replace('chr', '')

groundtruth_outside_bed = []
for idx, row in tqdm(groundtruths.iterrows(), total=groundtruths.shape[0]):
    chrom = row['CHROM']
    pos = row['POS']
    ref = row['REF']
    alt = row['ALT']
    aux = bedfile[bedfile['chrom'] == str(chrom)]
    count = False
    for ind in list(aux.index):
        #print(aux['endpos'][ind], aux['startpos'][ind])
        if aux['endpos'][ind] >= int(pos):
            if aux['startpos'][ind] <= int(pos):
                count = True
    if not count:
        groundtruth_outside_bed.append(chrom + '_'+ str(pos) + '_'+ ref + '_'+ alt)
print(len(groundtruth_outside_bed))
groundtruths_in_bed = groundtruths.drop(groundtruth_outside_bed)
print(groundtruths_in_bed.shape[0])
calltablesseries = calltables[muttype]
calltablesseries['truth'] = False
calltablesseries.loc[calltablesseries.index.isin(list(groundtruths_in_bed.index)), 'truth'] = True
print(calltablesseries['truth'].value_counts())
calltablesseries['SampleDf_ROC_score'] = calltablesseries['SampleDf_ROC_vaf']
calltablesseries['SampleEf_ROC_score'] = calltablesseries['SampleEf_ROC_vaf']
# metrics
methods = ['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan', 'ROC']
#results_auprc_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='auprc', ground_truth_method='SEQC2',
#                                 refsample='SEQC2', muttype=muttype, methods=methodss, splitby='dilution')
#results_precision_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='precision',  ground_truth_method='SEQC2',
#                                 refsample='SEQC2', muttype=muttype, methods=methods)
#results_recall_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='recall',  ground_truth_method='SEQC2',
##                                 refsample='SEQC2', muttype=muttype, methods=methods)
#results_maxf1precision_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='maxf1precision',  ground_truth_method='SEQC2',
#                                 refsample='SEQC2', muttype=muttype, methods=methods)
#results_maxf1recall_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='maxf1recall',  ground_truth_method='SEQC2',
 #                                refsample='SEQC2', muttype=muttype, methods=methods)
figure_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, xy='pr', ground_truth_method='SEQC2',
                                 refsample='SEQC2', muttype=muttype, methods=methods, splitby='dilution')
                #    ground_truth_method=gtm,
                #    refsample=refsample, muttype=muttype.upper(), methods=None, fixedvar=fixedvar, save=save)

In [None]:
method_sampleid_dict = {
    'BRP': 'BRP2_ST26_25ng_LIB1-P',
    'IDT': 'IDT2_ST05_25ng_LIB1-P',
    'ILM': 'ILM2_ST29_25ng_LIB1-P',
    'ROC': 'ROC2_ST21_25ng_LIB1-PH',
    'TFS': 'TFS2_ST24_25ng_LIB1-P',
}
seriesorder = ['SampleDf', 'SampleEf']

calltablesseries_df = {}

for method in ['ROC', 'BRP', 'IDT', 'ILM']: 
    print('######## '+ method + ' ########')
    if method == 'ROC':
        refgenome = 'hg38'
    else:
        refgenome = 'hg19'
    mixtureid = method_sampleid_dict[method]
    print('############# {} ############'.format(mixtureid))
    calltables = {'sampleid':[], 'tf':[], 'cov':[], 'snv':[], 'indel':[], 'snp':[]}
    calltable_snv, aux = get_calltableseries(config, mixtureid, chrom='all', muttype='snv', filterparam=filterparam, reload=reload, save=save, diltype='SEQC2', concat='tf', bcbiovaf=0.01)
    calltable_indel, aux = get_calltableseries(config, mixtureid, chrom='all', muttype='indel', filterparam=filterparam, reload=reload, save=save, diltype='SEQC2', concat='tf', bcbiovaf=0.01)
    calltable_snp, aux = get_calltableseries(config, mixtureid, chrom='all', muttype='snp', filterparam=filterparam, reload=reload, save=save, diltype='SEQC2', concat='tf', bcbiovaf=0.01)
    print(calltable_snv.shape, calltable_indel.shape, calltable_snp.shape)
    print(aux)
    calltables['snv'].append(calltable_snv)
    calltables['indel'].append(calltable_indel)
    calltables['snp'].append(calltable_snp)
    calltables['sampleid'] = mixtureid 
    #calltables['tf'] = np.unique([cn.split('_')[0] for cn in list(calltable_snv.columns)])[:-5].astype(float)
    calltables['snv'] = pd.concat(calltables['snv'])
    calltables['indel'] = pd.concat(calltables['indel'])
    calltables['snp'] = pd.concat(calltables['snp'])
    #dilutionseries
    dilutionseries = aux
    #dilutionseries = aux.T[['mixture_' + '_'.join(mixtureid.split('_')[:2]) + '_' + str(s[0]) + 'x_' + '_'.join(mixtureid.split('_')[2:4]) + '_' + str(s[1]) + 'x' for s in seriesorder]].T
    muttype = 'snv'
    refsample = 'SEQC2'
    # ground truth
    #groundtruths = read_vcf(os.path.join(*config.mixturefolderSEQC2, 'ground_truths', 'KnownPositives_hg38.vcf'))
    #groundtruths['CHROM'] = groundtruths['CHROM'].str.replace('chr', '')
    #groundtruths['chrom_pos_ref_alt'] = groundtruths['CHROM'].astype(str).str.cat(groundtruths['POS'].astype('str'), sep='_').str.cat(groundtruths['REF'].astype('str'), sep='_').str.cat(groundtruths['ALT'].astype('str'), sep='_')
    #groundtruths.set_index('chrom_pos_ref_alt', inplace=True)

    #bedfile = pd.read_csv(os.path.join(*config.mixturefolderSEQC2, 'bedfiles', 'LBx_ROC_hg38.bed'), sep='\t', header=None)
    #bedfile.columns = ['chrom', 'startpos', 'endpos']
    evaluation_bed_df = pd.read_csv(os.path.join(*config.mixturefolderSEQC2, 'bedfiles', 'evaluationbed_'+refgenome+'.csv'), index_col=0)
    if refgenome == 'hg38':
        evaluation_bed_df.index = ['chr'+i for i in list(evaluation_bed_df.index)]
    #bedfile['chrom'] = bedfile['chrom'].str.replace('chr', '')

    #groundtruth_outside_bed = []
    #for idx, row in tqdm(groundtruths.iterrows(), total=groundtruths.shape[0]):
    #    chrom = row['CHROM']
    #    pos = row['POS']
    #    ref = row['REF']
    #    alt = row['ALT']
    #    aux = bedfile[bedfile['chrom'] == str(chrom)]
    #    count = False
    #    for ind in list(aux.index):
    #        #print(aux['endpos'][ind], aux['startpos'][ind])
    #        if aux['endpos'][ind] >= int(pos):
    #            if aux['startpos'][ind] <= int(pos):
    #                count = True
    #    if not count:
    #        groundtruth_outside_bed.append(chrom + '_'+ str(pos) + '_'+ ref + '_'+ alt)
    #print(len(groundtruth_outside_bed))
    #groundtruths_in_bed = groundtruths.drop(groundtruth_outside_bed)
    #print(groundtruths_in_bed.shape[0])
    calltablesseries = calltables[muttype]
    calltablesseries['chrom_pos'] = calltablesseries['chrom'].astype(str).str.cat(calltablesseries['pos'].astype(str), sep='_')
    calltablesseries.set_index('chrom_pos', inplace=True)
    #print(calltablesseries['chrom_pos'])
    #calltablesseries['truth'] = False
    calltablesseries = calltablesseries[~calltablesseries.index.duplicated()] ### TODO
    calltablesseries = calltablesseries.reindex(evaluation_bed_df.index)
    calltablesseries['truth'] = evaluation_bed_df['truth']
    #calltablesseries = calltablesseries[calltablesseries['chrom_pos'].isin(list(evaluation_bed_df.index))]
    #calltablesseries = pd.concat([calltablesseries, evaluation_bed_df], axis=1)
    #calltablesseries = calltablesseries[~calltablesseries['truth'].isna()]
    print(calltablesseries.shape[0])
    print(calltablesseries.head())
    #calltablesseries['truth'] = False
    #calltablesseries.loc[calltablesseries.index.isin(list(groundtruths_in_bed.index)), 'truth'] = True
    print(calltablesseries['truth'].value_counts())
    #calltablesseries['SampleDf_'+method+'_score'] = calltablesseries['SampleDf_'+method+'_vaf']
    #calltablesseries['SampleEf_'+method+'_score'] = calltablesseries['SampleEf_'+method+'_vaf']
    # metrics
    methods = ['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan', method]
    #results_auprc_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='auprc', ground_truth_method='SEQC2',
    #                                 refsample='SEQC2', muttype=muttype, methods=methods)
    #results_precision_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='precision',  ground_truth_method='SEQC2',
    #                                 refsample='SEQC2', muttype=muttype, methods=methods)
    #results_recall_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='recall',  ground_truth_method='SEQC2',
    #                                 refsample='SEQC2', muttype=muttype, methods=methods)
    #results_maxf1_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='maxf1',  ground_truth_method='SEQC2',
    #                                 refsample='SEQC2', muttype=muttype, methods=methods)
    #figure_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, xy='pr', ground_truth_method='SEQC2',
    #                                 refsample='SEQC2', muttype=muttype, methods=methods)
                    #    ground_truth_method=gtm,
                    #    refsample=refsample, muttype=muttype.upper(), methods=None, fixedvar=fixedvar, save=save)
    calltablesseries_df[method] = calltablesseries

In [None]:
import matplotlib.transforms as transforms

for met in ['TP', 'FP']:
    metbool = False if met == 'FP' else True 
    for m, calltablesseries in calltablesseries_df.items():
        print(m)
        if m == 'IDT':
            suffix = 'lS'
        else:
            suffix = ''
        print(suffix)
        print(calltablesseries[calltablesseries['truth'] == metbool].shape[0])
        A = calltablesseries[calltablesseries['truth'] == metbool][['SampleDf'+suffix+'_'+m  for m in config.methods]]
        print(A.sum())
        B = calltablesseries[calltablesseries['truth'] == metbool][['SampleEf'+suffix+'_'+m  for m in config.methods]]
        print(B.sum())
        ngt = calltablesseries[calltablesseries['truth'] == metbool].shape[0]
        fig, ax = plt.subplots(figsize=(10,5))
        plt.bar(x=np.arange(len(config.methods))-0.2, height=A.sum().values, width=0.4, color=[config.colors[config.methods.index(m)] for m in config.methods], label=A.sum().index[0].split('_')[0], alpha=0.6)
        plt.bar(x=np.arange(len(config.methods))+0.2, height=B.sum().values, width=0.4, color=[config.colors[config.methods.index(m)] for m in config.methods], label=B.sum().index[0].split('_')[0], alpha=1)
        plt.xticks(np.arange(len(config.methods)), [a.split('_')[1] for a in A.sum().index])
        plt.axhline(y=ngt, c='red', lw=3)
        trans = transforms.blended_transform_factory(ax.get_yticklabels()[0].get_transform(), ax.transData)
        ax.text(0, ngt, "{:.0f}".format(ngt), color="red", transform=trans, ha="right", va="center")
        plt.xticks(rotation=90)
        plt.legend(bbox_to_anchor=(1,1))
        plt.title(met)
        plt.show()

In [None]:
import matplotlib.transforms as transforms

methodcalls_dict = {}
for me in config.methods:
        methodcalls_dict[me] = {}
        methodcalls_dict[me]['SampleDf'] = []
        methodcalls_dict[me]['SampleEf'] = []

met = 'TP'
#for met in ['TP', 'FP']:
metbool = False if met == 'FP' else True 
for m, calltablesseries in calltablesseries_df.items():
    print(m)
    if m == 'IDT':
        suffix = 'lS'
    else:
        suffix = ''
    ngt = calltablesseries[calltablesseries['truth'] == metbool].shape[0]
    print(ngt)
    A = calltablesseries[calltablesseries['truth'] == metbool][['SampleDf'+suffix+'_'+m  for m in config.methods]]
    B = calltablesseries[calltablesseries['truth'] == metbool][['SampleEf'+suffix+'_'+m  for m in config.methods]]
    for me in config.methods:
        if m != 'ROC':
            methodcalls_dict[me]['SampleDf'].append(list(A[A['SampleDf'+suffix+'_'+me] == True].index))
            methodcalls_dict[me]['SampleEf'].append(list(B[B['SampleEf'+suffix+'_'+me] == True].index))
        else:
            a = list(A[A['SampleDf'+suffix+'_'+me] == True].index)
            for ebi, eb in enumerate(a):
                print(eb)
                a[ebi] = liftover(eb, os.path.join('data', 'extdata', 'hg38ToHg19.over.chain.gz'))
            methodcalls_dict[me]['SampleDf'].append(a)
            b = list(B[B['SampleEf'+suffix+'_'+me] == True].index)
            for ebi, eb in enumerate(b):
                print(eb)
                b[ebi] = liftover(eb, os.path.join('data', 'extdata', 'hg38ToHg19.over.chain.gz'))
            methodcalls_dict[me]['SampleEf'].append(b)
    
methodcalls_dict

In [None]:
res = {}
res2 = {}
for m in config.methods:
    aux = methodcalls_dict[m]['SampleDf'].copy()
    aux = pd.Series([item for sublist in aux for item in sublist])
    aux = aux.value_counts()
    aux = aux.value_counts()
    aux2 = methodcalls_dict[m]['SampleEf'].copy()
    aux2 = pd.Series([item for sublist in aux2 for item in sublist])
    aux2 = aux2.value_counts()
    aux2 = aux2.value_counts()
    res[m] = aux
    res2[m] = aux2
print(res)
res = pd.DataFrame(res)
res.fillna(0, inplace=True)
res = res.astype(int)
res = res.reindex(index=res.index[::-1])
res2 = pd.DataFrame(res2)
res2.fillna(0, inplace=True)
res2 = res2.astype(int)
res2 = res2.reindex(index=res2.index[::-1])
res['sample'] = 0
res2['sample'] = 1
res = pd.concat([res, res2])
res['count'] = res.index
res.reset_index(inplace=True, drop=True)
print(res.columns)

res3 = res.set_index(['sample', 'count']).stack().reset_index(level=2, drop=False).reset_index()
res3.columns = ['sample', 'count', 'method', 'value']
res3.sort_values(['sample', 'count', 'method'],  ascending=[True, False, True], inplace=True)
res3

In [None]:
df_pivot

In [None]:
#sns.barplot(x='method', y='value', hue='sample', data=res3)

df_pivot = pd.pivot_table(res3[res3['sample'] == 0], index='method', columns='count', values='value')
df_pivot = df_pivot.reindex(config.methods)
df_pivot = df_pivot[df_pivot.columns[::-1]] 
df_pivot.plot.bar(stacked=True)

df_pivot = pd.pivot_table(res3[res3['sample'] == 1], index='method', columns='count', values='value')
df_pivot = df_pivot.reindex(config.methods)
df_pivot = df_pivot[df_pivot.columns[::-1]] 
df_pivot.plot.bar(stacked=True)

#res3.set_index('method').plot(kind='bar', hue='sample', stacked=True)

In [None]:
# FP default threshold
if method == 'IDT':
    suffix = 'lS'
else:
    suffix = ''
print(suffix)
print(calltablesseries[calltablesseries['truth'] == False].shape[0])
A = calltablesseries[calltablesseries['truth'] == False][['SampleDf'+suffix+'_'+m  for m in config.methods]]
print(A.sum())
B = calltablesseries[calltablesseries['truth'] == False][['SampleEf'+suffix+'_'+m  for m in config.methods]]
print(B.sum())

In [None]:
# FP 
"""
print(calltablesseries[calltablesseries['truth'] == False].shape[0])
A = calltablesseries[calltablesseries['truth'] == False][['SampleDf'+suffix+'_'+m+'_score'  for m in config.methods]]
A[A < 0.8] = 0
A[A >= 0.8] = 1
print(A.sum())
B = calltablesseries[calltablesseries['truth'] == False][['SampleEf'+suffix+'_'+m+'_score'  for m in config.methods]]
B[B < 0.8] = 0
B[B >= 0.8] = 1
print(B.sum())
"""

In [None]:
import matplotlib.transforms as transforms

for m, calltablesseries in calltablesseries_df.items():
    print(m)
    if m == 'IDT':
        suffix = 'lS'
    else:
        suffix = ''
    print(suffix)
    print(calltablesseries[calltablesseries['truth'] == False].shape[0])
    A = calltablesseries[calltablesseries['truth'] == False][['SampleDf'+suffix+'_'+m  for m in config.methods]]
    print(A.sum())
    B = calltablesseries[calltablesseries['truth'] == False][['SampleEf'+suffix+'_'+m  for m in config.methods]]
    print(B.sum())
    ngt = calltablesseries[calltablesseries['truth'] == False].shape[0]
    fig, ax = plt.subplots(figsize=(10,5))
    plt.bar(x=np.arange(len(config.methods))-0.2, height=A.sum().values, width=0.4, color=[config.colors[config.methods.index(m)] for m in config.methods], label=A.sum().index[0].split('_')[0], alpha=0.6)
    plt.bar(x=np.arange(len(config.methods))+0.2, height=B.sum().values, width=0.4, color=[config.colors[config.methods.index(m)] for m in config.methods], label=B.sum().index[0].split('_')[0], alpha=1)
    plt.xticks(np.arange(len(config.methods)), [a.split('_')[1] for a in A.sum().index])
    plt.axhline(y=ngt, c='red', lw=3)
    trans = transforms.blended_transform_factory(ax.get_yticklabels()[0].get_transform(), ax.transData)
    ax.text(0, ngt, "{:.0f}".format(ngt), color="red", transform=trans, ha="right", va="center")
    plt.xticks(rotation=90)
    plt.legend(bbox_to_anchor=(1,1))
    plt.title('FP')
    plt.show()

In [None]:
# TP default threshold
print(calltablesseries[calltablesseries['truth'] == True].shape[0])
A = calltablesseries[calltablesseries['truth'] == True][['SampleDf'+suffix+'_'+m  for m in config.methods]]
print(A.sum())
B = calltablesseries[calltablesseries['truth'] == True][['SampleEf'+suffix+'_'+m for m in config.methods]]
print(B.sum())

In [None]:
# TP
"""
threshold = 0.8
print(calltablesseries[calltablesseries['truth'] == True].shape[0])
A = calltablesseries[calltablesseries['truth'] == True][['SampleDf'+suffix+'_'+m+'_score'  for m in config.methods]]
A[A < threshold] = 0
A[A >= threshold] = 1
print(A.sum())
B = calltablesseries[calltablesseries['truth'] == True][['SampleEf'+suffix+'_'+m+'_score'  for m in config.methods]]
B[B < threshold] = 0
B[B >= threshold] = 1
print(B.sum())
"""

In [None]:
fig, ax = plt.subplots(figsize=(10,5))
ngt = calltablesseries[calltablesseries['truth'] == True].shape[0]
plt.bar(x=np.arange(len(config.methods))-0.2, height=A.sum().values, width=0.4, color=[config.colors[config.methods.index(m)] for m in config.methods], label=A.sum().index[0].split('_')[0], alpha=0.6)
plt.bar(x=np.arange(len(config.methods))+0.2, height=B.sum().values, width=0.4, color=[config.colors[config.methods.index(m)] for m in config.methods], label=B.sum().index[0].split('_')[0], alpha=1)
plt.xticks(np.arange(len(config.methods)), [a.split('_')[1] for a in A.sum().index])
plt.axhline(y=calltablesseries[calltablesseries['truth'] == True].shape[0], c='red', lw=3)
plt.xticks(rotation=90)
plt.legend(bbox_to_anchor=(1,1))
plt.axhline(y=ngt, c='red', lw=3)
trans = transforms.blended_transform_factory(ax.get_yticklabels()[0].get_transform(), ax.transData)
ax.text(0, ngt, "{:.0f}".format(ngt), color="red", transform=trans, ha="right", va="center")
plt.title('TP')

In [None]:
calltablesseries.head(50)

In [None]:
# FP default threshold
calltablesseries[calltablesseries['truth'] == False][['SampleDf_'+m+'_score' for m in config.methods]]

In [None]:
for m in config.methods:
    if method == 'IDT':
        suffix = 'lS'
    else:
        suffix = ''
    print(suffix)
    plt.figure()
    if calltablesseries['SampleDf'+suffix+'_'+m+'_vaf'].isna().sum() < calltablesseries.shape[0]:
        sns.scatterplot(x='SampleDf'+suffix+'_'+m+'_vaf', y='SampleDf'+suffix+'_'+m+'_score', hue='truth', marker='o', data=calltablesseries)
        #plt.xlim([-.01, .21])
    if calltablesseries['SampleEf'+suffix+'_'+m+'_vaf'].isna().sum() < calltablesseries.shape[0]:
        sns.scatterplot(x='SampleEf'+suffix+'_'+m+'_vaf', y='SampleEf'+suffix+'_'+m+'_score', hue='truth', marker='s', data=calltablesseries)
        #plt.xlim([-.01, .21])
        plt.ylim([-.01, 1.02])
        plt.show()

In [None]:
evaluation_bed_df

In [None]:
#calltablesseries.set_index('chrom_pos', inplace=True)
calltablesseries.loc[calltablesseries[calltablesseries.index.duplicated()].index][['chrom', 'pos', 'ref', 'alt'] + ['SampleDf_'+m for m in config.methods]]

In [None]:
calltablesseries[calltablesseries['chrom_pos'].isin(list(evaluation_bed_df.index))]

In [None]:
calltablesseries['pos'].isna().sum()

In [None]:
calltablesseries.tail(50)

In [None]:
calltablesseries

In [None]:
pd.DataFrame(dilutionseries.iloc[1]).T

In [None]:
figure_curve_allchr(config, calltablesseries, pd.DataFrame(dilutionseries.iloc[1]).T, mixtureid, xy='pr', ground_truth_method='SEQC2',
                                 refsample='SEQC2', muttype=muttype, methods=methods)

In [None]:
calltablesseries['chrom_pos'] = calltablesseries['chrom'].str[3:].str.cat(calltablesseries['pos'].astype(str), sep='_')
print(calltablesseries.shape[0])
A = calltablesseries[calltablesseries['chrom_pos'].isin(evaluation_bed_hg38)]
missed = set(evaluation_bed_hg38) - set(A['chrom_pos'].isin(evaluation_bed_hg38))
print(len(missed))
missed_index = ['chr'+i for i in missed]
#calltablesseries = calltablesseries.reindex(set(set(calltablesseries.index) | set(missed)))
print(A.shape[0])
#evaluation_bed

In [None]:
A = A.reindex(set(set(A.index) | set(missed_index)))
A.head(5)

In [None]:
A['chrom'] = A.index.str.split('_').str[0]
A['pos'] =  A.index.str.split('_').str[1]
A['chrom_pos'] = A['chrom'].str[3:]+'_'+A['pos']
A

In [None]:
for a in A[A['truth'].isna()]['chrom_pos'].values:
    #print(evaluation_bed_hg38_df.loc[a].values[0])
    A.loc[(A['chrom_pos'] == a), 'truth'] = evaluation_bed_hg38_df.loc[a].values[0]
A

In [None]:
results_auprc_df = metric_curve_allchr(config, A, dilutionseries, mixtureid, metric='auprc', ground_truth_method='SEQC2',
                                 refsample='SEQC2', muttype=muttype, methods=methods)
results_precision_df = metric_curve_allchr(config, A, dilutionseries, mixtureid, metric='precision',  ground_truth_method='SEQC2',
                                 refsample='SEQC2', muttype=muttype, methods=methods)
results_recall_df = metric_curve_allchr(config, A, dilutionseries, mixtureid, metric='recall',  ground_truth_method='SEQC2',
                                 refsample='SEQC2', muttype=muttype, methods=methods)
results_maxf1precision_df = metric_curve_allchr(config, A, dilutionseries, mixtureid, metric='maxf1precision',  ground_truth_method='SEQC2',
                                 refsample='SEQC2', muttype=muttype, methods=methods)
results_maxf1recall_df = metric_curve_allchr(config, A, dilutionseries, mixtureid, metric='maxf1recall',  ground_truth_method='SEQC2',
                                 refsample='SEQC2', muttype=muttype, methods=methods)
figure_curve_allchr(config, A, dilutionseries, mixtureid, xy='pr', ground_truth_method='SEQC2',
                                 refsample='SEQC2', muttype=muttype, methods=methods)

# Part I: (1) Load/Generate call tables, (2) Generate Ground truths and (3) Compute/Save metrics per patient

In [None]:
#for mixtureid in mixtureids:
mixtureid = 'ROC2_ST21_25ng_LIB1-PH'
seriesorder = ['SampleDf', 'SampleEf']
print('############# {} ############'.format(mixtureid))
calltables = {'sampleid':[], 'tf':[], 'cov':[], 'snv':[], 'indel':[], 'snp':[]}
calltable_snv, aux = get_calltableseries(config, mixtureid, chrom='all', muttype='snv', filterparam=filterparam, reload=reload, save=save, diltype='SEQC2', concat='tf')
calltable_indel, aux = get_calltableseries(config, mixtureid, chrom='all', muttype='indel', filterparam=filterparam, reload=reload, save=save, diltype='SEQC2', concat='tf')
calltable_snp, aux = get_calltableseries(config, mixtureid, chrom='all', muttype='snp', filterparam=filterparam, reload=reload, save=save, diltype='SEQC2', concat='tf')
print(calltable_snv.shape, calltable_indel.shape, calltable_snp.shape)
print(aux)
calltables['snv'].append(calltable_snv)
calltables['indel'].append(calltable_indel)
calltables['snp'].append(calltable_snp)
calltables['sampleid'] = mixtureid 
#calltables['tf'] = np.unique([cn.split('_')[0] for cn in list(calltable_snv.columns)])[:-5].astype(float)
calltables['snv'] = pd.concat(calltables['snv'])
calltables['indel'] = pd.concat(calltables['indel'])
calltables['snp'] = pd.concat(calltables['snp'])
#dilutionseries
dilutionseries = aux
#dilutionseries = aux.T[['mixture_' + '_'.join(mixtureid.split('_')[:2]) + '_' + str(s[0]) + 'x_' + '_'.join(mixtureid.split('_')[2:4]) + '_' + str(s[1]) + 'x' for s in seriesorder]].T
muttype = 'snv'
refsample = 'SEQC2'
# ground truth
groundtruths = read_vcf(os.path.join(*config.mixturefolderSEQC2, 'ground_truths', 'KnownPositives_hg38.vcf'))
#groundtruths['CHROM'] = groundtruths['CHROM'].str.replace('chr', '')
groundtruths['chrom_pos_ref_alt'] = groundtruths['CHROM'].astype(str).str.cat(groundtruths['POS'].astype('str'), sep='_').str.cat(groundtruths['REF'].astype('str'), sep='_').str.cat(groundtruths['ALT'].astype('str'), sep='_')
groundtruths.set_index('chrom_pos_ref_alt', inplace=True)

bedfile = pd.read_csv(os.path.join(*config.mixturefolderSEQC2, 'bedfiles', 'LBx_ROC_hg38.bed'), sep='\t', header=None)
bedfile.columns = ['chrom', 'startpos', 'endpos']
#bedfile['chrom'] = bedfile['chrom'].str.replace('chr', '')

groundtruth_outside_bed = []
for idx, row in tqdm(groundtruths.iterrows(), total=groundtruths.shape[0]):
    chrom = row['CHROM']
    pos = row['POS']
    ref = row['REF']
    alt = row['ALT']
    aux = bedfile[bedfile['chrom'] == str(chrom)]
    count = False
    for ind in list(aux.index):
        #print(aux['endpos'][ind], aux['startpos'][ind])
        if aux['endpos'][ind] >= int(pos):
            if aux['startpos'][ind] <= int(pos):
                count = True
    if not count:
        groundtruth_outside_bed.append(chrom + '_'+ str(pos) + '_'+ ref + '_'+ alt)
print(len(groundtruth_outside_bed))
groundtruths_in_bed = groundtruths.drop(groundtruth_outside_bed)
print(groundtruths_in_bed.shape[0])
calltablesseries = calltables[muttype]
calltablesseries['truth'] = False
calltablesseries.loc[calltablesseries.index.isin(list(groundtruths_in_bed.index)), 'truth'] = True
print(calltablesseries['truth'].value_counts())
calltablesseries['SampleDf_ROC_score'] = calltablesseries['SampleDf_ROC_vaf']
calltablesseries['SampleEf_ROC_score'] = calltablesseries['SampleEf_ROC_vaf']
# metrics
methods = ['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan', 'ROC']
results_auprc_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='auprc', ground_truth_method='SEQC2',
                                 refsample='SEQC2', muttype=muttype, methods=methods)
results_precision_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='precision',  ground_truth_method='SEQC2',
                                 refsample='SEQC2', muttype=muttype, methods=methods)
results_recall_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='recall',  ground_truth_method='SEQC2',
                                 refsample='SEQC2', muttype=muttype, methods=methods)
results_maxf1precision_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='maxf1precision',  ground_truth_method='SEQC2',
                                 refsample='SEQC2', muttype=muttype, methods=methods)
results_maxf1recall_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='maxf1recall',  ground_truth_method='SEQC2',
                                 refsample='SEQC2', muttype=muttype, methods=methods)
figure_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, xy='pr', ground_truth_method='SEQC2',
                                 refsample='SEQC2', muttype=muttype, methods=methods)
                #    ground_truth_method=gtm,
                #    refsample=refsample, muttype=muttype.upper(), methods=None, fixedvar=fixedvar, save=save)

In [None]:
calltablesseries

In [None]:
#for mixtureid in mixtureids:
mixtureid = 'BRP2_ST26_25ng_LIB1-P'
seriesorder = ['SampleDf', 'SampleEf']
print('############# {} ############'.format(mixtureid))
calltables = {'sampleid':[], 'tf':[], 'cov':[], 'snv':[], 'indel':[], 'snp':[]}
calltable_snv, aux = get_calltableseries(config, mixtureid, chrom='all', muttype='snv', filterparam=filterparam, reload=reload, save=save, diltype='SEQC2', concat='tf')
calltable_indel, aux = get_calltableseries(config, mixtureid, chrom='all', muttype='indel', filterparam=filterparam, reload=reload, save=save, diltype='SEQC2', concat='tf')
calltable_snp, aux = get_calltableseries(config, mixtureid, chrom='all', muttype='snp', filterparam=filterparam, reload=reload, save=save, diltype='SEQC2', concat='tf')
print(calltable_snv.shape, calltable_indel.shape, calltable_snp.shape)
print(aux)
calltables['snv'].append(calltable_snv)
calltables['indel'].append(calltable_indel)
calltables['snp'].append(calltable_snp)
calltables['sampleid'] = mixtureid 
#calltables['tf'] = np.unique([cn.split('_')[0] for cn in list(calltable_snv.columns)])[:-5].astype(float)
calltables['snv'] = pd.concat(calltables['snv'])
calltables['indel'] = pd.concat(calltables['indel'])
calltables['snp'] = pd.concat(calltables['snp'])
#dilutionseries
dilutionseries = aux
#dilutionseries = aux.T[['mixture_' + '_'.join(mixtureid.split('_')[:2]) + '_' + str(s[0]) + 'x_' + '_'.join(mixtureid.split('_')[2:4]) + '_' + str(s[1]) + 'x' for s in seriesorder]].T
muttype = 'snv'
refsample = 'SEQC2'
# ground truth
groundtruths = read_vcf(os.path.join(*config.mixturefolderSEQC2, 'ground_truths', 'KnownPositives_hg19.vcf'))
groundtruths['CHROM'] = groundtruths['CHROM'].str.replace('chr', '')
groundtruths['chrom_pos_ref_alt'] = groundtruths['CHROM'].astype(str).str.cat(groundtruths['POS'].astype('str'), sep='_').str.cat(groundtruths['REF'].astype('str'), sep='_').str.cat(groundtruths['ALT'].astype('str'), sep='_')
groundtruths.set_index('chrom_pos_ref_alt', inplace=True)

bedfile = pd.read_csv(os.path.join(*config.mixturefolderSEQC2, 'bedfiles', 'LBx_BRP_hg19.bed'), sep='\t', header=None)
bedfile.columns = ['chrom', 'startpos', 'endpos', 'gene']
bedfile['chrom'] = bedfile['chrom'].str.replace('chr', '')

groundtruth_outside_bed = []
for idx, row in tqdm(groundtruths.iterrows(), total=groundtruths.shape[0]):
    chrom = row['CHROM']
    pos = row['POS']
    ref = row['REF']
    alt = row['ALT']
    aux = bedfile[bedfile['chrom'] == str(chrom)]
    count = False
    for ind in list(aux.index):
        #print(aux['endpos'][ind], aux['startpos'][ind])
        if aux['endpos'][ind] >= int(pos):
            if aux['startpos'][ind] <= int(pos):
                count = True
    if not count:
        groundtruth_outside_bed.append(chrom + '_'+ str(pos) + '_'+ ref + '_'+ alt)
print(len(groundtruth_outside_bed))
groundtruths_in_bed = groundtruths.drop(groundtruth_outside_bed)
print(groundtruths_in_bed.shape[0])
calltablesseries = calltables[muttype]
calltablesseries['truth'] = False
calltablesseries.loc[calltablesseries.index.isin(list(groundtruths_in_bed.index)), 'truth'] = True
print(calltablesseries['truth'].value_counts())
calltablesseries['SampleDf_BRP_score'] = calltablesseries['SampleDf_BRP_vaf']
calltablesseries['SampleEf_BRP_score'] = calltablesseries['SampleEf_BRP_vaf']
# metrics
methods = ['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan', 'BRP']
results_auprc_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='auprc', ground_truth_method='SEQC2',
                                 refsample='SEQC2', muttype=muttype, methods=methods)
results_precision_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='precision',  ground_truth_method='SEQC2',
                                 refsample='SEQC2', muttype=muttype, methods=methods)
results_recall_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='recall',  ground_truth_method='SEQC2',
                                 refsample='SEQC2', muttype=muttype, methods=methods)
results_maxf1precision_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='maxf1precision',  ground_truth_method='SEQC2',
                                 refsample='SEQC2', muttype=muttype, methods=methods)
results_maxf1recall_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='maxf1recall',  ground_truth_method='SEQC2',
                                 refsample='SEQC2', muttype=muttype, methods=methods)
figure_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, xy='pr', ground_truth_method='SEQC2',
                                 refsample='SEQC2', muttype=muttype, methods=methods)
                #    ground_truth_method=gtm,
                #    refsample=refsample, muttype=muttype.upper(), methods=None, fixedvar=fixedvar, save=save)

In [None]:
mixtureids

In [None]:
evaluation_bed_df

In [None]:
calltablesseries

In [None]:
#fig, ax = plt.subplots(figsize=(10, 10))
res = {}
mixtureid = 'ROC2_ST21_25ng_LIB1-PH'
#for mixtureid in mixtureids:
#mixtureid = 'BRP2_ST26_25ng_LIB1-P'
method = mixtureid.split('_')[0][:-1]
suffix = 'lS' if method == 'IDT' else ''
seriesorder = ['SampleDf'+suffix, 'SampleEf'+suffix]
print('############# {} ############'.format(mixtureid))
calltables = {'sampleid':[], 'tf':[], 'cov':[], 'snv':[], 'indel':[], 'snp':[]}
calltable_snv, aux = get_calltableseries(config, mixtureid, chrom='all', muttype='snv', filterparam=filterparam, reload=reload, save=save, diltype='SEQC2', concat='tf', bcbiovaf=0.01, gatkcorr=False)
calltable_indel, aux = get_calltableseries(config, mixtureid, chrom='all', muttype='indel', filterparam=filterparam, reload=reload, save=save, diltype='SEQC2', concat='tf', bcbiovaf=0.01, gatkcorr=False)
calltable_snp, aux = get_calltableseries(config, mixtureid, chrom='all', muttype='snp', filterparam=filterparam, reload=reload, save=save, diltype='SEQC2', concat='tf', bcbiovaf=0.01, gatkcorr=False)
print(calltable_snv.shape, calltable_indel.shape, calltable_snp.shape)
print(aux)
calltables['snv'].append(calltable_snv)
calltables['indel'].append(calltable_indel)
calltables['snp'].append(calltable_snp)
calltables['sampleid'] = mixtureid 
#calltables['tf'] = np.unique([cn.split('_')[0] for cn in list(calltable_snv.columns)])[:-5].astype(float)
calltables['snv'] = pd.concat(calltables['snv'])
calltables['indel'] = pd.concat(calltables['indel'])
calltables['snp'] = pd.concat(calltables['snp'])
#dilutionseries
dilutionseries = aux
#dilutionseries = aux.T[['mixture_' + '_'.join(mixtureid.split('_')[:2]) + '_' + str(s[0]) + 'x_' + '_'.join(mixtureid.split('_')[2:4]) + '_' + str(s[1]) + 'x' for s in seriesorder]].T
muttype = 'snv'
refsample = 'SEQC2'
# ground truth
if mixtureid == 'ROC2_ST21_25ng_LIB1-PH':
    evaluation_bed_df = pd.read_csv(os.path.join(*config.mixturefolderSEQC2, 'bedfiles', 'evaluationbed_hg38.csv'), index_col=0)
    evaluation_bed_df.index = ['chr'+i for i in list(evaluation_bed_df.index)]
    print(evaluation_bed_df.shape[0])
else:
    evaluation_bed_df = pd.read_csv(os.path.join(*config.mixturefolderSEQC2, 'bedfiles', 'evaluationbed_hg19.csv'), index_col=0)
    print(evaluation_bed_df.shape[0])
calltablesseries = calltables[muttype]
calltablesseries['chrom_pos'] = calltablesseries['chrom'].astype('str').str.cat(calltablesseries['pos'].astype('str'), sep="_")
print(calltablesseries.shape[0])

calltablesseries.reset_index(inplace=True)
calltablesseries.set_index('chrom_pos', inplace=True)
calltablesseries = calltablesseries.loc[~calltablesseries.index.duplicated()] ### TODO improve, merge pred for instance for '9_21971120'
calltablesseries = calltablesseries.reindex(evaluation_bed_df.index)
print(calltablesseries.shape[0])
calltablesseries['truth'] = evaluation_bed_df['truth']
print(calltablesseries['truth'].value_counts())
calltablesseries['SampleDf'+suffix+'_'+method+'_score'] = calltablesseries['SampleDf'+suffix+'_'+method].fillna(0).astype(int)
calltablesseries['SampleEf'+suffix+'_'+method+'_score'] = calltablesseries['SampleEf'+suffix+'_'+method].fillna(0).astype(int)
# metrics
methods = ['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan', method]
for m in methods:
    calltablesseries['SampleDf'+suffix+'_'+m+'_score'].fillna(0, inplace=True)
    calltablesseries['SampleEf'+suffix+'_'+m+'_score'].fillna(0, inplace=True)
#results_auprc_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='auprc', ground_truth_method='SEQC2',
#                                 refsample='SEQC2', muttype=muttype, methods=methods)
#results_precision_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='precision',  ground_truth_method='SEQC2',
#                                 refsample='SEQC2', muttype=muttype, methods=methods)
#results_recall_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='recall',  ground_truth_method='SEQC2',
#                                 refsample='SEQC2', muttype=muttype, methods=methods)
#results_maxf1_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='maxf1',  ground_truth_method='SEQC2',
#                                 refsample='SEQC2', muttype=muttype, methods=methods)
#results_maxf1recall_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='maxf1recall',  ground_truth_method='SEQC2',
#                                 refsample='SEQC2', muttype=muttype, methods=methods)
pr = figure_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, xy='pr', ground_truth_method='SEQC2',
                                 refsample='SEQC2', muttype=muttype, methods=methods, splitby='dilution', plot='partial', figax=None)
res[mixtureid] = pr
                #    ground_truth_method=gtm,
                #    refsample=refsample, muttype=muttype.upper(), methods=None, fixedvar=fixedvar, save=save)

In [None]:
mixtureids

In [None]:
#fig, ax = plt.subplots(figsize=(10, 10))
res = {}
res_auprc = {}
res_maxrecallatleast010precision = {}
res_maxrecallatleast050precision = {}
res_maxrecallatleast090precision = {}


for mixtureid in mixtureids:
    reload = False
    save = False
    #mixtureid = 'BRP2_ST26_25ng_LIB1-P'
    method = mixtureid.split('_')[0][:-1]
    suffix = 'lS' if method == 'IDT' else ''
    seriesorder = ['SampleDf'+suffix, 'SampleEf'+suffix]
    print('############# {} ############'.format(mixtureid))
    calltables = {'sampleid':[], 'tf':[], 'cov':[], 'snv':[], 'indel':[], 'snp':[]}
    calltable_snv, aux = get_calltableseries(config, mixtureid, chrom='all', muttype='snv', filterparam='PASSREJECT', reload=reload, save=reload, diltype='SEQC2', concat='tf', bcbiovaf=0.01, gatkcorr=False)
    calltable_indel, aux = get_calltableseries(config, mixtureid, chrom='all', muttype='indel', filterparam='PASSREJECT', reload=reload, save=reload, diltype='SEQC2', concat='tf', bcbiovaf=0.01, gatkcorr=False)
    calltable_snp, aux = get_calltableseries(config, mixtureid, chrom='all', muttype='snp', filterparam='PASSREJECT', reload=reload, save=reload, diltype='SEQC2', concat='tf', bcbiovaf=0.01, gatkcorr=False)
    print(calltable_snv.shape, calltable_indel.shape, calltable_snp.shape)
    print(aux)
    calltables['snv'].append(calltable_snv)
    calltables['indel'].append(calltable_indel)
    calltables['snp'].append(calltable_snp)
    calltables['sampleid'] = mixtureid 
    #calltables['tf'] = np.unique([cn.split('_')[0] for cn in list(calltable_snv.columns)])[:-5].astype(float)
    calltables['snv'] = pd.concat(calltables['snv'])
    calltables['indel'] = pd.concat(calltables['indel'])
    calltables['snp'] = pd.concat(calltables['snp'])
    #dilutionseries
    dilutionseries = aux
    #dilutionseries = aux.T[['mixture_' + '_'.join(mixtureid.split('_')[:2]) + '_' + str(s[0]) + 'x_' + '_'.join(mixtureid.split('_')[2:4]) + '_' + str(s[1]) + 'x' for s in seriesorder]].T
    muttype = 'snv'
    refsample = 'SEQC2'
    # ground truth
    #if mixtureid == 'ROC2_ST21_25ng_LIB1-PH':
        #evaluation_bed_df = pd.read_csv(os.path.join(*config.mixturefolderSEQC2, 'bedfiles', 'evaluationbed_hg38.csv'), index_col=0)
        #evaluation_bed_df.index = ['chr'+i for i in list(evaluation_bed_df.index)]
        #print(evaluation_bed_df.shape[0])
    #else:
        #evaluation_bed_df = pd.read_csv(os.path.join(*config.mixturefolderSEQC2, 'bedfiles', 'evaluationbed_hg19.csv'), index_col=0)
        #print(evaluation_bed_df.shape[0])
        # load bedfile for method BRP
    genomeref = 'hg19' if mixtureid != 'ROC2_ST21_25ng_LIB1-PH' else 'hg38'
    print(genomeref)
    bedfile = pd.read_csv(os.path.join(*config.mixturefolderSEQC2, 'bedfiles', 'intersect_BRP_IDT_ILM_ROC_'+genomeref+'.bed'), sep='\t', header=None)
    bedfile.columns = ['chrom', 'startpos', 'endpos', 'gene']
    if genomeref == 'hg19':
        bedfile['chrom'] = bedfile['chrom'].str.replace('chr', '')
    # load ground truths (cerified mutations either somatic or diluted germline in the sample A)
    groundtruths = read_vcf(os.path.join(*config.mixturefolderSEQC2, 'ground_truths', 'KnownPositives_'+genomeref+'.vcf'))
    if genomeref == 'hg19':
        groundtruths['CHROM'] = groundtruths['CHROM'].str.replace('chr', '')
    groundtruths['chrom_pos'] = groundtruths['CHROM'].astype(str).str.cat(groundtruths['POS'].astype('str'), sep='_')
    groundtruths['chrom_pos_ref_alt'] = groundtruths['CHROM'].astype(str).str.cat(groundtruths['POS'].astype('str'), sep='_').str.cat(groundtruths['REF'].astype('str'), sep='_').str.cat(groundtruths['ALT'].astype('str'), sep='_')
    groundtruths.set_index('chrom_pos_ref_alt', inplace=True)
    print(groundtruths.shape[0])
    bedfilelistall = []
    for il in range(bedfile.shape[0]):
        for i in list(np.arange(bedfile.startpos.iloc[il], bedfile.endpos.iloc[il])):
            bedfilelistall.append(bedfile.chrom.iloc[il]+'_'+str(i))
    print(len(bedfilelistall))
    evaluation_bed_df = pd.DataFrame(index=bedfilelistall)
    evaluation_bed_df['truth'] = False
    gtchrompos = set(set(groundtruths['chrom_pos'].values) & set(evaluation_bed_df.index))
    evaluation_bed_df.loc[gtchrompos, 'truth'] = True
    print(evaluation_bed_df.value_counts())
    calltablesseries = calltables[muttype]
    calltablesseries['chrom_pos'] = calltablesseries['chrom'].astype('str').str.cat(calltablesseries['pos'].astype('str'), sep="_")
    print(calltablesseries.shape[0])

    calltablesseries.reset_index(inplace=True)
    calltablesseries.set_index('chrom_pos', inplace=True)
    calltablesseries = calltablesseries.loc[~calltablesseries.index.duplicated()] ### TODO improve, merge pred for instance for '9_21971120'
    calltablesseries = calltablesseries.reindex(evaluation_bed_df.index)
    print(calltablesseries.shape[0])
    calltablesseries['truth'] = evaluation_bed_df['truth']
    print(calltablesseries['truth'].value_counts())
    calltablesseries['SampleDf'+suffix+'_'+method+'_score'] = calltablesseries['SampleDf'+suffix+'_'+method].fillna(0).astype(int)
    calltablesseries['SampleEf'+suffix+'_'+method+'_score'] = calltablesseries['SampleEf'+suffix+'_'+method].fillna(0).astype(int)
    # metrics
    methods = ['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan', method]
    for m in methods:
        calltablesseries['SampleDf'+suffix+'_'+m+'_score'].fillna(0, inplace=True)
        calltablesseries['SampleEf'+suffix+'_'+m+'_score'].fillna(0, inplace=True)
    results_auprc_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='auprc', ground_truth_method='SEQC2',
                                     refsample='SEQC2', muttype=muttype, methods=methods)
    results_maxrecallatlast0_10precision_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='maxrecallatleast0_10precision', ground_truth_method='SEQC2',
                                      refsample='SEQC2', muttype=muttype, methods=methods)
    results_maxrecallatlast0_50precision_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='maxrecallatleast0_50precision', ground_truth_method='SEQC2',
                                      refsample='SEQC2', muttype=muttype, methods=methods)
    results_maxrecallatlast0_90precision_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='maxrecallatleast0_90precision', ground_truth_method='SEQC2',
                                      refsample='SEQC2', muttype=muttype, methods=methods)
    #results_precision_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='precision',  ground_truth_method='SEQC2',
    #                                 refsample='SEQC2', muttype=muttype, methods=methods)
    #results_recall_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='recall',  ground_truth_method='SEQC2',
    #                                 refsample='SEQC2', muttype=muttype, methods=methods)
    #results_maxf1_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='maxf1',  ground_truth_method='SEQC2',
    #                                 refsample='SEQC2', muttype=muttype, methods=methods)
    #results_maxf1recall_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='maxf1recall',  ground_truth_method='SEQC2',
    #                                 refsample='SEQC2', muttype=muttype, methods=methods)
    pr = figure_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, xy='pr', ground_truth_method='SEQC2',
                                     refsample='SEQC2', muttype=muttype, methods=methods, splitby='dilution', plot='all', figax=None)
    res[mixtureid] = pr
    res_auprc[mixtureid] = results_auprc_df
    res_maxrecallatleast010precision[mixtureid] = results_maxrecallatlast0_10precision_df
    res_maxrecallatleast050precision[mixtureid] = results_maxrecallatlast0_50precision_df
    res_maxrecallatleast090precision[mixtureid] = results_maxrecallatlast0_90precision_df

                    #    ground_truth_method=gtm,
                    #    refsample=refsample, muttype=muttype.upper(), methods=None, fixedvar=fixedvar, save=save)

In [None]:
#fig, ax = plt.subplots(figsize=(10, 10))
res = {}
mixtureid = 'ROC2_ST21_25ng_LIB1-PH'
reload = True
save = True
filterparam = 'PASSREJECT' # 'PASSREJECT'
#for mixtureid in mixtureids:
#mixtureid = 'BRP2_ST26_25ng_LIB1-P'
method = mixtureid.split('_')[0][:-1]
suffix = 'lS' if method == 'IDT' else ''
seriesorder = ['SampleDf'+suffix, 'SampleEf'+suffix]
print('############# {} ############'.format(mixtureid))
calltables = {'sampleid':[], 'tf':[], 'cov':[], 'snv':[], 'indel':[], 'snp':[]}
calltable_snv, aux = get_calltableseries(config, mixtureid, chrom='all', muttype='snv', filterparam=filterparam, reload=reload, save=reload, diltype='SEQC2', concat='tf', bcbiovaf=0.01, gatkcorr=False)
calltable_indel, aux = get_calltableseries(config, mixtureid, chrom='all', muttype='indel', filterparam=filterparam, reload=reload, save=reload, diltype='SEQC2', concat='tf', bcbiovaf=0.01, gatkcorr=False)
calltable_snp, aux = get_calltableseries(config, mixtureid, chrom='all', muttype='snp', filterparam=filterparam, reload=reload, save=reload, diltype='SEQC2', concat='tf', bcbiovaf=0.01, gatkcorr=False)
print(calltable_snv.shape, calltable_indel.shape, calltable_snp.shape)
print(aux)
calltables['snv'].append(calltable_snv)
calltables['indel'].append(calltable_indel)
calltables['snp'].append(calltable_snp)
calltables['sampleid'] = mixtureid 
#calltables['tf'] = np.unique([cn.split('_')[0] for cn in list(calltable_snv.columns)])[:-5].astype(float)
calltables['snv'] = pd.concat(calltables['snv'])
calltables['indel'] = pd.concat(calltables['indel'])
calltables['snp'] = pd.concat(calltables['snp'])
#dilutionseries
dilutionseries = aux
#dilutionseries = aux.T[['mixture_' + '_'.join(mixtureid.split('_')[:2]) + '_' + str(s[0]) + 'x_' + '_'.join(mixtureid.split('_')[2:4]) + '_' + str(s[1]) + 'x' for s in seriesorder]].T
muttype = 'snv'
refsample = 'SEQC2'
# ground truth
#if mixtureid == 'ROC2_ST21_25ng_LIB1-PH':
    #evaluation_bed_df = pd.read_csv(os.path.join(*config.mixturefolderSEQC2, 'bedfiles', 'evaluationbed_hg38.csv'), index_col=0)
    #evaluation_bed_df.index = ['chr'+i for i in list(evaluation_bed_df.index)]
    #print(evaluation_bed_df.shape[0])
#else:
    #evaluation_bed_df = pd.read_csv(os.path.join(*config.mixturefolderSEQC2, 'bedfiles', 'evaluationbed_hg19.csv'), index_col=0)
    #print(evaluation_bed_df.shape[0])
    # load bedfile for method BRP
genomeref = 'hg19' if mixtureid != 'ROC2_ST21_25ng_LIB1-PH' else 'hg38'
print(genomeref)
bedfile = pd.read_csv(os.path.join(*config.mixturefolderSEQC2, 'bedfiles', 'intersect_BRP_IDT_ILM_ROC_'+genomeref+'.bed'), sep='\t', header=None)
bedfile.columns = ['chrom', 'startpos', 'endpos', 'gene']
if genomeref == 'hg19':
    bedfile['chrom'] = bedfile['chrom'].str.replace('chr', '')
# load ground truths (cerified mutations either somatic or diluted germline in the sample A)
groundtruths = read_vcf(os.path.join(*config.mixturefolderSEQC2, 'ground_truths', 'KnownPositives_'+genomeref+'.vcf'))
if genomeref == 'hg19':
    groundtruths['CHROM'] = groundtruths['CHROM'].str.replace('chr', '')
groundtruths['chrom_pos'] = groundtruths['CHROM'].astype(str).str.cat(groundtruths['POS'].astype('str'), sep='_')
groundtruths['chrom_pos_ref_alt'] = groundtruths['CHROM'].astype(str).str.cat(groundtruths['POS'].astype('str'), sep='_').str.cat(groundtruths['REF'].astype('str'), sep='_').str.cat(groundtruths['ALT'].astype('str'), sep='_')
groundtruths.set_index('chrom_pos_ref_alt', inplace=True)
print(groundtruths.shape[0])
bedfilelistall = []
for il in range(bedfile.shape[0]):
    for i in list(np.arange(bedfile.startpos.iloc[il], bedfile.endpos.iloc[il])):
        bedfilelistall.append(bedfile.chrom.iloc[il]+'_'+str(i))
print(len(bedfilelistall))
evaluation_bed_df = pd.DataFrame(index=bedfilelistall)
evaluation_bed_df['truth'] = False
gtchrompos = set(set(groundtruths['chrom_pos'].values) & set(evaluation_bed_df.index))
evaluation_bed_df.loc[gtchrompos, 'truth'] = True
print(evaluation_bed_df.value_counts())
calltablesseries = calltables[muttype]
calltablesseries['chrom_pos'] = calltablesseries['chrom'].astype('str').str.cat(calltablesseries['pos'].astype('str'), sep="_")
print(calltablesseries.shape[0])

calltablesseries.reset_index(inplace=True)
calltablesseries.set_index('chrom_pos', inplace=True)
calltablesseries = calltablesseries.loc[~calltablesseries.index.duplicated()] ### TODO improve, merge pred for instance for '9_21971120'
calltablesseries = calltablesseries.reindex(evaluation_bed_df.index)
print(calltablesseries.shape[0])
calltablesseries['truth'] = evaluation_bed_df['truth']
print(calltablesseries['truth'].value_counts())
calltablesseries['SampleDf'+suffix+'_'+method+'_score'] = calltablesseries['SampleDf'+suffix+'_'+method].fillna(0).astype(int)
calltablesseries['SampleEf'+suffix+'_'+method+'_score'] = calltablesseries['SampleEf'+suffix+'_'+method].fillna(0).astype(int)
# metrics
methods = ['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan', method]
for m in methods:
    calltablesseries['SampleDf'+suffix+'_'+m+'_score'].fillna(0, inplace=True)
    calltablesseries['SampleEf'+suffix+'_'+m+'_score'].fillna(0, inplace=True)
#results_auprc_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='auprc', ground_truth_method='SEQC2',
#                                 refsample='SEQC2', muttype=muttype, methods=methods)
#results_precision_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='precision',  ground_truth_method='SEQC2',
#                                 refsample='SEQC2', muttype=muttype, methods=methods)
#results_recall_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='recall',  ground_truth_method='SEQC2',
#                                 refsample='SEQC2', muttype=muttype, methods=methods)
#results_maxf1_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='maxf1',  ground_truth_method='SEQC2',
#                                 refsample='SEQC2', muttype=muttype, methods=methods)
#results_maxf1recall_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='maxf1recall',  ground_truth_method='SEQC2',
#                                 refsample='SEQC2', muttype=muttype, methods=methods)
pr = figure_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, xy='pr', ground_truth_method='SEQC2',
                                 refsample='SEQC2', muttype=muttype, methods=methods, splitby='dilution', plot='all', figax=None)
res[mixtureid] = pr
                #    ground_truth_method=gtm,
                #    refsample=refsample, muttype=muttype.upper(), methods=None, fixedvar=fixedvar, save=save)

In [None]:
def function_to_split_ls(hand,labl,dividor):

    Hand_L=[]
    Hand_M=[]
    Labl_L=[]
    Labl_M=[]

    for h,l in zip(hand,labl):
        co=h.get_color()
        ls=h.get_linestyle()
        lw=h.get_linewidth()
        mk=h.get_marker()
        mew=h.get_markeredgewidth()
        ms=h.get_markersize()

        LABS=l.split(dividor)

        if len(LABS) != 2:
            print('Split Legends Error: Only exactly 1 Dividor is accepted.')
            print('                     Currently ' + str(len(LABS)-1) + ' dividors were given')
            return hand,labl

        #Color
        LICO = plt.Line2D((0,1),(0,0), color=co, marker='', linestyle='solid',linewidth=lw)
        #Linestyle
        STYLE = plt.Line2D((0,1),(0,0), color='k', linestyle=ls)

        if LABS[0] not in Labl_L:
            Hand_L.append(LICO)
            Labl_L.append(LABS[0])

        if LABS[1] not in Labl_M:
            Hand_M.append(STYLE)
            Labl_M.append(LABS[1])

    return Hand_L+Hand_M,Labl_L+Labl_M

In [None]:
ls = ['solid', 'dashdot', (0, (5, 1)), (0, (1,1))]
for pref in ['Df', 'Ef']: # sample Df and sample Ef
    fig, ax = plt.subplots(figsize=(10, 10))
    plt.grid()
    for si, s in enumerate(list(res.keys())):
        print(s, [k for k in list(res[s].keys()) if pref in k][0])
        aux = res[s][[k for k in list(res[s].keys()) if pref in k][0]]
        for k, v in aux.items():
            recall, precision = v[0], v[1]
            kwargs = {}
            kwargs["label"] = k + ' * ' + s
            print(kwargs["label"])
            kwargs["color"] = config.colors[config.methods.index(k)]
            #kwargs["marker"] = markers[si]
            kwargs["linestyle"] = ls[si]
            kwargs["lw"] = 3
            # print(precision, recall)
            if (len(recall) == 2) and (recall[0] == 1) and (recall[1] != 0):
                #ax.scatter(recall[1], precision[1],  **kwargs)
                kwargs["markersize"]=15
                ax.plot(recall[1], precision[1],  **kwargs)
            else:
                kwargs["drawstyle"] = "steps-post"
                kwargs["markersize"]=10
                ax.plot(recall, precision, **kwargs)
    #plt.legend(bbox_to_anchor=(1, 1), loc="upper left")
    plt.semilogx()
    plt.xlim([0.01, 1.01])
    ax = plt.gca()
    hand, labl = ax.get_legend_handles_labels()
    hand, labl = function_to_split_ls(hand, labl, '*')
    ax.legend(hand, labl, bbox_to_anchor=(1, 1), loc="upper left")
    print(k, v)
    plt.title('Sample'+pref)

In [None]:
for k, v in res_auprc.items():
    res_auprc[k]['sampleid'] = k
res_auprc_tot = pd.concat(res_auprc.values())
res_auprc_tot['samplename'] = res_auprc_tot['samplename'].str.replace('lS', '')
res_auprc_tot

sns.catplot(x='samplename', y="AUPRC score", hue="caller",
                capsize=.1, height=6, aspect=1.5, kind="point",
                order=sorted(res_auprc_tot['samplename'].unique()),
                palette=[config.colors[config.methods.index(m)] for m in list(res_auprc_tot['caller'].unique())], data=res_auprc_tot)

for k, v in res_maxrecallatleast010precision.items():
    res_maxrecallatleast010precision[k]['sampleid'] = k
res_maxrecallatleast010precision_tot = pd.concat(res_maxrecallatleast010precision.values())
res_maxrecallatleast010precision_tot['samplename'] = res_maxrecallatleast010precision_tot['samplename'].str.replace('lS', '')
res_maxrecallatleast010precision_tot

sns.catplot(x='samplename', y="MAXRECALLATLEAST0_10PRECISION score", hue="caller",
                capsize=.1, height=6, aspect=1.5, kind="point",
                order=sorted(res_maxrecallatleast010precision_tot['samplename'].unique()),
                palette=[config.colors[config.methods.index(m)] for m in list(res_maxrecallatleast010precision_tot['caller'].unique())], data=res_maxrecallatleast010precision_tot)


for k, v in res_maxrecallatleast050precision.items():
    res_maxrecallatleast050precision[k]['sampleid'] = k
res_maxrecallatleast050precision_tot = pd.concat(res_maxrecallatleast050precision.values())
res_maxrecallatleast050precision_tot['samplename'] = res_maxrecallatleast050precision_tot['samplename'].str.replace('lS', '')
res_maxrecallatleast050precision_tot

sns.catplot(x='samplename', y="MAXRECALLATLEAST0_50PRECISION score", hue="caller",
                capsize=.1, height=6, aspect=1.5, kind="point",
                order=sorted(res_maxrecallatleast050precision_tot['samplename'].unique()),
                palette=[config.colors[config.methods.index(m)] for m in list(res_maxrecallatleast050precision_tot['caller'].unique())], data=res_maxrecallatleast050precision_tot)


for k, v in res_maxrecallatleast090precision.items():
    res_maxrecallatleast090precision[k]['sampleid'] = k
res_maxrecallatleast090precision_tot = pd.concat(res_maxrecallatleast090precision.values())
res_maxrecallatleast090precision_tot['samplename'] = res_maxrecallatleast090precision_tot['samplename'].str.replace('lS', '')
res_maxrecallatleast090precision_tot

sns.catplot(x='samplename', y="MAXRECALLATLEAST0_90PRECISION score", hue="caller",
                capsize=.1, height=6, aspect=1.5, kind="point",
                order=sorted(res_maxrecallatleast090precision_tot['samplename'].unique()),
                palette=[config.colors[config.methods.index(m)] for m in list(res_maxrecallatleast090precision_tot['caller'].unique())], data=res_maxrecallatleast090precision_tot)

In [None]:
calltablesseries['truth'].value_counts()

In [None]:
markers = ['D', 's', 'o', '^']
for pref in ['Df', 'Ef']: # sample Df and sample Ef
    fig, ax = plt.subplots(figsize=(10, 10))
    plt.grid()
    for si, s in enumerate(list(res.keys())):
        print(s, [k for k in list(res[s].keys()) if pref in k][0])
        aux = res[s][[k for k in list(res[s].keys()) if pref in k][0]]
        for k, v in aux.items():
            recall, precision = v[0], v[1]
            kwargs = {}
            kwargs["label"] = k + ' * ' + s
            print(kwargs["label"])
            kwargs["color"] = config.colors[config.methods.index(k)]
            kwargs["marker"] = markers[si]
            kwargs["lw"] = 3
            print(precision, recall)
            if (len(recall) == 2) and (recall[0] == 1) and (recall[1] != 0):
                #ax.scatter(recall[1], precision[1],  **kwargs)
                kwargs["markersize"]=15
                ax.plot(recall[1], precision[1],  **kwargs)
            else:
                kwargs["drawstyle"] = "steps-post"
                kwargs["markersize"]=10
                ax.plot(recall, precision, **kwargs)
    #plt.legend(bbox_to_anchor=(1, 1), loc="upper left")
    #plt.semilogx()
    #plt.xlim([0.01, 1.01])
    ax = plt.gca()
    hand, labl = ax.get_legend_handles_labels()
    hand, labl = function_to_split(hand, labl, '*')
    ax.legend(hand, labl, bbox_to_anchor=(1, 1), loc="upper left")
    print(k, v)
    plt.title('Sample'+pref)

In [None]:
k

In [None]:
calltablesseries[['SampleDf_'+m+'_score' for m in config.methods] + ['truth']].tail(25)

In [None]:
#calltablesseries.reset_index(inplace=True)
#calltablesseries.set_index('chrom_pos', inplace=True)
calltablesseries.loc[calltablesseries.index[calltablesseries.index.duplicated()]]
#calltablesseries.reindex(evaluation_bed_df.index)
#calltablesseries

In [None]:
results_auprc_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='auprc', ground_truth_method='SEQC2',
                                 refsample='SEQC2', muttype=muttype, methods=methods)
results_maxf1precision_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='maxf1precision',  ground_truth_method='SEQC2',
                                 refsample='SEQC2', muttype=muttype, methods=methods)
results_maxf1recall_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='maxf1recall',  ground_truth_method='SEQC2',
                                 refsample='SEQC2', muttype=muttype, methods=methods)

In [None]:
mixtureid = 'ILM2_ST29_25ng_LIB1-P'
seriesorder = ['SampleDf', 'SampleEf']
print('############# {} ############'.format(mixtureid))
calltables = {'sampleid':[], 'tf':[], 'cov':[], 'snv':[], 'indel':[], 'snp':[]}
calltable_snv, aux = get_calltableseries(config, mixtureid, chrom='all', muttype='snv', filterparam=filterparam, reload=reload, save=save, diltype='SEQC2', concat='tf')
calltable_indel, aux = get_calltableseries(config, mixtureid, chrom='all', muttype='indel', filterparam=filterparam, reload=reload, save=save, diltype='SEQC2', concat='tf')
calltable_snp, aux = get_calltableseries(config, mixtureid, chrom='all', muttype='snp', filterparam=filterparam, reload=reload, save=save, diltype='SEQC2', concat='tf')
print(calltable_snv.shape, calltable_indel.shape, calltable_snp.shape)
print(aux)
calltables['snv'].append(calltable_snv)
calltables['indel'].append(calltable_indel)
calltables['snp'].append(calltable_snp)
calltables['sampleid'] = mixtureid 
#calltables['tf'] = np.unique([cn.split('_')[0] for cn in list(calltable_snv.columns)])[:-5].astype(float)
calltables['snv'] = pd.concat(calltables['snv'])
calltables['indel'] = pd.concat(calltables['indel'])
calltables['snp'] = pd.concat(calltables['snp'])
#dilutionseries
dilutionseries = aux
#dilutionseries = aux.T[['mixture_' + '_'.join(mixtureid.split('_')[:2]) + '_' + str(s[0]) + 'x_' + '_'.join(mixtureid.split('_')[2:4]) + '_' + str(s[1]) + 'x' for s in seriesorder]].T
muttype = 'snv'
refsample = 'SEQC2'
# ground truth
groundtruths = read_vcf(os.path.join(*config.mixturefolderSEQC2, 'ground_truths', 'KnownPositives_hg19.vcf'))
groundtruths['CHROM'] = groundtruths['CHROM'].str.replace('chr', '')
groundtruths['chrom_pos_ref_alt'] = groundtruths['CHROM'].astype(str).str.cat(groundtruths['POS'].astype('str'), sep='_').str.cat(groundtruths['REF'].astype('str'), sep='_').str.cat(groundtruths['ALT'].astype('str'), sep='_')
groundtruths.set_index('chrom_pos_ref_alt', inplace=True)

bedfile = pd.read_csv(os.path.join(*config.mixturefolderSEQC2, 'bedfiles', 'LBx_ILM_hg19.bed'), sep='\t', header=None)
bedfile.columns = ['chrom', 'startpos', 'endpos']
bedfile['chrom'] = bedfile['chrom'].str.replace('chr', '')

groundtruth_outside_bed = []
for idx, row in tqdm(groundtruths.iterrows(), total=groundtruths.shape[0]):
    chrom = row['CHROM']
    pos = row['POS']
    ref = row['REF']
    alt = row['ALT']
    aux = bedfile[bedfile['chrom'] == str(chrom)]
    count = False
    for ind in list(aux.index):
        #print(aux['endpos'][ind], aux['startpos'][ind])
        if aux['endpos'][ind] >= int(pos):
            if aux['startpos'][ind] <= int(pos):
                count = True
    if not count:
        groundtruth_outside_bed.append(chrom + '_'+ str(pos) + '_'+ ref + '_'+ alt)
print(len(groundtruth_outside_bed))
groundtruths_in_bed = groundtruths.drop(groundtruth_outside_bed)
print(groundtruths_in_bed.shape[0])
calltablesseries = calltables[muttype]
calltablesseries['truth'] = False
calltablesseries.loc[calltablesseries.index.isin(list(groundtruths_in_bed.index)), 'truth'] = True
print(calltablesseries['truth'].value_counts())
calltablesseries['SampleDf_ILM_score'] = calltablesseries['SampleDf_ILM_vaf']
calltablesseries['SampleEf_ILM_score'] = calltablesseries['SampleEf_ILM_vaf']
# metrics
methods = ['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan', 'ILM']
results_auprc_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='auprc', ground_truth_method='SEQC2',
                                 refsample='SEQC2', muttype=muttype, methods=methods)
results_precision_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='precision',  ground_truth_method='SEQC2',
                                 refsample='SEQC2', muttype=muttype, methods=methods)
results_recall_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='recall',  ground_truth_method='SEQC2',
                                 refsample='SEQC2', muttype=muttype, methods=methods)
results_maxf1precision_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='maxf1precision',  ground_truth_method='SEQC2',
                                 refsample='SEQC2', muttype=muttype, methods=methods)
results_maxf1recall_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='maxf1recall',  ground_truth_method='SEQC2',
                                 refsample='SEQC2', muttype=muttype, methods=methods)
figure_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, xy='pr', ground_truth_method='SEQC2',
                                 refsample='SEQC2', muttype=muttype, methods=methods)
                #    ground_truth_method=gtm,
                #    refsample=refsample, muttype=muttype.upper(), methods=None, fixedvar=fixedvar, save=save)

In [None]:
mixtureid = 'IDT2_ST05_25ng_LIB1-P'
seriesorder = ['SampleDflS', 'SampleEflS']
print('############# {} ############'.format(mixtureid))
calltables = {'sampleid':[], 'tf':[], 'cov':[], 'snv':[], 'indel':[], 'snp':[]}
calltable_snv, aux = get_calltableseries(config, mixtureid, chrom='all', muttype='snv', filterparam=filterparam, reload=reload, save=save, diltype='SEQC2', concat='tf')
calltable_indel, aux = get_calltableseries(config, mixtureid, chrom='all', muttype='indel', filterparam=filterparam, reload=reload, save=save, diltype='SEQC2', concat='tf')
calltable_snp, aux = get_calltableseries(config, mixtureid, chrom='all', muttype='snp', filterparam=filterparam, reload=reload, save=save, diltype='SEQC2', concat='tf')
print(calltable_snv.shape, calltable_indel.shape, calltable_snp.shape)
print(aux)
calltables['snv'].append(calltable_snv)
calltables['indel'].append(calltable_indel)
calltables['snp'].append(calltable_snp)
calltables['sampleid'] = mixtureid 
#calltables['tf'] = np.unique([cn.split('_')[0] for cn in list(calltable_snv.columns)])[:-5].astype(float)
calltables['snv'] = pd.concat(calltables['snv'])
calltables['indel'] = pd.concat(calltables['indel'])
calltables['snp'] = pd.concat(calltables['snp'])
#dilutionseries
dilutionseries = aux
#dilutionseries = aux.T[['mixture_' + '_'.join(mixtureid.split('_')[:2]) + '_' + str(s[0]) + 'x_' + '_'.join(mixtureid.split('_')[2:4]) + '_' + str(s[1]) + 'x' for s in seriesorder]].T
muttype = 'snv'
refsample = 'SEQC2'
# ground truth
groundtruths = read_vcf(os.path.join(*config.mixturefolderSEQC2, 'ground_truths', 'KnownPositives_hg19.vcf'))
groundtruths['CHROM'] = groundtruths['CHROM'].str.replace('chr', '')
groundtruths['chrom_pos_ref_alt'] = groundtruths['CHROM'].astype(str).str.cat(groundtruths['POS'].astype('str'), sep='_').str.cat(groundtruths['REF'].astype('str'), sep='_').str.cat(groundtruths['ALT'].astype('str'), sep='_')
groundtruths.set_index('chrom_pos_ref_alt', inplace=True)

bedfile = pd.read_csv(os.path.join(*config.mixturefolderSEQC2, 'bedfiles', 'LBx_IDT_hg19.bed'), sep='\t', header=None)
bedfile.columns = ['chrom', 'startpos', 'endpos', 'gene', 'a','b']
bedfile['chrom'] = bedfile['chrom'].str.replace('chr', '')

groundtruth_outside_bed = []
for idx, row in tqdm(groundtruths.iterrows(), total=groundtruths.shape[0]):
    chrom = row['CHROM']
    pos = row['POS']
    ref = row['REF']
    alt = row['ALT']
    aux = bedfile[bedfile['chrom'] == str(chrom)]
    count = False
    for ind in list(aux.index):
        #print(aux['endpos'][ind], aux['startpos'][ind])
        if aux['endpos'][ind] >= int(pos):
            if aux['startpos'][ind] <= int(pos):
                count = True
    if not count:
        groundtruth_outside_bed.append(chrom + '_'+ str(pos) + '_'+ ref + '_'+ alt)
print(len(groundtruth_outside_bed))
groundtruths_in_bed = groundtruths.drop(groundtruth_outside_bed)
print(groundtruths_in_bed.shape[0])
calltablesseries = calltables[muttype]
calltablesseries['truth'] = False
calltablesseries.loc[calltablesseries.index.isin(list(groundtruths_in_bed.index)), 'truth'] = True
print(calltablesseries['truth'].value_counts())
calltablesseries['SampleDflS_IDT_score'] = calltablesseries['SampleDflS_IDT_vaf']
calltablesseries['SampleEflS_IDT_score'] = calltablesseries['SampleEflS_IDT_vaf']
# metrics
methods = ['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan', 'IDT']
results_auprc_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='auprc', ground_truth_method='SEQC2',
                                 refsample='SEQC2', muttype=muttype, methods=methods)
results_precision_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='precision',  ground_truth_method='SEQC2',
                                 refsample='SEQC2', muttype=muttype, methods=methods)
results_recall_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='recall',  ground_truth_method='SEQC2',
                                 refsample='SEQC2', muttype=muttype, methods=methods)
results_maxf1precision_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='maxf1precision',  ground_truth_method='SEQC2',
                                 refsample='SEQC2', muttype=muttype, methods=methods)
results_maxf1recall_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='maxf1recall',  ground_truth_method='SEQC2',
                                 refsample='SEQC2', muttype=muttype, methods=methods)
figure_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, xy='pr', ground_truth_method='SEQC2',
                                 refsample='SEQC2', muttype=muttype, methods=methods)
                #    ground_truth_method=gtm,
                #    refsample=refsample, muttype=muttype.upper(), methods=None, fixedvar=fixedvar, save=save)

In [None]:
mixtureid = 'TFS2_ST24_25ng_LIB1-P'
seriesorder = ['SampleDf', 'SampleEf']
print('############# {} ############'.format(mixtureid))
calltables = {'sampleid':[], 'tf':[], 'cov':[], 'snv':[], 'indel':[], 'snp':[]}
calltable_snv, aux = get_calltableseries(config, mixtureid, chrom='all', muttype='snv', filterparam=filterparam, reload=reload, save=save, diltype='SEQC2', concat='tf')
calltable_indel, aux = get_calltableseries(config, mixtureid, chrom='all', muttype='indel', filterparam=filterparam, reload=reload, save=save, diltype='SEQC2', concat='tf')
calltable_snp, aux = get_calltableseries(config, mixtureid, chrom='all', muttype='snp', filterparam=filterparam, reload=reload, save=save, diltype='SEQC2', concat='tf')
print(calltable_snv.shape, calltable_indel.shape, calltable_snp.shape)
print(aux)
calltables['snv'].append(calltable_snv)
calltables['indel'].append(calltable_indel)
calltables['snp'].append(calltable_snp)
calltables['sampleid'] = mixtureid 
#calltables['tf'] = np.unique([cn.split('_')[0] for cn in list(calltable_snv.columns)])[:-5].astype(float)
calltables['snv'] = pd.concat(calltables['snv'])
calltables['indel'] = pd.concat(calltables['indel'])
calltables['snp'] = pd.concat(calltables['snp'])
#dilutionseries
dilutionseries = aux
#dilutionseries = aux.T[['mixture_' + '_'.join(mixtureid.split('_')[:2]) + '_' + str(s[0]) + 'x_' + '_'.join(mixtureid.split('_')[2:4]) + '_' + str(s[1]) + 'x' for s in seriesorder]].T
muttype = 'snv'
refsample = 'SEQC2'
# ground truth
groundtruths = read_vcf(os.path.join(*config.mixturefolderSEQC2, 'ground_truths', 'KnownPositives_hg19.vcf'))
groundtruths['CHROM'] = groundtruths['CHROM'].str.replace('chr', '')
groundtruths['chrom_pos_ref_alt'] = groundtruths['CHROM'].astype(str).str.cat(groundtruths['POS'].astype('str'), sep='_').str.cat(groundtruths['REF'].astype('str'), sep='_').str.cat(groundtruths['ALT'].astype('str'), sep='_')
groundtruths.set_index('chrom_pos_ref_alt', inplace=True)

bedfile = pd.read_csv(os.path.join(*config.mixturefolderSEQC2, 'bedfiles', 'LBx_TFS_pre-defined_Hotspots_hg19.bed'), sep='\t', header=None)
bedfile.columns = ['chrom', 'startpos', 'endpos', 'gene', 'a','b', 'c', 'd']
bedfile['chrom'] = bedfile['chrom'].str.replace('chr', '')

groundtruth_outside_bed = []
for idx, row in tqdm(groundtruths.iterrows(), total=groundtruths.shape[0]):
    chrom = row['CHROM']
    pos = row['POS']
    ref = row['REF']
    alt = row['ALT']
    aux = bedfile[bedfile['chrom'] == str(chrom)]
    count = False
    for ind in list(aux.index):
        #print(aux['endpos'][ind], aux['startpos'][ind])
        if aux['endpos'][ind] >= int(pos):
            if aux['startpos'][ind] <= int(pos):
                count = True
    if not count:
        groundtruth_outside_bed.append(chrom + '_'+ str(pos) + '_'+ ref + '_'+ alt)
print(len(groundtruth_outside_bed))
groundtruths_in_bed = groundtruths.drop(groundtruth_outside_bed)
print(groundtruths_in_bed.shape[0])
calltablesseries = calltables[muttype]
calltablesseries['truth'] = False
calltablesseries.loc[calltablesseries.index.isin(list(groundtruths_in_bed.index)), 'truth'] = True
print(calltablesseries['truth'].value_counts())
calltablesseries['SampleDf_TFS_score'] = calltablesseries['SampleDf_TFS_vaf']
calltablesseries['SampleEf_TFS_score'] = calltablesseries['SampleEf_TFS_vaf']
# metrics
methods = ['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan', 'TFS']
results_auprc_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='auprc', ground_truth_method='SEQC2',
                                 refsample='SEQC2', muttype=muttype, methods=methods)
results_precision_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='precision',  ground_truth_method='SEQC2',
                                 refsample='SEQC2', muttype=muttype, methods=methods)
results_recall_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='recall',  ground_truth_method='SEQC2',
                                 refsample='SEQC2', muttype=muttype, methods=methods)
results_maxf1precision_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='maxf1precision',  ground_truth_method='SEQC2',
                                 refsample='SEQC2', muttype=muttype, methods=methods)
results_maxf1recall_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='maxf1recall',  ground_truth_method='SEQC2',
                                 refsample='SEQC2', muttype=muttype, methods=methods)
figure_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, xy='pr', ground_truth_method='SEQC2',
                                 refsample='SEQC2', muttype=muttype, methods=methods)
                #    ground_truth_method=gtm,
                #    refsample=refsample, muttype=muttype.upper(), methods=None, fixedvar=fixedvar, save=save)

In [None]:
calltablesseries[(calltablesseries['truth']) | (calltablesseries[['SampleDf_'+m+'_score' for m in ['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan']]].isna().sum() < 5) | (calltablesseries['SampleDf_TFS_score'] > 0)][['SampleDf_'+m+'_score' for m in ['freebayes', 'mutect2', 'strelka2', 'vardict', 'varscan', 'TFS']]+['truth']]#, 'truth']]#.sum(axis=1).value_counts()

In [None]:
results_precision_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='precision',  ground_truth_method='SEQC2',
                                 refsample='SEQC2', muttype=muttype, methods=config.methods)
results_recall_df = metric_curve_allchr(config, calltablesseries, dilutionseries, mixtureid, metric='recall',  ground_truth_method='SEQC2',
                                 refsample='SEQC2', muttype=muttype, methods=config.methods)

In [None]:
calltablesseries.loc[calltablesseries.index.isin(list(groundtruths_in_bed.index)), 'truth']

# From Excel sheet 

In [None]:
res = {}
m = 'BRP'

for lab in ['25', '26']:
    for rep in ['1', '2', '3', '4']:
        
        A = pd.read_excel(os.path.join(*config.mixturefolderSEQC2, 'excelreport', '41587_2021_857_MOESM6_ESM.xlsx'),  'BRP_Ef25', index_col=0)
        A.index = [str(a[3:]) for a in list(A.index)]
        A = A[['On-target', 'Known position', 'TP', 'FP', 'SampleB', 'BRP2_Ef25_ST'+lab+'_R'+rep]]
        print(A.shape)

        # load table
        calltable_snv, calltable_indel, calltable_snp = get_calltable(os.path.join(*config.mixturefolderSEQC2, 'SEQC2s_chrall', 'SEQC2s_chrall_BRP2_ST'+lab+'_25ng_LIB'+rep+'-P', 'SampleEf_BRP2_ST'+lab+'_25ng_LIB'+rep+'-P'), ['varnet', 'BRP'], save=False, filter='PASS')
        calltable = pd.concat([calltable_snv, calltable_indel, calltable_snp])
        calltable['chrom_pos'] = calltable['chrom'].astype(str).str.cat(calltable['pos'].astype('str'), sep='_')

        B = calltable
        B = B[['BRP', 'BRP_score', 'BRP_vaf', 'BRP_altcov', 'BRP_totcov']]
        B.index = B.index.astype(str)
        print(B.shape)
        
        C = pd.concat([A, B], axis=1)
        Nneg = C[C['FP'] == 1].shape[0]
        print('Nneg')
        print(Nneg)
        #C = C[(C['BRP_vaf'] <= 0.025) & (C['BRP_vaf'] > 0.001)]
        C['truth'] = C["TP"]
        C.dropna(inplace=True)
        print(C.shape)
        C['BRP_vaf_range'] = pd.cut(C['BRP_vaf'], [0.001, 0.002, 0.003, 0.005, 0.025])
        C['BRP_vaf_range_num'] = 0
        C['BRP_vaf_range_num'][C['BRP_vaf'] < 0.001] = 0
        C['BRP_vaf_range_num'][(C['BRP_vaf'] <= 0.0025) & (C['BRP_vaf'] > 0.001)] = 0.0025
        C['BRP_vaf_range_num'][(C['BRP_vaf'] <= 0.003) & (C['BRP_vaf'] > 0.0025)] = 0.003
        C['BRP_vaf_range_num'][(C['BRP_vaf'] <= 0.004) & (C['BRP_vaf'] > 0.003)] = 0.004
        C['BRP_vaf_range_num'][(C['BRP_vaf'] <= 0.005) & (C['BRP_vaf'] > 0.004)] = 0.005
        C['BRP_vaf_range_num'][(C['BRP_vaf'] <= 0.01) & (C['BRP_vaf'] > 0.005)] = 0.01
        C['BRP_vaf_range_num'][(C['BRP_vaf'] <= 0.015) & (C['BRP_vaf'] > 0.01)] = 0.015
        C['BRP_vaf_range_num'][(C['BRP_vaf'] <= 0.02) & (C['BRP_vaf'] > 0.015)] = 0.02
        C['BRP_vaf_range_num'][(C['BRP_vaf'] <= 0.025) & (C['BRP_vaf'] > 0.02)] = 0.025
        C['BRP_vaf_range_num'][C['BRP_vaf'] > 0.025] = 1
        print(C['BRP_vaf_range_num'].value_counts())
    
        fig, ax = plt.subplots(figsize=(10, 10))
        #precision, recall, thresholds = precision_recall_curve(C['truth'], C['BRP_vaf_range_num'])
        precision, recall, thresholds = precision_recall_curve(C['truth'], C['BRP_vaf_range_num'])
        print(thresholds)
        precisionbis = []
        for thres in [0] + list(thresholds):
            y_pred = C[m+'_vaf'].copy()
            y_pred[y_pred >= thres] = 1
            y_pred[y_pred != 1] = 0
            _, fp, _, tp = confusion_matrix(C['truth'], y_pred).ravel()
            precisionbis.append(tp/(tp+(fp/Nneg)))
        
        m = 'BRP'
        plot_pr_curve(precision[1:-1], recall[1:-1], estimator_name=m, f1_score=None, figax=(fig, ax), kwargs={'color': config.colors[config.methods.index(m)]})
        plt.ylim([0.0, 1.0])
        plt.xlim([0.0, 1.0])
        plt.legend(bbox_to_anchor=(1, 1), loc="upper left")

        #res['BRP2_Ef25_ST'+lab+'_R'+rep] = (precisionbis, recall, thresholds)

In [None]:
res = {}
m = 'BRP'
recallall, precisionall = [], []
Call = pd.DataFrame()
for lab in ['25', '26']:
    for rep in ['1', '2', '3', '4']:
        A = pd.read_excel(os.path.join(*config.mixturefolderSEQC2, 'excelreport', '41587_2021_857_MOESM6_ESM.xlsx'),  'BRP_Ef25', index_col=0)
        A.index = [str(a[3:]) for a in list(A.index)]
        A = A[['On-target', 'Known position', 'TP', 'FP', 'SampleB', 'BRP2_Ef25_ST'+lab+'_R'+rep]]
        print(A.shape)
        # load table
        calltable_snv, calltable_indel, calltable_snp = get_calltable(os.path.join(*config.mixturefolderSEQC2, 'SEQC2s_chrall', 'SEQC2s_chrall_BRP2_ST'+lab+'_25ng_LIB'+rep+'-P', 'SampleEf_BRP2_ST'+lab+'_25ng_LIB'+rep+'-P'), ['varnet', 'BRP'], save=False, filter='all')
        calltable = pd.concat([calltable_snv, calltable_indel, calltable_snp])
        calltable['chrom_pos'] = calltable['chrom'].astype(str).str.cat(calltable['pos'].astype('str'), sep='_')
        B = calltable
        B = B[['BRP', 'BRP_score', 'BRP_vaf', 'BRP_altcov', 'BRP_totcov']]
        B.index = B.index.astype(str)
        print(B.shape)
        C = pd.concat([A, B], axis=1)
        print(C.shape)
        Call = pd.concat([Call, C])

        Call = C
        Call['BRP_vaf_filter_2.5%'] = (Call['BRP_vaf'] > 0.025)
        Call['BRP_vaf_filter_1.5%'] = (Call['BRP_vaf'] > 0.015)
        Call['BRP_vaf_filter_1.0%'] = (Call['BRP_vaf'] > 0.01)
        Call['BRP_vaf_filter_0.5%'] = (Call['BRP_vaf'] > 0.005)
        Call['BRP_vaf_filter_0.4%'] = (Call['BRP_vaf'] > 0.004)
        Call['BRP_vaf_filter_0.3%'] = (Call['BRP_vaf'] > 0.003)
        Call['BRP_vaf_filter_0.25%'] = (Call['BRP_vaf'] > 0.0025)
        Call['BRP_vaf_filter_0.2%'] = (Call['BRP_vaf'] > 0.002)
        Call['BRP_vaf_filter_0.15%'] = (Call['BRP_vaf'] > 0.0015)
        Call['BRP_vaf_filter_0.1%'] = (Call['BRP_vaf'] > 0.001)

        fig, ax = plt.subplots(figsize=(10, 10))
        #precision, recall, thresholds = precision_recall_curve(C['truth'], C['BRP_vaf_range_num'])
        precision, recall = [], []
        thresholds = ['2.5%', '1.5%', '1.0%', '0.5%', '0.4%', '0.3%', '0.25%', '0.2%', '0.15%', '0.1%']
        #thresholds = ['2.5%', '1.5%', '1.0%', '0.5%', '0.4%', '0.3%', '0.25%', '0.1%']
        Call = Call[Call['SampleB'] == 0]
        for th in thresholds:
            #TP = float(Call[(Call['TP'] == 1) & (Call['BRP_vaf_filter_'+th] == 1)].shape[0])
            TP = float(Call[(Call['Known position'] == 1) & (Call['BRP_vaf_filter_'+th] == 1)].shape[0])
            #FP = float(Call[(Call['FP'] == 1)].shape[0])
            FP = float(Call[(Call['FP'] == 1) & (Call['BRP_vaf_filter_'+th] == 1)].shape[0])
            #FP = float(Call[(Call['Known position'] == 0) & (Call['BRP_vaf_filter_'+th] == 1)].shape[0])
            FN = float(Call[(Call['TP'] == 1) & (Call['BRP_vaf_filter_'+th] == 0)].shape[0])
            #FN = float(Call[(Call['Known position'] == 1) & (Call['BRP_vaf_filter_'+th] == 0)].shape[0])
            print(TP, FP, FN)
            p = TP/(TP + FP)
            r = TP/(TP + FN)
            precision.append(p)
            recall.append(r)
            print(th, p, r)
        print(thresholds)
        plt.plot(recall, precision, '.-', label=m, c=config.colors[config.methods.index(m)])
        #plot_pr_curve(precision, recall, estimator_name=m, f1_score=None, figax=(fig, ax), kwargs={'color': config.colors[config.methods.index(m)]})
        plt.ylim([0.97, 1.0001])
        plt.xlim([0.0, 1.0])
        plt.legend(bbox_to_anchor=(1, 1), loc="upper left")
        recallall.append(recall)
        precisionall.append(precision)
        #res['BRP2_Ef25_ST'+lab+'_R'+rep] = (precisionbis, recall, thresholds)

precisionall = np.array(precisionall)
recallall = np.array(recallall)
print(recallall.shape, precisionall.shape)
fig, ax = plt.subplots(figsize=(10, 10))
print(np.mean(recallall, axis=0), np.mean(precisionall, axis=0))
plt.plot(np.mean(recallall, axis=0), np.mean(precisionall, axis=0), '.-', label=m, c=config.colors[config.methods.index(m)])
#plot_pr_curve(precision, recall, estimator_name=m, f1_score=None, figax=(fig, ax), kwargs={'color': config.colors[config.methods.index(m)]})
plt.ylim([0.97, 1.0001])
plt.xlim([0.0, 1.0])
plt.legend(bbox_to_anchor=(1, 1), loc="upper left")

In [None]:
m = 'BRP'
recallall, precisionall = [], []

A = pd.read_excel(os.path.join(*config.mixturefolderSEQC2, 'excelreport', '41587_2021_857_MOESM6_ESM.xlsx'),  m+'_Ef25', index_col=0)
A.index = [str(a[3:]) for a in list(A.index)]
print(A.shape)
print(A['FP'].sum(), A['TP'].sum(), A['Known position'].sum())
#lab = '25'
#rep = '1'

for lab in ['25', '26']:
    for rep in ['1', '2', '3', '4']:
        
        
        calltable_snv, calltable_indel, calltable_snp = get_calltable(os.path.join(*config.mixturefolderSEQC2, 'SEQC2s_chrall', 'SEQC2s_chrall_'+m+'2_ST'+lab+'_25ng_LIB'+rep+'-P', 'SampleEf_'+m+'2_ST'+lab+'_25ng_LIB'+rep+'-P'), ['varnet', m], save=False, filter='all')
        calltable = pd.concat([calltable_snv, calltable_indel, calltable_snp])
        calltable.drop(['varnet', 'varnet_score', 'varnet_vaf', 'varnet_altcov', 'varnet_totcov'], axis=1, inplace=True)
        calltable['chrom_pos'] = calltable['chrom'].astype(str).str.cat(calltable['pos'].astype('str'), sep='_')
        B = calltable
        B.rename(columns={m: m+'2_Ef25_ST'+lab+'_R'+rep+'_pass', m+'_score': m+'2_Ef25_ST'+lab+'_R'+rep+'_score', m+'_vaf': m+'2_Ef25_ST'+lab+'_R'+rep+'_vaf', m+'_altcov': m+'2_Ef25_ST'+lab+'_R'+rep+'_altcov',  m+'_totcov' : m+'2_Ef25_ST'+lab+'_R'+rep+'_totcov'}, inplace=True)
        B.index = B.index.astype(str)

        ### RQ : 4_1803407_T_C missing (SNV number 230 !)

        C = pd.concat([A, B], axis=1)

        C[(C[m+'2_Ef25_ST'+lab+'_R'+rep].isna()) & ~C[m+'2_Ef25_ST'+lab+'_R'+rep].isna()]
        print(C[C['SampleB'] == 1][m+'2_Ef25_ST'+lab+'_R'+rep+'_vaf'].min())
        C = C[C['SampleB']!=1]
        #C = C[C[m+'2_Ef25_ST'+lab+'_R'+rep+'_vaf'] <= 0.025]
        #C[m+'2_Ef25_ST'+lab+'_R'+rep+'_vaf_filter_2.5%'] = (C[m+'2_Ef25_ST'+lab+'_R'+rep+'_vaf'] > 0.025)
        C[m+'2_Ef25_ST'+lab+'_R'+rep+'_vaf_filter_1.5%'] = (C[m+'2_Ef25_ST'+lab+'_R'+rep+'_vaf'] > 0.015)
        C[m+'2_Ef25_ST'+lab+'_R'+rep+'_vaf_filter_1.0%'] = (C[m+'2_Ef25_ST'+lab+'_R'+rep+'_vaf'] > 0.01)
        C[m+'2_Ef25_ST'+lab+'_R'+rep+'_vaf_filter_0.5%'] = (C[m+'2_Ef25_ST'+lab+'_R'+rep+'_vaf'] > 0.005)
        C[m+'2_Ef25_ST'+lab+'_R'+rep+'_vaf_filter_0.4%'] = (C[m+'2_Ef25_ST'+lab+'_R'+rep+'_vaf'] > 0.004)
        C[m+'2_Ef25_ST'+lab+'_R'+rep+'_vaf_filter_0.3%'] = (C[m+'2_Ef25_ST'+lab+'_R'+rep+'_vaf'] > 0.003)
        C[m+'2_Ef25_ST'+lab+'_R'+rep+'_vaf_filter_0.25%'] = (C[m+'2_Ef25_ST'+lab+'_R'+rep+'_vaf'] > 0.0025)
        C[m+'2_Ef25_ST'+lab+'_R'+rep+'_vaf_filter_0.2%'] = (C[m+'2_Ef25_ST'+lab+'_R'+rep+'_vaf'] > 0.002)
        C[m+'2_Ef25_ST'+lab+'_R'+rep+'_vaf_filter_0.15%'] = (C[m+'2_Ef25_ST'+lab+'_R'+rep+'_vaf'] > 0.0015)
        C[m+'2_Ef25_ST'+lab+'_R'+rep+'_vaf_filter_0.1%'] = (C[m+'2_Ef25_ST'+lab+'_R'+rep+'_vaf'] > 0.001)


        precision, recall = [], []
        #thresholds = ['2.5%', '1.5%', '1.0%', '0.5%', '0.4%', '0.3%', '0.25%', '0.2%', '0.15%', '0.1%']
        thresholds = ['1.5%', '1.0%', '0.5%', '0.4%', '0.3%', '0.25%', '0.2%', '0.15%', '0.1%']
        for th in thresholds:
            TP = float(C[(C['TP'] == 1) & (C[m+'2_Ef25_ST'+lab+'_R'+rep+'_vaf_filter_'+th] == 1)].shape[0])
            #TP = float(C[(C['Known position'] == 1) & (C[m+'2_Ef25_ST'+lab+'_R'+rep+'_vaf_filter_'+th] == 1)].shape[0])
            FP = float(C[(C['FP'] == 1) & (C[m+'2_Ef25_ST'+lab+'_R'+rep+'_vaf_filter_'+th] == 1)].shape[0])
            #FP = float(C[(C['Known position'] == 0) & (C[m+'2_Ef25_ST'+lab+'_R'+rep+'_vaf_filter_'+th] == 1)].shape[0])
            FN = float(C[(C['TP'] == 1) & (C[m+'2_Ef25_ST'+lab+'_R'+rep+'_vaf_filter_'+th] == 0)].shape[0])
            #FN = float(C[(C['Known position'] == 1) & (C[m+'2_Ef25_ST'+lab+'_R'+rep+'_vaf_filter_'+th] == 0)].shape[0])
            print(TP, FP, FN)
            p = TP/(TP + FP)
            r = TP/(TP + FN)
            precision.append(p)
            recall.append(r)
            print(th, round(p, 3), round(r, 3))
        plt.plot(recall, precision, '.-', label=m, c=config.colors[config.methods.index(m)])
        plt.ylim([0.97, 1.0001])
        plt.xlim([0.0, 1.0])
        plt.legend(bbox_to_anchor=(1, 1), loc="upper left")
    
        recallall.append(recall)
        precisionall.append(precision)

In [None]:
C.columns[12]

In [None]:
D = C[list(C.columns[:4]) + [C.columns[12]] + list(C.columns[20:])]
print(D[(D['BRP2_Ef25_ST26_R4'] == 0) & (D['BRP2_Ef25_ST26_R4_vaf'].isna())].shape[0], D[(D['BRP2_Ef25_ST26_R4'] == 0)].shape[0], D[(D['BRP2_Ef25_ST26_R4_vaf'].isna())].shape[0])
D

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))
print(np.mean(recallall, axis=0), np.mean(precisionall, axis=0))
plt.plot(np.mean(recallall, axis=0)[1:], np.mean(precisionall, axis=0)[1:], '.-', label=m, c=config.colors[config.methods.index(m)])
#plot_pr_curve(precision, recall, estimator_name=m, f1_score=None, figax=(fig, ax), kwargs={'color': config.colors[config.methods.index(m)]})
plt.ylim([0.97, 1.0001])
plt.xlim([0.0, 1.0])
plt.legend(bbox_to_anchor=(1, 1), loc="upper left")
plt.title(m+' PR curve average')

In [None]:
groundtruths = read_vcf(os.path.join(*config.mixturefolderSEQC2, 'ground_truths', 'KnownPositives_hg19.vcf'))
groundtruths['CHROM'] = groundtruths['CHROM'].str.replace('chr', '')
groundtruths['chrom_pos_ref_alt'] = groundtruths['CHROM'].astype(str).str.cat(groundtruths['POS'].astype('str'), sep='_').str.cat(groundtruths['REF'].astype('str'), sep='_').str.cat(groundtruths['ALT'].astype('str'), sep='_')
groundtruths.set_index('chrom_pos_ref_alt', inplace=True)
groundtruths

In [None]:
# load bedfile for method BRP
bedfile = pd.read_csv(os.path.join(*config.mixturefolderSEQC2, 'bedfiles', 'LBx_BRP_hg19.bed'), sep='\t', header=None)
bedfile.columns = ['chrom', 'startpos', 'endpos', 'gene']
bedfile['chrom'] = bedfile['chrom'].str.replace('chr', '')

In [None]:
# keep only ground truths in bed
groundtruth_outside_bed = []
for idx, row in tqdm(groundtruths.iterrows(), total=groundtruths.shape[0]):
    chrom = row['CHROM']
    pos = row['POS']
    ref = row['REF']
    alt = row['ALT']
    aux = bedfile[bedfile['chrom'] == str(chrom)]
    count = False
    for ind in list(aux.index):
        #print(aux['endpos'][ind], aux['startpos'][ind])
        if aux['endpos'][ind] >= int(pos):
            if aux['startpos'][ind] <= int(pos):
                count = True
    if not count:
        groundtruth_outside_bed.append(chrom + '_'+ str(pos) + '_'+ ref + '_'+ alt)
print(len(groundtruth_outside_bed))
groundtruths_in_bed = groundtruths.drop(groundtruth_outside_bed)
print(groundtruths_in_bed.shape[0])

In [None]:
print(len(set(set(list(C[C['TP'] == 1].index)) & set(groundtruths_in_bed.index))))
lother = list(set(set(groundtruths_in_bed.index)) - set(list(C[C['TP'] == 1].index)))
print(len(lother))
print(lother[1])
C[C['TP'] == 0].index

In [None]:
bedfile[bedfile['chrom'] == '10']

In [None]:
# load known negatives (germline mutations)
germlinetruths = pd.read_csv(os.path.join(*config.mixturefolderSEQC2, 'ground_truths', 'KnownNegatives_hg19.bed'), sep='\t', header=None)
germlinetruths.columns = ['chrom', 'startpos', 'endpos']
germlinetruths['chrom'] = germlinetruths['chrom'].str.replace('chr', '')
germlinetruths['chrom_pos'] = germlinetruths['chrom'].astype(str).str.cat(germlinetruths['startpos'].astype('str'), sep='_')
germlinetruths.set_index('chrom_pos', inplace=True)



In [None]:
germlinetruths.index

In [None]:
set(C['chrom_pos'].values) & set(germlinetruths.index)

In [None]:
print(len(set(list(C[C['TP'] == 1].index))))
print(len(set(set(list(C[C['TP'] == 1].index)) & set(groundtruths.index))))
C.loc[list(set(list(C[C['TP'] != 1].index)) & set(groundtruths.index))]

In [None]:
m = 'IDT'
recallall, precisionall = [], []

A = pd.read_excel(os.path.join(*config.mixturefolderSEQC2, 'excelreport', '41587_2021_857_MOESM6_ESM.xlsx'),  m+'_Ef25', index_col=0)
A.index = [str(a[3:]) for a in list(A.index)]
print(A.shape)
print(A['FP'].sum(), A['TP'].sum(), A['Known position'].sum())
#lab = '25'
#rep = '1'
for lab in ['05', '06']:
    for rep in ['1', '2', '3', '4']:
        if lab == '04':
            lab = '05'
        calltable_snv, calltable_indel, calltable_snp = get_calltable(os.path.join(*config.mixturefolderSEQC2, 'SEQC2s_chrall', 'SEQC2s_chrall_'+m+'2_ST'+lab+'_25ng_LIB'+rep+'-P', 'SampleEflS_'+m+'2_ST'+lab+'_25ng_LIB'+rep+'-P'), ['varnet', m], save=False, filter='all')
        calltable = pd.concat([calltable_snv, calltable_indel, calltable_snp])
        calltable.drop(['varnet', 'varnet_score', 'varnet_vaf', 'varnet_altcov', 'varnet_totcov'], axis=1, inplace=True)
        calltable['chrom_pos'] = calltable['chrom'].astype(str).str.cat(calltable['pos'].astype('str'), sep='_')
        B = calltable
        if lab == '05':
            lab = '04'
        B.rename(columns={m: m+'2_Ef25_ST'+lab+'_R'+rep+'_pass', m+'_score': m+'2_Ef25_ST'+lab+'_R'+rep+'_score', m+'_vaf': m+'2_Ef25_ST'+lab+'_R'+rep+'_vaf', m+'_altcov': m+'2_Ef25_ST'+lab+'_R'+rep+'_altcov',  m+'_totcov' : m+'2_Ef25_ST'+lab+'_R'+rep+'_totcov'}, inplace=True)
        B.index = B.index.astype(str)

        C = pd.concat([A, B], axis=1)

        C[(C[m+'2_Ef25_ST'+lab+'_R'+rep].isna()) & ~C[m+'2_Ef25_ST'+lab+'_R'+rep].isna()]
        print(C[C['SampleB'] == 1][m+'2_Ef25_ST'+lab+'_R'+rep+'_vaf'].min())
        print(C[C['SampleB']==1].shape[0])
        C = C[C['SampleB']!=1]
        C = C[C[m+'2_Ef25_ST'+lab+'_R'+rep+'_vaf'] <= 0.025]
        print(C.shape)
        #C[m+'2_Ef25_ST'+lab+'_R'+rep+'_vaf_filter_2.5%'] = (C[m+'2_Ef25_ST'+lab+'_R'+rep+'_vaf'] > 0.025)
        C[m+'2_Ef25_ST'+lab+'_R'+rep+'_vaf_filter_1.5%'] = (C[m+'2_Ef25_ST'+lab+'_R'+rep+'_vaf'] > 0.015)
        C[m+'2_Ef25_ST'+lab+'_R'+rep+'_vaf_filter_1.0%'] = (C[m+'2_Ef25_ST'+lab+'_R'+rep+'_vaf'] > 0.01)
        C[m+'2_Ef25_ST'+lab+'_R'+rep+'_vaf_filter_0.5%'] = (C[m+'2_Ef25_ST'+lab+'_R'+rep+'_vaf'] > 0.005)
        C[m+'2_Ef25_ST'+lab+'_R'+rep+'_vaf_filter_0.4%'] = (C[m+'2_Ef25_ST'+lab+'_R'+rep+'_vaf'] > 0.004)
        C[m+'2_Ef25_ST'+lab+'_R'+rep+'_vaf_filter_0.3%'] = (C[m+'2_Ef25_ST'+lab+'_R'+rep+'_vaf'] > 0.003)
        C[m+'2_Ef25_ST'+lab+'_R'+rep+'_vaf_filter_0.25%'] = (C[m+'2_Ef25_ST'+lab+'_R'+rep+'_vaf'] > 0.0025)
        C[m+'2_Ef25_ST'+lab+'_R'+rep+'_vaf_filter_0.2%'] = (C[m+'2_Ef25_ST'+lab+'_R'+rep+'_vaf'] > 0.002)
        C[m+'2_Ef25_ST'+lab+'_R'+rep+'_vaf_filter_0.15%'] = (C[m+'2_Ef25_ST'+lab+'_R'+rep+'_vaf'] > 0.0015)
        C[m+'2_Ef25_ST'+lab+'_R'+rep+'_vaf_filter_0.1%'] = (C[m+'2_Ef25_ST'+lab+'_R'+rep+'_vaf'] > 0.001)


        precision, recall = [], []
        #thresholds = ['2.5%', '1.5%', '1.0%', '0.5%', '0.4%', '0.3%', '0.25%', '0.2%', '0.15%', '0.1%']
        thresholds = ['1.5%', '1.0%', '0.5%', '0.4%', '0.3%', '0.25%', '0.2%', '0.15%', '0.1%']
        for th in thresholds:
            TP = float(C[(C['TP'] == 1) & (C[m+'2_Ef25_ST'+lab+'_R'+rep+'_vaf_filter_'+th] == 1)].shape[0])
            #TP = float(C[(C['Known position'] == 1) & (C[m+'2_Ef25_ST'+lab+'_R'+rep+'_vaf_filter_'+th] == 1)].shape[0])
            FP = float(C[(C['FP'] == 1) & (C[m+'2_Ef25_ST'+lab+'_R'+rep+'_vaf_filter_'+th] == 1)].shape[0])
            #FP = float(C[(C['Known position'] == 0) & (C[m+'2_Ef25_ST'+lab+'_R'+rep+'_vaf_filter_'+th] == 1)].shape[0])
            FN = float(C[(C['TP'] == 1) & (C[m+'2_Ef25_ST'+lab+'_R'+rep+'_vaf_filter_'+th] == 0)].shape[0])
            #FN = float(C[(C['Known position'] == 1) & (C[m+'2_Ef25_ST'+lab+'_R'+rep+'_vaf_filter_'+th] == 0)].shape[0])
            print(TP, FP, FN)
            p = TP/(TP + FP)
            r = TP/(TP + FN)
            precision.append(p)
            recall.append(r)
            print(th, round(p, 3), round(r, 3))
        plt.plot(recall, precision, '.-', label=m, c=config.colors[config.methods.index(m)])
        #plt.ylim([0.97, 1.0001])
        plt.xlim([0.0, 1.0])
        plt.legend(bbox_to_anchor=(1, 1), loc="upper left")
    
        recallall.append(recall)
        precisionall.append(precision)

In [None]:
C[(C[m+'2_Ef25_ST'+lab+'_R'+rep] == 1) & C[m+'2_Ef25_ST'+lab+'_R'+rep+"_vaf"].isna()][[m+'2_Ef25_ST'+lab+'_R'+rep, m+'2_Ef25_ST'+lab+'_R'+rep+"_vaf"]]

In [None]:
C[C['chrom'] == '12'].head(50)

In [None]:
C[(C[m+'2_Ef25_ST'+lab+'_R'+rep].isna()) & ~C[m+'2_Ef25_ST'+lab+'_R'+rep+"_vaf"].isna()][[m+'2_Ef25_ST'+lab+'_R'+rep, m+'2_Ef25_ST'+lab+'_R'+rep+"_vaf"]]

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))
print(np.mean(recallall, axis=0), np.mean(precisionall, axis=0))
plt.plot(np.mean(recallall, axis=0)[1:], np.mean(precisionall, axis=0)[1:], '.-', label=m, c=config.colors[config.methods.index(m)])
#plot_pr_curve(precision, recall, estimator_name=m, f1_score=None, figax=(fig, ax), kwargs={'color': config.colors[config.methods.index(m)]})
#plt.ylim([0.97, 1.0001])
plt.xlim([0.0, 1.0])
plt.legend(bbox_to_anchor=(1, 1), loc="upper left")
plt.title(m+' PR curve average')

In [None]:
C[C['SampleB'] == 1][m+'2_Ef25_ST'+lab+'_R'+rep+'_vaf'].min()

In [None]:
C[(C['BRP2_Ef25_ST25_R1'] == 0) & ~C['BRP2_Ef25_ST25_R1_vaf'].isna()]#.dropna()
C[(C['BRP2_Ef25_ST25_R1'] == 1) & C['BRP2_Ef25_ST25_R1_vaf'].isna()]

In [None]:
res = {}
m = 'IDT'
precisionall, recallall = [], []
Call = pd.DataFrame()
for lab in ['05', '06']:
    for rep in ['1', '2', '3', '4']:
        A = pd.read_excel(os.path.join(*config.mixturefolderSEQC2, 'excelreport', '41587_2021_857_MOESM6_ESM.xlsx'),  m+'_Ef25', index_col=0)
        A.index = [str(a[3:]) for a in list(A.index)]
        laba = lab if lab == '06' else '04'
        A = A[['On-target', 'Known position', 'TP', 'FP', 'SampleB', m+'2_Ef25_ST'+laba+'_R'+rep]]
        print(A.shape)
        # load table
        calltable_snv, calltable_indel, calltable_snp = get_calltable(os.path.join(*config.mixturefolderSEQC2, 'SEQC2s_chrall', 'SEQC2s_chrall_IDT2_ST'+lab+'_25ng_LIB'+rep+'-P', 'SampleEflS_IDT2_ST'+lab+'_25ng_LIB'+rep+'-P'), ['varnet', m], save=False, filter='all')
        calltable = pd.concat([calltable_snv, calltable_indel, calltable_snp])
        calltable['chrom_pos'] = calltable['chrom'].astype(str).str.cat(calltable['pos'].astype('str'), sep='_')
        B = calltable
        B = B[[m, m+'_score', m+'_vaf', m+'_altcov', m+'_totcov']]
        B.index = B.index.astype(str)
        print(B.shape)
        C = pd.concat([A, B], axis=1)
        print(C.shape)
        Call = pd.concat([Call, C])
        Call = C

        Call[m+'_vaf_filter_2.5%'] = (Call[m+'_vaf'] > 0.025)
        Call[m+'_vaf_filter_1.5%'] = (Call[m+'_vaf'] > 0.015)
        Call[m+'_vaf_filter_1.0%'] = (Call[m+'_vaf'] > 0.01)
        Call[m+'_vaf_filter_0.5%'] = (Call[m+'_vaf'] > 0.005)
        Call[m+'_vaf_filter_0.4%'] = (Call[m+'_vaf'] > 0.004)
        Call[m+'_vaf_filter_0.3%'] = (Call[m+'_vaf'] > 0.003)
        Call[m+'_vaf_filter_0.25%'] = (Call[m+'_vaf'] > 0.0025)
        Call[m+'_vaf_filter_0.2%'] = (Call[m+'_vaf'] > 0.002)
        Call[m+'_vaf_filter_0.15%'] = (Call[m+'_vaf'] > 0.0015)
        Call[m+'_vaf_filter_0.1%'] = (Call[m+'_vaf'] > 0.001)

        fig, ax = plt.subplots(figsize=(10, 10))
        #precision, recall, thresholds = precision_recall_curve(C['truth'], C['BRP_vaf_range_num'])
        precision, recall = [], []
        #thresholds = ['2.5%', '1.5%', '1.0%', '0.5%', '0.4%', '0.3%', '0.25%', '0.1%']
        thresholds = ['2.5%', '1.5%', '1.0%', '0.5%', '0.4%', '0.3%', '0.25%', '0.2%', '0.15%', '0.1%']
        Call = Call[Call['SampleB'] == 0]
        for th in thresholds:
            TP = float(Call[(Call['TP'] == 1) & (Call[m+'_vaf_filter_'+th] == 1)].shape[0])
            #FP = float(Call[(Call['FP'] == 1)].shape[0])
            FP = float(Call[(Call['FP'] == 1) & (Call[m+'_vaf_filter_'+th] == 1)].shape[0])
            FN = float(Call[(Call['TP'] == 1) & (Call[m+'_vaf_filter_'+th] == 0)].shape[0])
            print(TP, FP, FN)
            p = TP/(TP + FP)
            r = TP/(TP + FN)
            precision.append(p)
            recall.append(r)
            print(th, p, r)
        print(thresholds)
        plt.plot(recall, precision, '.-', label=m, c=config.colors[config.methods.index(m)])
        #plot_pr_curve(precision, recall, estimator_name=m, f1_score=None, figax=(fig, ax), kwargs={'color': config.colors[config.methods.index(m)]})
        plt.ylim([0.97, 1.0001])
        plt.xlim([0.0, 1.0])
        plt.legend(bbox_to_anchor=(1, 1), loc="upper left")
        
        recallall.append(recall)
        precisionall.append(precision)

precisionall = np.array(precisionall)
recallall = np.array(recallall)
print(recallall.shape, precisionall.shape)
fig, ax = plt.subplots(figsize=(10, 10))
print(np.mean(recallall, axis=0), np.mean(precisionall, axis=0))
plt.plot(np.mean(recallall, axis=0), np.mean(precisionall, axis=0), '.-', label=m, c=config.colors[config.methods.index(m)])
#plot_pr_curve(precision, recall, estimator_name=m, f1_score=None, figax=(fig, ax), kwargs={'color': config.colors[config.methods.index(m)]})
plt.ylim([0.97, 1.0001])
plt.xlim([0.0, 1.0])
plt.legend(bbox_to_anchor=(1, 1), loc="upper left")
plt.title(m+' PR curve average')

In [None]:
C[~C['Known position'].isna()].head(50)

In [None]:
Call[(~Call['IDT_vaf'].isna()) & (Call['SampleB'] == 0)].head(50)

In [None]:
Call[Call['IDT_vaf'] <= 0.025]

In [None]:
calltable['truth'][calltable['truth'] == False]

In [None]:
thresholds

In [None]:
res = {}

method_sampleid_dict = {
    'BRP': ['SampleEf_BRP2_ST26_25ng_LIB1-P', 'BRP2_Ef25_ST26_R1'],
    'IDT': ['SampleEfIS_IDT2_ST05_25ng_LIB1-P', 'IDT2_Ef25_ST05_R1'],
    'ILM': ['SampleEf_ILM2_ST29_25ng_LIB1-P', 'ILM2_Ef25_ST29_R1'],
    'ROC': ['SampleEf_ROC2_ST21_25ng_LIB1-P','ROC2_Ef25_ST21_R1'],
    'TFS': ['SampleEf_TFS2_ST24_25ng_LIB1-P','TFS2_Ef25_ST24_R1'],
}

fig, ax = plt.subplots(figsize=(10, 10))


for method in ['BRP', 'ROC', 'ILM', 'IDT']: #,  'IDT']: #'ROC', 'TFS']:
    
    print('######## '+ method + ' ########')

    #Call = pd.DataFrame()
    #for lab in ['25', '26']:
    #    for rep in ['1', '2', '3', '4']:
            #lab = '25'
            #rep = '1'

    A = pd.read_excel(os.path.join(*config.mixturefolderSEQC2, 'excelreport', '41587_2021_857_MOESM6_ESM.xlsx'), method + '_' + method_sampleid_dict[method][1].split('_')[1], index_col=0)
    A.index = [str(a[3:]) for a in list(A.index)]
    #A = A[['On-target', 'Known position', 'TP', 'FP', 'SampleB', 'BRP2_Ef25_ST'+lab+'_R'+rep]]
    A = A[['On-target', 'Known position', 'TP', 'FP', 'SampleB', method_sampleid_dict[method][1]]]
    print(A.shape)

    # load table
    #calltable_snv, calltable_indel, calltable_snp = get_calltable(os.path.join(*config.mixturefolderSEQC2, 'SEQC2s_chrall', 'SEQC2s_chrall_BRP2_ST'+lab+'_25ng_LIB'+rep+'-P', 'SampleEf_BRP2_ST'+lab+'_25ng_LIB'+rep+'-P'), ['varnet', 'BRP'], save=False, filter='PASS')
    calltable_snv, calltable_indel, calltable_snp = get_calltable(os.path.join(*config.mixturefolderSEQC2, 'SEQC2s_chrall', 'SEQC2s_chrall_'+'_'.join(method_sampleid_dict[method][0].split('_')[1:]), method_sampleid_dict[method][0]), ['varnet', method], save=False, filter='PASS')
    calltable = pd.concat([calltable_snv, calltable_indel, calltable_snp])
    if method == 'ROC' or method == 'ILM':
        calltable.index = calltable.index.astype(str).str[3:]
    calltable['chrom_pos'] = calltable['chrom'].astype(str).str.cat(calltable['pos'].astype('str'), sep='_')
   
    B = calltable
    #B = B[['BRP', 'BRP_score', 'BRP_vaf', 'BRP_altcov', 'BRP_totcov']]
    B = B[[method, method+'_score', method+'_vaf', method+'_altcov', method+'_totcov']]
    B.index = B.index.astype(str)
    print(B.shape)

    C = pd.concat([A, B], axis=1)
    #print('Nneg')
    #print(Nneg)
    #C = C[(C['BRP_vaf'] <= 0.025) & (C['BRP_vaf'] > 0.001)]
    #C.dropna(inplace=True)
    C = C[C['SampleB'] != 1]
    C['TP'].fillna(0, inplace=True)
    #C['BRP2_Ef25_ST25_R1'].fillna(1, inplace=True)
    C[method].fillna(True, inplace=True)
    C[method+'_vaf'].fillna(0, inplace=True)
    C['truth'] = C["TP"]
    print(C.shape)
    C[method+'_vaf_range'] = pd.cut(C[method+'_vaf'], [0.001, 0.002, 0.003, 0.005, 0.025])
    C[method+'_vaf_range_num'] = 5
    C[method+'_vaf_range_num'][C[method+'_vaf'] <= 0.001] = 0
    C[method+'_vaf_range_num'][(C[method+'_vaf'] <= 0.002) & (C[method+'_vaf'] > 0.001)] = 0.002
    C[method+'_vaf_range_num'][(C[method+'_vaf'] <= 0.003) & (C[method+'_vaf'] > 0.002)] = 0.003
    C[method+'_vaf_range_num'][(C[method+'_vaf'] <= 0.005) & (C[method+'_vaf'] > 0.003)] = 0.005
    C[method+'_vaf_range_num'][(C[method+'_vaf'] <= 0.025) & (C[method+'_vaf'] > 0.005)] = 0.025
    C[method+'_vaf_range_num'][C[method+'_vaf'] > 0.025] = 0
    print(C[method+'_vaf_range_num'].value_counts())

    #Call = pd.concat([Call, C])
        
    #print(Call.shape)

    precision, recall, thresholds = precision_recall_curve(C['truth'], C[method+'_vaf'])
    #precision, recall, thresholds = precision_recall_curve(Call['truth'], Call[method+'_vaf_range_num'])
    #print(thresholds)
    precisionbis = []
    for thres in [0] + list(thresholds):
        #Neg = Call[(Call['On-target']==1) & (Call['SampleB']==1)].shape[0]
        #Neg = C[(C['On-target']==1) & (C['SampleB']==1)].shape[0]
        #print(Neg)
        #y_pred = Call[m+'_vaf'].copy()
        y_pred = C[method+'_vaf'].copy()
        y_pred[y_pred >= thres] = 1
        y_pred[y_pred != 1] = 0
        #_, fp, _, tp = confusion_matrix(Call['truth'], y_pred).ravel()
        _, fp, _, tp = confusion_matrix(C['truth'], y_pred).ravel()
        #precisionbis.append(tp/(tp+(fp/Nneg)))
        
    idxa = min(range(len(thresholds)), key=lambda i: abs(thresholds[i]-0.001))
    idxb = min(range(len(thresholds)), key=lambda i: abs(thresholds[i]-0.025))
    if method == 'IDT':
        idxa, idxb = 0, len(thresholds)
    
    print(thresholds[idxa:idxb])

    plot_pr_curve(precision[idxa:idxb], recall[idxa:idxb], estimator_name=method, f1_score=None, figax=(fig, ax), kwargs={'color': config.colors[config.methods.index(method)]})
plt.ylim([0.0, 1.0])
plt.xlim([0.0, 1.0])
plt.legend(bbox_to_anchor=(1, 1), loc="upper left")

In [None]:
calltable

In [None]:
recall[idxa:idxb]

In [None]:
C[(C['IDT_vaf'] > 0) & (C['IDT_vaf'] <= 0.025)]

In [None]:
sns.catplot(x='TP', y='ILM_vaf', data=C[['TP', 'ILM_vaf']])

In [None]:
C

In [None]:
C

In [None]:
C['TP'].fillna(0, inplace=True)
C['BRP2_Ef25_ST25_R1'].fillna(1, inplace=True)
C['BRP'].fillna(True, inplace=True)
C['BRP_vaf'].fillna(0, inplace=True)
C[['TP', 'BRP', 'BRP_vaf']]
for vaf in np.sort(C['BRP_vaf'].unique()):
    if vaf > 0.001 and vaf <= 0.025:
        C['BRP_tmp'] = C['BRP'].copy()
        C['BRP_tmp'][C['BRP_vaf'] < vaf] = False
        P = precision_score(C['TP'], C['BRP_tmp'])
        R = recall_score(C['TP'], C['BRP_tmp'])
        Neg = C[(C['On-target']==1) & (C['SampleB']==1)].shape[0]
        #TP
        TP = C[(C['TP'] == 1) & (C['BRP_tmp'] == True)].shape[0] 
        #FP
        FP = C[((C['TP'] == 0) & (C['BRP_tmp'] == True)) | ((C['FP'] == 1) & (C['BRP_tmp'] == True))].shape[0]
        Pbis = TP/(TP+(FP/Neg))
        print(vaf, P, Pbis,  R)

In [None]:
C['TP'], C['BRP_tmp']

In [None]:
C[(C['BRP_vaf'].isna()) & (C['BRP2_Ef25_ST25_R1'] == 1)] # weird

In [None]:
C['TP'].fillna(0, inplace=True)
C['FP'].fillna(1, inplace=True)

In [None]:
C[C['BRP_vaf'] < 0.001]['BRP'] = False

In [None]:
#TP
TP = C[(C['TP'] == 1) & (C['BRP'] == True)].shape[0] 
#FN
FN = C[(C['TP'] == 1) & (C['BRP'] == False)].shape[0]
#FP
FP = C[((C['TP'] == 0) & (C['BRP'] == True)) | ((C['FP'] == 1) & (C['BRP'] == True))].shape[0]
#TN
TN = C[((C['FP'] == 1) & (C['BRP'] == False)) | ((C['TP'] == 0) & (C['BRP'] == False))].shape[0]

print(TP, FN, FP, TN)

In [None]:
Neg = C[(C['On-target']==1) & (C['SampleB']==1)].shape[0]
precisionbis = TP/(TP+(FP/Neg))
recall = TP/(TP+FN)
precision = TP/(TP+FP)
print(precision, recall, precisionbis)

In [None]:
Neg = C[(C['On-target']==1) & (C['SampleB']==1)].shape[0]
print(Neg)

In [None]:
C.shape, A.shape, B.shape

In [None]:
C['BRP2_Ef25_ST25_R1'].sum()

In [None]:
C[C["BRP"] == False]