In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pysam
from collections import Counter
from matplotlib import cm
from matplotlib.colors import ListedColormap

import io
from pysam import VariantFile

from sklearn.metrics import precision_score, recall_score, average_precision_score, precision_recall_curve

In [None]:
tab20 = cm.get_cmap('tab10', 8)
newcmap_list = []
for i in range(4):
    newcmap_list.append(tab20.colors[i])
for i in range(4):
    newcmap_list.append(np.array(list(tab20.colors[i][:-1]) +[0.3]))

newcmap = ListedColormap(newcmap_list, name='newcmap')
color_list = ['tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple']

In [None]:
def read_vcf(path):
    with open(path, 'r') as f:
        lines = [l for l in f if not l.startswith('##')]
    res = pd.read_csv(
        io.StringIO(''.join(lines)),
        dtype={'#CHROM': str, 'POS': int, 'ID': str, 'REF': str, 'ALT': str,
               'QUAL': str, 'FILTER': str, 'INFO': str},
        sep='\t'
    ).rename(columns={'#CHROM': 'CHROM'})
    return res

foo = lambda x: pd.Series(x.split('CALLERS=')[1].split(';')[0])
foo2 = lambda x: pd.Series(x.split('TYPE=')[1].split(';')[0] if len(x.split('TYPE=')) > 1 else np.nan)
foo3 = lambda x: pd.Series(x.split('AF=')[1].split(';')[0] if len(x.split('AF=')) > 1 else np.nan)

# Select plasma sample

In [None]:
sample = '809'
#sample = '986'

if sample == '809':
    plasmasample1 = '809_110914'
    plasmasample2 = '809_030915'
    tumorsample1 = '809_290714-T1W'
elif sample == '986':
    plasmasample1 = '986_100215'
    plasmasample2 = '986_261016'
    tumorsample1 = '986_100215-T1W'

# Load SNV calls for plasma sample and matching mixed samples

In [None]:
vcf_pd_0 = read_vcf("../data/2015-07-31_CRC-"+plasmasample1+"-1-0/CRC-"+plasmasample1+"-1-0-ensemble-annotated.vcf")
vcf_pd_1 = read_vcf("../data/2015-07-31_CRC-"+plasmasample1+"-1-05775/CRC-"+plasmasample1+"-1-05775-ensemble-annotated.vcf")
vcf_pd_2 = read_vcf("../data/2015-07-31_CRC-"+plasmasample1+"-075-06738/CRC-"+plasmasample1+"-075-06738-ensemble-annotated.vcf")
vcf_pd_3 = read_vcf("../data/2015-07-31_CRC-"+plasmasample1+"-05-07701/CRC-"+plasmasample1+"-05-07701-ensemble-annotated.vcf")
vcf_pd_4 = read_vcf("../data/2015-07-31_CRC-"+plasmasample1+"-025-08663/CRC-"+plasmasample1+"-025-08663-ensemble-annotated.vcf")

vcf_pd_0['callers'] = vcf_pd_0['INFO'].apply(foo)
vcf_pd_0['type'] = vcf_pd_0['INFO'].apply(foo2)
vcf_pd_0['VAF'] = vcf_pd_0['INFO'].apply(foo3)
vcf_pd_0['type'][(vcf_pd_0['type'] == 'Deletion') |  (vcf_pd_0['type'] == 'del')] = 'DEL'
vcf_pd_0['type'][(vcf_pd_0['type'] == 'Insertion') |  (vcf_pd_0['type'] == 'ins')] = 'INS'
vcf_pd_0['freebayes'] = vcf_pd_0['INFO'].str.contains('freebayes')
vcf_pd_0['vardict'] = vcf_pd_0['INFO'].str.contains('vardict')
vcf_pd_0['varscan'] = vcf_pd_0['INFO'].str.contains('varscan')
vcf_pd_0['mutect2'] = vcf_pd_0['INFO'].str.contains('mutect2')
vcf_pd_0['strelka2'] = vcf_pd_0['INFO'].str.contains('strelka2')

vcf_pd_1['callers'] = vcf_pd_1['INFO'].apply(foo)
vcf_pd_1['type'] = vcf_pd_1['INFO'].apply(foo2)
vcf_pd_1['VAF'] = vcf_pd_1['INFO'].apply(foo3)
vcf_pd_1['type'][(vcf_pd_1['type'] == 'Deletion') |  (vcf_pd_1['type'] == 'del')] = 'DEL'
vcf_pd_1['type'][(vcf_pd_1['type'] == 'Insertion') |  (vcf_pd_1['type'] == 'ins')] = 'INS'
vcf_pd_1['freebayes'] = vcf_pd_1['INFO'].str.contains('freebayes')
vcf_pd_1['vardict'] = vcf_pd_1['INFO'].str.contains('vardict')
vcf_pd_1['varscan'] = vcf_pd_1['INFO'].str.contains('varscan')
vcf_pd_1['mutect2'] = vcf_pd_1['INFO'].str.contains('mutect2')
vcf_pd_1['strelka2'] = vcf_pd_1['INFO'].str.contains('strelka2')

vcf_pd_2['callers'] = vcf_pd_2['INFO'].apply(foo)
vcf_pd_2['type'] = vcf_pd_2['INFO'].apply(foo2)
vcf_pd_2['VAF'] = vcf_pd_2['INFO'].apply(foo3)
vcf_pd_2['type'][(vcf_pd_2['type'] == 'Deletion') |  (vcf_pd_2['type'] == 'del')] = 'DEL'
vcf_pd_2['type'][(vcf_pd_2['type'] == 'Insertion') |  (vcf_pd_2['type'] == 'ins')] = 'INS'
vcf_pd_2['freebayes'] = vcf_pd_2['INFO'].str.contains('freebayes')
vcf_pd_2['vardict'] = vcf_pd_2['INFO'].str.contains('vardict')
vcf_pd_2['varscan'] = vcf_pd_2['INFO'].str.contains('varscan')
vcf_pd_2['mutect2'] = vcf_pd_2['INFO'].str.contains('mutect2')
vcf_pd_2['strelka2'] = vcf_pd_2['INFO'].str.contains('strelka2')

vcf_pd_3['callers'] = vcf_pd_3['INFO'].apply(foo)
vcf_pd_3['type'] = vcf_pd_3['INFO'].apply(foo2)
vcf_pd_3['VAF'] = vcf_pd_3['INFO'].apply(foo3)
vcf_pd_3['type'][(vcf_pd_3['type'] == 'Deletion') |  (vcf_pd_3['type'] == 'del')] = 'DEL'
vcf_pd_3['type'][(vcf_pd_3['type'] == 'Insertion') |  (vcf_pd_3['type'] == 'ins')] = 'INS'
vcf_pd_3['freebayes'] = vcf_pd_3['INFO'].str.contains('freebayes')
vcf_pd_3['vardict'] = vcf_pd_3['INFO'].str.contains('vardict')
vcf_pd_3['varscan'] = vcf_pd_3['INFO'].str.contains('varscan')
vcf_pd_3['mutect2'] = vcf_pd_3['INFO'].str.contains('mutect2')
vcf_pd_3['strelka2'] = vcf_pd_3['INFO'].str.contains('strelka2')

vcf_pd_4['callers'] = vcf_pd_4['INFO'].apply(foo)
vcf_pd_4['type'] = vcf_pd_4['INFO'].apply(foo2)
vcf_pd_4['VAF'] = vcf_pd_4['INFO'].apply(foo3)
vcf_pd_4['type'][(vcf_pd_4['type'] == 'Deletion') |  (vcf_pd_4['type'] == 'del')] = 'DEL'
vcf_pd_4['type'][(vcf_pd_4['type'] == 'Insertion') |  (vcf_pd_4['type'] == 'ins')] = 'INS'
vcf_pd_4['freebayes'] = vcf_pd_4['INFO'].str.contains('freebayes')
vcf_pd_4['vardict'] = vcf_pd_4['INFO'].str.contains('vardict')
vcf_pd_4['varscan'] = vcf_pd_4['INFO'].str.contains('varscan')
vcf_pd_4['mutect2'] = vcf_pd_4['INFO'].str.contains('mutect2')
vcf_pd_4['strelka2'] = vcf_pd_4['INFO'].str.contains('strelka2')

sample_0 = vcf_pd_0[['CHROM', 'POS', 'REF', 'ALT', 'QUAL', 'type', 'VAF', 'freebayes', 'vardict', 'varscan', 'mutect2', 'strelka2']]
sample_0['CHROM_POS'] = sample_0['CHROM'].astype('str').str.cat(sample_0['POS'].astype('str'),sep="_")
sample_0.set_index('CHROM_POS', inplace = True)

sample_1 = vcf_pd_1[['CHROM', 'POS', 'REF', 'ALT', 'QUAL','type', 'VAF', 'freebayes', 'vardict', 'varscan', 'mutect2', 'strelka2']]
sample_1['CHROM_POS'] = sample_1['CHROM'].astype('str').str.cat(sample_1['POS'].astype('str'),sep="_")
sample_1.set_index('CHROM_POS', inplace = True)

sample_2 = vcf_pd_2[['CHROM', 'POS', 'REF', 'ALT', 'QUAL', 'type', 'VAF', 'freebayes', 'vardict', 'varscan', 'mutect2', 'strelka2']]
sample_2['CHROM_POS'] = sample_2['CHROM'].astype('str').str.cat(sample_2['POS'].astype('str'),sep="_")
sample_2.set_index('CHROM_POS', inplace = True)

sample_3 = vcf_pd_3[['CHROM', 'POS', 'REF', 'ALT', 'QUAL', 'type', 'VAF', 'freebayes', 'vardict', 'varscan', 'mutect2', 'strelka2']]
sample_3['CHROM_POS'] = sample_3['CHROM'].astype('str').str.cat(sample_3['POS'].astype('str'),sep="_")
sample_3.set_index('CHROM_POS', inplace = True)

sample_4 = vcf_pd_4[['CHROM', 'POS', 'REF', 'ALT', 'QUAL', 'type', 'VAF', 'freebayes', 'vardict', 'varscan', 'mutect2', 'strelka2']]
sample_4['CHROM_POS'] = sample_4['CHROM'].astype('str').str.cat(sample_4['POS'].astype('str'),sep="_")
sample_4.set_index('CHROM_POS', inplace = True)

sample_4.head()

In [None]:
# tumor fraction estimation
samples_tf = {
    'sample_0': float(list(pd.read_csv("../data/2015-07-31_CRC-"+plasmasample1+"-1-0/estimated_tf.txt").columns)[0]),
    'sample_1': float(list(pd.read_csv("../data/2015-07-31_CRC-"+plasmasample1+"-1-05775/estimated_tf.txt").columns)[0]),
    'sample_2': float(list(pd.read_csv("../data/2015-07-31_CRC-"+plasmasample1+"-075-06738/estimated_tf.txt").columns)[0]),
    'sample_3': float(list(pd.read_csv("../data/2015-07-31_CRC-"+plasmasample1+"-05-07701/estimated_tf.txt").columns)[0]),
    'sample_4': float(list(pd.read_csv("../data/2015-07-31_CRC-"+plasmasample1+"-025-08663/estimated_tf.txt").columns)[0]),
}

print(samples_tf)

# Plot VAF distribution

In [None]:
#fig, axs = plt.subplots(5,1,figsize=(8, 15))
#fig.suptitle('Dilution effect on Variant Allele Frequency (VAF) distribution')

VAF = {'VAF': [], 'sample': []}
for i, sample in enumerate([sample_0, sample_1, sample_2, sample_3, sample_4]):
    VAF['VAF'] = VAF['VAF'] + list(sample['VAF'].astype(float).values)
    VAF['sample'] = VAF['sample']+['sample_'+str(i) for _ in range(len(sample['VAF'].astype(float).values))]
VAF_pd = pd.DataFrame.from_dict(VAF)
plt.figure(figsize=(15, 8))
sns.histplot(data=VAF_pd, x='VAF', hue="sample", element="step",
             palette=sns.color_palette("rocket", n_colors=5),
             binwidth=0.02, stat="probability", common_norm=False)
plt.ylim([0, 1])
plt.xlim([0, 1])

In [None]:
fig, axs = plt.subplots(5,1,figsize=(8, 15))
fig.suptitle('Dilution effect on Variant Allele Frequency (VAF) distribution')

for i, sample in enumerate([sample_0, sample_1, sample_2, sample_3, sample_4]):   
    sns.histplot(sample['VAF'].astype(float), ax=axs[i],
            label='sample_'+str(i)+', tf={0:.2f}%'.format(100*samples_tf['sample_'+str(i)]),
            color=sns.color_palette("rocket", n_colors=5)[i],
            binwidth=0.02)#, stat="probability")
    axs[i].legend()
    axs[i].set_ylim([0, 200])
    axs[i].set_xlim([0, 1])

In [None]:
plt.figure(figsize=(15,5))
for i, sample in enumerate([sample_0, sample_1, sample_2, sample_3, sample_4]):   
    plt.plot(sample['POS'], sample['VAF'].astype(float), '.',
             color=sns.color_palette("rocket", n_colors=5)[i],
             label='sample_'+str(i)+', tf={0:.2f}%'.format(100*samples_tf['sample_'+str(i)]))
plt.legend()

In [None]:
sample_0

In [None]:
roundtfvalues = [round(tf, 3) for tf in list(samples_tf.values())]
vcf_sharedpos_df = pd.concat([sample_0, sample_1, sample_2, sample_3, sample_4], axis=1, join="inner")
print(vcf_sharedpos_df.shape)
vcf_sharedpos_df = vcf_sharedpos_df[['VAF']].astype(float)
vcf_sharedpos_df.columns = ['sample0 VAF', 'sample1 VAF', 'sample2 VAF', 'sample3 VAF', 'sample4 VAF']
vcf_sharedpos_df.columns = roundtfvalues
a = vcf_sharedpos_df.shape[0]
b = vcf_sharedpos_df[vcf_sharedpos_df[roundtfvalues[0]] != vcf_sharedpos_df[roundtfvalues[1]]].shape[0]
print(a, b, 100*b/a)
vcf_sharedpos_diff_df = vcf_sharedpos_df[vcf_sharedpos_df[roundtfvalues[0]] != vcf_sharedpos_df[roundtfvalues[1]]].T
vcf_sharedpos_diff_df.index.name='tumor burden'
vcf_sharedpos_diff_df.reset_index(inplace=True)
#print(vcf_sharedpos_diff_df.head())
vcf_sharedpos_diff_df = pd.melt(vcf_sharedpos_diff_df, id_vars =['tumor burden'], value_vars =vcf_sharedpos_diff_df.columns[1:],
                    var_name='CHROM_POS', value_name='VAF')
print(vcf_sharedpos_diff_df.head())
sns.catplot(x="tumor burden", y="VAF", #, hue="CHROM_POS",
            data=vcf_sharedpos_diff_df, order=sorted(list(roundtfvalues), reverse=True), kind="point")
vcf_sharedpos_diff_df.head(10)

- Benchmarking results for germline SNVs
- Benchmarking results for somatic SNVs on exome data.
- averaged over the four replicates

- add fake mutations on healthy mixtures