In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pysam
from collections import Counter
from matplotlib import cm
from matplotlib.colors import ListedColormap

import io
from pysam import VariantFile

from sklearn.metrics import precision_score, recall_score, average_precision_score, precision_recall_curve

In [None]:
tab20 = cm.get_cmap('tab10', 8)
newcmap_list = []
for i in range(4):
    newcmap_list.append(tab20.colors[i])
for i in range(4):
    newcmap_list.append(np.array(list(tab20.colors[i][:-1]) +[0.3]))

newcmap = ListedColormap(newcmap_list, name='newcmap')
color_list = ['tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple']

In [None]:
def read_vcf(path):
    with open(path, 'r') as f:
        lines = [l for l in f if not l.startswith('##')]
    res = pd.read_csv(
        io.StringIO(''.join(lines)),
        dtype={'#CHROM': str, 'POS': int, 'ID': str, 'REF': str, 'ALT': str,
               'QUAL': str, 'FILTER': str, 'INFO': str},
        sep='\t'
    ).rename(columns={'#CHROM': 'CHROM'})
    return res

foo = lambda x: pd.Series(x.split('CALLERS=')[1].split(';')[0])
foo2 = lambda x: pd.Series(x.split('TYPE=')[1].split(';')[0] if len(x.split('TYPE=')) > 1 else np.nan)
foo3 = lambda x: pd.Series(x.split('AF=')[1].split(';')[0] if len(x.split('AF=')) > 1 else np.nan)

In [None]:
sns.set(style="darkgrid", context="talk", rc={"lines.linewidth": 2})
plt.style.use("dark_background")
plt.rcParams.update({"grid.linewidth":0.5, "grid.alpha":0.5})
sns.set_palette("deep")

# Select plasma sample

In [None]:
sample = '809'
#sample = '986'

if sample == '809':
    plasmasample1 = '809_110914'
    plasmasample2 = '809_030915'
    tumorsample1 = '809_290714-T1W'
elif sample == '986':
    plasmasample1 = '986_100215'
    plasmasample2 = '986_261016'
    tumorsample1 = '986_100215-T1W'

# Load SNV calls for plasma sample and matching mixed samples

In [None]:
vcf_pd_0 = read_vcf("../data/calls_dilution/2015-07-31_CRC-"+plasmasample1+"-1-0_filtered/CRC-"+plasmasample1+"-1-0_filtered-ensemble-annotated.vcf")
vcf_pd_1 = read_vcf("../data/calls_dilution/2015-07-31_CRC-"+plasmasample1+"-1-0.72_filtered/CRC-"+plasmasample1+"-1-0_72_filtered-ensemble-annotated.vcf")
vcf_pd_2 = read_vcf("../data/calls_dilution/2015-07-31_CRC-"+plasmasample1+"-0.75-0.765_filtered/CRC-"+plasmasample1+"-0_75-0_765_filtered-ensemble-annotated.vcf")
vcf_pd_3 = read_vcf("../data/calls_dilution/2015-07-31_CRC-"+plasmasample1+"-0.5-0.81_filtered/CRC-"+plasmasample1+"-0_5-0_81_filtered-ensemble-annotated.vcf")
vcf_pd_4 = read_vcf("../data/calls_dilution/2015-07-31_CRC-"+plasmasample1+"-0.25-0.855_filtered/CRC-"+plasmasample1+"-0_25-0_855_filtered-ensemble-annotated.vcf")
vcf_pd_5 = read_vcf("../data/calls_dilution/2015-07-31_CRC-"+plasmasample1+"-0.125-0.875_filtered/CRC-"+plasmasample1+"-0_125-0_875_filtered-ensemble-annotated.vcf")

vcf_pd_0['callers'] = vcf_pd_0['INFO'].apply(foo)
vcf_pd_0['type'] = np.nan
vcf_pd_0['type'][vcf_pd_0['ALT'].str.len() - vcf_pd_0['REF'].str.len() == 0] = 'SNV'
vcf_pd_0['type'][vcf_pd_0['ALT'].str.len() - vcf_pd_0['REF'].str.len() > 0] = 'INS'
vcf_pd_0['type'][vcf_pd_0['ALT'].str.len() - vcf_pd_0['REF'].str.len() < 0] = 'DEL'
vcf_pd_0['type'][vcf_pd_0['ID'].str.contains('rs')] = 'SNP'
vcf_pd_0['VAF'] = vcf_pd_0['INFO'].apply(foo3)
vcf_pd_0['freebayes'] = vcf_pd_0['INFO'].str.contains('freebayes')
vcf_pd_0['vardict'] = vcf_pd_0['INFO'].str.contains('vardict')
vcf_pd_0['varscan'] = vcf_pd_0['INFO'].str.contains('varscan')
vcf_pd_0['mutect2'] = vcf_pd_0['INFO'].str.contains('mutect2')
vcf_pd_0['strelka2'] = vcf_pd_0['INFO'].str.contains('strelka2')

vcf_pd_1['callers'] = vcf_pd_1['INFO'].apply(foo)
vcf_pd_1['type'] = np.nan
vcf_pd_1['type'][vcf_pd_1['ALT'].str.len() - vcf_pd_1['REF'].str.len() == 0] = 'SNV'
vcf_pd_1['type'][vcf_pd_1['ALT'].str.len() - vcf_pd_1['REF'].str.len() > 0] = 'INS'
vcf_pd_1['type'][vcf_pd_1['ALT'].str.len() - vcf_pd_1['REF'].str.len() < 0] = 'DEL'
vcf_pd_1['type'][vcf_pd_1['ID'].str.contains('rs')] = 'SNP'
vcf_pd_1['VAF'] = vcf_pd_1['INFO'].apply(foo3)
vcf_pd_1['freebayes'] = vcf_pd_1['INFO'].str.contains('freebayes')
vcf_pd_1['vardict'] = vcf_pd_1['INFO'].str.contains('vardict')
vcf_pd_1['varscan'] = vcf_pd_1['INFO'].str.contains('varscan')
vcf_pd_1['mutect2'] = vcf_pd_1['INFO'].str.contains('mutect2')
vcf_pd_1['strelka2'] = vcf_pd_1['INFO'].str.contains('strelka2')

vcf_pd_2['callers'] = vcf_pd_2['INFO'].apply(foo)
vcf_pd_2['type'] = np.nan
vcf_pd_2['type'][vcf_pd_2['ALT'].str.len() - vcf_pd_2['REF'].str.len() == 0] = 'SNV'
vcf_pd_2['type'][vcf_pd_2['ALT'].str.len() - vcf_pd_2['REF'].str.len() > 0] = 'INS'
vcf_pd_2['type'][vcf_pd_2['ALT'].str.len() - vcf_pd_2['REF'].str.len() < 0] = 'DEL'
vcf_pd_2['type'][vcf_pd_2['ID'].str.contains('rs')] = 'SNP'
vcf_pd_2['VAF'] = vcf_pd_2['INFO'].apply(foo3)
vcf_pd_2['freebayes'] = vcf_pd_2['INFO'].str.contains('freebayes')
vcf_pd_2['vardict'] = vcf_pd_2['INFO'].str.contains('vardict')
vcf_pd_2['varscan'] = vcf_pd_2['INFO'].str.contains('varscan')
vcf_pd_2['mutect2'] = vcf_pd_2['INFO'].str.contains('mutect2')
vcf_pd_2['strelka2'] = vcf_pd_2['INFO'].str.contains('strelka2')

vcf_pd_3['callers'] = vcf_pd_3['INFO'].apply(foo)
vcf_pd_3['type'] = np.nan
vcf_pd_3['type'][vcf_pd_3['ALT'].str.len() - vcf_pd_3['REF'].str.len() == 0] = 'SNV'
vcf_pd_3['type'][vcf_pd_3['ALT'].str.len() - vcf_pd_3['REF'].str.len() > 0] = 'INS'
vcf_pd_3['type'][vcf_pd_3['ALT'].str.len() - vcf_pd_3['REF'].str.len() < 0] = 'DEL'
vcf_pd_3['type'][vcf_pd_3['ID'].str.contains('rs')] = 'SNP'
vcf_pd_3['VAF'] = vcf_pd_3['INFO'].apply(foo3)
vcf_pd_3['freebayes'] = vcf_pd_3['INFO'].str.contains('freebayes')
vcf_pd_3['vardict'] = vcf_pd_3['INFO'].str.contains('vardict')
vcf_pd_3['varscan'] = vcf_pd_3['INFO'].str.contains('varscan')
vcf_pd_3['mutect2'] = vcf_pd_3['INFO'].str.contains('mutect2')
vcf_pd_3['strelka2'] = vcf_pd_3['INFO'].str.contains('strelka2')

vcf_pd_4['callers'] = vcf_pd_4['INFO'].apply(foo)
vcf_pd_4['type'] = np.nan
vcf_pd_4['type'][vcf_pd_4['ALT'].str.len() - vcf_pd_4['REF'].str.len() == 0] = 'SNV'
vcf_pd_4['type'][vcf_pd_4['ALT'].str.len() - vcf_pd_4['REF'].str.len() > 0] = 'INS'
vcf_pd_4['type'][vcf_pd_4['ALT'].str.len() - vcf_pd_4['REF'].str.len() < 0] = 'DEL'
vcf_pd_4['type'][vcf_pd_4['ID'].str.contains('rs')] = 'SNP'
vcf_pd_4['VAF'] = vcf_pd_4['INFO'].apply(foo3)
vcf_pd_4['freebayes'] = vcf_pd_4['INFO'].str.contains('freebayes')
vcf_pd_4['vardict'] = vcf_pd_4['INFO'].str.contains('vardict')
vcf_pd_4['varscan'] = vcf_pd_4['INFO'].str.contains('varscan')
vcf_pd_4['mutect2'] = vcf_pd_4['INFO'].str.contains('mutect2')
vcf_pd_4['strelka2'] = vcf_pd_4['INFO'].str.contains('strelka2')

vcf_pd_5['callers'] = vcf_pd_5['INFO'].apply(foo)
vcf_pd_5['type'] = np.nan
vcf_pd_5['type'][vcf_pd_5['ALT'].str.len() - vcf_pd_5['REF'].str.len() == 0] = 'SNV'
vcf_pd_5['type'][vcf_pd_5['ALT'].str.len() - vcf_pd_5['REF'].str.len() > 0] = 'INS'
vcf_pd_5['type'][vcf_pd_5['ALT'].str.len() - vcf_pd_5['REF'].str.len() < 0] = 'DEL'
vcf_pd_5['type'][vcf_pd_5['ID'].str.contains('rs')] = 'SNP'
vcf_pd_5['VAF'] = vcf_pd_5['INFO'].apply(foo3)
vcf_pd_5['freebayes'] = vcf_pd_5['INFO'].str.contains('freebayes')
vcf_pd_5['vardict'] = vcf_pd_5['INFO'].str.contains('vardict')
vcf_pd_5['varscan'] = vcf_pd_5['INFO'].str.contains('varscan')
vcf_pd_5['mutect2'] = vcf_pd_5['INFO'].str.contains('mutect2')
vcf_pd_5['strelka2'] = vcf_pd_5['INFO'].str.contains('strelka2')


sample_0 = vcf_pd_0[['CHROM', 'POS', 'REF', 'ALT', 'QUAL', 'FILTER', 'type', 'VAF', 'freebayes', 'vardict', 'varscan', 'mutect2', 'strelka2']]
sample_0['CHROM_POS'] = sample_0['CHROM'].astype('str').str.cat(sample_0['POS'].astype('str'),sep="_")
sample_0.set_index('CHROM_POS', inplace = True)

sample_1 = vcf_pd_1[['CHROM', 'POS', 'REF', 'ALT', 'QUAL', 'type', 'VAF','freebayes', 'vardict', 'varscan', 'mutect2', 'strelka2']]
sample_1['CHROM_POS'] = sample_1['CHROM'].astype('str').str.cat(sample_1['POS'].astype('str'),sep="_")
sample_1.set_index('CHROM_POS', inplace = True)

sample_2 = vcf_pd_2[['CHROM', 'POS', 'REF', 'ALT', 'QUAL', 'type', 'VAF','freebayes', 'vardict', 'varscan', 'mutect2', 'strelka2']]
sample_2['CHROM_POS'] = sample_2['CHROM'].astype('str').str.cat(sample_2['POS'].astype('str'),sep="_")
sample_2.set_index('CHROM_POS', inplace = True)

sample_3 = vcf_pd_3[['CHROM', 'POS', 'REF', 'ALT', 'QUAL', 'type', 'VAF','freebayes', 'vardict', 'varscan', 'mutect2', 'strelka2']]
sample_3['CHROM_POS'] = sample_3['CHROM'].astype('str').str.cat(sample_3['POS'].astype('str'),sep="_")
sample_3.set_index('CHROM_POS', inplace = True)

sample_4 = vcf_pd_4[['CHROM', 'POS', 'REF', 'ALT', 'QUAL', 'type', 'VAF','freebayes', 'vardict', 'varscan', 'mutect2', 'strelka2']]
sample_4['CHROM_POS'] = sample_4['CHROM'].astype('str').str.cat(sample_4['POS'].astype('str'),sep="_")
sample_4.set_index('CHROM_POS', inplace = True)

sample_5 = vcf_pd_5[['CHROM', 'POS', 'REF', 'ALT', 'QUAL', 'type', 'VAF','freebayes', 'vardict', 'varscan', 'mutect2', 'strelka2']]
sample_5['CHROM_POS'] = sample_5['CHROM'].astype('str').str.cat(sample_5['POS'].astype('str'),sep="_")
sample_5.set_index('CHROM_POS', inplace = True)

sample_5.head()

In [None]:
samples_tf = {
    'sample_0': float(list(pd.read_csv("../data/calls_dilution/2015-07-31_CRC-"+plasmasample1+"-1-0_filtered/estimated_tf.txt").columns)[0]),
    'sample_1': float(list(pd.read_csv("../data/calls_dilution/2015-07-31_CRC-"+plasmasample1+"-1-0.72_filtered/estimated_tf.txt").columns)[0]),
    'sample_2': float(list(pd.read_csv("../data/calls_dilution/2015-07-31_CRC-"+plasmasample1+"-0.75-0.765_filtered/estimated_tf.txt").columns)[0]),
    'sample_3': float(list(pd.read_csv("../data/calls_dilution/2015-07-31_CRC-"+plasmasample1+"-0.5-0.81_filtered/estimated_tf.txt").columns)[0]),
    'sample_4': float(list(pd.read_csv("../data/calls_dilution/2015-07-31_CRC-"+plasmasample1+"-0.25-0.855_filtered/estimated_tf.txt").columns)[0]),
    'sample_5': float(list(pd.read_csv("../data/calls_dilution/2015-07-31_CRC-"+plasmasample1+"-0.125-0.875_filtered/estimated_tf.txt").columns)[0]),

}

print(samples_tf)

# Plot VAF distribution

In [None]:
#fig, axs = plt.subplots(5,1,figsize=(8, 15))
#fig.suptitle('Dilution effect on Variant Allele Frequency (VAF) distribution')

VAF = {'VAF': [], 'sample': []}
for i, sample in enumerate([sample_0, sample_1, sample_2, sample_3, sample_4, sample_5]):
    VAF['VAF'] = VAF['VAF'] + list(sample['VAF'].astype(float).values)
    VAF['sample'] = VAF['sample']+['sample_'+str(i) for _ in range(len(sample['VAF'].astype(float).values))]
VAF_pd = pd.DataFrame.from_dict(VAF)
plt.figure(figsize=(15, 8))
sns.histplot(data=VAF_pd, x='VAF', hue="sample", element="step",
             palette=sns.color_palette("rocket", n_colors=6),
             binwidth=0.02, stat="probability", common_norm=False)
plt.ylim([0, 1])
plt.xlim([0, 1])

In [None]:
fig, axs = plt.subplots(6,1,figsize=(8, 20))
fig.suptitle('Dilution effect on Variant Allele Frequency (VAF) distribution')

for i, sample in enumerate([sample_0, sample_1, sample_2, sample_3, sample_4, sample_5]):   
    sns.histplot(sample['VAF'].astype(float), ax=axs[i],
            label='sample_'+str(i)+', tf={0:.2f}%'.format(100*samples_tf['sample_'+str(i)]),
            color=sns.color_palette("rocket", n_colors=6)[i],
            binwidth=0.02, stat="probability")
    axs[i].legend()
    axs[i].set_ylim([0, 1])
    axs[i].set_xlim([0, 1])
    axs[i].axhline(y=0.5, c='grey')
    axs[i].axhline(y=0.4, c='grey', ls='--')
    axs[i].axhline(y=0.2, c='grey', ls='-.')

In [None]:
plt.figure(figsize=(15,5))
for i, sample in enumerate([sample_0, sample_1, sample_2, sample_3, sample_4]):   
    plt.plot(sample['POS'], sample['VAF'].astype(float), '.',
             color=sns.color_palette("rocket", n_colors=5)[i],
             label='sample_'+str(i)+', tf={0:.2f}%'.format(100*samples_tf['sample_'+str(i)]))
plt.legend()

In [None]:
vcf_sharedpos_df = pd.concat([sample_0, sample_1, sample_2, sample_3, sample_4], axis=1, join="outer")
vcf_sharedpos_df

In [None]:
roundtfvalues = [round(tf, 3) for tf in list(samples_tf.values())]
vcf_sharedpos_df = pd.concat([sample_0, sample_1, sample_2, sample_3, sample_4, sample_5], axis=1, join="outer")
print(vcf_sharedpos_df.shape)
vcf_sharedpos_df = vcf_sharedpos_df[['VAF']].astype(float)
vcf_sharedpos_df.columns = ['sample0 VAF', 'sample1 VAF', 'sample2 VAF', 'sample3 VAF', 'sample4 VAF', 'sample5 VAF']
vcf_sharedpos_df.columns = roundtfvalues
vcf_sharedpos_df = vcf_sharedpos_df.dropna(how='all')
a = vcf_sharedpos_df.shape[0]
b = vcf_sharedpos_df[vcf_sharedpos_df[roundtfvalues[0]] != vcf_sharedpos_df[roundtfvalues[1]]].shape[0]
print(a, b, 100*b/a)
vcf_sharedpos_diff_df = vcf_sharedpos_df[vcf_sharedpos_df.stack().groupby(level=0).nunique() > 1].T
#print(vcf_sharedpos_diff_df.columns)
vcf_sharedpos_diff_df = vcf_sharedpos_diff_df.T
vcf_sharedpos_diff_df.index.name='CHROM_POS'
#print(vcf_sharedpos_diff_df.head())
vcf_sharedpos_diff_df['group'] = 'VAF down'
change_index = vcf_sharedpos_diff_df[(vcf_sharedpos_diff_df[roundtfvalues[-1]].astype(float) > vcf_sharedpos_diff_df[roundtfvalues[-2]].astype(float)) |
                                   (vcf_sharedpos_diff_df[roundtfvalues[-2]].astype(float) > vcf_sharedpos_diff_df[roundtfvalues[-3]].astype(float)) |
                                    (vcf_sharedpos_diff_df[roundtfvalues[-3]].astype(float) > vcf_sharedpos_diff_df[roundtfvalues[-4]].astype(float)) |
                                    (vcf_sharedpos_diff_df[roundtfvalues[-4]].astype(float) > vcf_sharedpos_diff_df[roundtfvalues[-5]].astype(float))].index
vcf_sharedpos_diff_df['group'].loc[change_index] = 'VAF up'
vcf_sharedpos_diff_df.reset_index(inplace=True)
print(vcf_sharedpos_diff_df.head())
print(vcf_sharedpos_diff_df[['group', 'CHROM_POS']].groupby(['group']).count())
vcf_sharedpos_diff_df.drop('CHROM_POS', axis=1, inplace=True)
vcf_sharedpos_diff_df.set_index('group', inplace=True)
vcf_sharedpos_diff_df = vcf_sharedpos_diff_df.T
vcf_sharedpos_diff_df.index.name='tumor burden'
vcf_sharedpos_diff_df.reset_index(inplace=True)


vcf_sharedpos_diff_df = pd.melt(vcf_sharedpos_diff_df, id_vars =['tumor burden'], value_vars =vcf_sharedpos_diff_df.columns[1:],
                    var_name='group', value_name='VAF')

#print(vcf_sharedpos_diff_df.head())
sns.catplot(x="tumor burden", y="VAF", hue='group', #hue="CHROM_POS",
            data=vcf_sharedpos_diff_df, order=sorted(list(roundtfvalues), reverse=True), kind="point",
            palette=sns.color_palette("husl"))
#vcf_sharedpos_diff_df.head(10)

In [None]:
vcf_sharedpos_diff_df

In [None]:
vcf_sharedpos_diff_df

In [None]:
#vcf_sharedpos_diff_df.loc[vcf_sharedpos_diff_df[roundtfvalues[-1]].astype(float) > vcf_sharedpos_diff_df[roundtfvalues[-2]].astype(float)]['group'] = 'VAF up'
vcf_sharedpos_diff_df[vcf_sharedpos_diff_df[roundtfvalues[-1]].astype(float) > vcf_sharedpos_diff_df[roundtfvalues[-2]].astype(float)]
#vcf_sharedpos_diff_df[vcf_sharedpos_diff_df['group'] == 'VAF up']

In [None]:
sns.lineplot(x="tumor burden", y="VAF", #, hue="CHROM_POS",
            data=vcf_sharedpos_diff_df)
vcf_sharedpos_diff_df

In [None]:
vcf_sharedpos_diff_df

- Benchmarking results for germline SNVs
- Benchmarking results for somatic SNVs on exome data.
- averaged over the four replicates

- add fake mutations on healthy mixtures