In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pysam
from collections import Counter
from matplotlib import cm
from matplotlib.colors import ListedColormap

import io
from pysam import VariantFile

from sklearn.metrics import precision_score, recall_score, average_precision_score, precision_recall_curve

In [None]:
tab20 = cm.get_cmap('tab10', 8)
newcmap_list = []
for i in range(4):
    newcmap_list.append(tab20.colors[i])
for i in range(4):
    newcmap_list.append(np.array(list(tab20.colors[i][:-1]) +[0.3]))

newcmap = ListedColormap(newcmap_list, name='newcmap')
color_list = ['tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple']

# Select plasma sample

In [None]:
sample = '809'
#sample = '986'

if sample == '809':
    plasmasample1 = '809_110914'
    plasmasample2 = '809_030915'
    tumorsample1 = '809_290714-T1W'
elif sample == '986':
    plasmasample1 = '986_100215'
    plasmasample2 = '986_261016'
    tumorsample1 = '986_100215-T1W'

# Load SNV calls for plasma sample and matching mixed samples

In [None]:
vcf_pd_0 = pd.read_csv("../data/2015-07-31_CRC-"+plasmasample1+"-filter-1-0/CRC-"+plasmasample1+"-filter-1-0-ensemble-annotated_outputvaf.vcf.frq", sep='\t')
vcf_pd_1 = pd.read_csv("../data/2015-07-31_CRC-"+plasmasample1+"-filter-1-05/CRC-"+plasmasample1+"-filter-1-05-ensemble-annotated_outputvaf.vcf.frq", sep='\t')
#vcf_pd_2 = pd.read_csv("../data/2015-07-31_CRC-"+plasmasample1+"-filter-1-1/CRC-"+plasmasample1+"-filter-1-1-ensemble-annotated_outputvaf.vcf.frq", sep='\t')

vcf_pd_0 = vcf_pd_0.reset_index()
vcf_pd_0.columns = ['CHROM', 'POS', 'N_ALLELES', 'N_CHR', 'ALLELE1_FREQ', 'ALLELE2_FREQ']
vcf_pd_0['CHROM_POS'] = vcf_pd_0['CHROM'].astype('str').str.cat(vcf_pd_0['POS'].astype('str'),sep="_")
vcf_pd_0[['main allele','freq main allele']] =  pd.DataFrame(vcf_pd_0.ALLELE1_FREQ.str.split(':',1).tolist(), columns = ['main allele','freq main allele'])
vcf_pd_0['freq main allele'] = vcf_pd_0['freq main allele'].astype(float)
vcf_pd_0['freq second allele'] = 1- vcf_pd_0['freq main allele'].astype(float)


vcf_pd_1 = vcf_pd_1.reset_index()
vcf_pd_1.columns = ['CHROM', 'POS', 'N_ALLELES', 'N_CHR', 'ALLELE1_FREQ', 'ALLELE2_FREQ']
vcf_pd_1['CHROM_POS'] = vcf_pd_1['CHROM'].astype('str').str.cat(vcf_pd_1['POS'].astype('str'),sep="_")
vcf_pd_1[['main allele','freq main allele']] =  pd.DataFrame(vcf_pd_1.ALLELE1_FREQ.str.split(':',1).tolist(), columns = ['main allele','freq main allele'])
vcf_pd_1['freq main allele'] = vcf_pd_1['freq main allele'].astype(float)
vcf_pd_1['freq second allele'] = 1- vcf_pd_1['freq main allele'].astype(float)


#vcf_pd_2 = vcf_pd_2.reset_index()
#vcf_pd_2.columns = ['CHROM', 'POS', 'N_ALLELES', 'N_CHR', 'ALLELE1_FREQ', 'ALLELE2_FREQ']
#vcf_pd_2[['main allele','freq main allele']] =  pd.DataFrame(vcf_pd_2.ALLELE1_FREQ.str.split(':',1).tolist(), columns = ['main allele','freq main allele'])
#vcf_pd_2['freq main allele'] = vcf_pd_2['freq main allele'].astype(float)

#vcf_pd_3 = vcf_pd_3.reset_index()
#vcf_pd_3.columns = ['CHROM', 'POS', 'N_ALLELES', 'N_CHR', 'ALLELE1_FREQ', 'ALLELE2_FREQ']
#vcf_pd_3[['main allele','freq main allele']] =  pd.DataFrame(vcf_pd_3.ALLELE1_FREQ.str.split(':',1).tolist(), columns = ['main allele','freq main allele'])
#vcf_pd_3['freq main allele'] = vcf_pd_3['freq main allele'].astype(float)

In [None]:
vcf_pd_0

In [None]:
# tumor fraction estimation
samples_tf = {
    'sample_0': 0.47,
    'sample_1': float(list(pd.read_csv("../data/2015-07-31_CRC-809_110914-filter-05-05/estimated_tf.txt").columns)[0]),
    #'sample_2': float(list(pd.read_csv("../data/2015-07-31_CRC-809_110914-filter-005-095/estimated_tf.txt").columns)[0]),
    #'sample_3': float(list(pd.read_csv("../data/2015-07-31_CRC-809_110914-filter-001-099/estimated_tf.txt").columns)[0]),
}

print(samples_tf)

# Plot VAF distribution

In [None]:
for i, vcf_pd in enumerate([vcf_pd_0, vcf_pd_1]): #, vcf_pd_2, vcf_pd_3]):
    plt.figure()
    plt.hist(vcf_pd['freq main allele'])
    plt.title('sample_'+str(i)+', tf={0:.3f}%'.format(100*samples_tf['sample_'+str(i)]))

In [None]:
plt.figure(figsize=(15,5))
plt.plot(vcf_pd_0['POS'], 1- vcf_pd_0['freq main allele'], '.')
plt.plot(vcf_pd_1['POS'], 1- vcf_pd_1['freq main allele'], '.')

In [None]:
vcf_sharedpos_df = pd.concat([vcf_pd_0, vcf_pd_1], axis=1, join="inner")
print(vcf_sharedpos_df.shape)
vcf_sharedpos_df = vcf_sharedpos_df[['CHROM_POS', 'freq second allele']]
vcf_sharedpos_df.columns = ['CHROM_POS', 'CHROM_POS_bis', 'sample0 freq second allele', 'sample1 freq second allele']
vcf_sharedpos_df.drop('CHROM_POS_bis', axis=1, inplace=True)
vcf_sharedpos_df.set_index('CHROM_POS', inplace=True)
vcf_sharedpos_df.columns = list(samples_tf.values())
a = vcf_sharedpos_df.shape[0]
b = vcf_sharedpos_df[vcf_sharedpos_df[list(samples_tf.values())[0]] != vcf_sharedpos_df[list(samples_tf.values())[1]]].shape[0]
print(a, b, 100*b/a)
vcf_sharedpos_diff_df = vcf_sharedpos_df[vcf_sharedpos_df[list(samples_tf.values())[0]] != vcf_sharedpos_df[list(samples_tf.values())[1]]].T
vcf_sharedpos_diff_df.index.name='tumor burden'
vcf_sharedpos_diff_df.reset_index(inplace=True)
#print(vcf_sharedpos_diff_df.head())
vcf_sharedpos_diff_df = pd.melt(vcf_sharedpos_diff_df, id_vars =['tumor burden'], value_vars =vcf_sharedpos_diff_df.columns[1:],
                    var_name='CHROM_POS', value_name='VAF')
print(vcf_sharedpos_diff_df.head())
sns.catplot(x="tumor burden", y="VAF", hue="CHROM_POS",
            data=vcf_sharedpos_diff_df, order=sorted(list(samples_tf.values()), reverse=True), kind="point")

vcf_sharedpos_diff_df.head(10)

- Benchmarking results for germline SNVs
- Benchmarking results for somatic SNVs on exome data.
- averaged over the four replicates

- add fake mutations on healthy mixtures