In [None]:
import os
import io
import re
import numpy as np
import pandas as pd
import pysam
import matplotlib.pyplot as plt
import seaborn as sns
from pysam import FastaFile
from tqdm.notebook import tqdm 
from IPython.core.debugger import set_trace

from supporting_reads import list_reads_to_remove, prepare_bamsurgeon_inputs

# Read SNP databases

## dbSNP

common variants : ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606_b151_GRCh37p13/VCF/00-common_all.vcf

nb_lines = 501220

In [None]:
'''
def read_vcf(path):
        with open(path, 'r') as f:
            lines = [l for l in f if not l.startswith('##')]
        res = None
        for bi in tqdm(range(int(np.ceil(len(lines)/1000)))):
            res_batch = pd.read_csv(io.StringIO(''.join([lines[0]] + lines[bi*1000 + 1:(bi+1)*1000 + 1])),
                dtype={'#CHROM': str, 'POS': int, 'ID': str, 'REF': str, 'ALT': str,
                       'QUAL': str, 'FILTER': str, 'INFO': str}, sep='\t')
            if res is None:
                if not res_batch[res_batch['#CHROM'] == '22'].empty:
                    res = res_batch[res_batch['#CHROM'] == '22']
            else:
                if not res_batch[res_batch['#CHROM'] == '22'].empty:
                    res = pd.concat([res, res_batch[res_batch['#CHROM'] == '22']])
        return res

foo_type = lambda x: pd.Series(x.split(';VC=')[1].split(';')[0])

dbsnp_df = read_vcf('../data/common_SNPs/00-common_all.vcf')
print(dbsnp_df.shape)
dbsnp_df.to_csv('../data/common_SNPs/dbsnp_df.csv', index=False)
dbsnp_df.head()
'''

In [None]:
dbsnp_df = pd.read_csv('../data/common_SNPs/dbsnp_df.csv')
print(dbsnp_df.shape)
dbsnp_df.head()

## genomAD database

In [None]:
#genomad_df = read_vcf('../data/common_SNPs/gnomad.genomes.r2.1.1.sites.22.vcf')
#print(genomad_df.shape)
#genomad_df.head()
#genomad_df.to_csv('../data/common_SNPs/genomad_df.csv')

In [None]:
# iterator as the database is too large, even for chr22 only
genomad_df_iterator = pd.read_csv('../data/common_SNPs/genomad_df.csv', iterator=True, chunksize=500000)
genomad_df_iterator.get_chunk(100)

# Read patient SNPs

In [None]:
patient_date = '986_100215'

In [None]:
'''
def read_vcf(path):
        with open(path, 'r') as f:
            lines = [l for l in f if not l.startswith('##')]
        res = pd.read_csv(io.StringIO(''.join(lines[:])),
            dtype={'#CHROM': str, 'POS': int, 'ID': str, 'REF': str, 'ALT': str,
                   'QUAL': str, 'FILTER': str, 'INFO': str}, sep='\t')
        return res

foo_vaf = lambda x: pd.Series(x.split(';AF=')[1].split(';')[0])

# Read SNPs detected in cancer patient

patient_snps_df = read_vcf('../data/2015-07-31_NCC_CRC-'+patient_date+'-CW/NCC_CRC-'+patient_date+'-CW-gatk-haplotype-annotated.vcf')
print(patient_snps_df.shape)
patient_snps_df = patient_snps_df[patient_snps_df['#CHROM'] == '22']
print(patient_snps_df.shape)

patient_snps_df['VAF'] = patient_snps_df['INFO'].apply(foo_vaf)
patient_snps_df = patient_snps_df[['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'VAF']]

patient_snps_df.to_csv('../data/patient_SNPs/patient_'+patient_date.split('_')[0]+'_snps.csv', index=False)
# print types of VAF
# heterozygote (=0.5), homozygote (1), double hetoerozygotes (0.5,0.5)
patient_snps_df['VAF'].value_counts()
'''

In [None]:
patient_snps_df = pd.read_csv('../data/patient_SNPs/patient_'+patient_date.split('_')[0]+'_snps.csv')
patient_snps_df['VAF'].value_counts()

# Find reads to remove in pooled healthy sample

removing rare reads supporting known SNPs that are not found in the cancer patients

Important notes:
1. Indexing
    - SAM is a 1-index based file
    - VCF is a 1-index based file
    - pysam is a 0-index based tool

2. Paired-End sequencing
    - BAM/SAM are storing the resverse complementary of reversed reads as sequence

3. Mapping issues
    - some reads are not mapped -> no CIGAR string + no read.reference_end

## dbSNP database

In [None]:
reads2remove, log_pd = list_reads_to_remove(
    "../data/healthy_chr22_merged-ready.bam", dbsnp_df.iloc[1000:1300],
    patient_snps_df, verbose = -1)
# save list of reads to remove and log dataframe
#log_pd.to_csv('../data/prepare_pooled_healthy/log_'+patient_date.split('_')[0]+'_dbsnp.csv', index=False)
#with open('../data/prepare_pooled_healthy/readfile_'+patient_date.split('_')[0]+'_dbsnp.txt', "w") as output:
#    for r in reads2remove:
#        output.write(str(r) + "\n")

In [None]:
# load back results

log_pd = pd.read_csv('../data/prepare_pooled_healthy/log_'+patient_date.split('_')[0]+'_dbsnp.csv')
reads2remove = pd.read_csv('../data/prepare_pooled_healthy/readfile_'+patient_date.split('_')[0]+'_dbsnp.txt', header=None).values.flatten()
print('# reads to remove: ', len(reads2remove))
print('% reads to remove: {:2f}%'.format(100*sum(log_pd['supporting_reads'])/sum(log_pd['total_reads'])))

In [None]:
# visualise results

print('proportion of high VAF positions (above 10%): {}%'.format(100*log_pd[log_pd['vaf'] > 0.1].size/log_pd.size))

sns.set(font_scale=2)

plt.figure(figsize=(15,5))
plt.title('SNV')
sns.histplot(data=log_pd[log_pd['type'] == 'SNV'][['vaf', 'normal af', 'noisy af']],
             bins=100,  stat="probability", common_norm=False, legend=False)
plt.xlabel('local VAF')

plt.figure(figsize=(15,5))
plt.title('DEL')
sns.histplot(data=log_pd[log_pd['type'] == 'DEL'][['vaf', 'normal af', 'noisy af']],
             bins=100,  stat="probability", common_norm=False, legend=False)
plt.xlabel('local VAF')

plt.figure(figsize=(15,5))
plt.title('INS')
sns.histplot(data=log_pd[log_pd['type'] == 'INS'][['vaf', 'normal af', 'noisy af']],
             bins=100,  stat="probability", common_norm=False, legend=False)
plt.xlabel('local VAF')

plt.figure(figsize=(15,5))
plt.title('all mutations')
sns.histplot(data=log_pd[['vaf', 'normal af', 'noisy af']],
             bins=100,  stat="probability", common_norm=False)

log_pd.head()

## GenomAD database

In [None]:
ci = 0
for genomad_df_chunk in genomad_df_iterator:
    ci += 1
    print('chunk '+str(ci))
    genomad_df_chunk = genomad_df_chunk.drop('Unnamed: 0', axis=1)
    reads2remove, log_pd = list_reads_to_remove(
        "../data/healthy_chr22_merged-ready.bam", genomad_df_chunk, patient_snps_df, verbose=-1)
    # save list of reads to remove and log dataframe
    log_pd.to_csv('../data/prepare_pooled_healthy/log_'+patient_date.split('_')[0]+'_genomad_'+str(ci)+'.csv', index=False)
    with open('../data/prepare_pooled_healthy/readfile_'+patient_date.split('_')[0]+'_genomad_'+str(ci)+'.txt', "w") as output:
        for r in reads2remove:
                    output.write(str(r) + "\n")

# Patient's SNPs detected with GATK Haplotype

In [None]:
common_set = set(list(patient_snps_df.ID)) & set(list(dbsnp_df.ID))
print(100*len(common_set)/dbsnp_df.size, 100*len(common_set)/patient_snps_df.size, dbsnp_df.size)

# how many mutations are not known SNPs?
print('% of unknown SNPs: {:2f}%'.format(100*patient_snps_df[patient_snps_df['ID'] == '.'].shape[0]/patient_snps_df.shape[0]))

In [None]:
print(patient_snps_df.size)
print(dbsnp_df.size)

In [None]:
bamsurgeon_snv_pd, bamsurgeon_indel_pd = prepare_bamsurgeon_inputs(patient_snps_df, log_pd, max_vaf=0.1)

In [None]:
# save file
bamsurgeon_snv_pd.to_csv('../data/prepare_pooled_healthy/varfile_snv_'+patient_date.split('_')[0]+'_dbsnp.bed',  sep='\t', header=False, index=False)
bamsurgeon_indel_pd.to_csv('../data/prepare_pooled_healthy/varfile_indel_'+patient_date.split('_')[0]+'_dbsnp.bed',  sep='\t', header=False, index=False)

In [None]:
bamsurgeon_snv_pd = pd.read_csv('../data/prepare_pooled_healthy/varfile_snv_'+patient_date.split('_')[0]+'_dbsnp.bed', sep='\t', header=None)
bamsurgeon_snv_pd.columns = ['chrom', 'pos_start', 'pos_end', 'vaf', 'alt']

In [None]:
bamsurgeon_snv_pd.shape

# Split varfiles to run BamSurgeon by chunks

In [None]:
bamsurgeon_snv_pd = pd.read_csv('../data/prepare_pooled_healthy/varfile_snv_'+patient_date.split('_')[0]+'_dbsnp.bed', sep='\t', header=None)
bamsurgeon_snv_pd.shape

In [None]:
chunk_size = 10000
for chunk in range(int(np.ceil(bamsurgeon_snv_pd.shape[0]/chunk_size))):
    bamsurgeon_snv_pd_chunk = bamsurgeon_snv_pd.iloc[chunk*chunk_size:(chunk+1)*chunk_size]
    bamsurgeon_snv_pd_chunk.to_csv('../data/prepare_pooled_healthy/varfile_snv_'+patient_date.split('_')[0]+'_dbsnp_'+str(chunk)+'.bed',  sep='\t', header=False, index=False)

In [None]:
bamsurgeon_indel_pd = pd.read_csv('../data/prepare_pooled_healthy/varfile_indel_'+patient_date.split('_')[0]+'_dbsnp.bed', sep='\t', header=None)
bamsurgeon_indel_pd.shape

In [None]:
chunk_size = 10000
for chunk in range(int(np.ceil(bamsurgeon_indel_pd.shape[0]/chunk_size))):
    bamsurgeon_indel_pd_chunk = bamsurgeon_indel_pd.iloc[chunk*chunk_size:(chunk+1)*chunk_size]
    bamsurgeon_indel_pd_chunk.to_csv('../data/prepare_pooled_healthy/varfile_indel_'+patient_date.split('_')[0]+'_dbsnp_'+str(chunk)+'.bed',  sep='\t', header=False, index=False)

# Check SNV indel overlaps

In [None]:
bamsurgeon_indel_pd

In [None]:
bamsurgeon_snv_pd