In [None]:
import os
import io
import re
import numpy as np
import pandas as pd
import pysam
import matplotlib.pyplot as plt
import seaborn as sns
from pysam import FastaFile
from tqdm.notebook import tqdm 
from IPython.core.debugger import set_trace

from supporting_reads import list_reads_to_remove, prepare_bamsurgeon_inputs

In [None]:
def read_vcf(path):
        with open(path, 'r') as f:
            lines = [l for l in f if not l.startswith('##')]
        res = None
        for bi in tqdm(range(int(np.ceil(len(lines)/1000)))):
            res_batch = pd.read_csv(io.StringIO(''.join([lines[0]] + lines[bi*1000 + 1:(bi+1)*1000 + 1])),
                dtype={'#CHROM': str, 'POS': int, 'ID': str, 'REF': str, 'ALT': str,
                       'QUAL': str, 'FILTER': str, 'INFO': str}, sep='\t')
            if res is None:
                if not res_batch[res_batch['#CHROM'] == '22'].empty:
                    res = res_batch[res_batch['#CHROM'] == '22']
            else:
                if not res_batch[res_batch['#CHROM'] == '22'].empty:
                    res = pd.concat([res, res_batch[res_batch['#CHROM'] == '22']])
        return res

foo_type = lambda x: pd.Series(x.split(';VC=')[1].split(';')[0])

# Read SNP databases

## dbSNP

common variants : ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606_b151_GRCh37p13/VCF/00-common_all.vcf

nb_lines = 500718

In [None]:
# dbsnp_df = read_vcf('../data/common_SNPs/00-common_all.vcf')
# print(dbsnp_df.shape)
# dbsnp_df.to_csv('../data/common_SNPs/dbsnp_df.csv', index=False)
# dbsnp_df.head()

In [None]:
dbsnp_df = pd.read_csv('../data/common_SNPs/dbsnp_df.csv')
print(dbsnp_df.shape)
dbsnp_df.head()

## genomAD database

In [None]:
#genomad_df = read_vcf('../data/common_SNPs/gnomad.genomes.r2.1.1.sites.22.vcf')
#print(genomad_df.shape)
#genomad_df.head()

In [None]:
#genomad_df.to_csv('../data/common_SNPs/genomad_df.csv')

In [None]:
#genomad_df_new = pd.read_csv('../data/common_SNPs/genomad_df.csv', memory_map=True)
#genomad_df_new.head()

# Read patient SNPs

In [None]:
def read_vcf(path):
        with open(path, 'r') as f:
            lines = [l for l in f if not l.startswith('##')]
        res = pd.read_csv(io.StringIO(''.join(lines[:])),
            dtype={'#CHROM': str, 'POS': int, 'ID': str, 'REF': str, 'ALT': str,
                   'QUAL': str, 'FILTER': str, 'INFO': str}, sep='\t')
        return res

foo_vaf = lambda x: pd.Series(x.split(';AF=')[1].split(';')[0])

# Read SNPs detected in cancer patient

patient_snps = read_vcf('../data/2015-07-31_NCC_CRC-809_110914-CW/NCC_CRC-809_110914-CW-gatk-haplotype-annotated.vcf')
print(patient_snps.shape)
patient_snps = patient_snps[patient_snps['#CHROM'] == '22']
print(patient_snps.shape)
patient_snps.head()

patient_snps['VAF'] = patient_snps['INFO'].apply(foo_vaf)
patient_snps = patient_snps[['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'VAF']]

patient_snps.to_csv('../data/patient_SNPs/patient_809_snps.csv', index=False)
# print types of VAF
# heterozygote (=0.5), homozygote (1), double hetoerozygotes (0.5,0.5)
patient_snps['VAF'].value_counts()

In [None]:
patient_snps = pd.read_csv('../data/patient_SNPs/patient_809_snps.csv')

# Find reads to remove in pooled healthy sample

removing rare reads supporting known SNPs that are not found in the cancer patients

Important notes:
1. Indexing
    - SAM is a 1-index based file
    - VCF is a 1-index based file
    - pysam is a 0-index based tool

2. Paired-End sequencing
    - BAM/SAM are storing the resverse complementary of reversed reads as sequence

3. Mapping issues
    - some reads are not mapped -> no CIGAR string + no read.reference_end

In [None]:
reads2remove, log_pd = list_reads_to_remove(
    "../data/healthy_chr22_merged-ready.bam", dbsnp_df.iloc[3000:4000],
    patient_snps, verbose = -1)

In [None]:
100*log_pd[log_pd['vaf'] > 0.1].size/log_pd.size

In [None]:
plt.figure(figsize=(10,5))
plt.title('SNV')
sns.histplot(data=log_pd[log_pd['type'] == 'SNV'][['vaf', 'normal af', 'noisy af']],
             bins=100,  stat="probability")
plt.figure(figsize=(10,5))
plt.title('DEL')
sns.histplot(data=log_pd[log_pd['type'] == 'DEL'][['vaf', 'normal af', 'noisy af']],
             bins=100,  stat="probability")
plt.figure(figsize=(10,5))
plt.title('INS')
sns.histplot(data=log_pd[log_pd['type'] == 'INS'][['vaf', 'normal af', 'noisy af']],
             bins=100,  stat="probability")
log_pd.head()

# Patient's SNPs detected with GATK Haplotype

In [None]:
common_set = set(list(patient_snps.ID)) & set(list(dbsnp_df.ID))
print(100*len(common_set)/dbsnp_df.size, 100*len(common_set)/patient_snps.size, dbsnp_df.size)

In [None]:
# how many mutations are not known SNPs?
print('% of unknown SNPs: {:2f}%'.format(100*patient_snps[patient_snps['ID'] == '.'].shape[0]/patient_snps.shape[0]))

In [None]:
bamsurgeon_snv_pd, bamsurgeon_indel_pd = prepare_bamsurgeon_inputs(patient_snps, log_pd, max_vaf=0.1)

In [None]:
bamsurgeon_snv_pd.head(20)

In [None]:
bamsurgeon_indel_pd.head(20)