In [None]:
import os
import io
import re
import numpy as np
import pandas as pd
import pysam
import matplotlib.pyplot as plt
import seaborn as sns
from pysam import FastaFile
from tqdm.notebook import tqdm 
from IPython.core.debugger import set_trace

from supporting_reads import list_supporting_reads

In [None]:
def read_vcf(path):
        with open(path, 'r') as f:
            lines = [l for l in f if not l.startswith('##')]
        res = None
        for bi in tqdm(range(int(np.ceil(len(lines)/1000)))):
            res_batch = pd.read_csv(io.StringIO(''.join([lines[0]] + lines[bi*1000 + 1:(bi+1)*1000 + 1])),
                dtype={'#CHROM': str, 'POS': int, 'ID': str, 'REF': str, 'ALT': str,
                       'QUAL': str, 'FILTER': str, 'INFO': str}, sep='\t')
            if res is None:
                if not res_batch[res_batch['#CHROM'] == '22'].empty:
                    res = res_batch[res_batch['#CHROM'] == '22']
            else:
                if not res_batch[res_batch['#CHROM'] == '22'].empty:
                    res = pd.concat([res, res_batch[res_batch['#CHROM'] == '22']])
        return res

foo_type = lambda x: pd.Series(x.split(';VC=')[1].split(';')[0])

# Read SNP databases

## dbSNP

common variants : ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606_b151_GRCh37p13/VCF/00-common_all.vcf

nb_lines = 500718

In [None]:
# dbsnp_df = read_vcf('../data/common_SNPs/00-common_all.vcf')
# print(dbsnp_df.shape)
# dbsnp_df.to_csv('../data/common_SNPs/dbsnp_df.csv', index=False)
# dbsnp_df.head()

In [None]:
dbsnp_df = pd.read_csv('../data/common_SNPs/dbsnp_df.csv')
print(dbsnp_df.shape)
dbsnp_df.head()

In [None]:
# type of SNP
dbsnp_df['type'] = dbsnp_df['INFO'].apply(foo_type)
dbsnp_df['type'].value_counts()

## genomAD database

In [None]:
#genomad_df = read_vcf('../data/common_SNPs/gnomad.genomes.r2.1.1.sites.22.vcf')
#print(genomad_df.shape)
#genomad_df.head()

In [None]:
#genomad_df.to_csv('../data/common_SNPs/genomad_df.csv')

In [None]:
#genomad_df_new = pd.read_csv('../data/common_SNPs/genomad_df.csv', memory_map=True)
#genomad_df_new.head()

# Import reference genome hg19.fa chr22

In [None]:
reference_genome = FastaFile('../data/reference_genome/chr22.fa')

# Find reads supporting known SNPs in merged healthy bam

Important notes:
1. Indexing
    - SAM is a 1-index based file
    - VCF is a 1-index based file
    - pysam is a 0-index based tool

2. Paired-End sequencing
    - BAM/SAM are storing the resverse complementary of reversed reads as sequence

3. Mapping issues
    - some reads are not mapped -> no CIGAR string + no read.reference_end

In [None]:
reads2remove, log_pd = list_supporting_reads(
    "../data/healthy_chr22_merged-ready.bam", dbsnp_df.iloc[3000:3500],
    verbose = 0)


In [None]:
log_pd[log_pd['vaf'] > 0.1].size
log_pd[log_pd['vaf'] > 0.1].size

In [None]:
plt.figure(figsize=(10,5))
plt.title('SNV')
sns.histplot(data=log_pd[log_pd['type'] == 'SNV'][['vaf', 'normal af', 'noisy af']],
             bins=100,  stat="probability")
plt.figure(figsize=(10,5))
plt.title('DEL')
sns.histplot(data=log_pd[log_pd['type'] == 'DEL'][['vaf', 'normal af', 'noisy af']],
             bins=100,  stat="probability")
plt.figure(figsize=(10,5))
plt.title('INS')
sns.histplot(data=log_pd[log_pd['type'] == 'INS'][['vaf', 'normal af', 'noisy af']],
             bins=100,  stat="probability")
log_pd.head()

# Patient's SNPs detected with GATK Haplotype

In [None]:
def read_vcf(path):
        with open(path, 'r') as f:
            lines = [l for l in f if not l.startswith('##')]
        res = pd.read_csv(io.StringIO(''.join(lines[:])),
            dtype={'#CHROM': str, 'POS': int, 'ID': str, 'REF': str, 'ALT': str,
                   'QUAL': str, 'FILTER': str, 'INFO': str}, sep='\t')
        return res

foo_vaf = lambda x: pd.Series(x.split(';AF=')[1].split(';')[0])

# Read SNPs detected in cancer patient

patient_snps = read_vcf('../data/2015-07-31_NCC_CRC-809_110914-CW/NCC_CRC-809_110914-CW-gatk-haplotype-annotated.vcf')
print(patient_snps.shape)
patient_snps = patient_snps[patient_snps['#CHROM'] == '22']
print(patient_snps.shape)
patient_snps.head()

patient_snps['VAF'] = patient_snps['INFO'].apply(foo_vaf)
patient_snps = patient_snps[['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'VAF']]
patient_snps

# print types of VAF
# heterozygote (=0.5), homozygote (1), double hetoerozygotes (0.5,0.5)
patient_snps['VAF'].value_counts()

In [None]:
patient_snps.head()

In [None]:
# how many mutations are not known SNPs?
print('% of unknown SNPs: {:2f}%'.format(100*patient_snps[patient_snps['ID'] == '.'].shape[0]/patient_snps.shape[0]))

In [None]:
samfile = pysam.AlignmentFile("../data/healthy_chr22_merged-ready.bam", "rb")

# ititiate list of reads to remove
reads2remove = []
log_dict = {"position":[],"type":[],
            "total_reads":[], 'supporting_reads':[],
            'normal_reads':[], 'alternative_reads':[],
            "problematic_reads":[]}

# iterate over positions
for ci, mutation in tqdm(patient_snps.iterrows(), total=patient_snps.shape[0]):
    
    genotype = {'A': 0, 'C': 0, 'G': 0, 'T': 0, 'N': 0}
    c = 0 # number of reads supporting the considered mutation
    t = 0 # total number of reads at that position
    p = 0 # number pf reads with issues
    n = 0 # number of reads supporting the reference genome
    a = 0 # number of reads with alternative nucleotide (not ref, not alt snp)
    mutation_type = None
    
    # iterate over reads that fall into the mutation position
    for read in samfile.fetch(str(mutation['#CHROM']), mutation['POS']-1, mutation['POS']): 
        t += 1
        seq = read.query_alignment_sequence
        pos = (mutation['POS']-1) - read.reference_start 
        cond, cond1, cond2 = False, False, False

        ######## SNV ##########
        if len(mutation ['ALT']) - len(mutation['REF']) == 0:
            mutation_type = 'SNV'
            cigar = read.cigarstring
            if cigar is None:
                p += 1
            else:
                cigar_pos = re.split('M|I|D|N|S|H|P|=|X',cigar)[:-1]
                cigar_states = re.split('[0-9]+',cigar)[1:]
                cumul = 0
                old_pos = pos
                for i, cp in enumerate(cigar_pos):
                    if (cigar_states[i] != 'S') and (cumul <= pos):
                        cumul += -int(cp) if cigar_states[i] == 'D' else int(cp)
                        if cigar_states[i] == 'D':
                            pos += -int(cp)  
                        elif cigar_states[i] == 'I':
                            pos += int(cp)
            genotype[seq[pos]] = genotype[seq[pos]]+1
            if ',' in mutation['ALT']:
                for muts in mutation['ALT'].split(','):
                    if (seq[pos] == muts):
                        cond = True
            else: 
                if (seq[pos] == mutation['ALT']):
                    cond = True
            if cond:
                c += 1
                reads2remove.append(read.query_name)
            elif seq[pos] == mutation['REF']:
                n += 1
            elif cigar is not None:
                a += 1

            ######## INSERTION ##########
            if len(mutation ['ALT']) - len(mutation['REF']) > 0:
                mutation_type = 'INS'  # insertion
                # cond1 = cigar string indicates a deletion at this position
                cigar = read.cigarstring
                if cigar is None:
                    p += 1
                else:
                    cigar_pos = re.split('M|I|D|N|S|H|P|=|X',cigar)[:-1]
                    cigar_states = re.split('[0-9]+',cigar)[1:]
                    if 'I' in cigar_states:
                        cigar_pos = [0 if (cigar_states[i] == 'S') else int(ci) for i, ci in enumerate(cigar_pos)]
                        cigar_pos = [-int(ci) if (cigar_states[i] == 'D') else int(ci) for i, ci in enumerate(cigar_pos)]
                        indexI = cigar_states.index('I')
                        if type(indexI) == list:
                            for idxI in indexI:
                                print(cigar_pos[:idxI], pos)
                                if sum(cigar_pos[:idxI]) == pos + 1:
                                    cond1 = True
                        else:
                            if sum(cigar_pos[:indexI]) == pos + 1:
                                    cond1 = True
                if cond1:
                    # cond2 = nucleotide sequence comparison
                    if ',' in mutation['ALT']:
                        for muts in mutation['ALT'].split(','):
                            if (seq[new_pos:new_pos+len(muts)] == muts):
                                cond2 = True
                    cond2 = (seq[pos:pos+len(mutation['ALT'])] == mutation['ALT'])

            ######## DELETION ##########
            elif len(mutation['ALT']) - len(mutation['REF']) < 0:
                mutation_type = 'DEL' # deletion
                # cond1 = cigar string indicates a deletion at this position
                cigar = read.cigarstring
                if cigar is None:
                    p += 1
                else:
                    cigar_pos = re.split('M|I|D|N|S|H|P|=|X',cigar)[:-1]
                    cigar_states = re.split('[0-9]+',cigar)[1:]
                    if 'D' in cigar_states:
                        cigar_pos = [0 if (cigar_states[i] == 'S') else int(ci) for i, ci in enumerate(cigar_pos)]
                        cigar_pos = [-int(ci) if (cigar_states[i] == 'D') else int(ci) for i, ci in enumerate(cigar_pos)]
                        indexD = cigar_states.index('D')
                        if type(indexD) == list:
                            for idxD in indexD:
                                if sum(cigar_pos[:idxD]) == pos + 1:
                                    cond1 = True
                        else:
                            if sum(cigar_pos[:indexD]) == pos + 1:
                                    cond1 = True
                if cond1:
                    # cond2 = nucleotide sequence comparison
                    if ',' in mutation['ALT']:
                        print('multiple positions in ALT')
                    cond2 = (seq[pos:pos+len(mutation['ALT'])] == mutation['ALT'])
            if cond1 and cond2:
                c += 1
                reads2remove.append(read.query_name)
            else:
                if not cond1 and (seq[pos:pos+len(mutation['REF'])] == mutation['REF']):
                    n += 1
                elif cigar is not None:
                    a += 1
    print(mutation_type, c, t, p, n, 'VAF:', round(c/t, 2))
    if mutation_type == 'SNV':
        print('REF:', mutation['REF'], 'ALT:', mutation['ALT'], mutation['VAF'], genotype)
    log_dict["position"].append(mutation['POS'])
    log_dict["type"].append(mutation_type)
    log_dict["total_reads"].append(t)
    log_dict["supporting_reads"].append(c)
    log_dict["normal_reads"].append(n)
    log_dict["alternative_reads"].append(a)
    log_dict["problematic_reads"].append(p)

samfile.close()