In [None]:
import os
import io
import re
import numpy as np
import pandas as pd
import pysam
from tqdm.notebook import tqdm 

from pdbio.vcfdataframe import VcfDataFrame

In [None]:
def read_vcf(path):
        with open(path, 'r') as f:
            lines = [l for l in f if not l.startswith('##')]
        res = None
        for bi in tqdm(range(int(np.ceil(len(lines)/1000)))):
            res_batch = pd.read_csv(io.StringIO(''.join([lines[0]] + lines[bi*1000 + 1:(bi+1)*1000])),
                dtype={'#CHROM': str, 'POS': int, 'ID': str, 'REF': str, 'ALT': str,
                       'QUAL': str, 'FILTER': str, 'INFO': str}, sep='\t')
            if res is None:
                if not res_batch[res_batch['#CHROM'] == '22'].empty:
                    res = res_batch[res_batch['#CHROM'] == '22']
            else:
                if not res_batch[res_batch['#CHROM'] == '22'].empty:
                    res = pd.concat([res, res_batch[res_batch['#CHROM'] == '22']])
        return res

foo = lambda x: pd.Series(x.split('CALLERS=')[1].split(';')[0])

# Read dpSNP database

whole genome avaiable at http://hgdownload.soe.ucsc.edu/goldenPath/hg19/database/snp151Common.txt.gz

here chr22 only: https://ftp.ncbi.nlm.nih.gov/snp/organisms/human_9606_b151_GRCh37p13/chr_rpts/

00-common_all.vcf

nb_lines = 37302979

In [None]:
dpsnp_df = read_vcf('../data/common_SNPs/00-common_all.vcf')
print(dpsnp_df.shape)
dpsnp_df.head()

In [None]:
#dpsnp_df = VcfDataFrame('../data/common_SNPs/00-common_all.vcf')
#dpsnp_df.head()
#too large !!!

In [None]:
#dpsnp_df = pd.read_csv('../data/common_SNPs/chr_22.txt', sep='\t', skiprows=3)
#dpsnp_df.columns = dpsnp_df.columns.astype(str) + ' ' + dpsnp_df.iloc[0].astype(str)
#dbsnp_df.drop(0, inplace=True)
#dpsnp_df.head()

In [None]:
#dpsnp_df = pd.read_csv('../data/common_SNPs/snp151Common.txt', sep='\t', header=None)
# dpsnp_df = dpsnp_df[dpsnp_df[1] == 'chr22']
#column_list = ['bin', 'chrom',  'chromStart', 'chromEnd', 'name', \
#               'score', 'strand', 'refNCBI', 'refUCSC', 'observed', \
#               'molType', 'class', 'valid', 'avHet', 'avHetSE', \
#               'func', 'locType', 'weight', 'exceptions', 'submitterCount', \
#               'submitters', 'alleleFreqCount', 'alleles', 'alleleNs', \
#               'alleleFreqs', 'bitefields']
#dpsnp_df.columns = column_list
dpsnp_df.head(20)

# Read genomAD database

In [None]:
genomad_df = read_vcf('../data/common_SNPs/gnomad.genomes.r2.1.1.sites.22.vcf')
genomad_df

# Find reads supporting known SNPs in lpWGS healthy bams

Important notes:
1. Indexing
    - SAM is a 1-index based file
    - VCF is a 1-index based file
    - pysam is a 0-index based tool

2. Paired-End sequencing
    - BAM/SAM are storing the resverse complementary of reversed reads as sequence

3. Mapping issues
    - some reads are not mapped -> no CIGAR string + no read.reference_end

In [None]:
samfile = pysam.AlignmentFile("../data/healthy_chr22_merged-ready.bam", "rb")

# ititiate list of reads to remove
reads2remove = []
log_dict = {"position":[],"type":[],
            "total_reads":[], 'supporting_reads':[],
            "problematic_reads":[]}

# iterate over positions
for ci, mutation in tqdm(dpsnp_df.iterrows(), total=vcf_pd.shape[0]):
    c = 0 # number of reads supporting the considered mutation
    t = 0 # total number of reads at that position
    p = 0 # number pf reads with issues
    for read in samfile.fetch(mutation['chrom'], mutation['chromStart']-1, mutation['chromStart']): 
        t += 1
        seq = read.query_alignment_sequence
        pos = (mutation['chromStart'] -1) - read.reference_start + 1 
    if mutation['class'] == 'single':
        cond1 = (seq[pos-1:  pos-1+len(mutation['ALT'])] == mutation['ALT'])
        if cond1: 
            cigar = read.cigarstring
            if cigar is None:
                p += 1
            else:
                cigar_pos = re.split('M|I|D|N|S|H|P|=|X',cigar)[:-1]
                cigar_states = re.split('[0-9]+',cigar)[1:]
                cond2 = False
                if 'I' in cigar_states:
                    cumul = 0
                    indel_pos = None
                    for i, cp in enumerate(cigar_pos):
                        if (cigar_states[i] != 'S') and (cumul <= pos):
                            cumul += -int(cp) if cp == 'D' else int(cp)
                            indel_pos = i
                    if cigar_states[indel_pos] == 'I':
                        cond2 = True
                if cond1 and cond2:
                    c += 1
                    reads2remove.append(read.query_name)

    elif (mutation['type'] == 'SNV') or (mutation['type'] == 'SNP'):
        cigar = read.cigarstring
        #print(pos, seq[pos-2:  pos+2], seq[pos-1], mutation['ALT'],  mutation['REF'])
        if cigar is None:
            p += 1
        else:
            cigar_pos = re.split('M|I|D|N|S|H|P|=|X',cigar)[:-1]
            cigar_states = re.split('[0-9]+',cigar)[1:]
            if 'D' in cigar_states:
                if 'S' in cigar_states:
                    Spos = [i for i, x in enumerate(cigar_states) if x == "S"] 
                    cigar_pos = [i for j, i in enumerate(cigar_pos) if j not in Spos]
                    cigar_states = [i for j, i in enumerate(cigar_states) if j not in Spos]
                Dpos = [i for i, x in enumerate(cigar_states) if x == "D"] 
                for dpos in Dpos:
                    if sum(list(map(int, cigar_pos[:dpos]))) < pos:
                        #print(sum(list(map(int, cigar_pos[:dpos]))), cigar)
                        pos = pos - int(cigar_pos[dpos])
                #print(len(seq), pos-1)
        if (seq[pos-1:pos-1+len(mutation['ALT'])] == mutation['ALT']):
            c += 1
            reads2remove.append(read.query_name)
    elif mutation['type'] == 'DEL':
        cond1 = (seq[pos-1:  pos-1+len(mutation['REF'])] != mutation['REF'])
        if cond1:
            cigar = read.cigarstring
            if cigar is None:
                p += 1
            else:
                cigar_pos = re.split('M|I|D|N|S|H|P|=|X',cigar)[:-1]
                cigar_states = re.split('[0-9]+',cigar)[1:]
                cond2 = False
                if 'D' in cigar_states:
                    cumul = 0
                    indel_pos = None
                    for i, cp in enumerate(cigar_pos):
                        if (cigar_pos[i] != 'S') and (cumul <= pos):
                            cumul += -int(cp) if cp == 'D' else int(cp)
                            indel_pos = i
                    if cigar_states[indel_pos] == 'D':
                        cond2 = True
                if cond1 and cond2:
                    c += 1
                    reads2remove.append(read.query_name)
    log_dict["position"].append(mutation['POS'])
    log_dict["type"].append(mutation['type'])
    log_dict["total_reads"].append(t)
    log_dict["supporting_reads"].append(c)
    log_dict["problematic_reads"].append(p)

samfile.close()

In [None]:
log_pd = pd.DataFrame.from_dict(log_dict)
log_pd

In [None]:
log_pd[log_pd['total_reads'] > 0].shape[0]/log_pd.shape[0]

In [None]:
100*log_pd[(log_pd['supporting_reads'] == 0) & (log_pd['total_reads'] > 0)].shape[0]/log_pd[log_pd['total_reads'] > 0].shape[0]

In [None]:
print(vcf_pd.shape)

In [None]:
a = 'GCAGCCACTCAGGATGTTGGAACCTGGCCATCCCTGCTTCTTTCAGTGGGTGAGGTTGGTGGCTGCTCCACCTGTTCCAGGCACACCCTTAACAGAGGTGGCTGCTTGCTCTTTAAGCCAGCTTGGCCTTGCCTGGCATGCACAGGCCCCG'
b = 'GCAGCCACTCAGGATGTTGGAACCTGGCCATCCCTGCTTCTTTCAGTGGGTGAGGTTGGTGGCTGCTCCACCTGTTCCAGGCACACCCTTAACAGAGGTGGCTGCTTGCTCTTTAAGCCAGCTTGGCCTTGCCTGGCATGCACAGGCCCCG'

a == b