In [50]:
import os
import itertools
from collections import Counter, defaultdict

import pandas as pd 
import numpy as np

import pysam 
from Bio import SeqIO 

## Inputs: 

`BAM` files with the sequencing aligned to the SSPE reference. 

`fasta` file for the SSPE reference made from all tissues. 

`csv` file with the variants labeled by whether they're genome-1 or genome-2 

Eventually, these will all be inputs in the `Snakemake` pipeline. 

In [6]:
# Bam files - will be supplied by snakemake 
bams = list()
for root, dirs, files in os.walk("../../results/realigned/"):
    for file in files:
        if file.endswith(".bam"):
             bams.append(os.path.join(root, file))
                
# Base inputs for analysis
ref_path = "../../config/ref/MeVChiTok-SSPE.fa" 
contig = "MeVChiTok"
minimum_qual = 25

# Import the dataframe with mutations labled by identity
snps_path = "../../results/spatial/labeled_variants.csv" # Path to the major haplotypes
snps_df = pd.read_csv(snps_path)

## Functions 

Main functions need to run the analysis – these include phasing reads and several helper functions. 

In [3]:
def check_read(read):
    """
    Helper function to decide what reads should
    be keep when parsing alignment file with `pysam`. 

    Parameters
    ----------
    read : AlignedSegment
        read from alignment file parsed with `pysam`.

    Returns
    -------
    bool
        True/False if read should be included
        
    """
    # Exclude Quality Failures
    if read.is_qcfail:
        return False
    # Exclude Secondary Mappings
    if read.is_secondary:
        return False
    # Exclude Unmapped Reads
    if read.is_unmapped:
        return False
    else:
        return True
    


## Testing; 1, 2, 3 ... 

In [None]:
def phase_variants(SNPs_df, bampath, contig, maxdepth = 500):
    
    ## ===== Format inputs and get SNP list ===== ##

    SNPs_set = set(pair for pair in zip(SNPs_df.POS, SNPs_df.ALT))                   
    
    ## ===== Move through the BAM file and get haplotypes ===== ##
    
    # Save the haplotypes in a dictionary
    haplotype_dict = {}

    # Get the start and stop
    start = sorted([pos for pos, alt in SNPs_set])[0]
    stop = sorted([pos for pos, alt in SNPs_set])[-1]

    # Open alignment with pysam
    with pysam.AlignmentFile(bampath, "rb") as bamfile:

        # Get the pileup column for a specific region
        for pileupcolumn in bamfile.pileup(contig, max_depth = maxdepth, start = start, stop = stop, stepper = 'nofilter'):

            # Check the if the position has a target SNP (converted to 0-indexed)
            if pileupcolumn.pos in [pos-1 for pos, alt in SNPs_set]:

                # Iterate over every alignment in the pileup column. 
                for pileupread in pileupcolumn.pileups:

                    # Check if the read is valid and can be parsed
                    if check_read(pileupread.alignment) and not pileupread.is_del and not pileupread.is_refskip:

                        # Save the query name
                        qname = pileupread.alignment.query_name
                        
                        # Save the 1-indexed position
                        pos = pileupcolumn.pos + 1

                        # Save the base at that position in the read
                        alt = pileupread.alignment.query_sequence[pileupread.query_position]

                        # Check if this read has the SNP or not
                        if (pos, alt) in SNPs_set:
                            phase = 1 # The read has the SNP

                        else:
                            phase = 0 # The read doesn't have the SNP

                        # Add the readname to the dictionary  
                        if qname in haplotype_dict.keys():
                            haplotype_dict[qname].append((pos, phase, alt)) 
                        else:
                            haplotype_dict[qname] = [(pos, phase, alt)]
                     
    ## ===== Convert the haplotype dictionary to a dataframe filling in missings ===== ##
    
    # Save the haplotypes with missing values (not covered by reads) filled
    completed_haplotype_dict = {}

    # Iterate over the haplotype dictionary created above
    for qname, alleles in haplotype_dict.items():
        
        # Make a dictionary to fill with observed SNPs for each read
        aa_dict = {tup:"-" for tup in SNPs_set}
        
        # Iterate over all observed alleles
        for allele in alleles:
            
            # If it's 1, then an allele was observed
            if allele[1] == 1:
                # Add this observation based on the key
                aa_dict[(allele[0], allele[2])] = 1
            # If it's a wild type allele 
            elif allele[1] == 0:
                # Check every possible wt allele
                for key in aa_dict.keys():
                    # Check by position
                    if key[0] == allele[0]:
                        # If it's the same positon add a 0
                        aa_dict[key] = 0

        # Add this populated dictionary to the haplotype dictionary
        completed_haplotype_dict[qname] = aa_dict

    
    # Convert this dicitonary into a dataframe and take the transpose
    haplotype_df = pd.DataFrame(completed_haplotype_dict).T
    
    # Replace the '-' with 'NaN'
    haplotype_df = haplotype_df.replace('-', np.nan)
    
    # Return the phased SNP df
    return haplotype_df

In [116]:
def phase_variants(bampath, snps):
    """
    Get the phase of variants by making a dictionary keyed by 
    read name and add allele counts for all polymorphic sites.
    
    Parameters
    ----------
    
    bampath: str
        Path to a BAM file to phase variants for.
    
    snps: list
        A sorted list of the SNPs to phase.
    
        
    Returns
    -------
    
    pandas.DataFrame
        DataFrame with the pairwise phase of all targeted SNPs
    
    """
    
    # Searchable set of alternative alleles at each position 
    alt_alleles = set((pos, alt) for ref, pos, alt in snps)
    # Searchable set of reference alleles at each position 
    ref_alleles = set((pos, ref) for ref, pos, alt in snps)
    
    # First and last position to visit in the pileup column
    start = snps[0][1]
    stop = snps[-1][1]
    
    # Empty dict to store qnames
    qnames = defaultdict(list)
    # Empty dict to store snp pairs
    phased = defaultdict(list)
    
    ## ==== Go through the bam file and identify haplotypes ==== ##
    
    # Open the alignment file
    with pysam.AlignmentFile(bampath, "rb") as bamfile:
        
        # Iterate over the pileup column at each position
        for pileupcolumn in bamfile.pileup(stepper = 'nofilter',
                                           flag_filter = 0,
                                           min_base_quality = 25,
                                           start = start,
                                           stop = stop,):
            
            # Check if the position contains a polymorphic position
            if pileupcolumn.reference_pos + 1 in set(pos for ref, pos, alt in snps):
                
                # Iterate over each read in the pileup column 
                for pileupread in pileupcolumn.pileups:
                    
                    # Check that it's a good read - no deletions, qfails, or skips
                    if check_read(pileupread.alignment) and not pileupread.is_del and not pileupread.is_refskip:

                        # Save the read name
                        qname = pileupread.alignment.query_name

                        # Save the 1-indexed position
                        position = pileupcolumn.pos + 1

                        # Save the base at that position in the read
                        allele = pileupread.alignment.query_sequence[pileupread.query_position]

                        # Check if this read has the SNP or not - assuming bialleleic! 
                        if (position, allele) in alt_alleles:
                            phase = 1 # The read has the alt allele at this position

                        elif (position, allele) in ref_alleles:
                            phase = 0 # The read has the ref allele at this position
                        else:
                            continue 

                        # Add the qname to the dictionary along with phase at position 
                        qnames[qname].append((position, phase, allele))

    ## ==== Collate haplotypes and count for all SNP paris observed ==== ##
    
    for qname, alleles in qnames.items(): 

        # Can't phase anything with a single SNP
        if len(set(alleles)) == 1: 
            continue 

        haplotype = defaultdict(set)
        for pos, phase, allele in alleles:
            haplotype[pos].add(phase)

        # Don't use reads with read pairs that disagree 
        for phases in haplotype.values():
            if len(phases) > 1:
                continue

        # Get a list of allele observations for this read
        allele_obsvs = sorted([(position, list(phase)[0])
                               for position, phase 
                               in haplotype.items()], key = lambda x: x[0])

        # Get the phases of all combinations represented on this read
        for allele_one, allele_two in itertools.combinations(allele_obsvs, 2): 

            allele_pair = (allele_one[0], allele_two[0])

            phasing = f"{allele_one[1]}{allele_two[1]}"

            # Add them to a dictionary indexed by the combination of positions 
            phased[allele_pair].append(phasing)

    # Count the haplotypes for each pair of positions with overlaping reads 
    counts = {comp: Counter(haps) for comp, haps in phased.items()}

    # Convert to a dataframe 
    counts_df = (pd.DataFrame(counts)
                 .T
                 .fillna(0)
                 .reset_index()
                 .rename(columns = {"level_0": "snp_1", "level_1": "snp_2"})
                 .sort_values(by=['snp_1', 'snp_2'])
    )

    # Check that there are no redundant pairs - these would need to be combined 
    assert len(set(
        Counter(
            tuple(sorted((pos_1, pos_2))) 
            for pos_1, pos_2 
            in zip(counts_df.snp_1, counts_df.snp_2))
        .values())) == 1

    return counts_df
                            

In [42]:
# List of all SNPs including the reference allele - assume all sites are biallelic 
snps_list = sorted(
                list(
                    {(REF, POS, ALT) for REF, POS, ALT in
                     zip(snps_df.REF, snps_df.POS, snps_df.ALT)}
                ),
                key = lambda x: x[1])


In [121]:
phase_list = []

for bam in bams:
    
    tissue = " ".join(os.path.basename(bam).split(".")[0].split("_"))
    print(f'Assigning reads for {tissue}')
    
    phase_df = phase_variants(bam, snps_list)
    phase_df["Tissue"] = tissue
    
    phase_list.append(phase_df)
    
final_df = pd.concat(phase_list)
final_df.to_csv("../../config/snp_pairs.csv", index=False)


Assigning reads for Frontal Cortex 3
Assigning reads for Internal Capsule
Assigning reads for Brain Stem
Assigning reads for Cerebellum Nucleus
Assigning reads for Frontal Cortex 1
Assigning reads for UBS
Assigning reads for Parietal Lobe
Assigning reads for Midbrain
Assigning reads for Occipital Lobe
Assigning reads for Temporal Lobe
Assigning reads for Frontal Cortex 2


In [122]:
final_df

Unnamed: 0,snp_1,snp_2,00,10,01,11,Tissue
0,42,96,6402.0,18.0,1.0,0.0,Frontal Cortex 3
1,42,152,5180.0,15.0,162.0,1.0,Frontal Cortex 3
79,42,167,115.0,2.0,1.0,0.0,Frontal Cortex 3
3,42,242,78.0,0.0,0.0,0.0,Frontal Cortex 3
4,42,260,77.0,0.0,0.0,0.0,Frontal Cortex 3
...,...,...,...,...,...,...,...
20995,15382,15799,22.0,0.0,0.0,0.0,Frontal Cortex 2
21005,15382,15855,5.0,0.0,0.0,0.0,Frontal Cortex 2
3,15795,15799,7144.0,1.0,5.0,0.0,Frontal Cortex 2
3808,15795,15855,2989.0,1.0,24.0,0.0,Frontal Cortex 2
