In [1]:
import pysam

In [2]:
bampath = '/projects/ps-yeolab5/ekofman/Sammi/MouseBrainEF1A_SingleCell_EPR_batch2/filtered_possorted_ms_hippo_stamp_bam/filtered_keep_xf25_possorted_genome_with_header.bam_MD.bam'


In [3]:
samfile = pysam.AlignmentFile(bampath, "rb")

In [4]:
complements = {
    'A': 'T',
    'C': 'G',
    'G': 'C',
    'T': 'A'
}

def reverse_complement(input_seq):
    """
    Get the reverse complement of a sequence
    """
    rev_input_seq = reversed(input_seq)
    new_seq = []
    for r in rev_input_seq:
        new_seq.append(complements.get(r))
        
    return ''.join(new_seq)

def get_positions_from_md_tag(md_tag):
    """
    Figure out which positions are replaced, from the MD tag.
    """
    md_tag_parsed = []
    for c in md_tag:
        try:
            md_tag_parsed.append(str(int(c)))
        except Exception as e:
            md_tag_parsed.append('-')

    positions = []

    position_splitters = [int(i) for i in ''.join(md_tag_parsed).split('-')]
    for s in position_splitters:
        if len(positions) == 0:
            positions.append(s)
        else:
            positions.append(positions[-1] + s + 1)
    return positions


def incorporate_insertions(aligned_sequence, cigar_tuples):
    """
    Update an aligned sequence to reflect any insertions (take away those positions) such
    that it can be better compared base-to-base to a reference sequence.
    """
    new_seq = ''
    
    current_pos = 0
    for mod, pos in cigar_tuples:
        if mod == 0:
            new_seq += aligned_sequence[current_pos:current_pos+pos]
        current_pos += pos
    return new_seq


def incorporate_replaced_pos_info(aligned_seq, positions_replaced):
    """
    Return the aligned sequence string, with specified positions indicated as upper case
    and others as lower case. Also returns the bases at these positions themselves.
    """
    def upper(x): return x.upper()
    def lower(x): return x.lower()
    
    differences_function = upper
    others_function = lower
    
    indicated_aligned_seq = []
    bases_at_pos = []
    for i, a in enumerate(aligned_seq):
        if i in positions_replaced:
            indicated_aligned_seq.append(differences_function(a))
            bases_at_pos.append(a.upper())
        else:
            indicated_aligned_seq.append(others_function(a))
    return indicated_aligned_seq, bases_at_pos

In [5]:

for read in samfile.fetch('16', 3495912, 3505912):
    md_tag = read.get_tag('MD')
    print('MD tag', md_tag)

    # Are there any replacements?
    if not ('G' in md_tag or 'A' in md_tag or 'T' in md_tag or 'C' in md_tag):
        print('\t\t\t\t\t\tMaybe no edits')
        continue
    else:
        pass
    
    positions_replaced = get_positions_from_md_tag(md_tag)
    
    print(read.cigarstring)
    cigar_tuples = read.cigartuples

    print('Reverse?', read.is_reverse)
    
    aligned_seq = reverse_complement(read.get_forward_sequence())
    aligned_seq = incorporate_insertions(aligned_seq, cigar_tuples)
    indicated_aligned_seq, alt_bases = incorporate_replaced_pos_info(aligned_seq, positions_replaced)
    
    
    reference_seq = read.get_reference_sequence().lower()
    indicated_referece_seq, ref_bases = incorporate_replaced_pos_info(reference_seq, positions_replaced)

    
    print('Forward aligned sequence\n', ''.join(indicated_aligned_seq))
    print('Reference sequence\n', ''.join(indicated_referece_seq))
    print('barcode', read.get_tag('CR'))
    print("Alt\tRef\tPosition")
    for alt, ref, pos in zip(alt_bases, ref_bases, positions_replaced):
        print(alt, '\t', ref, '\t', pos)
        
    print()

MD tag 14G3G25G2G26C24
18M2I81M
Reverse? True
Forward aligned sequence
 ttattttagttcctTaaaTttttttctggactacataagtgatgTgaAaaagaattaagtggttagaacatgtaAtttccacccatcaactatgtgtat
Reference sequence
 ttattttagttcctGaaaGttttttctggactacataagtgatgGgaGaaagaattaagtggttagaacatgtaCtttccacccatcaactatgtgtat
barcode CAGCACGGTTCCTAAG
Alt	Ref	Position
T 	 G 	 14
T 	 G 	 18
T 	 G 	 44
A 	 G 	 47
A 	 C 	 74

MD tag 6A2G0T2T5G1C23T27
73M28S
Reverse? True
Forward aligned sequence
 ttcctgGaaCGttAttctgAaAtacataagtgatgggagaaagaaCtaagtggttagaacatgtactttccac
Reference sequence
 ttcctgAaaGTttTttctgGaCtacataagtgatgggagaaagaaTtaagtggttagaacatgtactttccac
barcode CATCAAGCAAACGTGG
Alt	Ref	Position
G 	 A 	 6
C 	 G 	 9
G 	 T 	 10
A 	 T 	 13
A 	 G 	 19
A 	 C 	 21
C 	 T 	 45

MD tag 3G2T12T1C7A3A29
63M38S
Reverse? True
Forward aligned sequence
 ttgAagAtttgttccattcAtTatctctcCacaTatgtataagtgtaaggatgtctggctgtg
Reference sequence
 ttgGagTtttgttccattcTtCatctctcAacaAatgtataagtgtaaggatgtctggctgtg
barcode GTCAAACCATGACAAA
Alt	Ref	P