In [1]:
from Bio import Align
import numpy as np

### Aligner 1

In [None]:
# NEW
a = Align.PairwiseAligner()

# Don't allow for gaps or mismatches with the target sequence
penalty = -np.inf
a.target_gap_score = penalty
a.mismatch_score = penalty
a.match_score = 100
a.match = 100
a.mismatch= penalty

GAP_BASE = 1

# Generally, prefer to extend gaps than to create them
a.query_internal_extend_gap_score = GAP_BASE + 1
a.query_internal_open_gap_score = GAP_BASE

# Set slight preference for open gaps on the edges, however, if present, strongly prefer single edge gaps
a.query_right_extend_gap_score = a.query_internal_extend_gap_score + 2
a.query_left_extend_gap_score = a.query_right_extend_gap_score
a.query_right_open_gap_score = a.query_internal_extend_gap_score + 1
a.query_left_open_gap_score = a.query_right_open_gap_score 
# a.mismatch = -np.inf



In [None]:
a.mismatch_score, a.target_gap_score

### Aligner 2

In [6]:
a = Align.PairwiseAligner()

# Don't allow for gaps or mismatches with the target sequence
a.target_gap_score = -np.inf
a.mismatch = -np.inf
# a.mismatch_score = 9999999
a.mismatch_score = -np.inf
a.match = 5

# Generally, prefer to extend gaps than to create them
a.query_extend_gap_score = 99
a.query_open_gap_score = 49

# Set slight preference for open gaps on the edges, however, if present, strongly prefer single edge gaps
a.query_end_open_gap_score = 50
a.query_end_extend_gap_score = 100

In [21]:
a = Align.PairwiseAligner()

# Don't allow for gaps or mismatches with the target sequence
a.target_gap_score = -9000000000
a.mismatch = -9999
a.mismatch_score = -9999
a.match = 5

# Generally, prefer to extend gaps than to create them
a.query_extend_gap_score = 99
a.query_open_gap_score = 49

# Set slight preference for open gaps on the edges, however, if present, strongly prefer single edge gaps
a.query_end_open_gap_score = 50
a.query_end_extend_gap_score = 100

In [39]:
a = Align.PairwiseAligner()

# Don't allow for gaps or mismatches with the target sequence
a.target_gap_score = -9000000000
a.mismatch = -9999
a.mismatch_score = -9999
a.match = 1

# Generally, prefer to extend gaps than to create them
a.query_extend_gap_score = 0
a.query_open_gap_score = -4

# Set slight preference for open gaps on the edges, however, if present, strongly prefer single edge gaps
a.query_end_open_gap_score = -3
a.query_end_extend_gap_score = .2

### Aligner 3 - no negatives

In [None]:
a = Align.PairwiseAligner()

# Don't allow for gaps or mismatches with the target sequence
a.target_gap_score = 0
a.mismatch = 0
a.mismatch_score = 0
a.match = 101

# Generally, prefer to extend gaps than to create them
a.query_extend_gap_score = 99
a.query_open_gap_score = 49

# Set slight preference for open gaps on the edges, however, if present, strongly prefer single edge gaps
a.query_end_extend_gap_score = 100
a.query_end_open_gap_score = 50


In [None]:
a.mismatch, a.mismatch_score, a.target_gap_score

# Method

In [40]:
from itertools import zip_longest

def grouper(iterable, n, fillvalue=None):
    args = [iter(iterable)] * n
    return zip_longest(*args, fillvalue=fillvalue)

def get_mask(true, observed, as_numpy=False):
    alignments = a.align(true, observed)
    if len(alignments) > 1 and alignments[0].score == alignments[1].score:
        print("Multiple equally-scored alignments found.")
        
    a1 = alignments[0]
    for al in alignments:
        print(al)
#     print(a1)
    print(a1.score)
    assert a1.path[0] == (0, 0), "Path must start at (0,0)."
    assert a1.score > 0, "Alignment must be positive"
    assert len(observed) <= len(true), "Obs seq must be shorter than or same length as true seq."
    
    # If the alignment starts with a match, then star the seqmask with "-"
    if a1.path[1][0] != 0 and a1.path[1][1] != 0:
        seqmask = "-"*len(true)
        gapchar = "+"
    else:
        seqmask = "+"*len(true)
        gapchar = "-"
        
    for gap_start, gap_end in grouper(a1.path, 2):
        if gap_end is None:
            break
        assert gap_start[1] == gap_end[1] or \
        (gap_end[0] - gap_start[0] == gap_end[1] - gap_start[1]), f"There must be a gap in the observed seq only, {gap_start}, {gap_end}"
        
        seqmask = seqmask[:gap_start[0]] + gapchar*(gap_end[0]-gap_start[0]) + seqmask[gap_end[0]:]
    if as_numpy:
        ints = []
        for s in seqmask:
            if s is "+":
                ints.append(1)
            else:
                ints.append(0)
        return np.asarray(ints)
    return seqmask
    
    

# Examples

In [41]:
get_mask("SLKIIRHHHSNAAAAANNALDITO", "SLKIIRHHHSNNALDITO")

Multiple equally-scored alignments found.
SLKIIRHHHSNAAAAANNALDITO
||||||||||------||||||||
SLKIIRHHHS------NNALDITO

SLKIIRHHHSNAAAAANNALDITO
|||||||||||------|||||||
SLKIIRHHHSN------NALDITO

14.0


'++++++++++------++++++++'

In [42]:
get_mask("AAAAAAAAGAPAAAAAAA", "AAAAAAAAAAAAAAA", as_numpy=True)

AAAAAAAAGAPAAAAAAA
||||||||---|||||||
AAAAAAAA---AAAAAAA

11.0


array([1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1])

In [43]:
get_mask("AAAAAAAAGAAAAAAAPAAAAAAA", "AAAAAAAAAAAAAAA")

AAAAAAAAGAAAAAAAPAAAAAAA
||||||||---------|||||||
AAAAAAAA---------AAAAAAA

11.0


'++++++++---------+++++++'

In [44]:
get_mask("STARTAAAAAAAAAAAAAAA", "AAAAAAAAAAAAAAA")

STARTAAAAAAAAAAAAAAA
-----|||||||||||||||
-----AAAAAAAAAAAAAAA

12.8


'-----+++++++++++++++'

In [45]:
get_mask("AAAAAAAAAAAAAAAEND", "AAAAAAAAAAAAAAA")

AAAAAAAAAAAAAAAEND
|||||||||||||||---
AAAAAAAAAAAAAAA---

12.399999999999999


'+++++++++++++++---'

In [46]:
get_mask("STARTAAAAAAAAAAAAAAA", "AAAAAAAAAAAAAAA")

STARTAAAAAAAAAAAAAAA
-----|||||||||||||||
-----AAAAAAAAAAAAAAA

12.8


'-----+++++++++++++++'

In [47]:
get_mask("STARTAAAAAAAAAGAPAAAAAA", "AAAAAAAAAAAAAAA")

STARTAAAAAAAAAGAPAAAAAA
-----|||||||||---||||||
-----AAAAAAAAA---AAAAAA

8.8


'-----+++++++++---++++++'

In [48]:
get_mask("STARTAAAAAAAAAGAPAAAAEAAEND", "AAAAAAAAAAAAAAA")

STARTAAAAAAAAAGAPAAAAEAAEND
-----|||||||||---||||-||---
-----AAAAAAAAA---AAAA-AA---

2.1999999999999997


'-----+++++++++---++++-++---'

In [49]:
get_mask("STARTAAAAAAAGAAAAPAAAAAAAAAEND", "AAAAAAAAAAAAAAAA")

STARTAAAAAAAGAAAAPAAAAAAAAAEND
-----|||||||------|||||||||---
-----AAAAAAA------AAAAAAAAA---

7.200000000000001


'-----+++++++------+++++++++---'

In [61]:
get_mask("AAAAAAAAAAAAAAAENDDDAA", "AAAAAAAAAAAAAAAA")

Multiple equally-scored alignments found.
AAAAAAAAAAAAAAAENDDDAA
||||||||||||||------||
AAAAAAAAAAAAAA------AA

AAAAAAAAAAAAAAAENDDDAA
|||||||||||||||------|
AAAAAAAAAAAAAAA------A

12.0


'++++++++++++++------++'

In [51]:
tru = "HHHHHHFNLPPGNYKKPKLLYCSNGGHFLRILPDGTVDGTRDRSDQHIQFQLSAESVGEVYIKSTETGQYLAMDTDGLVYGSQTPNEECLFLERLEENHYNTYISKKHAEKNWFLGLKKNGSVKRGPRTHYGQKAILFLPLPVSSD"
obs = "HHHHHFNLPPGNYKKPKLLYCSNGGHFLRILPDGTVDGTRDRSDQHIQFQLSAESVGEVYIKSTETGQYLAMDTDGLVYGSQTPNEECLFLERLEENHYNTYISKKHAEKNWFLGLKKNGSVKRGPRTHYGQKAILFLPLPV"

In [52]:
get_mask('HHHHHHFNLPPGNYKKPKLLYCSNGGHFLRILPDGTVDGTRDRSDQHIQFQLSAESVGEVYIKSTETGQYLAMDTDGLVYGSQTPNEECLFLERLEENHYNTYISKKHAEKNWFLGLKKNGSVKRGPRTHYGQKAILFLPLPVSSD', 'HHHHHFNLPPGNYKKPKLLYCSNGGHFLRILPDGTVDGTRDRSDQHIQFQLSAESVGEVYIKSTETGQYLAMDTDGLVYGSQTPNEECLFLERLEENHYNTYISKKHAEKNWFLGLKKNGSVKRGPRTHYGQKAILFLPLPV')

HHHHHHFNLPPGNYKKPKLLYCSNGGHFLRILPDGTVDGTRDRSDQHIQFQLSAESVGEVYIKSTETGQYLAMDTDGLVYGSQTPNEECLFLERLEENHYNTYISKKHAEKNWFLGLKKNGSVKRGPRTHYGQKAILFLPLPVSSD
-||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||---
-HHHHHFNLPPGNYKKPKLLYCSNGGHFLRILPDGTVDGTRDRSDQHIQFQLSAESVGEVYIKSTETGQYLAMDTDGLVYGSQTPNEECLFLERLEENHYNTYISKKHAEKNWFLGLKKNGSVKRGPRTHYGQKAILFLPLPV---

136.39999999999998


'-++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---'

In [55]:
obs = 'AFIEWYPRGYGVAFKIKKKIYEKLSKYQKIEVYETEGFGRLLALDGTVQLVTLGERSYHEPLVHPAMLAHPKPKRVLVIGGGDGGTVREVLQHDVDEVIMVEIDEDVIMVSKDLIKIDNGLLEAMLNGKHEKAKLTIGDGFEFIKNNRGFDVIIADSTDPVLFSEEFYRYVYDALNNPGIYVTQAGSVYLFTDELISAYKEMKKVFDRVYYYSFPVIGYASPWAFLVGVKGDIDFTKIDRERAKKLQLEYYDPLMHETLFQMPKYIRETLQ'
tru = 'MERAFIEWYPRGYGVAFKIKKKIYEKLSKYQKIEVYETEGFGRLLALDGTVQLVTLGERSYHEPLVHPAMLAHPKPKRVLVIGGGDGGTVREVLQHDVDEVIMVEIDEDVIMVSKDLIKIDNGLLEAMLNGKHEKAKLTIGDGFEFIKNNRGFDVIIADSTDPVGPAKVLFSEEFYRYVYDALNNPGIYVTQAGSVYLFTDELISAYKEMKKVFDRVYYYSFPVIGYASPWAFLVGVKGDIDFTKIDRERAKKLQLEYYDPLMHETLFQMPKYIRETLQRL'

In [56]:
print(a.align(tru, obs)[0])

MERAFIEWYPRGYGVAFKIKKKIYEKLSKYQKIEVYETEGFGRLLALDGTVQLVTLGERSYHEPLVHPAMLAHPKPKRVLVIGGGDGGTVREVLQHDVDEVIMVEIDEDVIMVSKDLIKIDNGLLEAMLNGKHEKAKLTIGDGFEFIKNNRGFDVIIADSTDPVGPAKVLFSEEFYRYVYDALNNPGIYVTQAGSVYLFTDELISAYKEMKKVFDRVYYYSFPVIGYASPWAFLVGVKGDIDFTKIDRERAKKLQLEYYDPLMHETLFQMPKYIRETLQRL
---||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||-----|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||--
---AFIEWYPRGYGVAFKIKKKIYEKLSKYQKIEVYETEGFGRLLALDGTVQLVTLGERSYHEPLVHPAMLAHPKPKRVLVIGGGDGGTVREVLQHDVDEVIMVEIDEDVIMVSKDLIKIDNGLLEAMLNGKHEKAKLTIGDGFEFIKNNRGFDVIIADSTDP-----VLFSEEFYRYVYDALNNPGIYVTQAGSVYLFTDELISAYKEMKKVFDRVYYYSFPVIGYASPWAFLVGVKGDIDFTKIDRERAKKLQLEYYDPLMHETLFQMPKYIRETLQ--



In [57]:
print(a.align(tru, obs)[1])

MERAFIEWYPRGYGVAFKIKKKIYEKLSKYQKIEVYETEGFGRLLALDGTVQLVTLGERSYHEPLVHPAMLAHPKPKRVLVIGGGDGGTVREVLQHDVDEVIMVEIDEDVIMVSKDLIKIDNGLLEAMLNGKHEKAKLTIGDGFEFIKNNRGFDVIIADSTDPVGPAKVLFSEEFYRYVYDALNNPGIYVTQAGSVYLFTDELISAYKEMKKVFDRVYYYSFPVIGYASPWAFLVGVKGDIDFTKIDRERAKKLQLEYYDPLMHETLFQMPKYIRETLQRL
---|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||-----||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||--
---AFIEWYPRGYGVAFKIKKKIYEKLSKYQKIEVYETEGFGRLLALDGTVQLVTLGERSYHEPLVHPAMLAHPKPKRVLVIGGGDGGTVREVLQHDVDEVIMVEIDEDVIMVSKDLIKIDNGLLEAMLNGKHEKAKLTIGDGFEFIKNNRGFDVIIADSTDPV-----LFSEEFYRYVYDALNNPGIYVTQAGSVYLFTDELISAYKEMKKVFDRVYYYSFPVIGYASPWAFLVGVKGDIDFTKIDRERAKKLQLEYYDPLMHETLFQMPKYIRETLQ--



# Method 2 `pairwise2`

In [38]:
from Bio import pairwise2 as pw

In [60]:
pw.align.globalms("STARTAAAAAAAAAAAAAA", "AAAAAAAAAAAAAAA", 500, -9999, -10, -1, penalize_end_gaps=True)

[('STARTAAAAAAAAAAAAAA', '--A--AAAAAAAAAAAAAA', 7478.0, 0, 19)]