# Needleman-Wunsch Sequence Alignment Algorithm

In [445]:
import numpy as np
import pandas as pd
import sys

In [446]:
match = 1
gap = -2
mismatch = -1


#sequence1 = 'GATTACA'
#sequence2 = 'GTCGACGCAAAAAAAAAAA'

sequence1 = 'TTGCCAAGGTTGCCAAGGTTGCCAAGGTTGCCAAGGTTGCCAAGGTTGCCAAGG'
sequence2 = 'CCAAGGTTAASNEUDHTTBKSBDJJGCCCAAGGTTGCDDD'


#sequence1 = 'HEAHEE'
#sequence2 = 'PAHE'

#sequence1 = 'AGT'
#sequence2 = 'AAGC'


In [447]:

#For arrow grid, 0=origin, 1=up, 2=left, 3=diag
TB_ORIG = 0
TB_LEFT = 2
TB_UP = 1
TB_DIAG = 4

def needleman_wunsch(seq1: str, seq2: str):
    
    #Set up grids --we add 2 for the corner and two initial gap spots
    score_grid = np.zeros((len(seq1)+1, (len(seq2)+1)), dtype=np.int64)
    arrows_list = [[] for y in range(0, len(seq2)+1)]
                             
    #Set constant scores in the corner
    score_grid[0][0] = 0
    score_grid[1][0] = gap
    score_grid[0][1] = gap
    
    #Set up terminal cell
    arrows_list[0].append([TB_ORIG])

    
    #fill seq1 side
    for x in range(1, len(seq1) + 1):
        score_grid[x][0] = gap + score_grid[x-1][0]
        arrows_list[0].append([TB_LEFT])
    
    #fill seq2 side
    for y in range(1, len(seq2) + 1):
        score_grid[0][y] = gap + score_grid[0][y-1]
        arrows_list[y].append([TB_UP])
        
    #fill a row
    for y in range(0, len(seq2)):
        for x in range(1, len(seq1) + 1):
            above = score_grid[x][y]
            left = score_grid[x-1][y+1]
            diag = score_grid[x-1][y]
            
            score, arrows = get_cell_score(seq1[x-1], seq2[y], above, left, diag)
            
            score_grid[x][y+1] = score  
            arrows_list[y+1].append(arrows)

                                                            
    return score_grid, arrows_list
    
    

In [448]:
def get_cell_score(char1: str, char2: str, above: int, left: int, diag: int):
    
    diag_score = -sys.maxsize
    above_score = -sys.maxsize
    left_score = -sys.maxsize
    
    if char1 == char2:
        diag_score = diag + match;
    else: 
        diag_score = diag + mismatch;
        
    above_score = above + gap
    left_score = left + gap
    
    max_score = max(diag_score, above_score, left_score)
    
    arrows_to_record = []
    if max_score == left_score:
        arrows_to_record.append(TB_LEFT)
    if max_score == above_score:
        arrows_to_record.append(TB_UP)
    if max_score == diag_score:
        arrows_to_record.append(TB_DIAG)
    
    return max_score, arrows_to_record
            

In [449]:
STOP = '⚬'
ARROW_UP = '↑'
ARROW_LEFT = '←'
ARROW_DIAG = '⬉'

arrow_sym = {0: STOP, 1: ARROW_UP, 2: ARROW_LEFT, 4: ARROW_DIAG}


def arrow_chart(numbers):
    shp = numbers.shape
    
    print('shape = ' + str(numbers.shape))

    arrow_grid = []
    
    for x in range(0,shp[0]):
        arrows = [ ]
        for y in range(0,shp[1]):
            arrows.append(arrow_sym[numbers[x][y]])
        arrow_grid.append(arrows)
    
    return arrow_grid

In [450]:
def traceback(sequence1, sequence2, arrow_list):
    
    found_branches = []
    
    curr_x, curr_y = len(arrow_list[0])-1, len(arrow_list)-1
    
    #list of branch points to trace
    branches = []
    for idx in range(0, len(arrow_list[curr_y][curr_x])):
        branches.append({'curr_x': curr_x, 'curr_y': curr_y, 'new_seq_1': '', 'new_seq_2': '', 
                         'arrow': arrow_list[curr_y][curr_x][idx]})
        
    #lists for completed sequences
    seq1_list = []
    seq2_list = []

    while (len(branches) > 0):
    
        curr_x = branches[0]['curr_x']
        curr_y = branches[0]['curr_y']

        new_seq_1 = branches[0]['new_seq_1']
        new_seq_2 = branches[0]['new_seq_2']
        
        curr_arrow = branches[0]['arrow']

        while curr_x + curr_y > 0:

            xchar = sequence1[curr_x-1]
            ychar = sequence2[curr_y-1]

            if curr_arrow == TB_DIAG:
                new_seq_1 = xchar + new_seq_1
                new_seq_2 = ychar + new_seq_2

                curr_x -= 1
                curr_y -= 1

            elif curr_arrow == TB_LEFT:
                new_seq_1 = xchar + new_seq_1
                new_seq_2 = '-' + new_seq_2

                curr_x -= 1

            else:
                new_seq_1 = '-' + new_seq_1
                new_seq_2 = ychar + new_seq_2

                curr_y -= 1

            curr_arrow_list = arrow_list[curr_y][curr_x]
            curr_arrow = curr_arrow_list[0]
        
            #Check for more branches
            if len(curr_arrow_list) > 1 and [curr_x, curr_y] not in found_branches:
                found_branches.append([curr_x, curr_y])
                for idx in range(1, len(arrow_list[curr_y][curr_x])):
                    branches.append({'curr_x': curr_x, 'curr_y': curr_y, 'new_seq_1': new_seq_1, 'new_seq_2': new_seq_2, 
                                     'arrow': arrow_list[curr_y][curr_x][idx]})
                
            
        seq1_list.append(new_seq_1)
        seq2_list.append(new_seq_2)
        
        
        branches = branches[1:]
        
    return seq1_list, seq2_list


In [451]:

score_grid, arrows_list = needleman_wunsch(sequence1, sequence2)


In [452]:
s1, s2 = traceback(sequence1, sequence2, arrows_list)
    
for n in range(0,len(s1)):
    print(s1[n])
    print(s2[n])
    print()
    

TTGCCAAGGTTGCCAAGGTTGCCAAGGTTGCCAAGGTTGCCAAGGTTGCCAAGG
---CCAAGGTT---AASNEUDHT----TBKSBDJJGC--CCAAGGTTGCDDD--

TTGCCAAGGTTGCCAAGGTTGCCAAGGTTGCCAAGGTTGCCAAGGTTGCCAAGG
---CCAAGGTT---AASNEUDHT----TBKSBDJJGC--CCAAGGTTGCDD--D

TTGCCAAGGTTGCCAAGGTTGCCAAGGTTGCCAAGGTTGCCAAGGTTGCCAAGG
---CCAAGGTT---AASNEUDHT----TBKSBDJJGC--CCAAGGTTGCDD-D-

TTGCCAAGGTTGCCAAGGTTGCCAAGGTTGCCAAGGTTGCCAAGGTTGCCAAGG
---CCAAGGTT---AASNEUDHT----TBKSBDJJG--CCCAAGGTTGCDDD--

TTGCCAAGGTTGCCAAGGTTGCCAAGGTTGCCAAGGTTGCCAAGGTTGCCAAGG
---CCAAGGTT---AASNEUDHT----TBKSBDJJG-C-CCAAGGTTGCDDD--

TTGCCAAGGTTGCCAAGGTTGCCAAGGTTGCCAAGGTTGCCAAGGTTGCCAAGG
---CCAAGGTT---AASNEUDH----TTBKSBDJJGC--CCAAGGTTGCDDD--

TTGCCAAGGTTGCCAAGGTTGCCAAGGTTGCCAAGGTTGCCAAGGTTGCCAAGG
---CCAAGGTT---AASNEUDH---T-TBKSBDJJGC--CCAAGGTTGCDDD--

TTGCCAAGGTTGCCAAGGTTGCCAAGGTTGCCAAGGTTGCCAAGGTTGCCAAGG
---CCAAGGTT---AASNEUDH--T--TBKSBDJJGC--CCAAGGTTGCDDD--

TTGCCAAGGTTGCCAAGGTTGCCAAGGTTGCCAAGGTTGCCAAGGTTGCCAAGG
---CCAAGGTT---AASNEUDH-T---TBKSBDJJGC--CCAAGGTTGCDDD--

T