# Smith Waterman Algorithm

In [1]:
import numpy as np
import pandas as pd

In [2]:
def read_fasta(filename):
    with open(filename, 'r') as f:
        sequence = ""
        for line in f:
            line = line.strip()
            if line.startswith(">"): #ignore first line with label
                pass
            else:
                sequence += line
    return sequence

In [None]:
class SmithWaterman:
    """
    Performs the Smith-Waterman local alignment algorithm.
    """

    def __init__(self, seq1, seq2, match_score=1, mismatch_penalty=-1, gap_penalty=2):

        self.seq1 = seq1
        self.seq2 = seq2
        self.match_score = match_score
        self.mismatch_penalty = mismatch_penalty
        self.gap_penalty = gap_penalty

        self.rows = len(seq1) + 1
        self.cols = len(seq2) + 1

        self.max_score = 0
        self.max_pos = (0, 0)

        self.fill_matrix()

        self.score = self.find_score()
        self.alignment = self.traceback()
    
    def fill_matrix(self):

        # Initialize the scoring matrix with zeros
        self.matrix = np.zeros((self.rows, self.cols), dtype = int)

        # Fill the scoring matrix
        for i in range(1, self.rows):
            for j in range(1, self.cols):
                score_diag = self.matrix[i - 1, j - 1] + (self.match_score if self.seq1[i - 1] == self.seq2[j - 1] else self.mismatch_penalty)
                score_up = self.matrix[i - 1, j] - self.gap_penalty
                score_left = self.matrix[i, j - 1] - self.gap_penalty
            
                self.matrix[i, j] = max(0, score_diag, score_up, score_left)

    def find_score(self):
        for i in range(1, self.rows):
            for j in range(1, self.cols):
                if self.matrix[i, j] > self.max_score:
                    self.max_score = self.matrix[i, j]
                    self.max_pos = (i, j)

        return self.max_score

    def traceback(self):
        i, j = self.max_pos
        aligned_seq1, aligned_seq2 = "", ""
        traceback_coords = []
    
        #Traceback and create the alignment
        while self.matrix[i][j] != 0:
            traceback_coords.append((i,j))
            current_score = self.matrix[i][j]
            if current_score == self.matrix[i-1][j-1] + (self.match_score if self.seq1[i-1] == self.seq2[j-1] else self.mismatch_penalty):
                aligned_seq1 = self.seq1[i-1] + aligned_seq1
                aligned_seq2 = self.seq2[j-1] + aligned_seq2
                i -= 1
                j -= 1
            elif current_score == self.matrix[i-1][j] + self.gap_penalty:
                aligned_seq1 = self.seq1[i-1] + aligned_seq1
                aligned_seq2 = "-" + aligned_seq2
                i -= 1
            else:
                aligned_seq1 = "-" + aligned_seq1
                aligned_seq2 = self.seq2[j-1] + aligned_seq2
                j -= 1
                
        self.traceback_coords = traceback_coords

        while i > 0:
            aligned_seq1 = self.seq1[i - 1] + aligned_seq1
            aligned_seq2 = "-" + aligned_seq2
            i -= 1

        while j > 0:
            aligned_seq1 = "-" + aligned_seq1
            aligned_seq2 = self.seq2[j - 1] + aligned_seq2
            j -= 1
        
        return aligned_seq1, aligned_seq2
    
    def display_matrix(self):
        '''
        Returns the scoring matrix with the traceback labeled in cyan
        '''
        df = pd.DataFrame(self.matrix) 

        styled_df = df.style.apply( lambda row: [
            "background-color: cyan" 
            if (row.name, col_idx) in self.traceback_coords else ""
            for col_idx in range(df.shape[1]) ], axis=1 )
        

        return styled_df

In [34]:
example = SmithWaterman("AGGTA", "ACGT")
print(f"Score: {example.score}\n")
print(f"Alignment:\n{example.alignment[0]}\n{example.alignment[1]}")

example.display_matrix()

Score: 2

Alignment:
--AGGT
AC--GT


Unnamed: 0,0,1,2,3,4
0,0,0,0,0,0
1,0,1,0,0,0
2,0,0,0,1,0
3,0,0,0,1,0
4,0,0,0,0,2
5,0,1,0,0,0


## Real World Examples

In [35]:
human_hbb = read_fasta("data/HBB_HUMAN.fasta")
mouse_hbb = read_fasta("data/HBB1_MOUSE.fasta")
pigeon_hbb = read_fasta("data/HBB_COLLI.fasta")

In [36]:
humanVmouse = SmithWaterman(human_hbb, mouse_hbb)
print(f"Score: {humanVmouse.score}\n")
print(f"Alignment:\n{humanVmouse.alignment[0]}\n{humanVmouse.alignment[1]}")

Score: 89

Alignment:
MVHLTPEEKSAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFESFGDLSTPDAVMGNPKVKAHGKKVLGAFSDGLAHLDNLKGTFATLSELHCDKLHVDPENFRLLGNVLVCVLAHHFGKEFTPPVQAAYQKVVAGVANALAHKYH
MVHLTDAEKAAVSCLWGKVNSDEVGGEALGRLLVVYPWTQRYFDSFGDLSSASAIMGNAKVKAHGKKVITAFNDGLNHLDSLKGTFASLSELHCDKLHVDPENFRLLGNMIVIVLGHHLGKDFTPAAQAAFQKVVAGVATALAHKYH


In [37]:
humanVpigeon = SmithWaterman(human_hbb, pigeon_hbb)
print(f"Score: {humanVpigeon.score}\n")
print(f"Alignment:\n{humanVpigeon.alignment[0]}\n{humanVpigeon.alignment[1]}")

Score: 58

Alignment:
--------------MVHLTPEEKSAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFESFGDLSTPDAVMGNPKVKAHGKKVLGAFSDGLAHLDNLKGTFATLSELHCDKLHVDPENFRLLGNVLVCVLAHHFGKEFTPPVQAAYQKVVAGVANALAHKYH
VHWSAEEKQLITSI---------------WGKVNVADCGAEALARLLIVYPWTQRFFSSFGNLSSATAISGNPNVKAHGKKVLTSFGDAVKNLDNIKGTFAQLSELHCDKLHVDPENFRLLGDILVIILAAHFGKDFTPECQAAWQKLVRVVAHALARKYH
