In [1]:
import numpy as np
import pandas as pd

# Week 1: Introduction to Sequence Alignment

In [2]:
'''
Code Challenge: Solve the Change Problem. The DPChange pseudocode is reproduced below for your convenience.
Input: An integer money and an array Coins = (coin1, ..., coind).
Output: The minimum number of coins with denominations Coins that changes money.
'''

def DPChange(money, coins):
    if type(coins) == str:
        coins = coins.split(',')
    coins = list(map(int, coins))
    
    min_count    = [float('Inf')] * (money + 1)
    min_count[0] = 0
    
    for m in range(1, (money + 1)):
        for coin in coins:
            if m >= coin:
                if min_count[m - coin] + 1 < min_count[m]:
                    min_count[m] = min_count[m - coin] + 1
    return(min_count[money])
    
# Test
money = 17373
coins = '18,5,3,1'

DPChange(money, coins)

966

In [3]:
'''
Code Challenge: Find the length of a longest path in the Manhattan Tourist Problem.
Input: Integers n and m, followed by an n × (m + 1) matrix Down and an (n + 1) × m matrix Right.
Output: The length of a longest path from source (0, 0) to sink (n, m) in the n × m rectangular grid whose edges are defined by the matrices Down and Right.
'''

def ManhattanTourist(n, m, down, right):
    # reformat down and right matrix
    if type(down) == str:
        down = down.split('\n')
        for i in range(len(down)):
            down[i] = down[i].split(' ')
        down = np.array(down).reshape(n, m + 1)
    down = down.astype(int)
    
    if type(right) == str:
        right = right.split('\n')
        for i in range(len(right)):
            right[i] = right[i].split(' ')
        right = np.array(right).reshape(n + 1, m)
    right = right.astype(int)
    
    # initalize matrix
    matrix = np.zeros((n + 1, m + 1))
    for i in range(1, n + 1):
        matrix[i, 0] = matrix[i - 1, 0] + down[i - 1, 0]
    for j in range(1, m + 1):
        matrix[0, j] = matrix[0, j - 1] + right[0, j - 1]
    
    # fill matrix
    for i in range(1, n + 1):
        for j in range(1, m + 1):
            matrix[i, j] = max(matrix[i - 1, j] + down[i - 1, j],
                               matrix[i, j - 1] + right[i, j - 1])
            
    return(int(matrix[n, m]))

# Test
n = 4
m = 4
down = '''1 0 2 4 3
4 6 5 2 1
4 4 5 2 1
5 6 8 5 3'''

right = '''3 2 4 0
3 2 4 2
0 7 3 3
3 3 0 2
1 3 2 2'''

ManhattanTourist(n, m, down, right)

34

In [4]:
"""
Code Challenge: Use OutputLCS (reproduced below) to solve the Longest Common Subsequence Problem.
Input: Two strings s and t.
Output: A longest common subsequence of s and t.
"""

def LCSBackTrack(v, w):
    
    matrix = np.zeros((len(v) + 1, len(w) + 1))
    backtrack = np.zeros((len(v) + 1, len(w) + 1), '<U32')
    for i in range(1, (len(v) + 1)):
        for j in range(1, (len(w) + 1)):
            if v[i - 1] == w[j - 1]:
                 match = 1
            else:
                 match = 0
            matrix[i, j] = max(matrix[i - 1, j], matrix[i, j - 1], matrix[ i - 1, j - 1] + match)
            
            if matrix[i, j] == matrix[i - 1, j]:
                backtrack[i, j] = 'deletion'
            elif matrix[i, j] == matrix[i, j - 1]:
                backtrack[i, j] = 'insertion'
            elif (matrix[i, j] == matrix[i - 1, j - 1] + 1) & (v[i - 1] == w[j - 1]):
                backtrack[i, j] = 'diagonal'
    return(backtrack)

def OutputLCS(backtrack, v, i, j, longest_string):
    if (i == 0)|(j == 0):
        return
    if backtrack[i, j] == 'deletion':
        OutputLCS(backtrack, v, i - 1, j, longest_string)
    elif backtrack[i, j] == 'insertion':
        OutputLCS(backtrack, v, i, j - 1, longest_string)
    else:
        OutputLCS(backtrack, v, i - 1, j - 1, longest_string)
        longest_string.append(v[i - 1])

def LongestCommonSubsequence(v, w):
    backtrack = LCSBackTrack(v, w)
    i = len(v) 
    j = len(w) 
    longest_string = []
    
    OutputLCS(backtrack, v, i, j, longest_string)
    
    longest_string = ''.join(longest_string)
    
    return(longest_string)

# Test
v = 'AACCTTGG'
w = 'ACACTGTGA'

LongestCommonSubsequence(v, w)

'AACTTG'

In [5]:
'''
Code Challenge: Solve the Longest Path in a DAG Problem.
Input: An integer representing the starting node to consider in a graph, followed by an integer representing the ending node to 
    consider, followed by a list of edges in the graph. The edge notation "0->1:7" indicates that an edge connects node 0 to node 1
    with weight 7.  You may assume a given topological order corresponding to nodes in increasing order.
Output: The length of a longest path in the graph, followed by a longest path. 
'''

def GraphString2Dict(graph):
    graph = graph.split('\n')
    graph_dict   = {}
    graph_weight = {}
    
    for path in graph:
        from_, to_    = path.split('->')
        to_, weight = to_.split(':')
        graph_weight[''.join([from_,'->',to_])] = int(weight)
        if from_ in graph_dict:
            graph_dict[from_].append(to_)
        else:
            graph_dict[from_]=[to_]
    
    return(graph_dict,graph_weight)

def Longest_Path(start_point, end_point, graph):
    start_point, end_point = str(start_point), str(end_point)
    graph, graph_weight    = GraphString2Dict(graph)
    
    longest_path = []
    score_list   = [0] * 100
    track_back   = ['000'] * 100
    froms        = [start_point]

    while len(froms) != 0:
        next_from = []
        for from_ in froms:
            tos = graph[from_]
            for to in tos:
                weight_key = from_ + '->' + to
                score_list[int(to)] = max(score_list[int(to)], score_list[int(from_)] + graph_weight[weight_key])
                if score_list[int(to)] == (score_list[int(from_)] + graph_weight[weight_key]):
                    track_back[int(to)] = from_
                if to in graph:
                    next_from.append(to)
            froms = next_from
    
    to = end_point
    longest_path.append(end_point)
    
    while to != start_point:
        from_ = track_back[int(to)]
        longest_path.append(from_)
        to    = from_
        
    longest_path = '->'.join(longest_path[::-1])
    
    return(score_list[int(end_point)],longest_path)

# Test
start_point = 0
end_point   = 4
graph = '''0->1:7
0->2:4
2->3:2
1->4:1
3->4:3'''

Longest_Path(start_point, end_point, graph)

(9, '0->2->3->4')

# Week 2: From Finding a Longest Path to Aligning DNA Strings

In [6]:
'''
Code Challenge: Solve the Global Alignment Problem.
Input: Two protein strings written in the single-letter amino acid alphabet.
Output: The maximum alignment score of these strings followed by an alignment achieving this maximum score. Use the BLOSUM62 scoring matrix for matches and mismatches as well as the indel penalty σ = 5.
'''

BLOSUM62 = pd.read_csv('data/BLOSUM62.csv')
BLOSUM62 = BLOSUM62.set_index('X')

def BackTrack_ScoreMatrix_Global(v, w, score_matrix, indel_pentalty):
    
    matrix = np.full((len(v) + 1, len(w) + 1), 0, 'int')
    backtrack = np.zeros((len(v) + 1, len(w) + 1), '<U32')
    
    for i in range(1, (len(v) + 1)):
        matrix[i, 0] = matrix[i - 1, 0] - indel_pentalty
    for j in range(1, (len(w) + 1)):
        matrix[0, j] = matrix[0, j - 1] - indel_pentalty
        
    for i in range(1, (len(v) + 1)):
        for j in range(1, (len(w) + 1)):
            
            matrix[i, j] = max(matrix[i - 1, j]     - indel_pentalty, 
                               matrix[i    , j - 1] - indel_pentalty, 
                               matrix[i - 1, j - 1] + score_matrix.at[v[i - 1], w[j - 1]])
            
            if matrix[i, j] == matrix[i - 1, j] - indel_pentalty:
                backtrack[i, j] = 'deletion'
                
            elif matrix[i, j] == matrix[i, j - 1] - indel_pentalty:
                backtrack[i, j] = 'insertion'
                
            elif matrix[i, j] == (matrix[ i - 1, j - 1]) + (score_matrix.at[v[i - 1], w[j - 1]]):
                backtrack[i, j] = 'diagonal'
            
    return(matrix[len(v), len(w)],backtrack)

def AppendAlignment(backtrack, v, w, i, j, aligned_v, aligned_w):
    if (i == 0)|(j == 0):
        if (i == 0)&(j == 0):
            return
        
        elif backtrack[i, j] == 'deletion':
            if i == 0:
                aligned_v.append('-')
            else : 
                aligned_v.append(v[i - 1])
            aligned_w.append('-')
            
        elif backtrack[i, j] == 'insertion':
            if j == 0:
                aligned_w.append('-')
            else :
                aligned_w.append(w[j - 1])
            aligend_v.append('-')         

        else:
            if i == 0:
                aligned_v.append('-')
            else : 
                aligned_v.append(v[i - 1])
            if j == 0:
                aligned_w.append('-')
            else :
                aligned_w.append(w[j - 1])   
        return
    
    if backtrack[i, j] == 'jump':
        return
        
    elif backtrack[i, j] == 'deletion':
        AppendAlignment(backtrack, v, w, i - 1, j, aligned_v, aligned_w)
        aligned_w.append('-')
        aligned_v.append(v[i - 1])
        
    elif backtrack[i, j] == 'insertion':
        AppendAlignment(backtrack, v, w, i, j - 1, aligned_v, aligned_w)
        aligned_v.append('-')
        aligned_w.append(w[j - 1])
        
    else:
        AppendAlignment(backtrack, v, w, i - 1, j - 1, aligned_v, aligned_w)
        aligned_v.append(v[i - 1])
        aligned_w.append(w[j - 1])

def Score_Alignment_Global(v, w, score_matrix, indel_pentalty):
    score, backtrack = BackTrack_ScoreMatrix_Global(v, w, score_matrix, indel_pentalty)
    
    i = len(v) 
    j = len(w) 
    aligned_v = []
    aligned_w = []

    AppendAlignment(backtrack, v, w, i, j, aligned_v, aligned_w)
    aligned_v = ''.join(aligned_v)
    aligned_w = ''.join(aligned_w)
    
    return(score, aligned_v, aligned_w)

# Test
v = 'PLEASANTLY'
w = 'MEANLY'
score_matrix = BLOSUM62
indel_pentalty = 5

Score_Alignment_Global(v, w, score_matrix, indel_pentalty)

(8, 'PLEASANTLY', '-MEA--N-LY')

In [7]:
'''
Code Challenge: Solve the Local Alignment Problem.
Input: Two protein strings written in the single-letter amino acid alphabet.
Output: The maximum score of a local alignment of the strings, followed by a local alignment of these strings achieving the maximum
     score. Use the PAM250 scoring matrix for matches and mismatches as well as the indel penalty σ = 5.
'''

PAM250 = pd.read_csv('data/PAM250.csv')
PAM250 = PAM250.set_index('X')

def BackTrack_ScoreMatrix_Local(v, w, score_matrix):
    
    matrix = np.full((len(v) + 1, len(w) + 1), 0, 'int')
    backtrack = np.zeros((len(v) + 1, len(w) + 1), '<U32')
    
    for i in range(1, (len(v) + 1)):
        matrix[i, 0] = matrix[i - 1, 0] - 5
    for j in range(1, (len(w) + 1)):
        matrix[0, j] = matrix[0, j - 1] - 5
        
    for i in range(1, (len(v) + 1)):
        for j in range(1, (len(w) + 1)):
            
            matrix[i, j] = max(0,
                               matrix[i - 1, j]     - 5, 
                               matrix[i    , j - 1] - 5, 
                               matrix[i - 1, j - 1] + score_matrix.at[v[i - 1], w[j - 1]])
            
            if matrix[i, j] == 0:
                backtrack[i, j] = 'jump'
                
            elif matrix[i, j] == matrix[i - 1, j] - 5:
                backtrack[i, j] = 'deletion'
                
            elif matrix[i, j] == matrix[i, j - 1] - 5:
                backtrack[i, j] = 'insertion'
                
            elif matrix[i, j] == (matrix[ i - 1, j - 1]) + (score_matrix.at[v[i - 1], w[j - 1]]):
                backtrack[i, j] = 'diagonal'
            
    i_start, j_start = np.where(matrix == np.max(matrix))
    
    return(np.max(matrix), backtrack, int(i_start), int(j_start))

def Score_Alignment_Local(v, w, score_matrix):
    score, backtrack, i, j = BackTrack_ScoreMatrix_Local(v, w, score_matrix)
    
    #i = len(v) 
    #j = len(w) 
    aligned_v = []
    aligned_w = []

    AppendAlignment(backtrack, v, w, i, j, aligned_v, aligned_w)
    aligned_v = ''.join(aligned_v)
    aligned_w = ''.join(aligned_w)
    
    return(score, aligned_v, aligned_w)

# Test
v = 'MEANLY'
w = 'PENALTY'
score_matrix = PAM250

Score_Alignment_Local(v, w, score_matrix)

(15, 'EANL-Y', 'ENALTY')

In [8]:
'''
Edit Distance Problem: Find the edit distance between two strings.
Input: Two strings.
Output: The edit distance between these strings.
'''

def HammingDistance(string1, string2):
    mismatch = 0
    for base1, base2 in zip (string1, string2):
        if base1 != base2: 
            mismatch = mismatch + 1
    return (mismatch)

def EditDistance(v, w):
    score_matrix         = pd.DataFrame(np.identity(20)) - 1
    score_matrix.columns = PAM250.columns
    score_matrix.index   = PAM250.index
    indel_pentalty       = 1
    
    score, aligned_v, aligned_w = Score_Alignment_Global(v, w, score_matrix, indel_pentalty)

    edit_distance = HammingDistance(aligned_v, aligned_w)
    
    return(edit_distance)

# Test
v = 'PLEASANTLY'
w = 'MEANLY'

EditDistance(v, w)

5

In [9]:
'''
Code Challenge: Solve the Fitting Alignment Problem.
Input: Two nucleotide strings v and w, where v has length at most 1000 and w has length at most 100.
Output: A highest-scoring fitting alignment between v and w. Use the simple scoring method in which matches count +1 and both the
     mismatch and indel penalties are 1.
'''

def BackTrack_ScoreMatrix_Fitting(v, w, score_matrix, indel_pentalty):
    
    matrix = np.full((len(v) + 1, len(w) + 1), 0, 'int')
    backtrack = np.zeros((len(v) + 1, len(w) + 1), '<U32')
    
    '''for i in range(1, (len(v) + 1)):
        matrix[i, 0] = matrix[i - 1, 0] - indel_pentalty
    for j in range(1, (len(w) + 1)):
        matrix[0, j] = matrix[0, j - 1] - indel_pentalty'''
        
    for i in range(1, (len(v) + 1)):
        for j in range(1, (len(w) + 1)):
            
            matrix[i, j] = max(
                               matrix[i - 1, j]     - indel_pentalty, 
                               matrix[i    , j - 1] - indel_pentalty, 
                               matrix[i - 1, j - 1] + (score_matrix.at[v[i - 1],w[j - 1]]))
            
            
                
            if matrix[i, j] == matrix[i - 1, j] - indel_pentalty:
                backtrack[i, j] = 'deletion'
                
            elif matrix[i, j] == matrix[i, j - 1] - indel_pentalty:
                backtrack[i, j] = 'insertion'
                
            elif matrix[i, j] == (matrix[ i - 1, j - 1]) + (score_matrix.at[v[i - 1], w[j - 1]]):
                backtrack[i, j] = 'diagonal'
                
    return(matrix, backtrack )

def AppendAlignment_Fitting(backtrack, v, w, i, j, aligned_v, aligned_w):

    if (i == 0)|(j == 0):
        return
        
    if backtrack[i, j] == 'deletion':
        AppendAlignment(backtrack, v, w, i - 1, j, aligned_v, aligned_w)
        aligned_w.append('-')
        aligned_v.append(v[i - 1])
        
    elif backtrack[i, j] == 'insertion':
        AppendAlignment(backtrack, v, w, i, j - 1, aligned_v, aligned_w)
        aligned_v.append('-')
        aligned_w.append(w[j - 1])
        
    else:
        AppendAlignment(backtrack, v, w, i - 1, j - 1, aligned_v, aligned_w)
        aligned_v.append(v[i - 1])
        aligned_w.append(w[j - 1])

def Score_Alignment_Fitting(v, w, score_matrix, indel_pentalty):
    
    matrix, backtrack = BackTrack_ScoreMatrix_Fitting(v, w, score_matrix, indel_pentalty)

    j = len(w) 
    i = np.where(matrix[j : len(v), len(w)] == np.max(matrix[j : len(v), len(w)]))[0][0] + j

    score = np.max(matrix[:,j])

    aligned_v = []
    aligned_w = []

    AppendAlignment_Fitting(backtrack, v, w, i, j, aligned_v, aligned_w)
    
    aligned_v = ''.join(aligned_v)
    aligned_w = ''.join(aligned_w)
    
    while aligned_v[0] != aligned_w[0]:
        aligned_v = aligned_v[1:]
        aligned_w = aligned_w[1:]
    
    return(score, aligned_v, aligned_w )

# Test
score_matrix         = (pd.DataFrame(np.identity(20)) * 2) - 1
score_matrix.columns = PAM250.columns
score_matrix.index   = PAM250.index
v = 'GTAGGCTTAAGGTTA'
w = 'TAGATA'
indel_pentalty = 1

Score_Alignment_Fitting(v, w, score_matrix, indel_pentalty)

(2, 'TAGGCTTA', 'TAGA-T-A')

In [10]:
'''
Code Challenge: Solve the Overlap Alignment Problem.
Input: Two strings v and w, each of length at most 1000.
Output: The score of an optimal overlap alignment of v and w, followed by an alignment of a suffix v' of v and a prefix w' of w
     achieving this maximum score. Use an alignment score in which matches count +1 and both the mismatch and indel penalties are 2.
'''

def BackTrack_ScoreMatrix_Overlap(v, w, score_matrix, indel_pentalty):
    
    matrix = np.full((len(v) + 1, len(w) + 1), 0, 'int')
    backtrack = np.zeros((len(v) + 1, len(w) + 1), '<U32')
    
    '''for i in range(1, (len(v) + 1)):
        matrix[i, 0] = matrix[i - 1, 0] - indel_pentalty'''
    '''for j in range(1, (len(w) + 1)):
        matrix[0, j] = matrix[0, j - 1] - indel_pentalty'''
        
    for i in range(1, (len(v) + 1)):
        for j in range(1, (len(w) + 1)):
            if (j == 1) | (i == len(v)):
                matrix[i, j] = max(0,
                               matrix[i - 1, j]     - indel_pentalty, 
                               matrix[i    , j - 1] - indel_pentalty, 
                               matrix[i - 1, j - 1] + (score_matrix.at[v[i - 1],w[j - 1]]))
            else:
                matrix[i, j] = max(
                               matrix[i - 1, j]     - indel_pentalty, 
                               matrix[i    , j - 1] - indel_pentalty, 
                               matrix[i - 1, j - 1] + (score_matrix.at[v[i - 1],w[j - 1]]))
                
            if matrix[i, j] == matrix[i - 1, j] - indel_pentalty:
                backtrack[i, j] = 'deletion'
                
            elif matrix[i, j] == matrix[i, j - 1] - indel_pentalty:
                backtrack[i, j] = 'insertion'
                
            elif matrix[i, j] == (matrix[ i - 1, j - 1]) + (score_matrix.at[v[i - 1], w[j - 1]]):
                backtrack[i, j] = 'diagonal'
                
    return(matrix, backtrack )

def Score_Alignment_Overlap(v, w, score_matrix, indel_pentalty):
    
    matrix, backtrack = BackTrack_ScoreMatrix_Overlap(v, w, score_matrix, indel_pentalty)

    i = len(v) 
    j = np.where(matrix[i, :] == np.max(matrix[i, :]))[0][-1]
    score = matrix[i, j]

    aligned_v = []
    aligned_w = []

    AppendAlignment(backtrack, v, w, i, j, aligned_v, aligned_w)
    aligned_v = ''.join(aligned_v)
    aligned_w = ''.join(aligned_w)

    while aligned_v[0] != aligned_w[0]:
        aligned_v = aligned_v[1:]
        aligned_w = aligned_w[1:]

    return(score,aligned_v,aligned_w)

# Test
score_matrix         = (pd.DataFrame(np.identity(20)) * 3) - 2
score_matrix.columns = PAM250.columns
score_matrix.index   = PAM250.index

v = 'PAWHEAE'
w = 'HEAGAWGHEE'
indel_pentalty = 2

Score_Alignment_Overlap(v, w, score_matrix, indel_pentalty)

(1, 'HEAE', 'HEAG')