In [1]:
import numpy as np
import pandas as pd
import copy

# Week 1: Introduction to Sequence Alignment

In [2]:
'''
Code Challenge: Solve the Change Problem. The DPChange pseudocode is reproduced below for your convenience.
Input: An integer money and an array Coins = (coin1, ..., coind).
Output: The minimum number of coins with denominations Coins that changes money.
'''

def DPChange(money, coins):
    if type(coins) == str:
        coins = coins.split(',')
    coins = list(map(int, coins))
    
    min_count    = [float('Inf')] * (money + 1)
    min_count[0] = 0
    
    for m in range(1, (money + 1)):
        for coin in coins:
            if m >= coin:
                if min_count[m - coin] + 1 < min_count[m]:
                    min_count[m] = min_count[m - coin] + 1
    return(min_count[money])
    
# Test
money = 17373
coins = '18,5,3,1'

DPChange(money, coins)

966

In [3]:
'''
Code Challenge: Find the length of a longest path in the Manhattan Tourist Problem.
Input: Integers n and m, followed by an n × (m + 1) matrix Down and an (n + 1) × m matrix Right.
Output: The length of a longest path from source (0, 0) to sink (n, m) in the n × m rectangular grid whose edges are defined by the matrices Down and Right.
'''

def ManhattanTourist(n, m, down, right):
    # reformat down and right matrix
    if type(down) == str:
        down = down.split('\n')
        for i in range(len(down)):
            down[i] = down[i].split(' ')
        down = np.array(down).reshape(n, m + 1)
    down = down.astype(int)
    
    if type(right) == str:
        right = right.split('\n')
        for i in range(len(right)):
            right[i] = right[i].split(' ')
        right = np.array(right).reshape(n + 1, m)
    right = right.astype(int)
    
    # initalize matrix
    matrix = np.zeros((n + 1, m + 1))
    for i in range(1, n + 1):
        matrix[i, 0] = matrix[i - 1, 0] + down[i - 1, 0]
    for j in range(1, m + 1):
        matrix[0, j] = matrix[0, j - 1] + right[0, j - 1]
    
    # fill matrix
    for i in range(1, n + 1):
        for j in range(1, m + 1):
            matrix[i, j] = max(matrix[i - 1, j] + down[i - 1, j],
                               matrix[i, j - 1] + right[i, j - 1])
            
    return(int(matrix[n, m]))

# Test
n = 4
m = 4
down = '''1 0 2 4 3
4 6 5 2 1
4 4 5 2 1
5 6 8 5 3'''

right = '''3 2 4 0
3 2 4 2
0 7 3 3
3 3 0 2
1 3 2 2'''

ManhattanTourist(n, m, down, right)

34

In [4]:
"""
Code Challenge: Use OutputLCS (reproduced below) to solve the Longest Common Subsequence Problem.
Input: Two strings s and t.
Output: A longest common subsequence of s and t.
"""

def LCSBackTrack(v, w):
    
    matrix = np.zeros((len(v) + 1, len(w) + 1))
    backtrack = np.zeros((len(v) + 1, len(w) + 1), '<U32')
    for i in range(1, (len(v) + 1)):
        for j in range(1, (len(w) + 1)):
            if v[i - 1] == w[j - 1]:
                 match = 1
            else:
                 match = 0
            matrix[i, j] = max(matrix[i - 1, j], matrix[i, j - 1], matrix[ i - 1, j - 1] + match)
            
            if matrix[i, j] == matrix[i - 1, j]:
                backtrack[i, j] = 'deletion'
            elif matrix[i, j] == matrix[i, j - 1]:
                backtrack[i, j] = 'insertion'
            elif (matrix[i, j] == matrix[i - 1, j - 1] + 1) & (v[i - 1] == w[j - 1]):
                backtrack[i, j] = 'diagonal'
    return(backtrack)

def OutputLCS(backtrack, v, i, j, longest_string):
    if (i == 0)|(j == 0):
        return
    if backtrack[i, j] == 'deletion':
        OutputLCS(backtrack, v, i - 1, j, longest_string)
    elif backtrack[i, j] == 'insertion':
        OutputLCS(backtrack, v, i, j - 1, longest_string)
    else:
        OutputLCS(backtrack, v, i - 1, j - 1, longest_string)
        longest_string.append(v[i - 1])

def LongestCommonSubsequence(v, w):
    backtrack = LCSBackTrack(v, w)
    i = len(v) 
    j = len(w) 
    longest_string = []
    
    OutputLCS(backtrack, v, i, j, longest_string)
    
    longest_string = ''.join(longest_string)
    
    return(longest_string)

# Test
v = 'AACCTTGG'
w = 'ACACTGTGA'

LongestCommonSubsequence(v, w)

'AACTTG'

In [5]:
'''
Code Challenge: Solve the Longest Path in a DAG Problem.
Input: An integer representing the starting node to consider in a graph, followed by an integer representing the ending node to 
    consider, followed by a list of edges in the graph. The edge notation "0->1:7" indicates that an edge connects node 0 to node 1
    with weight 7.  You may assume a given topological order corresponding to nodes in increasing order.
Output: The length of a longest path in the graph, followed by a longest path. 
'''

def GraphString2Dict(graph):
    graph = graph.split('\n')
    graph_dict   = {}
    graph_weight = {}
    
    for path in graph:
        from_, to_    = path.split('->')
        to_, weight = to_.split(':')
        graph_weight[''.join([from_,'->',to_])] = int(weight)
        if from_ in graph_dict:
            graph_dict[from_].append(to_)
        else:
            graph_dict[from_]=[to_]
    
    return(graph_dict,graph_weight)

def Longest_Path(start_point, end_point, graph):
    start_point, end_point = str(start_point), str(end_point)
    graph, graph_weight    = GraphString2Dict(graph)
    
    longest_path = []
    score_list   = [0] * 100
    track_back   = ['000'] * 100
    froms        = [start_point]

    while len(froms) != 0:
        next_from = []
        for from_ in froms:
            tos = graph[from_]
            for to in tos:
                weight_key = from_ + '->' + to
                score_list[int(to)] = max(score_list[int(to)], score_list[int(from_)] + graph_weight[weight_key])
                if score_list[int(to)] == (score_list[int(from_)] + graph_weight[weight_key]):
                    track_back[int(to)] = from_
                if to in graph:
                    next_from.append(to)
            froms = next_from
    
    to = end_point
    longest_path.append(end_point)
    
    while to != start_point:
        from_ = track_back[int(to)]
        longest_path.append(from_)
        to    = from_
        
    longest_path = '->'.join(longest_path[::-1])
    
    return(score_list[int(end_point)],longest_path)

# Test
start_point = 0
end_point   = 4
graph = '''0->1:7
0->2:4
2->3:2
1->4:1
3->4:3'''

Longest_Path(start_point, end_point, graph)

(9, '0->2->3->4')

# Week 2: From Finding a Longest Path to Aligning DNA Strings

In [6]:
'''
Code Challenge: Solve the Global Alignment Problem.
Input: Two protein strings written in the single-letter amino acid alphabet.
Output: The maximum alignment score of these strings followed by an alignment achieving this maximum score. Use the BLOSUM62 scoring matrix for matches and mismatches as well as the indel penalty σ = 5.
'''

BLOSUM62 = pd.read_csv('data/BLOSUM62.csv')
BLOSUM62 = BLOSUM62.set_index('X')

def BackTrack_ScoreMatrix_Global(v, w, score_matrix, indel_pentalty):
    
    matrix = np.full((len(v) + 1, len(w) + 1), 0, 'int')
    backtrack = np.zeros((len(v) + 1, len(w) + 1), '<U32')
    
    for i in range(1, (len(v) + 1)):
        matrix[i, 0] = matrix[i - 1, 0] - indel_pentalty
    for j in range(1, (len(w) + 1)):
        matrix[0, j] = matrix[0, j - 1] - indel_pentalty
        
    for i in range(1, (len(v) + 1)):
        for j in range(1, (len(w) + 1)):
            
            matrix[i, j] = max(matrix[i - 1, j]     - indel_pentalty, 
                               matrix[i    , j - 1] - indel_pentalty, 
                               matrix[i - 1, j - 1] + score_matrix.at[v[i - 1], w[j - 1]])
            
            if matrix[i, j] == matrix[i - 1, j] - indel_pentalty:
                backtrack[i, j] = 'deletion'
                
            elif matrix[i, j] == matrix[i, j - 1] - indel_pentalty:
                backtrack[i, j] = 'insertion'
                
            elif matrix[i, j] == (matrix[ i - 1, j - 1]) + (score_matrix.at[v[i - 1], w[j - 1]]):
                backtrack[i, j] = 'diagonal'
            
    return(matrix[len(v), len(w)],backtrack)

def AppendAlignment(backtrack, v, w, i, j, aligned_v, aligned_w):
    if (i == 0)|(j == 0):
        if (i == 0)&(j == 0):
            return
        
        elif backtrack[i, j] == 'deletion':
            if i == 0:
                aligned_v.append('-')
            else : 
                aligned_v.append(v[i - 1])
            aligned_w.append('-')
            
        elif backtrack[i, j] == 'insertion':
            if j == 0:
                aligned_w.append('-')
            else :
                aligned_w.append(w[j - 1])
            aligend_v.append('-')         

        else:
            if i == 0:
                aligned_v.append('-')
            else : 
                aligned_v.append(v[i - 1])
            if j == 0:
                aligned_w.append('-')
            else :
                aligned_w.append(w[j - 1])   
        return
    
    if backtrack[i, j] == 'jump':
        return
        
    elif backtrack[i, j] == 'deletion':
        AppendAlignment(backtrack, v, w, i - 1, j, aligned_v, aligned_w)
        aligned_w.append('-')
        aligned_v.append(v[i - 1])
        
    elif backtrack[i, j] == 'insertion':
        AppendAlignment(backtrack, v, w, i, j - 1, aligned_v, aligned_w)
        aligned_v.append('-')
        aligned_w.append(w[j - 1])
        
    else:
        AppendAlignment(backtrack, v, w, i - 1, j - 1, aligned_v, aligned_w)
        aligned_v.append(v[i - 1])
        aligned_w.append(w[j - 1])

def Score_Alignment_Global(v, w, score_matrix, indel_pentalty):
    score, backtrack = BackTrack_ScoreMatrix_Global(v, w, score_matrix, indel_pentalty)

    i = len(v) 
    j = len(w) 
    aligned_v = []
    aligned_w = []

    AppendAlignment(backtrack, v, w, i, j, aligned_v, aligned_w)
    aligned_v = ''.join(aligned_v)
    aligned_w = ''.join(aligned_w)
    
    return(score, aligned_v, aligned_w)

# Test
v = 'PLEASANTLY'
w = 'MEANLY'
score_matrix = BLOSUM62
indel_pentalty = 5

Score_Alignment_Global(v, w, score_matrix, indel_pentalty)

(8, 'PLEASANTLY', '-MEA--N-LY')

In [7]:
'''
Code Challenge: Solve the Local Alignment Problem.
Input: Two protein strings written in the single-letter amino acid alphabet.
Output: The maximum score of a local alignment of the strings, followed by a local alignment of these strings achieving the maximum
     score. Use the PAM250 scoring matrix for matches and mismatches as well as the indel penalty σ = 5.
'''

PAM250 = pd.read_csv('data/PAM250.csv')
PAM250 = PAM250.set_index('X')

def BackTrack_ScoreMatrix_Local(v, w, score_matrix):
    
    matrix = np.full((len(v) + 1, len(w) + 1), 0, 'int')
    backtrack = np.zeros((len(v) + 1, len(w) + 1), '<U32')
    
    for i in range(1, (len(v) + 1)):
        matrix[i, 0] = matrix[i - 1, 0] - 5
    for j in range(1, (len(w) + 1)):
        matrix[0, j] = matrix[0, j - 1] - 5
        
    for i in range(1, (len(v) + 1)):
        for j in range(1, (len(w) + 1)):
            
            matrix[i, j] = max(0,
                               matrix[i - 1, j]     - 5, 
                               matrix[i    , j - 1] - 5, 
                               matrix[i - 1, j - 1] + score_matrix.at[v[i - 1], w[j - 1]])
            
            if matrix[i, j] == 0:
                backtrack[i, j] = 'jump'
                
            elif matrix[i, j] == matrix[i - 1, j] - 5:
                backtrack[i, j] = 'deletion'
                
            elif matrix[i, j] == matrix[i, j - 1] - 5:
                backtrack[i, j] = 'insertion'
                
            elif matrix[i, j] == (matrix[ i - 1, j - 1]) + (score_matrix.at[v[i - 1], w[j - 1]]):
                backtrack[i, j] = 'diagonal'
            
    i_start, j_start = np.where(matrix == np.max(matrix))
    
    return(np.max(matrix), backtrack, int(i_start), int(j_start))

def Score_Alignment_Local(v, w, score_matrix):
    score, backtrack, i, j = BackTrack_ScoreMatrix_Local(v, w, score_matrix)
    
    #i = len(v) 
    #j = len(w) 
    aligned_v = []
    aligned_w = []

    AppendAlignment(backtrack, v, w, i, j, aligned_v, aligned_w)
    aligned_v = ''.join(aligned_v)
    aligned_w = ''.join(aligned_w)
    
    return(score, aligned_v, aligned_w)

# Test
v = 'MEANLY'
w = 'PENALTY'
score_matrix = PAM250

Score_Alignment_Local(v, w, score_matrix)

(15, 'EANL-Y', 'ENALTY')

In [8]:
'''
Edit Distance Problem: Find the edit distance between two strings.
Input: Two strings.
Output: The edit distance between these strings.
'''

def HammingDistance(string1, string2):
    mismatch = 0
    for base1, base2 in zip (string1, string2):
        if base1 != base2: 
            mismatch = mismatch + 1
    return (mismatch)

def EditDistance(v, w):
    score_matrix         = pd.DataFrame(np.identity(20)) - 1
    score_matrix.columns = PAM250.columns
    score_matrix.index   = PAM250.index
    indel_pentalty       = 1
    
    score, aligned_v, aligned_w = Score_Alignment_Global(v, w, score_matrix, indel_pentalty)

    edit_distance = HammingDistance(aligned_v, aligned_w)
    
    return(edit_distance)

# Test
v = 'PLEASANTLY'
w = 'MEANLY'

EditDistance(v, w)

5

In [9]:
'''
Code Challenge: Solve the Fitting Alignment Problem.
Input: Two nucleotide strings v and w, where v has length at most 1000 and w has length at most 100.
Output: A highest-scoring fitting alignment between v and w. Use the simple scoring method in which matches count +1 and both the
     mismatch and indel penalties are 1.
'''

def BackTrack_ScoreMatrix_Fitting(v, w, score_matrix, indel_pentalty):
    
    matrix = np.full((len(v) + 1, len(w) + 1), 0, 'int')
    backtrack = np.zeros((len(v) + 1, len(w) + 1), '<U32')
    
    '''for i in range(1, (len(v) + 1)):
        matrix[i, 0] = matrix[i - 1, 0] - indel_pentalty
    for j in range(1, (len(w) + 1)):
        matrix[0, j] = matrix[0, j - 1] - indel_pentalty'''
        
    for i in range(1, (len(v) + 1)):
        for j in range(1, (len(w) + 1)):
            
            matrix[i, j] = max(
                               matrix[i - 1, j]     - indel_pentalty, 
                               matrix[i    , j - 1] - indel_pentalty, 
                               matrix[i - 1, j - 1] + (score_matrix.at[v[i - 1],w[j - 1]]))
            
            
                
            if matrix[i, j] == matrix[i - 1, j] - indel_pentalty:
                backtrack[i, j] = 'deletion'
                
            elif matrix[i, j] == matrix[i, j - 1] - indel_pentalty:
                backtrack[i, j] = 'insertion'
                
            elif matrix[i, j] == (matrix[ i - 1, j - 1]) + (score_matrix.at[v[i - 1], w[j - 1]]):
                backtrack[i, j] = 'diagonal'
                
    return(matrix, backtrack )

def AppendAlignment_Fitting(backtrack, v, w, i, j, aligned_v, aligned_w):

    if (i == 0)|(j == 0):
        return
        
    if backtrack[i, j] == 'deletion':
        AppendAlignment(backtrack, v, w, i - 1, j, aligned_v, aligned_w)
        aligned_w.append('-')
        aligned_v.append(v[i - 1])
        
    elif backtrack[i, j] == 'insertion':
        AppendAlignment(backtrack, v, w, i, j - 1, aligned_v, aligned_w)
        aligned_v.append('-')
        aligned_w.append(w[j - 1])
        
    else:
        AppendAlignment(backtrack, v, w, i - 1, j - 1, aligned_v, aligned_w)
        aligned_v.append(v[i - 1])
        aligned_w.append(w[j - 1])

def Score_Alignment_Fitting(v, w, score_matrix, indel_pentalty):
    
    matrix, backtrack = BackTrack_ScoreMatrix_Fitting(v, w, score_matrix, indel_pentalty)

    j = len(w) 
    i = np.where(matrix[j : len(v), len(w)] == np.max(matrix[j : len(v), len(w)]))[0][0] + j

    score = np.max(matrix[:,j])

    aligned_v = []
    aligned_w = []

    AppendAlignment_Fitting(backtrack, v, w, i, j, aligned_v, aligned_w)
    
    aligned_v = ''.join(aligned_v)
    aligned_w = ''.join(aligned_w)
    
    while aligned_v[0] != aligned_w[0]:
        aligned_v = aligned_v[1:]
        aligned_w = aligned_w[1:]
    
    return(score, aligned_v, aligned_w )

# Test
score_matrix         = (pd.DataFrame(np.identity(20)) * 2) - 1
score_matrix.columns = PAM250.columns
score_matrix.index   = PAM250.index
v = 'GTAGGCTTAAGGTTA'
w = 'TAGATA'
indel_pentalty = 1

Score_Alignment_Fitting(v, w, score_matrix, indel_pentalty)

(2, 'TAGGCTTA', 'TAGA-T-A')

In [10]:
'''
Code Challenge: Solve the Overlap Alignment Problem.
Input: Two strings v and w, each of length at most 1000.
Output: The score of an optimal overlap alignment of v and w, followed by an alignment of a suffix v' of v and a prefix w' of w
     achieving this maximum score. Use an alignment score in which matches count +1 and both the mismatch and indel penalties are 2.
'''

def BackTrack_ScoreMatrix_Overlap(v, w, score_matrix, indel_pentalty):
    
    matrix = np.full((len(v) + 1, len(w) + 1), 0, 'int')
    backtrack = np.zeros((len(v) + 1, len(w) + 1), '<U32')
    
    '''for i in range(1, (len(v) + 1)):
        matrix[i, 0] = matrix[i - 1, 0] - indel_pentalty'''
    '''for j in range(1, (len(w) + 1)):
        matrix[0, j] = matrix[0, j - 1] - indel_pentalty'''
        
    for i in range(1, (len(v) + 1)):
        for j in range(1, (len(w) + 1)):
            if (j == 1) | (i == len(v)):
                matrix[i, j] = max(0,
                               matrix[i - 1, j]     - indel_pentalty, 
                               matrix[i    , j - 1] - indel_pentalty, 
                               matrix[i - 1, j - 1] + (score_matrix.at[v[i - 1],w[j - 1]]))
            else:
                matrix[i, j] = max(
                               matrix[i - 1, j]     - indel_pentalty, 
                               matrix[i    , j - 1] - indel_pentalty, 
                               matrix[i - 1, j - 1] + (score_matrix.at[v[i - 1],w[j - 1]]))
                
            if matrix[i, j] == matrix[i - 1, j] - indel_pentalty:
                backtrack[i, j] = 'deletion'
                
            elif matrix[i, j] == matrix[i, j - 1] - indel_pentalty:
                backtrack[i, j] = 'insertion'
                
            elif matrix[i, j] == (matrix[ i - 1, j - 1]) + (score_matrix.at[v[i - 1], w[j - 1]]):
                backtrack[i, j] = 'diagonal'
                
    return(matrix, backtrack )

def Score_Alignment_Overlap(v, w, score_matrix, indel_pentalty):
    
    matrix, backtrack = BackTrack_ScoreMatrix_Overlap(v, w, score_matrix, indel_pentalty)

    i = len(v) 
    j = np.where(matrix[i, :] == np.max(matrix[i, :]))[0][-1]
    score = matrix[i, j]

    aligned_v = []
    aligned_w = []

    AppendAlignment(backtrack, v, w, i, j, aligned_v, aligned_w)
    aligned_v = ''.join(aligned_v)
    aligned_w = ''.join(aligned_w)

    while aligned_v[0] != aligned_w[0]:
        aligned_v = aligned_v[1:]
        aligned_w = aligned_w[1:]

    return(score,aligned_v,aligned_w)

# Test
score_matrix         = (pd.DataFrame(np.identity(20)) * 3) - 2
score_matrix.columns = PAM250.columns
score_matrix.index   = PAM250.index

v = 'PAWHEAE'
w = 'HEAGAWGHEE'
indel_pentalty = 2

Score_Alignment_Overlap(v, w, score_matrix, indel_pentalty)

(1, 'HEAE', 'HEAG')

# Week 3: Advanced Topics in Sequence Alignment

In [11]:
'''
Code Challenge: Solve the Alignment with Affine Gap Penalties Problem.
Input: Two amino acid strings v and w (each of length at most 100).
Output: The maximum alignment score between v and w, followed by an alignment of v and w achieving this maximum score. Use the
     BLOSUM62 scoring matrix, a gap opening penalty of 11, and a gap extension penalty of 1.
'''

def Global_Alignment_with_Affine_Gap(v, w, score_matrix, opening_penalty, extension_penalty):
    
    # Initalize backtrack, matrix with penalty.
    matrix_upper  = np.full((len(v) + 1, len(w) + 1), 0, 'float')
    matrix_middle = np.full((len(v) + 1, len(w) + 1), 0, 'float')
    matrix_lower  = np.full((len(v) + 1, len(w) + 1), 0, 'float')

    backtrack_upper  = np.full((len(v) + 1, len(w) + 1), 0, '<U32')
    backtrack_middle = np.full((len(v) + 1, len(w) + 1), 0, '<U32')
    backtrack_lower  = np.full((len(v) + 1, len(w) + 1), 0, '<U32')

    matrix_middle[1, 0] = matrix_middle[0, 1] = matrix_upper[0, 1] = matrix_lower[1, 0] = -1 * opening_penalty
    matrix_upper[:, 0]  = matrix_lower[0, :]  = -1 * float('Inf')

    for i in range(2, (len(v) + 1)):
        matrix_middle[i, 0] = matrix_middle[i - 1, 0] - extension_penalty
        matrix_lower[i, 0]  = matrix_lower[i - 1, 0]  - extension_penalty

    for j in range(2, (len(w) + 1)):
        matrix_middle[0, j] = matrix_middle[0, j - 1] - extension_penalty
        matrix_upper[0, j]  = matrix_upper[0, j - 1]  - extension_penalty

    # fill the score
    for i in range(1, len(v)+1):
        for j in range(1, len(w)+1):
            matrix_lower[i, j] = max(matrix_lower[i - 1, j] - extension_penalty,
                                    matrix_middle[i - 1, j] - opening_penalty)

            matrix_upper[i, j] = max(matrix_upper[i, j - 1] - extension_penalty,
                                    matrix_middle[i, j - 1] - opening_penalty)

            matrix_middle[i, j] = max(matrix_lower[i, j],
                                      matrix_middle[i - 1, j - 1] + score_matrix.at[v[i-1], w[j-1]],
                                      matrix_upper[i, j])

            if matrix_lower[i, j] == (matrix_lower[i - 1, j] - extension_penalty):
                backtrack_lower[i, j] = 'down'
            else: 
                backtrack_lower[i, j] = 'lower_2_middle'

            if matrix_upper[i, j] == (matrix_upper[i, j - 1] - extension_penalty):
                backtrack_upper[i, j] = 'right'
            else:
                backtrack_upper[i, j] = 'upper_2_middle'

            if matrix_middle[i, j] == matrix_lower[i, j]:
                backtrack_middle[i, j] = 'lower'
            elif matrix_middle[i, j] == matrix_upper[i, j]:
                backtrack_middle[i, j] = 'upper'
            else:
                backtrack_middle[i, j] = 'diagonal'
    
    i = len(v) 
    j = len(w)
    score = max(matrix_lower[i, j],
               matrix_middle[i, j],
                matrix_upper[i, j])
    
    if score == matrix_lower[i, j]:
        current_matrix = backtrack_lower
        
    elif score == matrix_upper[i, j]:
        current_matrix = backtrack_upper
        
    else:
        current_matrix = backtrack_middle
    
    #backtrack
    aligned_v = []
    aligned_w = []
    
    while (i * j) != 0:
        if current_matrix[i, j] == 'lower_2_middle':
            aligned_v.append(v[i - 1])
            aligned_w.append('-')
            i = i - 1
            current_matrix = backtrack_middle
            
        elif current_matrix[i, j] == 'down':
            aligned_v.append(v[i - 1])
            aligned_w.append('-')
            i = i - 1
        
        elif current_matrix[i, j] == 'upper_2_middle':
            aligned_v.append('-')
            aligned_w.append(w[j - 1])
            j = j - 1
            current_matrix = backtrack_middle
            
        elif current_matrix[i, j] == 'right':
            aligned_v.append('-')
            aligned_w.append(w[j - 1])
            j = j - 1
        
        elif current_matrix[i, j] == 'lower':
            current_matrix = backtrack_lower
            
        elif current_matrix[i, j] == 'upper':
            current_matrix = backtrack_upper
        
        elif current_matrix[i, j] == 'diagonal':
            aligned_v.append(v[i - 1])
            aligned_w.append(w[j - 1])
            i = i - 1
            j = j - 1
    
    aligned_v = ''.join(aligned_v[::-1])
    aligned_w = ''.join(aligned_w[::-1])
    
    return(int(score), aligned_v, aligned_w)

# Test
v = 'PRTEINS'
w = 'PRTWPSEIN'
score_matrix      = BLOSUM62
opening_penalty   = 11
extension_penalty = 1

Global_Alignment_with_Affine_Gap(v, w, score_matrix, opening_penalty, extension_penalty)

(8, 'PRT---EINS', 'PRTWPSEIN-')

In [12]:
'''
Code Challenge: Solve the Middle Edge in Linear Space Problem (for protein strings).
Input: Two amino acid strings.
Output: A middle edge in the alignment graph in the form "(i, j) (k, l)", where (i, j) connects to (k, l). To compute scores, use the
     BLOSUM62 scoring matrix and a (linear) indel penalty equal to 5.
'''

def Middle_Edge(v, w, score_matrix, indel_pentalty):
    middle_col = len(w) // 2
    matrix     = np.full((len(v) + 1, middle_col + 2), 0, 'float')
    
    for i in range(1, (len(v) + 1)):
        matrix[i, 0] = matrix[i - 1, 0] - indel_pentalty
    for j in range(1, (middle_col + 2)):
        matrix[0, j] = matrix[0, middle_col - 1] - indel_pentalty
    
    for i in range(1, (len(v) + 1)):
        for j in range(1, (middle_col + 2)):
            
            matrix[i, j] = max(matrix[i - 1, j]     - indel_pentalty, 
                               matrix[i    , j - 1] - indel_pentalty, 
                               matrix[i - 1, j - 1] + (score_matrix.at[v[i - 1],w[j - 1]]))
            
    j = middle_col        
    i = int(np.where(matrix[:, j] == np.max(matrix[:, j]))[0])

    if max(matrix[i + 1, j + 1], matrix[i + 1, j], matrix[i, j + 1]) == matrix[i + 1, j + 1]:
        direction = 'diagonal'
        k, l      = i + 1, j + 1
        
    elif max(matrix[i + 1, j + 1], matrix[i + 1, j], matrix[i, j + 1]) == matrix[i, j + 1]:
        direction = 'right'
        k, l      = i, j + 1
    else:
        direction = 'bottom'
        k, l      = i + 1, j

    print([i, j], [k, l])
    return(direction, i)

# Test
v = 'PLEASANTLY'
w = 'MEASNLY'
score_matrix   = BLOSUM62
indel_pentalty = 5

Middle_Edge(v, w, score_matrix, indel_pentalty)

[4, 3] [5, 4]


('diagonal', 4)

In [13]:
'''
Code Challenge: Solve the Multiple Longest Common Subsequence Problem.
Input: Three DNA strings of length at most 10.
Output: The length of a longest common subsequence of these three strings, followed by a multiple alignment of the three strings
     corresponding to such an alignment.
'''


def Three_String_Global_Alignment(s1, s2, s3 ,match_score, unmatch_score):
    matrix    = np.full((len(s1) + 1, len(s2) + 1, len(s3) + 1), 0, 'float')
    backtrack = np.zeros((len(s1) + 1, len(s2) + 1, len(s3) + 1), dtype='int')
    
    # fill the matrix
    for i in range(1, (len(s1) + 1)):
        for j in range(1, (len(s2) + 1)):
            for k in range(1, (len(s3) + 1)):
                
                if s1[i -1] == s2[j - 1] == s3[k - 1]:
                    match = match_score
                else:
                    match = unmatch_score
                    
                scores = [matrix[i - 1, j    , k    ] + unmatch_score,
                          matrix[i    , j - 1, k    ] + unmatch_score,
                          matrix[i    , j    , k - 1] + unmatch_score,
                          matrix[i - 1, j - 1, k    ] + unmatch_score,
                          matrix[i    , j - 1, k - 1] + unmatch_score,
                          matrix[i - 1, j    , k - 1] + unmatch_score,
                          matrix[i - 1, j - 1, k - 1] + match]
                
                
                matrix[i, j, k]    = max(scores)
                backtrack[i, j, k] = scores.index(matrix[i, j, k])

    aligned_s1 = []
    aligned_s2 = []
    aligned_s3 = []
    
    i, j, k = len(s1), len(s2), len(s3)
    
    # trackback
    while (i * j * k) != 0 :
        
        if backtrack[i, j, k] == 0:
            aligned_s1.append(s1[i - 1])
            aligned_s2.append('-')
            aligned_s3.append('-') 
            i = i - 1
        
        elif backtrack[i, j, k] == 1:
            aligned_s1.append('-')
            aligned_s2.append(s2[j - 1])
            aligned_s3.append('-') 
            j = j - 1
        
        elif backtrack[i, j, k] == 2:
            aligned_s1.append('-')
            aligned_s2.append('-')
            aligned_s3.append(s3[k - 1]) 
            k = k - 1
            
        elif backtrack[i, j, k] == 3:
            aligned_s1.append(s1[i - 1])
            aligned_s2.append(s2[j - 1])
            aligned_s3.append('-') 
            i, j = i - 1, j - 1
        
        elif backtrack[i, j, k] == 4:
            aligned_s1.append('-')
            aligned_s2.append(s2[j - 1])
            aligned_s3.append(s3[k - 1]) 
            j, k = j - 1, k - 1
        
        elif backtrack[i, j, k] == 5:
            aligned_s1.append(s1[i - 1])
            aligned_s2.append('-')
            aligned_s3.append(s3[k - 1]) 
            i, k = i - 1, k - 1
        
        elif backtrack[i, j, k] == 6:
            aligned_s1.append(s1[i - 1])
            aligned_s2.append(s2[j - 1])
            aligned_s3.append(s3[k - 1]) 
            i, j, k = i - 1, j - 1, k - 1
    
    # fill the rest of backtrack
    while (i + j + k) != 0:
        if i > 0:
            aligned_s1.append(s1[i - 1])
            i = i - 1
        else:
            aligned_s1.append('-')
        
        if j > 0:
            aligned_s2.append(s2[j - 1])
            j = j - 1
        else:
            aligned_s2.append('-')
        
        if k > 0:
            aligned_s3.append(s3[k - 1])
            k = k - 1
        else:
            aligned_s3.append('-')
            
    
    aligned_s1 = ''.join(aligned_s1[::-1])
    aligned_s2 = ''.join(aligned_s2[::-1])
    aligned_s3 = ''.join(aligned_s3[::-1])
                            
    output_score = int(matrix[len(s1), len(s2), len(s3)] )    
    
    return(output_score, aligned_s1, aligned_s2, aligned_s3)

# Test
s1 = 'ATATCCG'
s2 = 'TCCGA'
s3 = 'ATGTACTG'
match_score   = 1
unmatch_score = 0

Three_String_Global_Alignment(s1, s2, s3 ,match_score, unmatch_score)

(3, 'AT---ATC--CG-', '-T-----C-C-GA', 'ATGTA--CT--G-')

# Week 4: Genome Rearrangements and Fragility

In [14]:
'''
Code Challenge: Implement GreedySorting.
Input: A permutation P.
Output: The sequence of permutations corresponding to applying GreedySorting to P, ending with the identity permutation.
'''

def reverse(p_list):
    for i in range(len(p_list)):
        p_list[i] = p_list[i] * -1
    p_list = p_list[::-1]
    return(p_list)

def print_p(p_list):
    p_list = list(map(str, p_list))
    p_list = p_list[1:]
    for i in range(len(p_list)):
        if (p_list[i][0]) != '-':
            p_list[i] = '+' + p_list[i]
    p_list = ' '.join(p_list)
    print('(' + p_list + ')')

def GreedySorting(p):
    if type(p) == str:
        p = p.split(' ')
    p = list(map(int, p))
    
    approxReversalDistance = 0
    p = [0] + p
    
    for k in range(1,len(p)):
        while p[k] != k:
            print_p(p)
            approxReversalDistance = approxReversalDistance + 1
            
            if p[k] == -k:
                p[k] = p[k] * -1

            elif (-k) in p:
                k_index = p.index(-k)
                p = p[: k] + reverse(p[k : k_index + 1]) + p[k_index + 1 :]

            else:
                k_index = p.index(k)
                p = p[: k] + reverse(p[k : k_index + 1]) + p[k_index + 1 :]
    
    print_p(p)  
    return(approxReversalDistance)
    
# Test
p = '-3 +4 +1 +5 -2'

GreedySorting(p)

(-3 +4 +1 +5 -2)
(-1 -4 +3 +5 -2)
(+1 -4 +3 +5 -2)
(+1 +2 -5 -3 +4)
(+1 +2 +3 +5 +4)
(+1 +2 +3 -4 -5)
(+1 +2 +3 +4 -5)
(+1 +2 +3 +4 +5)


7

In [15]:
'''
Number of Breakpoints Problem: Find the number of breakpoints in a permutation.
Input: A permutation.
Output: The number of breakpoints in this permutation.
'''

def Count_Breakpoint(permutation):
    if type(permutation) == str:
        permutation = permutation.split(' ')
    permutation = list(map(int, permutation))
    
    breakpoint_count = 0
    
    permutation = [0] + permutation + [len(permutation) + 1]
    
    for i in range(len(permutation) - 1):
        if permutation[i] != permutation[i + 1] - 1:
            breakpoint_count = breakpoint_count + 1
    
    return(breakpoint_count)

# Test
permutation = '+3 +4 +5 -12 -8 -7 -6 +1 +2 +10 +9 -11 +13 +14'

Count_Breakpoint(permutation)

8

# Week 5: Applying Genome Rearrangement Analysis to Find Genome Fragility

In [16]:
'''
Code Challenge: Implement ChromosomeToCycle.
Input: A chromosome Chromosome containing n synteny blocks.
Output: The sequence Nodes of integers between 1 and 2n resulting from applying ChromosomeToCycle to Chromosome.
'''

def ChromosomeToCycle(chromosome):
    if type(chromosome) == str:
        chromosome = chromosome.split(' ')
    chromosome = list(map(int, chromosome))
    
    nodes = []
    for old_node in chromosome:
        
        if old_node > 0:
            nodes.extend([2 * old_node - 1, 2 * old_node])
            
        else :
            nodes.extend([-2 * old_node   , -2 * old_node - 1])
            
    return(nodes)

# Test
chromosome = '+1 -2 -3 +4'

ChromosomeToCycle(chromosome)

[1, 2, 4, 3, 6, 5, 7, 8]

In [17]:
'''
Code Challenge: Implement CycleToChromosome.
Input: A sequence Nodes of integers between 1 and 2n.
Output: The chromosome Chromosome containing n synteny blocks resulting from applying CycleToChromosome to Nodes.
'''

def CycleToChromosome(nodes):
    if type(nodes) == str:
        nodes = nodes.split(' ')
    nodes = list(map(int, nodes))
    nodes = [0] + nodes
    
    chromosome = []
    
    for i in range(1, int(len(nodes) / 2) + 1):
        
        if nodes[2 * i - 1] < nodes[2 * i]:
            node = nodes[2 * i] / 2
        
        else:
            node = -1 * nodes[(2 * i) - 1] / 2

        chromosome.append(int(node))
        
    return(chromosome)

# Test
nodes = '1 2 4 3 6 5 7 8'

CycleToChromosome(nodes)

[1, -2, -3, 4]

In [18]:
'''
Code Challenge: Implement ColoredEdges.
Input: A genome P.
Output: The collection of colored edges in the genome graph of P in the form (x, y).
'''

def ColoredEdges(p):
    if type(p) == str:
        p = p[1 : -1]
        p = p.split(')(')
    
    edges = []
    
    for chromosome in p:
        if type(chromosome) == str:
            chromosome = chromosome.split(' ')
            chromosome = list(map(int, chromosome))
            
        nodes      = ChromosomeToCycle(chromosome)
        for i in range(1, int(len(chromosome))):
            edge = [nodes[2 * i - 1], nodes[2 * i]]
            edges.append(edge)
        last_egde = [nodes[-1], nodes[0]]
        edges.append(last_egde)
        
    return(edges)

# Test
p = '(+1 -2 -3)(+4 +5 -6)'

ColoredEdges(p)

[[2, 4], [3, 6], [5, 1], [8, 9], [10, 12], [11, 7]]

In [19]:
'''
Code Challenge: Implement GraphToGenome.
Input: The colored edges ColoredEdges of a genome graph.
Output: The genome P corresponding to this genome graph.
'''

def GraphToGenome(genome_graph):
    
    if type(genome_graph) == str:
        genome_graph = genome_graph.replace('(','')
        genome_graph = genome_graph.replace(')','')
        genome_graph = genome_graph.replace(' ','')
        genome_graph = genome_graph.split(',')
        genome_graph = list(map(int, genome_graph))
    
    else:
        tmp_genome_graph = []
        for sub_genome_graph in genome_graph:
            tmp_genome_graph.extend(sub_genome_graph)
        genome_graph = tmp_genome_graph
        
    p = [] 
    
    start_index = 0
    
    for i in range(start_index + 1, len(genome_graph)):
        if (i % 2 == 0) & (abs(genome_graph[i] - genome_graph[i - 1]) > 1):
            nodes       = genome_graph[start_index : i]
            nodes       = [nodes[-1]] + nodes[:-1]
            choromosome = CycleToChromosome(nodes)
            p.append(choromosome)
            start_index = i
            
    nodes = genome_graph[start_index:]
    nodes = [nodes[-1]] + nodes[:-1]
    choromosome = CycleToChromosome(nodes)
    p.append(choromosome)
    
    return(p)

# Test
genome_graph = '(2, 4), (3, 6), (5, 1), (7, 9), (10, 12), (11, 8)'

GraphToGenome(genome_graph)

[[1, -2, -3], [-4, 5, -6]]

In [20]:
'''
Code Challenge: Solve the 2-Break Distance Problem.
Input: Genomes P and Q.
Output: The 2-break distance d(P, Q).
'''

def Two_Break_Distance(P, Q):
    p = ColoredEdges(P)
    q = ColoredEdges(Q)
    
    comb = p + q
    
    n_cycle = 0
    while len(comb) != 0:
        n_cycle = n_cycle + 1
        current_cycle = comb[0]
        comb.pop(0)
        for number in current_cycle:
            for edge in comb:
                if number in edge:
                    edge.remove(number)
                    current_cycle.extend(edge)
                    comb.remove(edge)
    
    return(len(q) - n_cycle)

# Test
P = '(+1 +2 +3 +4 +5 +6)'
Q = '(+1 -3 -6 -5)(+2 -4)'

Two_Break_Distance(P, Q)

3

In [21]:
'''
Code Challenge: Implement 2-BreakOnGenomeGraph.
Input: The colored edges of a genome graph GenomeGraph, followed by indices i1 , i2 , i3 , and i4 .
Output: The colored edges of the genome graph resulting from applying the 2-break operation
'''

def Two_BreakOnGenomeGraph(genome_graph, i1 , i2 , i3 , i4):

    if type(genome_graph) == str:
        genome_graph = genome_graph[1: -1]
        genome_graph = genome_graph.split('), (')
        for i in range(len(genome_graph)):
            genome_graph[i] = genome_graph[i].split(',')
            genome_graph[i] = list(map(int, genome_graph[i]))

    tmp_genome_graph = copy.deepcopy(genome_graph)
    for edge in genome_graph:
        
        if (i1 in edge) & (i2 in edge):
            tmp_genome_graph.remove(edge)
            
        
        elif (i3 in edge) & (i4 in edge):
            tmp_genome_graph.remove(edge)
            
    tmp_genome_graph.append([i1, i3])
    tmp_genome_graph.append([i2, i4])
    
    return(tmp_genome_graph)

# Test
genome_graph = '(2, 4), (3, 8), (7, 5), (6, 1)'
i1 , i2 , i3 , i4 = 1, 6, 3, 8

Two_BreakOnGenomeGraph(genome_graph, i1 , i2 , i3 , i4)

[[2, 4], [7, 5], [1, 3], [6, 8]]

In [22]:
'''
2-Break Sorting Problem: Find a shortest transformation of one genome into another by 2-breaks.
Input: Two genomes with circular chromosomes on the same set of synteny blocks.
Output: The sequence of genomes resulting from applying a shortest sequence of 2-breaks
    transforming one genome into the other.
'''

def Sort_Graphs(edges):
    tmp_edges = copy.deepcopy(edges)
    master_output_edges = []
     
    while len(tmp_edges) != 0:
        output_edges = [copy.deepcopy(tmp_edges[0])]
        tmp_edges.pop(0)
        
        while True:
            if (output_edges[0][0]) % 2 == 0:
                if (output_edges[0][0] - output_edges[-1][1]) == 1:
                    break
            if (output_edges[0][0]) % 2 == 1:
                if (output_edges[0][0] - output_edges[-1][1]) == -1:
                    break
            
            for edge in tmp_edges:

                last_number  = output_edges[-1][1]
                if (last_number % 2) == 0:
                    if (last_number -1) in edge:
                        if last_number - 1 == edge[0]:
                            output_edges.append(edge)
                        else:
                            output_edges.append(edge[::-1])
                        tmp_edges.remove(edge)
                
                else:
                    if ( last_number + 1) in edge:
                        if last_number + 1 == edge[0]:
                            output_edges.append(edge)
                        else:
                            output_edges.append(edge[::-1])
                        tmp_edges.remove(edge)

        master_output_edges.extend(output_edges)
    
    return(master_output_edges)

def print_genome(P):
    output_print=''
    tmp_P = copy.deepcopy(P)
    
    for sub_p in tmp_P:
        print_sub = '('
        
        for i in range(len(sub_p)):
            if sub_p[i] > 0:
                sub_p[i] = '+' + str(sub_p[i])
            else:
                sub_p[i] = str(sub_p[i])
                
            print_sub = print_sub + sub_p[i] + ' '
            
        print_sub    = print_sub[: -1] + ')'
        output_print = output_print + print_sub
        
    print(output_print)
    

def ShortestRearrangementScenario(P, Q):
    print(P)
    red_edges  = ColoredEdges(P)
    blue_edges = ColoredEdges(Q)
    
    red_blue_edges = red_edges + blue_edges
    
    while Two_Break_Distance(P, Q) != 0:
        
        for blue_edge in blue_edges:
            if (blue_edge[::-1] not in red_edges) & (blue_edge not in red_edges):
                i1, i3 = blue_edge[0], blue_edge[1]
                for red_edge in red_edges:
                    if red_edge[0] == i1:
                        i2 = red_edge[1]
                    elif red_edge[1] == i1:
                        i2 = red_edge[0]
                    
                    if red_edge[0] == i3:
                        i4 = red_edge[1]
                    elif red_edge[1] == i3:
                        i4 = red_edge[0]
                
                red_edges = Two_BreakOnGenomeGraph(red_edges, i1 , i2 , i3 , i4)
                red_edges = Sort_Graphs(red_edges)
                P         = GraphToGenome(red_edges)
                print_genome(P)
                
                break

# Test 
P = '(+1 -2 -3 +4)'
Q = '(+1 +2 -4 -3)'

ShortestRearrangementScenario(P, Q)

(+1 -2 -3 +4)
(-3 +4 +1 +2)
(-3 +4 -2 -1)
(+4 -2 -1 +3)


In [23]:
'''
Shared k-mers Problem: Given two strings, find all their shared k-mers.
Input: An integer k and two strings.
Output: All k-mers shared by these strings, in the form of ordered pairs (x, y) corresponding to starting positions
     of these k-mers in the respective strings.
'''

def ReverseComplement(DNA_string):
    complementry_rule = {
        'A' : 'T',
        'T' : 'A',
        'C' : 'G',
        'G' : 'C' 
    }
    com_string = ''
    for base in DNA_string:
        com_string = com_string + complementry_rule[base]
    
    com_string = com_string[::-1]
    
    return (com_string)

def Shared_Kmers(string1, string2, k):
    kmers_position_1 = dict()
    kmers_position_2 = dict()
    shared_positions = []
    
    for start in range(len(string1) - k + 1):
        end     = start + k
        pattern = string1[start : end]
        if pattern in kmers_position_1:
            kmers_position_1[pattern].append(start)
        else:
            kmers_position_1[pattern] = [start]
    
    for start in range(len(string2) - k + 1):
        end     = start + k
        pattern = string2[start : end]
        if pattern in kmers_position_2:
            kmers_position_2[pattern].append(start)
        else:
            kmers_position_2[pattern] = [start]
    
    for pattern, positions in kmers_position_1.items():
        
        if pattern in kmers_position_2:

            for position_1 in positions:
                for position_2 in kmers_position_2[pattern]:
                    shared_positions.append([position_1, position_2])
        
        if ReverseComplement(pattern) in kmers_position_2:

            for position_1 in positions:
                for position_2 in kmers_position_2[ReverseComplement(pattern)]:
                    shared_positions.append([position_1, position_2])
        
    return(shared_positions)

# Test
k = 3
string1 = 'AAACTCATC'
string2 = 'TTTCAAATC'

Shared_Kmers(string1, string2, k)

[[0, 4], [0, 0], [4, 2], [6, 6]]