### solve the global sequence alignment problem using needleman-wunsch algorithm

In [1]:
import numpy as np
equal_score = 1
unequal_score = -1
space_score = -2
# needleman-wunsch 算法可能出现负分的情况

In [7]:
def createScoreMatrix(list1, list2, debug=False):
    lenList1, lenList2 = len(list1), len(list2)
    #initialize matrix
    scoreMatrix = np.zeros((lenList1+1, lenList2+1), dtype=int)
    for i in range(1, lenList1+1):
        scoreMatrix[i][0] = i * space_score
    for j in range(1, lenList2+1):
        scoreMatrix[0][j] = j * space_score
    #populate the matrix
    for i, x in enumerate(list1):
        for j, y in enumerate(list2):
            if x == y:
                scoreMatrix[i+1][j+1] = scoreMatrix[i][j]+equal_score
            else:
                scoreMatrix[i+1][j+1] = max(scoreMatrix[i][j+1]+space_score, scoreMatrix[i+1][j]+space_score, scoreMatrix[i][j]+unequal_score)
    if debug:
        print("score Matrix:")
        print(scoreMatrix)
    return scoreMatrix

list1=[1, 2, 4, 6,7,8,0]
list2=[4,5,7,1,2,0]
print(createScoreMatrix(list1, list2))

[[  0  -2  -4  -6  -8 -10 -12]
 [ -2  -1  -3  -5  -5  -7  -9]
 [ -4  -3  -2  -4  -6  -4  -6]
 [ -6  -3  -4  -3  -5  -6  -5]
 [ -8  -5  -4  -5  -4  -6  -7]
 [-10  -7  -6  -3  -5  -5  -7]
 [-12  -9  -8  -5  -4  -6  -6]
 [-14 -11 -10  -7  -6  -5  -5]]


In [8]:
list1=list("GCCCTAGCG")
list2=list("GCGCAATG")
print(createScoreMatrix(list1, list2))

[[  0  -2  -4  -6  -8 -10 -12 -14 -16]
 [ -2   1  -1  -3  -5  -7  -9 -11 -13]
 [ -4  -1   2   0  -2  -4  -6  -8 -10]
 [ -6  -3   0   1   1  -1  -3  -5  -7]
 [ -8  -5  -2  -1   2   0  -2  -4  -6]
 [-10  -7  -4  -3   0   1  -1  -1  -3]
 [-12  -9  -6  -5  -2   1   2   0  -2]
 [-14 -11  -8  -5  -4  -1   0   1   1]
 [-16 -13 -10  -7  -4  -3  -2  -1   0]
 [-18 -15 -12  -9  -6  -5  -4  -3   0]]


In [25]:
def traceBack(list1, list2, scoreMatrix):
    '''
    Return:
         alignedList1, alignedList2, commonSub
    '''
    commonSub = []
    alignedList1 = []
    alignedList2 = []
    i, j = scoreMatrix.shape[0]-1, scoreMatrix.shape[1]-1
    if i == 0 or j == 0:
        return list1, list2, commonSub
    else:
        while i != 0 and j != 0:  #顺序是左上，上，左
            if list1[i-1] == list2[j-1]:
                commonSub.append(list1[i-1])
                alignedList1.append(list1[i-1])
                alignedList2.append(list2[j-1])
                i -= 1
                j -= 1
            elif scoreMatrix[i][j] == scoreMatrix[i-1][j-1] + unequal_score:
                alignedList1.append(list1[i-1])
                alignedList2.append(list2[j-1])
                i -= 1
                j -= 1
            elif scoreMatrix[i][j] == scoreMatrix[i-1][j] + space_score:
                alignedList1.append(list1[i-1])
                alignedList2.append('_')
                i -= 1
            else:#scoreMatrix[i][j] == scoreMatrix[i][j-1] + space_score:
                alignedList1.append('_')
                alignedList2.append(list2[j-1])
                j -= 1
    #己回滋到最左一行，或最上一列，但未到达0, 0 位置
    while i > 0:
        alignedList1.append(list1[i-1])
        i -= 1
    while j > 0:
        alignedList2.append(list2[j-1])
        j -= 1
    alignedList1.reverse()
    alignedList2.reverse()
    commonSub.reverse()
    return alignedList1, alignedList2, commonSub

list1=[1, 2, 4, 6,7,8,0]
list2=[4,5,7,1,2,0]
alignedList1, alignedList2, commonSub= traceBack(list1, list2, createScoreMatrix(list1, list2))
print(alignedList1)
print(alignedList2)
print(commonSub)

[1, 2, 4, 6, 7, '_', 8, 0]
[4, 5, 7, 1, 2, 0]
[4, 7, 0]


In [26]:
def needleman_wunsch(list1, list2, debug=False):
    return traceBack(list1, list2, createScoreMatrix(list1, list2, debug))

In [27]:
list1 = list("GCCCTAGCG")
list2 = list("GCGCAATG")
alignedList1, alignedList2, commonSub = needleman_wunsch(list1, list2, True)
print(alignedList1)
print(alignedList2)
print(commonSub)

score Matrix:
[[  0  -2  -4  -6  -8 -10 -12 -14 -16]
 [ -2   1  -1  -3  -5  -7  -9 -11 -13]
 [ -4  -1   2   0  -2  -4  -6  -8 -10]
 [ -6  -3   0   1   1  -1  -3  -5  -7]
 [ -8  -5  -2  -1   2   0  -2  -4  -6]
 [-10  -7  -4  -3   0   1  -1  -1  -3]
 [-12  -9  -6  -5  -2   1   2   0  -2]
 [-14 -11  -8  -5  -4  -1   0   1   1]
 [-16 -13 -10  -7  -4  -3  -2  -1   0]
 [-18 -15 -12  -9  -6  -5  -4  -3   0]]
['G', 'C', 'C', 'C', 'T', 'A', 'G', 'C', 'G']
['G', 'C', 'G', 'C', '_', 'A', 'A', 'T', 'G']
['G', 'C', 'C', 'A', 'G']


In [29]:
text1 = "this is a test for text alignment from xxxx"
text2 = "Hi, try A test for alignment , Heirish"
list1 = text1.lower().split(" ")
list2 = text2.lower().split(" ")
alignedList1, alignedList2, commonSub = needleman_wunsch(list1, list2)
print(alignedList1)
print(alignedList2)
print(commonSub)

['this', 'is', 'a', 'test', 'for', 'text', 'alignment', 'from', 'xxxx']
['hi,', 'try', 'a', 'test', 'for', '_', 'alignment', ',', 'heirish']
['a', 'test', 'for', 'alignment']
