In [1]:
!pip install numpy pandas



In [2]:
import numpy as np
import pandas as pd

![SWPairHMMLeftSoftclip](figures/phmm_left_softclip.png)

In [3]:
# transition probabilities

delta = 0.2
tau = 0.1
epsilon = 0.4
eta = 0.2

In [13]:
# transition matrix

a = pd.DataFrame(
    np.array([
        # B    RX1      a    RY1      b    M                    X        Y        E
        [ 0.0, 1 - eta, eta, 0.0,     0.0, 0.0,                 0.0,     0.0,     0.0 ], # B
        [ 0.0, 1 - eta, eta, 0.0,     0.0, 0.0,                 0.0,     0.0,     0.0 ], # RX1
        [ 0.0, 0.0,     0.0, 1 - eta, eta, 0.0,                 0.0,     0.0,     0.0 ], # a
        [ 0.0, 0.0,     0.0, 1 - eta, eta, 0.0,                 0.0,     0.0,     0.0 ], # RY1
        [ 0.0, 0.0,     0.0, 0.0,     0.0, 1 - (2*delta) - tau, delta,   delta,   tau ], # b
        [ 0.0, 0.0,     0.0, 0.0,     0.0, 1 - (2*delta) - tau, delta,   delta,   tau ], # M
        [ 0.0, 0.0,     0.0, 0.0,     0.0, 1 - epsilon - tau,   epsilon, 0.0,     tau ], # X
        [ 0.0, 0.0,     0.0, 0.0,     0.0, 1 - epsilon - tau,   0.0,     epsilon, tau ], # Y
        [ 0.0, 0.0,     0.0, 0.0,     0.0, 0.0,                 0.0,     0.0,     0.0 ]  # E
    ]),
    columns = ['B', 'RX1', 'a', 'RY1', 'b', 'M', 'X', 'Y', 'E']
)
a.index = ['B', 'RX1', 'a', 'RY1', 'b', 'M', 'X', 'Y', 'E']

a

Unnamed: 0,B,RX1,a,RY1,b,M,X,Y,E
B,0.0,0.8,0.2,0.0,0.0,0.0,0.0,0.0,0.0
RX1,0.0,0.8,0.2,0.0,0.0,0.0,0.0,0.0,0.0
a,0.0,0.0,0.0,0.8,0.2,0.0,0.0,0.0,0.0
RY1,0.0,0.0,0.0,0.8,0.2,0.0,0.0,0.0,0.0
b,0.0,0.0,0.0,0.0,0.0,0.5,0.2,0.2,0.1
M,0.0,0.0,0.0,0.0,0.0,0.5,0.2,0.2,0.1
X,0.0,0.0,0.0,0.0,0.0,0.5,0.4,0.0,0.1
Y,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.4,0.1
E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
# emissions (match state)

eM = pd.DataFrame(
    np.array([
        # A     C     G     T
        [ 0.50, 0.05, 0.15, 0.30 ], # A
        [ 0.05, 0.50, 0.30, 0.15 ], # C
        [ 0.15, 0.30, 0.50, 0.05 ], # G
        [ 0.30, 0.15, 0.05, 0.50 ]  # T
    ]),
    columns = ['A', 'C', 'G', 'T']
)
eM.index = ['A', 'C', 'G', 'T']

eM

Unnamed: 0,A,C,G,T
A,0.5,0.05,0.15,0.3
C,0.05,0.5,0.3,0.15
G,0.15,0.3,0.5,0.05
T,0.3,0.15,0.05,0.5


In [15]:
# emissions (insert / delete states)

eXY = {
    'A': 0.25,
    'C': 0.25,
    'G': 0.25,
    'T': 0.25
}

eXY

{'A': 0.25, 'C': 0.25, 'G': 0.25, 'T': 0.25}

In [20]:
def max_vRX1(S1, S2, i, j, v_B, v_RX1, v_a, v_RY1, v_b, v_M, v_X, v_Y, tb):
    q = eXY[S2[0]]
    
    r = np.array([
        a.loc['B', 'RX1']*v_B[i][j - 1],
        a.loc['RX1', 'RX1']*v_RX1[i][j - 1],
        0,
        0,
        0,
        0,
        0,
        0
    ])
    
    mr = np.max(r)
    mi = np.argmax(r)
    
    v_RX1[i][j] = q*mr
    tb[i][j] = mi
    
def max_va(S1, S2, i, j, v_B, v_RX1, v_a, v_RY1, v_b, v_M, v_X, v_Y, tb):
    q = eXY[S2[0]]
    
    r = np.array([
        a.loc['B', 'a']*v_B[i][j - 1],
        a.loc['RX1', 'a']*v_RX1[i][j - 1],
        0,
        0,
        0,
        0,
        0,
        0
    ])
    
    mr = np.max(r)
    mi = np.argmax(r)
    
    v_a[i][j] = q*mr
    tb[i][j] = mi
    
def max_vRY1(S1, S2, i, j, v_B, v_RX1, v_a, v_RY1, v_b, v_M, v_X, v_Y, tb):
    q = eXY[S2[0]]
    
    r = np.array([
        0,
        0,
        a.loc['a', 'RY1']*v_a[i - 1][j],
        a.loc['RY1', 'RY1']*v_RY1[i - 1][j],
        0,
        0,
        0,
        0
    ])
    
    mr = np.max(r)
    mi = np.argmax(r)
    
    v_RY1[i][j] = q*mr
    tb[i][j] = mi

def max_vb(S1, S2, i, j, v_B, v_RX1, v_a, v_RY1, v_b, v_M, v_X, v_Y, tb):
    q = eXY[S2[0]]
    
    r = np.array([
        0,
        0,
        a.loc['a', 'b']*v_a[i][j - 1],
        a.loc['RY1', 'b']*v_RY1[i][j - 1],
        0,
        0,
        0,
        0
    ])
    
    mr = np.max(r)
    mi = np.argmax(r)
    
    v_b[i][j] = q*mr
    tb[i][j] = mi

def max_vM(S1, S2, i, j, v_B, v_RX1, v_a, v_RY1, v_b, v_M, v_X, v_Y, tb):
    p = eM.loc[S1[i - 1], S2[j - 1]]

    r = np.array([
        0,
        0,
        0,
        0,
        a.loc['b', 'M']*v_b[i - 1][j - 1],
        a.loc['M', 'M']*v_M[i - 1][j - 1],
        a.loc['X', 'M']*v_X[i - 1][j - 1],
        a.loc['Y', 'M']*v_Y[i - 1][j - 1]
    ])
    
    mr = np.max(r)
    mi = np.argmax(r)

    v_M[i][j] = p*mr
    tb[i][j] = mi

def max_vX(S1, S2, i, j, v_B, v_RX1, v_a, v_RY1, v_b, v_M, v_X, v_Y, tb):
    q = eXY[S1[0]]
    
    r = np.array([
        0,
        0,
        0,
        0,
        a.loc['b', 'X']*v_b[i - 1][j],
        a.loc['M', 'X']*v_M[i - 1][j],
        a.loc['X', 'X']*v_X[i - 1][j],
        0,
    ])
    
    mr = np.max(r)
    mi = np.argmax(r)
    
    v_X[i][j] = q*mr
    tb[i][j] = mi

def max_vY(S1, S2, i, j, v_B, v_RX1, v_a, v_RY1, v_b, v_M, v_X, v_Y, tb):
    q = eXY[S2[0]]
    
    r = np.array([
        0,
        0,
        0,
        0,
        a.loc['b', 'Y']*v_b[i][j - 1],
        a.loc['M', 'Y']*v_M[i][j - 1],
        0,
        a.loc['Y', 'Y']*v_Y[i][j - 1]
    ])
    
    mr = np.max(r)
    mi = np.argmax(r)
    
    v_Y[i][j] = q*mr
    tb[i][j] = mi

def traceback(S1, S2, v_B, v_RX1, v_a, v_RY1, v_b, v_M, v_X, v_Y, tb_B, tb_RX1, tb_a, tb_RY1, tb_b, tb_M, tb_X, tb_Y):
    i = len(S1)
    j = len(S2)
    n = np.max([len(S1), len(S2)]) - 1
    
    r = np.array([ v_B[i][j], v_RX1[i][j], v_a[i][j], v_RY1[i][j], v_b[i][j], v_M[i][j], v_X[i][j], v_Y[i][j] ])
    
    mr = np.max(r)
    mi = np.argmax(r)
    
    a1 = (n+1)*[' ']
    a2 = (n+1)*[' ']
            
    while i > 0 and j > 0:
        if mi == 0:
            a1[n] = S1[i - 1]
            a2[n] = S2[j - 1]
            mi = tb_B[i][j]
            i -= 1
            j -= 1
        elif mi == 1:
            a1[n] = S1[i - 1]
            a2[n] = S2[j - 1]
            mi = tb_RX1[i][j]
            i -= 1
            j -= 1            
        elif mi == 2:
            a1[n] = S1[i - 1]
            a2[n] = '~'        
            mi = tb_a[i][j]
            i -= 1
        elif mi == 3:
            a1[n] = '~'
            a2[n] = S2[j - 1]
            mi = tb_RY1[i][j]
            j -= 1

        elif mi == 4:
            a1[n] = S1[i - 1]
            a2[n] = S2[j - 1]
            mi = tb_b[i][j]
            i -= 1
            j -= 1
        elif mi == 5:
            a1[n] = S1[i - 1]
            a2[n] = S2[j - 1]
            mi = tb_M[i][j]
            i -= 1
            j -= 1            
        elif mi == 6:
            a1[n] = S1[i - 1]
            a2[n] = '-'        
            mi = tb_X[i][j]
            i -= 1
        elif mi == 7:
            a1[n] = '-'
            a2[n] = S2[j - 1]
            mi = tb_Y[i][j]
            j -= 1
            
        n -= 1
        
    return ''.join(a1), ''.join(a2)

In [33]:
# Initialization

## Specify sequences

A = "GTCATGTTAGTTTACG"
B =        "TAGTTAACG"

## Viterbi matrices

vB   = np.zeros([len(A)+1, len(B)+1])
vRX1 = np.zeros([len(A)+1, len(B)+1])
vas  = np.zeros([len(A)+1, len(B)+1])
vRY1 = np.zeros([len(A)+1, len(B)+1])
vbs  = np.zeros([len(A)+1, len(B)+1])
vM   = np.zeros([len(A)+1, len(B)+1])
vX   = np.zeros([len(A)+1, len(B)+1])
vY   = np.zeros([len(A)+1, len(B)+1])

vB[0,0] = 1

## Traceback matrices

tB   = np.zeros([len(A)+1, len(B)+1])
tRX1 = np.zeros([len(A)+1, len(B)+1])
tas  = np.zeros([len(A)+1, len(B)+1])
tRY1 = np.zeros([len(A)+1, len(B)+1])
tbs  = np.zeros([len(A)+1, len(B)+1])
tM   = np.zeros([len(A)+1, len(B)+1])
tX   = np.zeros([len(A)+1, len(B)+1])
tY   = np.zeros([len(A)+1, len(B)+1])

In [34]:
# Populate Viterbi matrices

for i in range(1, len(A)+1):
    for j in range(1, len(B)+1):
        max_vRX1(A, B, i, j, vB, vRX1, vas, vRY1, vbs, vM, vX, vY, tX1)
        max_va(A, B, i, j, vB, vRX1, vas, vRY1, vbs, vM, vX, vY, tas)
        max_vRY1(A, B, i, j, vB, vRX1, vas, vRY1, vbs, vM, vX, vY, tY1)
        max_vb(A, B, i, j, vB, vRX1, vas, vRY1, vbs, vM, vX, vY, tbs)
        max_vM(A, B, i, j, vB, vRX1, vas, vRY1, vbs, vM, vX, vY, tM)
        max_vX(A, B, i, j, vB, vRX1, vas, vRY1, vbs, vM, vX, vY, tX)
        max_vY(A, B, i, j, vB, vRX1, vas, vRY1, vbs, vM, vX, vY, tY)

In [26]:
# Follow traceback matrices

a1, a2 = traceback(A, B, vB, vRX1, vas, vRY1, vbs, vM, vX, vY,
                         tB, tRX1, tas, tRY1, tbs, tM, tX, tY)

In [27]:
# Print alignment

print(a1)
print(a2)

       TAGTTTACG
       TAGTTAACG


In [28]:
A

'GTCATGTTAGTTTACG'

In [29]:
B

'TAGTTAACG'

In [30]:
np.set_printoptions(linewidth=200)

print("vB:")
print(np.transpose(vB))

print("vB:")
print(np.transpose(vB))

vB:
[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
