# Imports

In [1]:
import numpy as np

# Variables

In [2]:
seq1 = "C A G T T G T T A C G"
seq2 = "T A C T G A C A T T T"
seq1 = seq1.split()
seq2 = seq2.split()

match = 1
mismatch = -2
gap = -1

# Init

In [3]:
def init(seq1, seq2):
    matrice = np.zeros((len(seq1), len(seq2)))
    n, m = matrice.shape
    for i in range(1, m):
        matrice[0][i] = -(-gap*i)
    for i in range(1, n):
        matrice[i][0] = --(gap*i)
    return matrice

# Needleman & Wunsch functions

In [4]:
# Renvoie la valeur max pour la matrice de distance
# Renvoie aussi la direction pour le backtrack
def find_max(a, b, c):
    if a > b:
        if a > c:
            return a, "diag"
        else:
            return c, "left"
    else:
        if b > c:
            return b, "uppe"
        else:
            return c, "left"

# Construit la matrice de distance de seq1 seq2
def get_distance_matrix(seq1, seq2, waterman):
    matrice = init(seq1, seq2)
    n, m = matrice.shape
    backtrack = np.empty((n, m), dtype=object)
    for i in range(1, n):
        for j in range(1, m):
            v = 0
            if seq1[i-1] == seq2[j-1]:
                v = match
            else:
                v = mismatch
            test1 = matrice[i-1][j-1] + v
            test2 = matrice[i-1][j]   + gap
            test3 = matrice[i]  [j-1] + gap
            if not waterman:
                matrice[i][j], _ = find_max(test1, test2, test3)
            else:
                matrice[i][j], _ = find_max_waterman(test1, test2, test3)
    return matrice
 
# Construit les 2 sequences ADN suivant la matrice de distance
def get_dna_strings(matrice, waterman):
    n, m = matrice.shape
    str1 = []
    str2 = []
    path = []
    
    # Choix des i j de départ
    i = 0
    j = 0
    if not waterman:
        i = n-1
        j = m-1
    if waterman: # On choisit ici de renvoyer un seul alignement local en partant d'un seul max
        max_v = -np.inf
        for i in range(n):
            for j in range(m):
                if matrice[i][j] > max_v:
                    max_v = matrice[i][j]
                    max_i = i
                    max_j = j
        i = max_i
        j = max_j
    print("---> Position de départ : (",i,",",j,")")
        
    # Backtrack
    while not (i==0 and j==0):
            test1 = matrice[i-1][j-1]
            test2 = matrice[i-1][j]
            test3 = matrice[i]  [j-1]
            _, direction = find_max(test1, test2, test3) # Check la direction pour backtrack
            path.append(direction)
            if direction == "diag":
                str1 += seq1[i-1]
                str2 += seq2[j-1]
                i -= 1                 # decrementation en diagonal
                j -= 1
            elif direction == "left":
                str1 += "_"
                str2 += seq2[j-1]
                j -= 1                 # decrementation des colonnes
            elif direction == "uppe":
                str1 += seq1[i-1]
                str2 += "_"
                i -= 1                 # decrementation des lignes
                
    return str1, str2, path

# Resultats

In [5]:
matrice = get_distance_matrix(seq1,seq2, waterman=False)
print(matrice)
str1, str2, path = get_dna_strings(matrice, waterman=False)
str1.reverse() # Reverse ici car on a parcouru le tableau en partant du bas gauche
str2.reverse()
print(path)
print(str1)
print(str2)

[[  0.  -1.  -2.  -3.  -4.  -5.  -6.  -7.  -8.  -9. -10.]
 [ -1.  -2.  -3.  -1.  -2.  -3.  -4.  -5.  -6.  -7.  -8.]
 [ -2.  -3.  -1.  -2.  -3.  -4.  -2.  -3.  -4.  -5.  -6.]
 [ -3.  -4.  -2.  -3.  -4.  -2.  -3.  -4.  -5.  -6.  -7.]
 [ -4.  -2.  -3.  -4.  -2.  -3.  -4.  -5.  -6.  -4.  -5.]
 [ -5.  -3.  -4.  -5.  -3.  -4.  -5.  -6.  -7.  -5.  -3.]
 [ -6.  -4.  -5.  -6.  -4.  -2.  -3.  -4.  -5.  -6.  -4.]
 [ -7.  -5.  -6.  -7.  -5.  -3.  -4.  -5.  -6.  -4.  -5.]
 [ -8.  -6.  -7.  -8.  -6.  -4.  -5.  -6.  -7.  -5.  -3.]
 [ -9.  -7.  -5.  -6.  -7.  -5.  -3.  -4.  -5.  -6.  -4.]
 [-10.  -8.  -6.  -4.  -5.  -6.  -4.  -2.  -3.  -4.  -5.]]
---> Position de départ : ( 10 , 10 )
['left', 'left', 'left', 'diag', 'diag', 'uppe', 'uppe', 'diag', 'uppe', 'diag', 'diag', 'diag', 'diag']
['C', 'A', 'G', 'T', 'T', 'G', 'T', 'T', 'A', 'C', '_', '_', '_']
['T', 'A', 'C', 'T', '_', 'G', '_', '_', 'A', 'C', 'A', 'T', 'T']


In [6]:
# Tester et robuste aux différentes valeurs de match, mismatch, gap ainsi qu'aux différentes tailles de sequences.

# Smith-Waterman

In [7]:
# Renvoie la valeur max pour la matrice de distance
# Renvoie aussi la direction pour le backtrack
def find_max_waterman(a, b, c):
    if a < 0 and b < 0 and c < 0:
        return 0, "zero"
    if a > b:
        if a > c:
            return a, "diag"
        else:
            return c, "left"
    else:
        if b > c:
            return b, "uppe"
        else:
            return c, "left"

In [8]:
matrice = get_distance_matrix(seq1,seq2, waterman=True)
print(matrice)
str1, str2, path = get_dna_strings(matrice, waterman=True)
str1.reverse() # Reverse ici car on a parcouru le tableau en partant du bas gauche
str2.reverse()
print(path)
print(str1)
print(str2)

[[  0.  -1.  -2.  -3.  -4.  -5.  -6.  -7.  -8.  -9. -10.]
 [ -1.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.]
 [ -2.   0.   1.   0.   0.   0.   1.   0.   1.   0.   0.]
 [ -3.   0.   0.   0.   0.   1.   0.   0.   0.   0.   0.]
 [ -4.   0.   0.   0.   1.   0.   0.   0.   0.   1.   1.]
 [ -5.   0.   0.   0.   1.   0.   0.   0.   0.   1.   2.]
 [ -6.   0.   0.   0.   0.   2.   1.   0.   0.   0.   1.]
 [ -7.   0.   0.   0.   1.   1.   0.   0.   0.   1.   1.]
 [ -8.   0.   0.   0.   1.   0.   0.   0.   0.   1.   2.]
 [ -9.   0.   1.   0.   0.   0.   1.   0.   1.   0.   1.]
 [-10.   0.   0.   2.   1.   0.   0.   2.   1.   0.   0.]]
---> Position de départ : ( 5 , 10 )
['left', 'uppe', 'left', 'left', 'left', 'diag', 'left', 'left', 'diag', 'left', 'uppe', 'diag']
['C', 'A', '_', 'G', '_', '_', 'T', '_', '_', '_', 'T', '_']
['T', '_', 'A', 'C', 'T', 'G', 'A', 'C', 'A', 'T', '_', 'T']


# Utilisation de BLOSUM62

In [9]:
SEQ1 = "MGPSENDPNLFVALYDFVASGDNTLSITKGEKLRVLGYNHNGEWCEAQTKNGQGWVPSNYITPVNSLEKHSWYHGPVSRNAAEYLLSSGINGSFLVRESESSPGQRSISLRYEGRVYHYRINTASDGKLYVSSESRFNTLAELVHHHSTVADGLITTLHYPAP"
SEQ2 = "GAMDPSEALQRPVASDFEPQGLSEAARWNSKENLLAGPSENDPNLFVALYDFVASGDNTLSITKGEKLRVLGYNHNGEWCEAQTKNGQGWVPSNYITPVNSLEKHSWYHGPVSRNAAEYLLSSGINGSFLVRESESSPGQRSISLRYEGRVYHYRINTASDGKLYVSSESRFNTLAELVHHHSTVADGLITTLHYPAPKRNKPTIYGVSPNYDKWEMERTDITMKHKLGGGQYGEVYEGVWKKYSLTVAVKTLKEDTMEVEEFLKEAAVMKEIKHPNLVQLLGVCTREPPFYIITEFMTYGNLLDYLRECNRQEVSAVVLLYMATQISSAMEYLEKKNFIHRNLAARNCLVGENHLVKVADFGLSRLMTGDTYTAHAGAKFPIKWTAPESLAYNKFSIKSDVWAFGVLLWEIATYGMSPYPGIDLSQVYELLEKDYRMERPEGCPEKVYELMRACWQWNPSDRPSFAEIHQAFETMFQESSISDEVEKELGKRGT"

f = open("BLOSUM62.txt", "r")
values = []
n_line = 0
n_col = 0

for idx, line in enumerate(f):
    if idx > 6:
        n_line += 1
        for elem in line.split(" ")[1:-1]:
            if elem != "":
                if idx == 7:
                    n_col += 1
f.close()

In [10]:
f = open("BLOSUM62.txt", "r")

seq_order = []
blosum62 = []

for idx, line in enumerate(f):
    if idx == 6:
        for elem in line.split(" ")[1:-1]:
            if elem != "":
                seq_order += elem
    if idx > 6:
        for elem in line.split(" ")[1:-1]:
            if elem != "":
                blosum62.append(elem)
    
blosum62 = np.array(blosum62).reshape(n_line, n_col)
print(blosum62.shape)
seq_order = np.array(seq_order)
print(seq_order)

(24, 24)
['A' 'R' 'N' 'D' 'C' 'Q' 'E' 'G' 'H' 'I' 'L' 'K' 'M' 'F' 'P' 'S' 'T' 'W'
 'Y' 'V' 'B' 'Z' 'X']


In [11]:
# Redifinition de la fonction pour alignement de sequence proteique avec blosum62
def get_distance_matrix(seq1, seq2, waterman):
    matrice = init(seq1, seq2)
    n, m = matrice.shape
    backtrack = np.empty((n, m), dtype=object)
    for i in range(1, n):
        for j in range(1, m):
            v = 0
            index = np.where(seq_order == seq1[i-1])[0]
            index2 = np.where(seq_order == seq2[i-1])[0]
            mismatch = int(blosum62[index, index2])
            if seq1[i-1] == seq2[j-1]:
                v = match
            else:
                v = mismatch
            test1 = matrice[i-1][j-1] + v
            test2 = matrice[i-1][j]   + gap
            test3 = matrice[i]  [j-1] + gap
            if not waterman:
                matrice[i][j], _ = find_max(test1, test2, test3)
            else:
                matrice[i][j], _ = find_max_waterman(test1, test2, test3)
    return matrice

# Redifinition de la fonction pour alignement de sequence proteique avec blosum62
def get_dna_strings(matrice, waterman):
    n, m = matrice.shape
    str1 = []
    str2 = []
    path = []
    
    # Choix des i j de départ
    i = 0
    j = 0
    if not waterman:
        i = n-1
        j = m-1
    if waterman: # On choisit ici de renvoyer un seul alignement local en partant d'un seul max
        max_v = -np.inf
        for i in range(n):
            for j in range(m):
                if matrice[i][j] > max_v:
                    max_v = matrice[i][j]
                    max_i = i
                    max_j = j
        i = max_i
        j = max_j
    print("---> Position de départ : (",i,",",j,")")
        
    # Backtrack
    while not (i==0 and j==0):
            test1 = matrice[i-1][j-1]
            test2 = matrice[i-1][j]
            test3 = matrice[i]  [j-1]
            _, direction = find_max(test1, test2, test3) # Check la direction pour backtrack
            path.append(direction)
            if direction == "diag":
                str1 += SEQ1[i-1]
                str2 += SEQ2[j-1]
                i -= 1                 # decrementation en diagonal
                j -= 1
            elif direction == "left":
                str1 += "_"
                str2 += SEQ2[j-1]
                j -= 1                 # decrementation des colonnes
            elif direction == "uppe":
                str1 += SEQ1[i-1]
                str2 += "_"
                i -= 1                 # decrementation des lignes
                
    return str1, str2, path

In [12]:
gap = -11
match = 1
# mismatch est definit suivant blosum dans la fonction get_distance_matrix()
matrice = get_distance_matrix(SEQ1,SEQ2, waterman=True).astype(int) # Reconversion en int
print(matrice)
str1, str2, path = get_dna_strings(matrice, waterman=True)
str1.reverse() # Reverse ici car on a parcouru le tableau en partant du bas gauche
str2.reverse()
print(path)
print(str1)
print(str2)

[[    0   -11   -22 ... -5412 -5423 -5434]
 [  -11     0     0 ...     0     0     0]
 [  -22     0     0 ...     0     0     1]
 ...
 [-1760     0     0 ...     0     0     0]
 [-1771     0     0 ...     0     0     0]
 [-1782     0     1 ...     0     0     0]]
---> Position de départ : ( 162 , 197 )
['diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag', 'diag',