In [1]:
# 1. (DNA)
X = 'ACTACTAGATTACTTACGGATCAGGTACTTTAGAGGCTTGCAACCA'
Y = 'TACTAGCTTACTTACCCATCAGGTTTTAGAGATGGCAACCA'

In [2]:
# 2. (Proteins)
X = 'AASRPRSGVPAQSDSDPCQNLAATPIPSRPPSSQSCQKCRADARQGRWGP'
Y = 'SGAPGQRGEPGPQGHAGAPGPPGPPGSDG'

In [3]:
# Import libraries  
import numpy as np
import pandas as pd
from time import time

In [4]:
# Using matrix dynamic programming, define a function which given string 1 and string 2, we calculate the edit distance

def edit_distance(string1, string2):
    """ Construct the initial matrix """
    m = len(string1) + 1 #length of string 1
    n = len(string2) + 1 #length of string 2
    matrix = np.zeros((m, n)) #create a matrix of dimensions m x n with zeros
    for x in range(m): #iterate over range of m
        matrix[x, 0] = x #put the values of the iteration in the first column
    for y in range(n): #iterate over range of n
        matrix[0, y] = y #put the values of the iteration in the first row
    
    """ Create lists for costs taken into consideration at each iteration  """
    s = [] #create an empty list for the substitution cost at each iteration
    d = [] #create an empty list for the deletion cost at each iteration
    i = [] #create an empty list for the insertion cost at each iteration
    
    """ Fill in the matrix with minimum cost """
    for x in range(1, m): #iterate over range of m skipping the first value [0]
        for y in range(1, n): #iterate over range of n skipping the first value [0]
            substitution = 1 if string1[x-1] != string2[y-1] else 0 #calculate the substitution cost
            
            s.append(matrix[x-1, y-1] + substitution) #add the substitution cost to the list
            d.append(matrix[x-1, y] + 1) #add the deletion cost to the list
            i.append(matrix[x, y-1] + 1) #add the insertion cost to the list
            
            matrix[x,y] = min( #select the action with minimum cost 
                matrix[x-1, y-1] + substitution, #calculate the substitution cost
                matrix[x-1, y] + 1, #calculate the deletion cost
                matrix[x, y-1] + 1 #calculate the insertion cost
            )
    
    """ Convert lists of costs to panda series """
    s = pd.Series(s)
    d = pd.Series(d)
    i = pd.Series(i)
    
    """ Merge lists of costs to dataframe """
    costs = pd.concat([s, d, i], axis=1)
    costs.columns = ['substitution', 'deletion', 'insertion']
    
    """ Print the results """
    print (costs) #print the costs at each iteration
    print (matrix) #print the contructed matrix
    return (int(matrix[m - 1, n - 1])) #return the edit distance

In [5]:
# Print the results
st = time()
print('Edit distance: ', edit_distance(X, Y))

# Print time
print('Time: %0.5f seconds' % (time() - st))

      substitution  deletion  insertion
0              1.0       2.0        2.0
1              2.0       3.0        2.0
2              2.0       4.0        3.0
3              4.0       5.0        3.0
4              5.0       6.0        4.0
5              6.0       7.0        5.0
6              7.0       8.0        6.0
7              8.0       9.0        7.0
8              9.0      10.0        8.0
9             10.0      11.0        9.0
10            11.0      12.0       10.0
11            12.0      13.0       11.0
12            13.0      14.0       12.0
13            14.0      15.0       13.0
14            15.0      16.0       14.0
15            15.0      17.0       15.0
16            17.0      18.0       16.0
17            17.0      19.0       17.0
18            19.0      20.0       18.0
19            20.0      21.0       19.0
20            21.0      22.0       20.0
21            22.0      23.0       21.0
22            23.0      24.0       22.0
23            24.0      25.0       23.0


In [6]:
# Import numpy  
import numpy as np

# Using matrix dynamic programming, define a function which given string 1, string 2, 
# the substitution cost, the deletion cost and the insertion cost, we calculate the edit distance

def edit_distance_penalty(string1, string2, substitution_cost=1, deletion_cost=1, insertion_cost=1):
    """ Construct the initial matrix """
    m = len(string1) + 1 #length of string 1
    n = len(string2) + 1 #length of string 2
    matrix = np.zeros((m, n)) #create a matrix of dimensions m x n with zeros
    for x in range(m): #iterate over range of m
        matrix[x, 0] = x * deletion_cost #put the values of the iteration in the first column
    for y in range(n): #iterate over range of n
        matrix[0, y] = y * insertion_cost #put the values of the iteration in the first row
    
    """ Create lists for costs taken into consideration at each iteration  """
    s = [] #create an empty list for the substitution cost at each iteration
    d = [] #create an empty list for the deletion cost at each iteration
    i = [] #create an empty list for the insertion cost at each iteration    
        
    """ Fill in the matrix with minimum cost """
    for x in range(1, m): #iterate over range of m skipping the first value [0]
        for y in range(1, n): #iterate over range of n skipping the first value [0]
            substitution = substitution_cost if string1[x-1] != string2[y-1] else 0 #calculate the substitution cost
            
            s.append(matrix[x-1, y-1] + substitution) #add the substitution cost to the list
            d.append(matrix[x-1, y] + deletion_cost) #add the deletion cost to the list
            i.append(matrix[x, y-1] + insertion_cost) #add the insertion cost to the list
            
            matrix[x,y] = min( #select the action with minimum cost 
                matrix[x-1, y-1] + substitution, #calculate the substitution cost
                matrix[x-1, y] + deletion_cost, #calculate the deletion cost
                matrix[x, y-1] + insertion_cost #calculate the insertion cost
            )
            
    """ Convert lists of costs to panda series """
    s = pd.Series(s)
    d = pd.Series(d)
    i = pd.Series(i)
    
    """ Merge lists of costs to dataframe """
    costs = pd.concat([s, d, i], axis=1)
    costs.columns = ['substitution', 'deletion', 'insertion']
    
    """ Print the results """
    print (costs) #print the costs at each iteration
    print (matrix) #print the contructed matrix
    return (int(matrix[m - 1, n - 1])) #return the edit distance

In [7]:
# Print the results
st = time()
print('Edit distance: ', edit_distance_penalty(X, Y, deletion_cost = 2, insertion_cost = 2))

# Print time
print('Time: %0.5f seconds' % (time() - st))

      substitution  deletion  insertion
0              1.0       4.0        4.0
1              3.0       6.0        3.0
2              4.0       8.0        5.0
3              7.0      10.0        6.0
4              9.0      12.0        8.0
5             11.0      14.0       10.0
6             13.0      16.0       12.0
7             15.0      18.0       14.0
8             17.0      20.0       16.0
9             19.0      22.0       18.0
10            21.0      24.0       20.0
11            23.0      26.0       22.0
12            25.0      28.0       24.0
13            27.0      30.0       26.0
14            29.0      32.0       28.0
15            30.0      34.0       30.0
16            33.0      36.0       32.0
17            34.0      38.0       34.0
18            37.0      40.0       36.0
19            39.0      42.0       38.0
20            41.0      44.0       40.0
21            43.0      46.0       42.0
22            45.0      48.0       44.0
23            47.0      50.0       46.0
