# Edit Distance, Assembly, and Overlaps



In [36]:
%matplotlib inline

In [79]:
from typing import Optional
import random

import matplotlib.pyplot as plt
import numpy as np

In [80]:
def edit_distance(
    s1: str,
    s2: str,
    dist: Optional[np.ndarray] = None
) -> int:
    """

    Parameters
    ----------
    s1
    s2
    dist

    Returns
    -------

    >>> s1 = "ABCD"
    >>> s2 = "EFG"
    >>> edit_distance(s1, s2)
    4
    """
    if dist is None:
        dist = build_edit_dist_matrix(s1, s2)
    # return np.min(dist[-1])
    return dist[-1][-1]

def build_edit_dist_matrix(
    s1: str,
    s2: str,
) -> np.ndarray:
    """Calculate the edit distance matrix between to strings using equal weights.

    The edit distance is defined as the number of substitutions,
    insertions, and deletions required to align them

    Parameters
    ----------
    s1 : str
        First string
    s2 :
        Second string

    Returns
    -------
    np.ndarray
        The edit distance matrix

    """
    if len(s1) > len(s2):
        shorter_str = s2
        longer_str = s1
        shape = (len(s2) + 1, len(s1) + 1)
    else:
        shorter_str = s1
        longer_str = s2
        shape = (len(s1) + 1, len(s2) + 1)
    dist = np.zeros(shape=shape, dtype=int)
    for index in range(1, shape[0]):
        dist[index][0] = index
    for index in range(1, shape[1]):
        dist[0][index] = index
    for index_i in range(1, shape[0]):
        for index_j in range(1, shape[1]):
            dist_hor = dist[index_i][index_j - 1] + 1
            dist_ver = dist[index_i - 1][index_j] + 1
            dist_diag = dist[index_i - 1][index_j - 1]
            if shorter_str[index_i - 1] != longer_str[index_j - 1]:
                dist_diag += 1
            dist[index_i][index_j] = np.min([dist_hor, dist_ver, dist_diag])
    return dist


s1 = "ABCD"
s2 = "EFG"
edit_distance(s1, s2)

3

In [68]:
# random.seed(42)

4