# Edit Distance, Assembly, and Overlaps



In [1]:
%matplotlib inline

In [2]:
from pathlib import Path
from typing import Literal, Optional, TypeAlias
import unittest

from Bio.SeqIO import parse, SeqRecord
import numpy as np
import numpy.typing as npt

In [3]:
ArrayLikeInt: TypeAlias = npt.ArrayLike
"""NumPy array of data type int"""


def min_distance(
    dist: ArrayLikeInt
) -> int:
    """Calculate the minimum distance between two strings using a distance matrix.

    Parameters
    ----------
    dist : ArrayLikeInt
        Prepopulated distance array. The rows match to the shorter string.
    Returns
    -------
    int
        Edit distance
    """
    return np.min(dist[-1])


def build_dist_matrix(
    s1: str,
    s2: str,
    method: Literal["Edit", "Approx"],
    verbose: bool = False
) -> np.ndarray:
    """Calculate the edit distance matrix between to strings

    The edit distance is defined as the number of substitutions,
    insertions, and deletions required to align them. For equal-
    weights.

    Parameters
    ----------
    s1 : str
        First string
    s2 :
        Second string
    verbose : bool
        Enable verbosity
    method : Literal["Edit", "Approx"]
        The method to use.

    Returns
    -------
    np.ndarray
        The edit distance matrix

    """
    if method not in ("Edit", "Approx"):
        raise ValueError(f'method "{method} is not supported"')
    if len(s1) > len(s2):
        shorter_str = s2
        longer_str = s1
        shape = (len(s2) + 1, len(s1) + 1)
    else:
        shorter_str = s1
        longer_str = s2
        shape = (len(s1) + 1, len(s2) + 1)
    dist = np.zeros(shape=shape, dtype=int)
    # Initialize first column
    for index in range(1, shape[0]):
        dist[index][0] = index
    # Initialize first row
    if method == "Edit":
        for index in range(1, shape[1]):
            dist[0][index] = index
    counter = 0
    total_iter = (shape[0] - 1) * (shape[1] - 1)
    for index_i in range(1, shape[0]):
        for index_j in range(1, shape[1]):
            if verbose and (counter / total_iter * 100) % 1 == 0:
                print("%d%%" % (counter / total_iter * 100))
            counter += 1
            dist_hor = dist[index_i][index_j - 1] + 1
            dist_ver = dist[index_i - 1][index_j] + 1
            dist_diag = dist[index_i - 1][index_j - 1]
            if shorter_str[index_i - 1] != longer_str[index_j - 1]:
                dist_diag += 1
            dist[index_i][index_j] = np.min([dist_hor, dist_ver, dist_diag])
    if verbose:
        print("100%")
    return dist


def build_edit_dist_matrix(
    s1: str,
    s2: str,
) -> np.ndarray:
    return build_dist_matrix(s1, s2, "Edit")


def build_approx_match_dist_matrix(
    s1: str,
    s2: str,
) -> np.ndarray:
    return build_dist_matrix(s1, s2, "Approx")


class DistanceMatrixTestCase(unittest.TestCase):

    def test_edit_distance(self):
        p_1 = "EFG"
        t_1 = "ABCD"
        edit_dist_mat_1 = build_edit_dist_matrix(p_1, t_1)
        self.assertEqual(min_distance(edit_dist_mat_1), 3)

        p_2 = "GCGTATGC"
        t_2 = "TATTGGCTATACGGTT"
        edit_dist_mat_2 = build_edit_dist_matrix(p_2, t_2)
        self.assertEqual(min_distance(edit_dist_mat_2), 5)

    def test_approx_distance(self):
        p_1 = "EFG"
        t_1 = "ABCD"
        approx_match_dist_mat_1 = build_approx_match_dist_matrix(p_1, t_1)
        self.assertEqual(
            min_distance(approx_match_dist_mat_1),
            3
        )

        p_2 = "GCGTATGC"
        t_2 = "TATTGGCTATACGGTT"
        approx_match_dist_mat_2 = build_approx_match_dist_matrix(p_2, t_2)
        self.assertEqual(
            min_distance(approx_match_dist_mat_2),
            2
        )

res = unittest.main(argv=[''], verbosity=3, exit=False)
assert len(res.result.failures) == 0

test_approx_distance (__main__.DistanceMatrixTestCase) ... ok
test_edit_distance (__main__.DistanceMatrixTestCase) ... ok

----------------------------------------------------------------------
Ran 2 tests in 0.020s

OK


In [4]:
!wget https://d28rh4a8wq0iu5.cloudfront.net/ads1/data/chr1.GRCh38.excerpt.fasta
!mkdir -p week3hw
!mv chr1.GRCh38.excerpt.fasta week3hw

--2022-12-02 14:27:04--  https://d28rh4a8wq0iu5.cloudfront.net/ads1/data/chr1.GRCh38.excerpt.fasta
Resolving d28rh4a8wq0iu5.cloudfront.net (d28rh4a8wq0iu5.cloudfront.net)... 108.156.200.104, 108.156.200.204, 108.156.200.29, ...
Connecting to d28rh4a8wq0iu5.cloudfront.net (d28rh4a8wq0iu5.cloudfront.net)|108.156.200.104|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 810105 (791K) [application/octet-stream]
Saving to: ‘chr1.GRCh38.excerpt.fasta’


2022-12-02 14:27:04 (13.3 MB/s) - ‘chr1.GRCh38.excerpt.fasta’ saved [810105/810105]



In [5]:
human_genome_file = Path("week3hw/chr1.GRCh38.excerpt.fasta")
with human_genome_file.open("r") as fh:
    human_genome_seg: SeqRecord = list(parse(fh, human_genome_file.suffix.strip(".")))[0]

In [6]:
human_genome_seg, len(human_genome_seg.seq)

(SeqRecord(seq=Seq('TTGAATGCTGAAATCAGCAGGTAATATATGATAATAGAGAAAGCTATCCCGAAG...AGG'), id='CM000663.2_excerpt', name='CM000663.2_excerpt', description='CM000663.2_excerpt EXCERPT FROM CM000663.2 Homo sapiens chromosome 1, GRCh38 reference primary assembly', dbxrefs=[]),
 800000)

# Quiz

## Q1 and Q2 Preamble

We saw how to adapt dynamic programming to find approximate occurrences of a pattern in a text. Recall that:

 1. Rows of the dynamic programming matrix are labeled with bases from P and columns with bases from T
 2. Elements in the first row are set to 0
 3. Elements in the first column are set to 0, 1, 2, ..., as for edit distance
 4. Other elements are set in the same way as elements of a standard edit distance matrix
 5. The minimal value in the bottom row is the edit distance of the closest match between P and T


First, download the provided excerpt of human chromosome 1

https://d28rh4a8wq0iu5.cloudfront.net/ads1/data/chr1.GRCh38.excerpt.fasta

Second, parse it using the readGenome function we wrote before.

Third, adapt the editDistance function we saw in practical (copied below) to answer questions 1 and 2 below. Your function should take arguments p (pattern), t (text) and should return the edit distance of the match between P and T with the fewest edits.

## Q1

What is the edit distance of the best match between pattern GCTGATCGATCGTACG and the excerpt of human chromosome 1?  (Don't consider reverse complements.)

In [7]:
p_1 = "GCTGATCGATCGTACG"
t_1 = str(human_genome_seg.seq)

try:
    assert edit_mat_1 is not None
except NameError:
    edit_mat_1 = build_dist_matrix(p_1, t_1, "Approx", verbose=True)

0%
1%
2%
3%
4%
5%
6%
8%
9%
10%
11%
12%
13%
15%
16%
17%
18%
19%
20%
21%
22%
23%
24%
25%
26%
27%
30%
31%
32%
33%
34%
35%
36%
37%
38%
39%
40%
41%
42%
43%
44%
45%
46%
47%
48%
49%
50%
51%
52%
53%
54%
59%
60%
61%
62%
63%
64%
65%
66%
67%
68%
69%
70%
71%
72%
73%
74%
75%
76%
77%
78%
79%
80%
81%
82%
83%
84%
85%
86%
87%
88%
89%
90%
91%
92%
93%
94%
95%
96%
97%
98%
99%
100%


In [8]:
edit_dist_1 = min_distance(edit_mat_1)

In [9]:
edit_dist_1

3

The correct answer is __3__!

## Q2

What is the edit distance of the best match between pattern GATTTACCAGATTGAG and the excerpt of human chromosome 1?  (Don't consider reverse complements.)

In [10]:
p_2 = "GATTTACCAGATTGAG"
t_2 = t_1

try:
    assert edit_mat_2 is not None
except NameError:
    edit_mat_2 = build_dist_matrix(p_2, t_2, "Approx", verbose=True)


0%
1%
2%
3%
4%
5%
6%
8%
9%
10%
11%
12%
13%
15%
16%
17%
18%
19%
20%
21%
22%
23%
24%
25%
26%
27%
30%
31%
32%
33%
34%
35%
36%
37%
38%
39%
40%
41%
42%
43%
44%
45%
46%
47%
48%
49%
50%
51%
52%
53%
54%
59%
60%
61%
62%
63%
64%
65%
66%
67%
68%
69%
70%
71%
72%
73%
74%
75%
76%
77%
78%
79%
80%
81%
82%
83%
84%
85%
86%
87%
88%
89%
90%
91%
92%
93%
94%
95%
96%
97%
98%
99%
100%


In [11]:
edit_dist_2 = min_distance(edit_mat_2)

In [12]:
edit_dist_2

2

The correct answer is __2__!