Reading notes and partial solutions to [Data Structures and Algorithms in Python](https://blackwells.co.uk/bookshop/product/9781118290279?gC=f177369a3b&gclid=Cj0KCQjwhJrqBRDZARIsALhp1WTBIyoxeQGXedlVy80vsglvFbNkVf7jTP0Z0zXEIP87lfqbtb4_diYaAr8dEALw_wcB).

# Text Processiong

## Pattern Matching

### Brute Force

In [1]:
def find_brute(T, P):
    """Return the lowest index of T at which substring P begins."""
    n, m = len(T), len(P)
    for i in range(n-m+1): # possible starting indices
        k = 0
        while k < m and T[i+k] == P[k]:
            k += 1
        if k == m:
            return i
    return -1

### Boyer-Moore

In [2]:
def find_boyer_moore(T, P):
    """Return the lowest index of T at which substring P begins."""
    n, m = len(T), len(P)
    if m == 0:
        return 0
    last = {}
    for k in range(m):
        last[P[k]] = k # record last occurence of P[k] in P
    i = m - 1 # index for text T
    k = m - 1 # index for pattern P
    while i < n:
        if T[i] == P[k]: # if there's a match
            if k == 0:
                return i # found pattern P starting from i
            else:
                i -= 1 # check backwards
                k -= 1
        else: # mismatch
            j = last.get(T[i], -1) # get last occurence of T[i] in T
            i += m - min(k, j+1) # character jump, either jump k or j+1 depending on where the last occurence is
            k = m - 1 # restart at end of patterm P
    return -1

### Knuth-Morris-Pratt (KMP)

In [None]:
def compute_kmp_fail(P):
    """Returns KMP's fail list for pattern P containing lengths of longest reuse."""
    m = len(P)
    fail = [0] * m
    j = 1 # probing index
    k = 0 # index for matches
    while j < m:
        if P[j] == P[k]:
            fail[j] = k + 1 # tail exclusive syntax
            j, k = j + 1, k + 1
        elif k > 0:
            k = fail[k-1] # k follows a matching prefix
        else:
            j += 1
    return fail

In [None]:
def find_kmp(T, P):
    """Return the lowest index of T at which substring P begins (or else -1)."""
    n, m = len(T), len(P)
    if m == 0:
        return 0
    fail = compute_kmp_fail(P)
    j = 0 # index for text T
    k = 0 # index for pattern P
    while j < n:
        if T[j] == P[k]:
            if k == m - 1: # match is complete
                return j - m + 1
            else:
                j += 1
                k += 1
        elif k > 0: # mismatch
            k = fail[k-1] # reuse
        else:
            j += 1
    return -1