# Project 2: Measuring Boyer Moore's Benefit

Implement versions of the naive exact matching and Boyer-Moore algorithms that additionally count and return 

(a) the number of character comparisons performed and 

(b) the number of alignments tried. 

In [6]:
%reset

In [69]:
import modules.boyer_moore
from modules.boyer_moore import BoyerMoore

import modules.all_functions
from modules.all_functions import * 

import bisect

import modules.index
from modules.index import *

## Prolog

In [70]:
def naive_count(p, t): 
    occurences = []
    alignments = 0
    comparisons = 0 
    """ i is every index until the last one we search. """
    for i in range(len(t) - len(p) + 1):
        """ at every i, we search a substring of t starting at i, equivalent to the length of p. 
        For a length of p = 4, we compare indices 0 1 2 3 of p and indices i+0, i+1, i+2, i+3 of t.
        As soon as we find the first dissimilar character, stop the comparision and move to the next index of i.""" 
        for j in range(len(p)):
            match = True
            comparisons += 1 
            if p[j] != t[i+j]: 
                match = False 
                break
        """ If we found no dissimilar characters, append i to occurences. """ 
        if match:
            occurences.append(i)
        alignments += 1
    return occurences, alignments, comparisons

In [71]:
def boyer_moore_count(p, p_bm, t):
    """ Do Boyer-Moore matching """
    i = 0
    occurences = []

    alignments = 0
    comparisons = 0 

    while i < len(t) - len(p) + 1:
        shift = 1
        mismatched = False

        for j in range(len(p)-1, -1, -1):
            comparisons += 1 
            if p[j] != t[i+j]:
                skip_bc = p_bm.bad_character_rule(j, t[i+j])
                skip_gs = p_bm.good_suffix_rule(j)
                shift = max(shift, skip_bc, skip_gs)
                mismatched = True
                break

        if not mismatched:
            occurences.append(i)
            skip_gs = p_bm.match_skip()
            shift = max(shift, skip_gs)
        
        i += shift
        alignments += 1

    return occurences, alignments, comparisons

In [72]:
p = 'AA'
t = 'ACAA'

occurences, alignments, comparisons = naive_count(p, t)
print(occurences, alignments, comparisons)

p_bm = BoyerMoore(p)
o, a, c = boyer_moore_count(p, p_bm, t)
print(o, a, c)

[2] 3 5
[2] 2 3


In [73]:
p = 'word'
t = 'there would have been a time for such a word'
occurrences, num_alignments, num_character_comparisons = naive_count(p, t)
print(occurrences, num_alignments, num_character_comparisons)

[40] 41 46


In [74]:
p = 'needle'
t = 'needle need noodle needle'
occurrences, num_alignments, num_character_comparisons = naive_count(p, t)
print(occurrences, num_alignments, num_character_comparisons)

[0, 19] 20 35


In [75]:
p = 'word'
t = 'there would have been a time for such a word'
lowercase_alphabet = 'abcdefghijklmnopqrstuvwxyz '
p_bm = BoyerMoore(p, lowercase_alphabet)
occurrences, num_alignments, num_character_comparisons = boyer_moore_count(p, p_bm, t)
print(occurrences, num_alignments, num_character_comparisons)

[40] 12 15


In [76]:
p = 'needle'
t = 'needle need noodle needle'
p_bm = BoyerMoore(p, lowercase_alphabet)
occurrences, num_alignments, num_character_comparisons = boyer_moore_count(p, p_bm, t)
print(occurrences, num_alignments, num_character_comparisons)

[0, 19] 5 18


## Actual Questions

In [78]:
t = read_genome_fasta('/Users/arshmeetkaur/Genomic_Data_Science/course 3 /files/chr1.GRCh38.excerpt.fasta')
print(t)

TTGAATGCTGAAATCAGCAGGTAATATATGATAATAGAGAAAGCTATCCCGAAGGTGCATAGGTCAACAATACTTGAGCCTAACTCAGTAGATCCTAAAAGAAAGCAATTTTTGCTGCTAACCTAACATTTCACAATGTCTGGAGACATTTACAGTTCCCACAACCTATGGCAGTTACTGGCATCTACTAGAGGTCAGAGATGCTGGTAAATACTCTGTAATGAACAAGAAGCCCCCCATAGCAAATAAATACCCAGCCCAAGATGGCAATAGTGCCCAGATTGAGAAACTTCACCTTAACCTGATATCATGCAAATATATCTGAAGAAAGACACAAACATAACTAAAGAAAGATGATTACCAGAAGAGATATTCATAAATCTTAGAAGCATAGAAAAAGAAACACAAGGCAATGTTTTCAGTGTCCAGGCAATTATCTTCCTGGGAAAAGCTAGCCTACCAGACCAACATGACTTTTGCACCTTGCTGGCAACCATTCTACTCTTCTGAAGAAGGAGACATCATTTGGACTCTAAAATCCCTTTTTCTGATTTCATACTCATCAAGAAATCTATCCATTTGGCTTAGTTTGTAGCTTATGCTGAAAAACGTGACTTGAGATTTCCTTCACTTGGAAATTGAGATTGCTTAATGTAGATTGACATTCTCAACATTTGGACAATAGTGGGATCAATTATCTTAACTTGCAAAGCTGAAGATTATACCTCTGGGCAACAGTCAAATTACCAAGGTAAATGCTTAGTTGTAGTCAGCATGGGATGGTGTTGAACCACTAATTCCATTTTTTAAAGAGATATAGGGCTTTTCAGGTTCTCTTTTTCTTCTTGAGTGAGCTTAAGTAGTTTGTTTCTTTCAAGGAATTAAACTATTTCATATAAGGTGTCACATTTATTGGCATAAGCTTGTTCAAAATATTTCTTATTATCCTAATATCTGTAGATTTTGTAATGATATCACCTCTCACATTCCTATTTTAATA


## Question 1 and 2

1: How many alignments does the naive exact matching algorithm try when matching the string 
GGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGG (derived from human Alu sequences) to the excerpt of human chromosome 1?  (Don't consider reverse complements.)

2: How many character comparisons? 


In [81]:
p = 'GGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGG'
occurrences, num_alignments, num_character_comparisons = naive_count(p, t)
print(occurrences, num_alignments, num_character_comparisons)
print(t[56922:56922+len(p)])

[56922] 799954 984143
GGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGG


## Question 3

How many alignmnents does boyer moore try? 

In [85]:
p = 'GGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGG'
p_bm = BoyerMoore(p)
occurrences, num_alignments, num_character_comparisons = boyer_moore_count(p, p_bm, t)
print(occurrences, num_alignments, num_character_comparisons)

[56922] 127974 165191


## Question 4 and Question 5



Index-assisted approximate matching: 
- we built the index class to implement ordered-list version of the kmer index. (Imported at the beginning of this homework file.) 
- built pigeonhole principle function: split p into multiple parts and use exact matching algorithm on each part. if even one matches, compare all of p with all of t to determine whether p meets t with the mismatch threshold. 
-- we used boyer moore, now we'll use the index matching principle. 

4: Implement a version of the approximate matching algorithm from the practical which uses index matching instead of boyer moore. P is always 24 and we will built 8mers.

Look for occurences GGCGCGGTGGCTCACGCCTGTAAT within the chromosome that occur with up to 2 substitutions. 

Use naive_hamming to confirm your results. 

p = 'GGCGCGGTGGCTCACGCCTGTAAT'
t = chromsome 
approximate_match_index(p, t, k) # k is the allowed amount of mismatches 

5: Count the number of index hits when searching for occurences of P in T with 2 mismatches. 
- accomplished this by using the query() function to look up substrings of P, which returned all possible occurences. (these are matches of p's substring, not p, to the whole genome. We prune through these later in the algo but these are the initial hits.)
- put all possible occurences in one set.
- added the length of each list of possible occurences to a counter keeping track of all hits. 
- did not use the query_index function, it doesn't make sense because the whole p is the size of an 8mer, and we didn't provide more than that to be "compared" with "T"

- how to check this: used the Boyer Moore approximate_match_bm function to get the number of hits

Got possible matches from approximate_match_bm like this: 

boyer moore returns matches for p's substring which we confirm later in the algo to either match or not match all of p with all of t with the allowed mismatches.
apply the Boyer Moore algorithm to each substring of P. 
        p_bm = BoyerMoore(p[start:end])
        possible_matches = boyer_moore(p[start:end], p_bm, t)
        index_hits += len(possible_matches)

Found 90 both ways, so that's the answer.

In [54]:
def approximate_match_index(p, t, k): 
    """ 
    Steps to approximate matching with index: 
    1. Split the P string into substrings of 8 (these will be the 8mers queried against the 8mer table built from T). 
    2. Build 8mer table from T. 
    3. Query P substring against T and get the occurences = []. 
    4. If at least one of those substrings have a match, you can inspect the rest of it. 
    5. Inspect rest to see if it's a match, then append the place of the match in T. 
    """

    length_of_partition = len(p) // (k+1) # building 8mers

    num_total_hits = 0 # the nubmer of hits found could include duplicates. It's for each search. 
    hits = set() # hits found by the query function, make it a set to stop duplicates
    actual_matches = set() # making it a set eliminates possibility of duplicates

    # Compare each substring of P to T using the Boyer Moore algorithm.
    for i in range(k+1): 
        # instead of creating substrings, use indices 
        start = i * length_of_partition
        end = start + length_of_partition 
        if (len(p) - end < length_of_partition):
            end = len(p) 
        
        # apply indexing
        k8mer_table = Index(t, length_of_partition)
        index_hits, num_hits = k8mer_table.query(p[start:end], True)
        # add each of the index hits to the table.
        for hit in index_hits:
            hits.add(hit)
        # count the number of index hits per each search. 
        num_total_hits += num_hits
        
        if len(index_hits) == 0: 
            continue 

        # for every possible match index, check if p really matches. 
        for m in index_hits:

            # to deal with the case that the match to the segment was found at the very end of t so p would span beyond t
            if m < start or m-start+len(p) > len(t):
                continue

            mismatches = 0 
            
            # check from the start of p to the start of the matching fragment 
            for j in range(start):
                if p[j] != t[m - start + j]:
                    mismatches += 1
                    # break 
                    if mismatches > k: 
                        break 
            
            # check from the end of the matching fragment to end of p 
            for j in range(end, len(p)):

                if p[j] != t[m - start + j]:
                    mismatches += 1
                    if mismatches > k: 
                        break 

            # if the number of mistmatics is tolerated, add it to the set 
            if mismatches <= k:
                actual_matches.add(m - start)

    list_actual_matches = list(actual_matches)
    list_actual_matches.sort()
    return list_actual_matches, hits, num_total_hits

In [88]:
t = read_genome_fasta('/Users/arshmeetkaur/Genomic_Data_Science/course 3 /files/chr1.GRCh38.excerpt.fasta')
p = 'GGCGCGGTGGCTCACGCCTGTAAT'

# testing results with naive hamming for confirmation. 
print(naive_hamming(p, t))

# testing results with approx match using the boyer moore algorithm. 
print(approximate_match_bm(p, t, 2, True)) # also gives the number of hits when you specify true
print(approximate_match_official(p, t, 2))

# testing our algorithm using index matching 
list_actual_matches, hits, num_total_hits = approximate_match_index(p, t, 2)
print(list_actual_matches) # matches the others 
print(len(list_actual_matches), len(hits), num_total_hits) # 19 occurences, 90 hits, 90 total hits. 
print(hits)

[56922, 84641, 147558, 160162, 160729, 191452, 262042, 273669, 364263, 421221, 429299, 465647, 551134, 635931, 657496, 681737, 717706, 724927, 747359]
([56922, 84641, 147558, 160162, 160729, 191452, 262042, 273669, 364263, 421221, 429299, 465647, 551134, 635931, 657496, 681737, 717706, 724927, 747359], 90)
[56922, 84641, 147558, 160162, 160729, 191452, 262042, 273669, 364263, 421221, 429299, 465647, 551134, 635931, 657496, 681737, 717706, 724927, 747359]
[56922, 84641, 147558, 160162, 160729, 191452, 262042, 273669, 364263, 421221, 429299, 465647, 551134, 635931, 657496, 681737, 717706, 724927, 747359]
19 90 90
{632321, 22548, 635947, 472642, 725061, 719434, 205397, 657496, 56922, 108126, 707167, 657504, 56930, 613475, 595557, 147558, 657512, 56938, 23154, 147574, 43143, 746636, 186012, 84641, 84657, 760505, 322751, 454348, 588494, 175326, 57056, 251106, 551142, 364263, 19182, 364271, 551150, 364279, 465655, 429307, 465663, 429315, 480517, 83720, 681737, 273677, 651539, 273685, 187671,

## Question 6: Index of Subsequences

Index of Subsequences: 

Instructions: 
- they gave us a class that uses subsequence of T instead of substrings to build the kmer table. 
- we get the subsequences by considering every nth character, what we'll call n = eval, and building subsequences of a length k

Goal: we need to write a function which builds a subsequence table with 
k = 8 
and 
ival = 3 

and then finds all occurences of P within T up to 2 mismatches. 

In [56]:
class SubseqIndex(object):
    """ Holds a subsequence index for a text T """
    
    def __init__(self, t, k, ival):
        """ Create index from all subsequences consisting of k characters
            spaced ival positions apart.  E.g., SubseqIndex("ATAT", 2, 2)
            extracts ("AA", 0) and ("TT", 1). """
        self.k = k  # num characters per subsequence extracted
        self.ival = ival  # space between them; 1=adjacent, 2=every other, etc
        self.index = []
        self.span = 1 + ival * (k - 1) # the total character span for one subseq
        for i in range(len(t) - self.span + 1):  # for each subseq
            self.index.append((t[i:i+self.span:ival], i))  # add (subseq, offset)
        self.index.sort()  # alphabetize by subseq
    
    def query(self, p):
        """ Return index hits for first subseq of p """
        subseq = p[:self.span:self.ival]  # query with first subseq
        i = bisect.bisect_left(self.index, (subseq, -1))  # binary search
        hits = []
        while i < len(self.index):  # collect matching index entries
            if self.index[i][0] != subseq:
                break
            hits.append(self.index[i][1])
            i += 1
        return hits
    
def approximate_match_subseq(p, t, k): 
    """ 
    Steps to approximate matching with index: 
    1. Split the P string into substrings of 8 (these will be the 8mers queried against the 8mer table built from T). 
    2. Build 8mer table from T. 
    3. Query P substring against T and get the occurences = []. 
    4. If at least one of those substrings have a match, you can inspect the rest of it. 
    5. Inspect rest to see if it's a match, then append the place of the match in T. 
    """

    num_total_hits = 0 # the nubmer of hits found could include duplicates. It's for each search. 
    hits = set() # hits found by the query function, make it a set to stop duplicates
    
    actual_matches = set() # making it a set eliminates possibility of duplicates

    subseq_length = len(p) // (k+1)

    # create the subsequences
    k3mer_subsequences = SubseqIndex(t, subseq_length, k+1)

    # Compare each substring of P to T using the exact matching algorithm.
    for start in range(k+1): 
        # instead of creating substrings, use indices 
        # typically there would be a start and end, but in this case, we want to query 
        index_hits = k3mer_subsequences.query(p[start:])

        # add each of the index hits to the table.
        for hit in index_hits:
            hits.add(hit)
        # count the number of index hits per each search. 
        num_total_hits += len(index_hits)
        
        if len(index_hits) == 0: 
            continue 

        # for every possible match index, check if p really matches. 
        for m in index_hits:

            # offset is always m - 0 + start = m - start
            # think about it
            offset = m - start

            # to deal with the case that the match to the segment was found at the very end of t so p would span beyond t
            if offset < 0 or offset +len(p) > len(t):
                continue

            mismatches = 0 
            
            # check from the start of p to the start of the matching fragment 
            for j in range(len(p)):
                if p[j] != t[offset + j]:
                    mismatches += 1
                    # break 
                    if mismatches > k: 
                        break 

            # if the number of mistmatics is tolerated, add it to the set 
            if mismatches <= k:
                actual_matches.add(offset)

    list_actual_matches = list(actual_matches)
    list_actual_matches.sort()
    return list_actual_matches, hits, num_total_hits

In [89]:
# testing our algorithm using index matching 
p = 'GGCGCGGTGGCTCACGCCTGTAAT'
list_actual_matches, hits, num_total_hits = approximate_match_subseq(p, t, 2)
print(list_actual_matches)
print(len(list_actual_matches), len(hits), num_total_hits) # 19 occurences, 90 hits, 90 total hits. 

[56922, 84641, 147558, 160162, 160729, 191452, 262042, 273669, 364263, 421221, 429299, 465647, 551134, 635931, 657496, 681737, 717706, 724927, 747359]
19 79 79
