## Oligo Search Website

Here are the functions used for the search feature in my website.

I have optimized them by comparing various python methods and code.

In [1]:
import xlrd
oligo_sheet = (xlrd.open_workbook("kh_oligos.xlsx")).sheet_by_index(0)
reference = (open("reference.txt")).read()

In [21]:
def reverse_complement(text):
    text = text[::-1].upper().replace(' ','')
    reverse_complement_text = text.translate(str.maketrans('ACGT','TGCA'))
    return reverse_complement_text


def oligo_search(reference, oligo_sheet):
    oligo_column = 2
    reference = reference.upper().replace(' ','')
    rc_reference = reverse_complement(reference)
    matching_oligos = []

    for i in range(oligo_sheet.nrows):
        oligo = (oligo_sheet.cell_value(
                    rowx=i,
                    colx=oligo_column)
                    .upper().replace(" ",""))
        if i < oligo_sheet.nrows and oligo != "" and ((oligo in reference) or (oligo in rc_reference)):
            matching_oligos.append(oligo_sheet.cell_value(rowx=i, colx=oligo_column-1))

    if not matching_oligos:
        return("No oligos match the reference.")
    else:
        return(matching_oligos)


oligo_search(reference, oligo_sheet)

## Reverse compliment function.

Python's maketrans() method was the quickest and most readable way to obtain the reverse compliment out of the five ways I tested. 

In [22]:
from timeit import timeit
from Bio.Seq import Seq

oligo = (oligo_sheet.cell_value(
                    rowx=6,
                    colx=2)
                    .upper().replace(" ",""))
print (oligo)

In [5]:
def reverse_complement_one(text):
    text = text.upper().replace(' ','')
    dict_basepair = {'G':'C', 'C':'G', 'A':'T', 'T':'A'}
    reverse_complement_string = ''
    for i, nucleotide in enumerate(text):
        basepair = (text[((i+1)*-1)+len(text)])
        if dict_basepair.get(basepair):
            reverse_complement_string += dict_basepair.get(basepair)
    return(reverse_complement_string)
reverse_complement_one(oligo)

'AACGAACTGAGTAAAGTTAGAAGCCAT'

In [6]:
def reverse_complement_two(text):
    text = text.upper().replace(' ','')
    dict_basepair = {'G':'C', 'C':'G', 'A':'T', 'T':'A'}
    reverse = (text[::-1])
    reverse_complement_string = ''
    for i in reverse:
        if dict_basepair.get(i):
            reverse_complement_string += dict_basepair.get(i)
    return(reverse_complement_string)
reverse_complement_two(oligo)

'AACGAACTGAGTAAAGTTAGAAGCCAT'

In [7]:
def reverse_complement_three(text):
    text = text.upper().replace(' ','')
    dict_basepair = {'G':'C', 'C':'G', 'A':'T', 'T':'A'}
    reverse_complement_string = ''
    for nucleotide in reversed(text):
        if dict_basepair.get(nucleotide):
            reverse_complement_string += dict_basepair.get(nucleotide)
    return (reverse_complement_string)
reverse_complement_three(oligo)

'AACGAACTGAGTAAAGTTAGAAGCCAT'

In [26]:
def reverse_complement_four(text):
    text = text[::-1].upper().replace(' ','')
    reverse_complement_text = text.translate(str.maketrans('ACGT','TGCA'))
    return reverse_complement_text
reverse_complement_four(oligo)

In [27]:
def reverse_complement_biopython(text):
    sequence = Seq(text)
    reverse_complement_sequence = Seq.reverse_complement(sequence)
    return(reverse_complement_sequence)
reverse_complement_biopython(oligo)

## Timeit

I used timeit to repeat each function 100,000 times.

In [28]:
print ("reverse_complement_one:{}".format(timeit(
                                    "reverse_complement_one(text)",
                                    "from __main__ import reverse_complement_one;"
                                    "text='A@GG GTTAG--TGACCAGCT! AG'", number =100000)))
print ("reverse_complement_two:{}".format(timeit(
                                    "reverse_complement_two(text)",
                                    "from __main__ import reverse_complement_two;"
                                    "text = 'A@GG GTTAG--TGACCAGCT! AG'", number =100000)))
print ("reverse_complement_three:{}".format(timeit(
                                    "reverse_complement_three(text)",
                                    "from __main__ import reverse_complement_three;"
                                    "text = 'A@GG GTTAG--TGACCAGCT! AG'", number =100000)))
print ("reverse_complement_four:{}".format(timeit(
                                    "reverse_complement_four(text)",
                                    "from __main__ import reverse_complement_four;"
                                    "text = 'A@GG GTTAG--TGACCAGCT! AG'", number =100000)))
print ("reverse_complement_biopython:{}".format(timeit(
                                    "reverse_complement_biopython(text)",
                                    "from __main__ import reverse_complement_biopython;"
                                    "text = 'A@GG GTTAG--TGACCAGCT! AG'", number =100000)))

## Oligo Search function

I chose to use "if pattern in text: do something" because it was quick and best suited for it's role in this code. 

In [29]:
def pattern_search_one(text, pattern):
    count = 0
    for i, nucleotide in enumerate(text):
        if text[i:i+len(pattern)] == pattern:
            count += 1
    return (count)
pattern_search_one(reference, oligo)

In [31]:
def pattern_search_two(text, pattern):
    return (text.count(pattern))
pattern_search_one(reference, oligo)

In [38]:
def pattern_search_three(text, pattern):
    match = []
    if pattern in text:
        match.append(oligo)
    if not match:
        return ("No pattern found in text.")
    else:
        return ("Pattern found in text.")
pattern_search_three(reference, oligo)

In [35]:
def pattern_search_four(text, pattern):
    match = text.find(pattern)
    if match > -1:
        return ("Pattern found in text.")
    else:
        return ("Pattern not found in text.")
pattern_search_four(reference, oligo)

In [46]:
print ("pattern_search_one:{}".format(timeit(
                                    "pattern_search_one(text, pattern)",
                                    "from __main__ import pattern_search_one;"
                                    "text='TGATTCCGGCGGGCGTGGAGAAGCGAGATTCATTCAAGCCGGGAGGCGTGGCGTGGCGTGGCGTGCGGATTCAAGCCGGCGGG';"
                                    "pattern='TTCCGG' ", number =100000)))

print ("pattern_search_two:{}".format(timeit(
                                    "pattern_search_two(text, pattern)",
                                    "from __main__ import pattern_search_two;"
                                    "text='TGATTCCGGCGGGCGTGGAGAAGCGAGATTCATTCAAGCCGGGAGGCGTGGCGTGGCGTGGCGTGCGGATTCAAGCCGGCGGG';"
                                    "pattern='TTCCGG' ", number =100000)))

print ("pattern_search_three:{}".format(timeit(
                                    "pattern_search_three(text, pattern)",
                                    "from __main__ import pattern_search_three;"
                                    "text='TGATTCCGGCGGGCGTGGAGAAGCGAGATTCATTCAAGCCGGGAGGCGTGGCGTGGCGTGGCGTGCGGATTCAAGCCGGCGGG';"
                                    "pattern='TTCCGG' ", number =100000)))

print ("pattern_search_four:{}".format(timeit(
                                    "pattern_search_four(text, pattern)",
                                    "from __main__ import pattern_search_four;"
                                    "text='TGATTCCGGCGGGCGTGGAGAAGCGAGATTCATTCAAGCCGGGAGGCGTGGCGTGGCGTGGCGTGCGGATTCAAGCCGGCGGG';"
                                    "pattern='TTCCGG' ", number =100000)))

## Oligo Search Locations

In the future I may want to incorporate the location of the oligo in the reference. I found regular expressions to be the best way to do this. 

In [39]:
import re

In [40]:
def pattern_find_one(text, pattern):
    list = []
    for i, nucleotide in enumerate(text):
        if text[i:i+len(pattern)] == pattern:
            list.append(i)
    return (list)
pattern_find_one(reference, oligo)

In [41]:
def pattern_find_two(text,pattern):
    matches = re.finditer('(?={0})'.format(pattern),text)
    positions = [str(match.start()) for match in matches]
    result = ' '.join(positions)
    return(result)
pattern_find_two(reference, oligo)

In [42]:
def pattern_find_three(text,pattern):
    results = []
    regex_pattern = re.compile('(?={0})'.format(pattern))
    matches = regex_pattern.finditer(text)
    for match in matches:
        results.append(match.start())
    return(results)
pattern_find_three(reference, oligo)

[12, 59]

In [45]:
print ("pattern_find_one:{}".format(timeit(
                                    "pattern_find_one(text, pattern)",
                                    "from __main__ import pattern_find_one;"
                                    "text='TGATTCCGGCGGGCGTGGAGAAGCGAGATTCATTCAAGCCGGGAGGCGTGGCGTGGCGTGGCGTGCGGATTCAAGCCGGCGGG';"
                                    "pattern='TTCCGG' ", number =100000)))

print ("pattern_find_two:{}".format(timeit(
                                    "pattern_find_two(text, pattern)",
                                    "from __main__ import pattern_find_two;"
                                    "text='TGATTCCGGCGGGCGTGGAGAAGCGAGATTCATTCAAGCCGGGAGGCGTGGCGTGGCGTGGCGTGCGGATTCAAGCCGGCGGG';"
                                    "pattern='TTCCGG' ", number =100000)))

print ("pattern_find_three:{}".format(timeit(
                                    "pattern_find_three(text, pattern)",
                                    "from __main__ import pattern_find_three;"
                                    "text='TGATTCCGGCGGGCGTGGAGAAGCGAGATTCATTCAAGCCGGGAGGCGTGGCGTGGCGTGGCGTGCGGATTCAAGCCGGCGGG';"
                                    "pattern='TTCCGG' ", number =100000)))

pattern_find_one:4.370878521993291
pattern_find_two:1.415826846001437
pattern_find_three:1.4113261079764925
