# TinyDB in action!

### TinyDB helped make my alignment algorithm faster.

---

### An example of an alignment:

### Does the pattern "cac" occur in the following sequence?

### cactaagcacacagagaata

### yes!

### *cac* taag *cacac* agagaata

---

### Here is a fast algroithm to determine if a pattern is found in a reference.

In [1]:
def find_in(reference, pattern):
    if pattern in reference:
        return "Yes"

In [2]:
reference = 'cactaagcacacagagaataatgtctagaatctgagtgccatgttatcaaattgtactgagactcttgcagtcacacaggct'
pattern = 'cac'

In [3]:
find_in(reference, pattern)

'Yes'

### Great! But this does not give us location information or allow us to find fuzzy alignments. 

In [4]:
def mismatch(string1, string2):
    mismatches = 0
    for (nucleotide1, nucleotide2) in zip(string1, string2):
        if nucleotide1 != nucleotide2:
            mismatches += 1
    return(mismatches)

In [13]:
def approximate_patterns(text, pattern, max_mismatches):
    pattern_location = []
    for i, base in enumerate(text):
        query_pattern = text[i:i+len(pattern)]
        if mismatch(query_pattern, pattern) <= max_mismatches:
            pattern_location.append(i)
    return(pattern_location)

In [14]:
approximate_patterns(reference, pattern, 0)

[0, 7, 9, 72, 74]

In [None]:
def approximate_patterns(text, pattern_list, max_mismatches):
    text = text.upper().replace(" ","")
    pattern_location = ''
    for pattern in pattern_list:
        
        for i, base in enumerate(text):
            query_pattern = text[i:i+len(pattern)]
            mismatches = 0
            for (nucleotide1, nucleotide2) in zip(pattern, query_pattern):
                if nucleotide1 != nucleotide2:
                    mismatches += 1
                    
            if mismatches <= max_mismatches:
                pattern_location += str(i+1)+','+'\t'

    return(pattern_location)

In [None]:
from tinydb import TinyDB, Query
import re
import json
import re

In [17]:
reference = 'cactaagcacacagagaataatgtctagaatctgagtgccatgttatcaaattgtactgagactcttgcagtcacacaggct'
pattern_list = ['acac','cag']
pattern_list_2 = ['*BIOTIN*-ACAC', 'TGTC', 'aaa', 'act gag act ctt gc']
pattern_length_list = []

In [18]:
db = TinyDB('/Users/ksindy/PycharmProjects/oligo_search_website/pratice_db.json')

In [19]:
for pattern in pattern_list:
    pattern_length = len(pattern)
    if pattern_length not in pattern_length_list:
        pattern_length_list.append(len(pattern))
        for i, nucleotide in enumerate(reference):
            ref_chunk = reference[i:i+pattern_length]
            db.insert({'sequence': ref_chunk, 'index':i+1})
            #+1 because of 0 indexing. This will give teh actual start site of the pattern.
    sequence_match = Query()
    print(db.search(sequence_match.sequence==pattern))

[{'index': 9, 'sequence': 'acac'}, {'index': 74, 'sequence': 'acac'}]
[{'index': 12, 'sequence': 'cag'}, {'index': 69, 'sequence': 'cag'}, {'index': 77, 'sequence': 'cag'}]


In [20]:
def pattern_find_tinydb(pattern_list, reference):
    matching = ''
    #matching_dict = {}
    for pattern in pattern_list:
        pattern_length = len(pattern)
        if pattern_length not in pattern_length_list:
            pattern_length_list.append(len(pattern))
            for i, nucleotide in enumerate(reference):
                ref_chunk = reference[i:i+pattern_length]
                db.insert({'sequence': ref_chunk, 'index':i+1})
                #+1 because of 0 indexing. This will give teh actual start site of the pattern.
        sequence_match = Query()
        matches = db.search(sequence_match.sequence==pattern)
        matches = json.dumps(matches)
        #print(matches)
        locations = re.findall(r'\d+', matches)
        #print(type(locations))
        #print([int(loc) for loc in matches.split(" ") if loc.isdigit()])
        #print("{} is found at {}".format(pattern, locations))
        #matching += str((db.search(sequence_match.sequence==pattern)))
        #matching_dict.add(db.search(sequence_match.sequence==pattern))
    return(matching)
print(pattern_find_tinydb(pattern_list, reference))




In [23]:
def pattern_find_in(pattern_list, refernece):
    pattern_match = []
    for pattern in pattern_list:
        if pattern in reference:
            pattern_match.append(pattern)
    return(pattern_match)

In [24]:
from timeit import timeit

In [25]:
print(pattern_find_tinydb(pattern_list, reference))
print ("pattern_find_tinydb:{}".format(timeit(
                                    "pattern_find_tinydb(pattern_list, reference)",
                                    "from __main__ import pattern_find_tinydb;"
                                    "reference='cactaagcacacagagaataatgtctagaatctgagtgccatgttatcaaattgtactgagactcttgcagtcacacaggct';" 
                                    "pattern_list=['acac','cag'] "
                                    , number=100000)))

print(pattern_find_in(pattern_list, reference))
print ("pattern_find_in:{}".format(timeit(
                                    "pattern_find_in(pattern_list, reference)",
                                    "from __main__ import pattern_find_in;"
                                    "reference='cactaagcacacagagaataatgtctagaatctgagtgccatgttatcaaattgtactgagactcttgcagtcacacaggct';" 
                                    "pattern_list=['acac','cag'] "
                                    , number=100000)))

print(approximate_patterns(reference, pattern_list, 0))
print ("approximate_patterns:{}".format(timeit(
                                    "approximate_patterns(reference, pattern_list, 0)",
                                    "from __main__ import approximate_patterns;"
                                    "reference='cactaagcacacagagaataatgtctagaatctgagtgccatgttatcaaattgtactgagactcttgcagtcacacaggct';" 
                                    "pattern_list=['acac','cag'] "
                                    , number=100000)))




In [34]:
print(pattern_length_list)
print(db.all())

[4]
[{'index': 1, 'sequence': 'cact'}, {'index': 2, 'sequence': 'acta'}, {'index': 3, 'sequence': 'ctaa'}, {'index': 4, 'sequence': 'taag'}, {'index': 5, 'sequence': 'aagc'}, {'index': 6, 'sequence': 'agca'}, {'index': 7, 'sequence': 'gcac'}, {'index': 8, 'sequence': 'caca'}, {'index': 9, 'sequence': 'acac'}, {'index': 10, 'sequence': 'caca'}, {'index': 11, 'sequence': 'acag'}, {'index': 12, 'sequence': 'caga'}, {'index': 13, 'sequence': 'agag'}, {'index': 14, 'sequence': 'gaga'}, {'index': 15, 'sequence': 'agaa'}, {'index': 16, 'sequence': 'gaat'}, {'index': 17, 'sequence': 'aata'}, {'index': 18, 'sequence': 'ataa'}, {'index': 19, 'sequence': 'taat'}, {'index': 20, 'sequence': 'aatg'}, {'index': 21, 'sequence': 'atgt'}, {'index': 22, 'sequence': 'tgtc'}, {'index': 23, 'sequence': 'gtct'}, {'index': 24, 'sequence': 'tcta'}, {'index': 25, 'sequence': 'ctag'}, {'index': 26, 'sequence': 'taga'}, {'index': 27, 'sequence': 'agaa'}, {'index': 28, 'sequence': 'gaat'}, {'index': 29, 'sequence'

In [15]:
db.purge()
pattern_length_list = []
print(db.all())

[]


In [41]:
# def reference_dict(pattern_length, reference):
#     for i, nucleotide in enumerate(reference):
#         query = reference[i:i+pattern_length]
#         query_number = pattern_to_number(query)
#         if len(query) == pattern_length and query not in query_dict:
#             query_dict[query]=str(i+1)+','
#             #+1 because of 0 indexing. This will give teh actual start site of the pattern.
#         elif len(query) == pattern_length:
#             query_dict[query]+=str(i+1)+','
            
#     db.insert(query_dict)
#     return (db.all())

In [42]:
def symbol_to_number(symbol):
    dict_symbol = {'A':0, 'C':1, 'G':2, 'T':3}
    return dict_symbol[symbol]

def pattern_to_number(pattern):
    pattern = pattern.upper().replace(" ","")
    regex = re.compile('[^agctuAGCTU]')
    pattern = regex.sub('', pattern)
    if not pattern:
        return 0
    symbol = pattern[-1]
    prefix = pattern[0:-1]
    return 4*pattern_to_number(prefix) + symbol_to_number(symbol)