# The Challenge:

### Find alignments. 

### Include locations and allow for fuzzy searches. 

### All this at a speed that does not piss off the user. 

---

### An example of an alignment:

### Does the pattern "cac" occur in the following sequence?

### cactaagcacacagagaata

### yes!

### *cac* taag *cacac* agagaata

---

### Here is a fast algroithm to determine if a pattern is found in a reference.

In [1]:
import datetime

In [2]:
def find_in(reference, pattern):
    start = datetime.datetime.now()
    
    if pattern in reference:
        stop = datetime.datetime.now()
        
        print(stop-start)
        return "Yes"

In [3]:
reference = 'cactaagcacacagagaataatgtctagaatctgagtgccatgttatcaaattgtactgagactcttgcagtcacacag'
pattern = 'cac'

In [4]:
find_in(reference, pattern)

0:00:00.000013


'Yes'

### Great! But this does not give us location information or allow us to perform fuzzy alignments.

### O(1) - constant 

### This function iterates through the reference using a 'window' the same size as the pattern.

In [5]:
def pattern_find(reference, pattern):
    start = datetime.datetime.now()
    pattern_location = []  # Empty list to store location of alignments.
    
    for i, base in enumerate(reference):
        query_pattern = reference[i:i+len(pattern)]  # slice "Window"
        
        if query_pattern == pattern:
            pattern_location.append(i+1) 
            
    stop = datetime.datetime.now()
    print(stop-start)
    return(pattern_location)

In [6]:
reference = 'cactaagcacacagagaataatgtctagaatctgagtgccatgttatcaaattgtactgagactcttgcagtcacacag'
pattern = 'cac'

In [7]:
pattern_find(reference, pattern)

0:00:00.000058


[1, 8, 10, 73, 75]

#### 00.000014 simple find

#### 00.000057 window find

### Looping takes 4 times longer but the location is now available. O(n) - linear 

### Now let's try it on a list.

In [8]:
pattern_list = ['acac','cag','ttat','agaa','ttatcaaatt',]

In [9]:
start = datetime.datetime.now()

for pattern in pattern_list:
    print('{} {}'.format(pattern, pattern_find(reference, pattern)))

stop = datetime.datetime.now()
print('{} final speed'.format(stop-start))

0:00:00.000094
acac [9, 74]
0:00:00.000081
cag [12, 69, 77]
0:00:00.000097
ttat [44]
0:00:00.000053
agaa [15, 27]
0:00:00.000036
ttatcaaatt [44]
0:00:00.003474 final speed


#### 00.000057 window find

#### 00.001089 window find with list

### As you would expect each item takes 4 times as long as the simple find algorithm so looking for 5 items take 19-20 times longer. Two O(n) linear or O(n^2) quadratic

### So what will this mean when we work with a large amount of ugly data....real data.

### The list: ~160 oligos (patterns) from one excel sheet (using xlrd) 

### Reference: 20,000 basepair portion of chromsome12 from human genom (UCSC DAS database).

#### http://genome.ucsc.edu/cgi-bin/das/hg19/dna?segment=chr12:53960000,53980000

![title](ucsc.png)

In [39]:
pattern_list = ['CATGACCCCCCCCAAGAAGAAGCGCAAGGTGGAGGACGGAATGGACGCACAAACACGACG', 'TGGGGCCGCCGCCCACGGCGGGGGCGCCGCCGCCCAACTTGTTTGCAGCTTTCCATTGAGCTT', 'CATGACCCCCCCCAAGAAGAAGCGCAAGGTGGAGGACGGAATGGCTTCTAACTTTACTCAGTTCGTTC', 'TGGGGCCGCCGCCCACGGCGGGGGCGCCGCCGCCCAACTTGTAGATGCCGGAGTTTGCTG', 'ATGGACGCACAAACACGAC', 'ATGGCTTCTAACTTTACTCAGTTCGTT', 'TACTTGCATTTTTTCTAAACAC', 'ACTATAGGGAGACCCAAGCTGG', 'GAGGATTCTGACAGTGAAATATCAG', 'ATGTTTGGTGAGCCAAAAC', 'CATACTTACTTGGCTTGTTTGGGATAT', 'GCCTATGGCATTATTGTACGGA', 'TTACGCATAAACGATGACGTCA', 'AAGGCACAGTCGAGGC', 'GATCTGGGCCCTGAAGAAGGGCCCG', 'GATCCGGGCCCTTCTTCAGGGCCCA', 'ATGTTTGGTGAGCCAAAAC', 'CATGAGGGCCCTGAAGAAGGGCCCC', 'GATCCGGGCCCTTCTTCAGGGCCCA', 'GATCTGGGCCCTGAAGAAGGGCCCG', 'CTATCAGTGATAGAGAACGTATAAG', 'CAGAGGAGGGAAGAGAG', 'CTTCTTATCATCTCCATCTTTATGATG', 'CGTTCAGTGTCAGAAAATG', 'CATCTTGAGACACATGGG', 'GTGACATAATTGGACAAACTACC', 'GGGAGCTTGTATATCCATTTTCGGATCTGATCAGCACGTGATGACCGAGTACAAGCCCACG', 'CATAGAAGGCGGCGGTGGAATCGAAATCTCGTAGCACGTGTCAGGCACCGGGCTTGCGGG', 'TGCCTCTGAGCTATTCC', 'GCGCTGGAGGATCATC', 'AGCGGTTCCCGG', 'CCGCGCTGGAGGATCATCCA', 'GGGTCTGGGCAGCGCCGTCG', 'GGCGAAGAACTCCAGCATGAG', 'CATGGCGATGCCTGCTTGCCGA', 'GCAAGGAACGCCCGTCGTGGC', 'TCTCCGGGCCTTTCGACCTGCAGCCAATATGGGATCGGCCATGACCGAGTACAAGCCCAC', 'CAGTCGAGGCTGATCAGCGAGCTCTAGAGAATTGATCCCCTCAGGCACCGGGCTTGCGGG', 'TCCTCTTCCTCATCTCCGGGCCTTTCGACCTGCAGCCAATATGACCGAGTACAAGCCCAC', 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCTACGTTTACGCAGACTATCTTTCT', 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCTCTATTTACGCAGACTATCTTTCT', 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCTGATTTTACGCAGACTATCTTTCT', 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCTTGCTTTACGCAGACTATCTTTCT', 'CAAGCAGAAGACGGCATACGAGATCGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATC', 'CGTGCCTTTTCCCGCGAGGTTG', 'GCCTGCTGGGGAGCCTGGGGAC', 'CCTCTAGAAATAATTTTGTTTAACTTTAAGAAGGAGATATACATATGAGCGGCCGCGGCAAAGG', 'CAACTCAGCTTCCTTTCGGGCTTTGTTAGCAGCCGGATCCTTAACCGCCAAAACCATACAGG', 'CTCTAGAAATAATTTTGTTTAACTTTAAGAAGGAGATATACATATGGGCCATCACCATCACCATCACC', 'GTCTGTGTAGAAGACCACACACG', 'GCTCAAGCAGAGGCGGCCTCGGCC', 'GCTTCAAGTGGGAGCGCGTGATG', 'GGGGATCAATTCTCTAGAGCTCGC', 'ATTGGCTGCAGGTCGAAAGGC', 'CCGCTCTTCCGATCTGTTATGAAGG', 'GCAGGAAACGAAGATAAATCATGTCG', 'CGTAACAACTCCGCCCCATTGACG', 'CGCGGTCTCGGCATTCCTGCTG', 'AGGCCCGGCATTCTGCACGC', 'CCATTCTCCGCCCCATGGCTGAC', 'CTTGTCTGTAAGCGGATGCCG', 'GATCAGTTGGGTGCACGAGTGG', 'CCGCTGTTGAGATCCAGTTCG', 'CCTCCTCACTACTTCTGGAATAGC', 'AGGCCGAGGCCGCCTCTGC', 'CCTGACGGGCTTGTCTGCTCC', 'CCAAAATGTCGTAACAACTCCGCCC', 'GCATATTTGAGAAGATGCGGCCAGC', 'CAGCGGCCAATAGCAGCTTTGC', 'CACAAGTGGCCTCTGGCCTCGCACACATTCCACATCCAACGCGTGGGTTGCGCCTTTTCCAAGGC', 'CGAGGCTGATCAGCGAGCTCTAGAGAATTGATCCCCCGTCGACGTCAGGCACCGGGCTTGCGGG', 'CGTCGACGGGGGATCAATTCTCTAGAGCTCGCTGATCAGC', 'GGTTCCTGGCCTTTTGCTGG', 'GTGAAATACCGCACAGAGCAAAAGG', 'TGTTCTGCAGCGTGTCGAGC', 'TTACAGCGTGATGGAGCAGATGAAG', 'ACGCGTTGGATGTGGAATGTGTGCGAGGCC', 'ACTGCCCGCTTTCCAGTCG', 'CGACGATATGATCCTGATGCAGCTAG', 'TTTACGCAGACTATCTTTCT', 'CGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATC', 'CAAGAATGCATGCGTCAATTTTACGCAGACTATCTTTCTAG', 'GGTGCCTGACGTCGACGGGGGATCAATTCTCTAGAGCTCGCTGATC', 'CAACCCACGCGTTGGATGTGGAATGTGTGCGA', 'GATCCGTTGTAAAACGACGGCCAGTCA', 'TATGACTGGCCGTCGTTTTACAAC', 'GGAGGACGGGCAGACTCGC', 'CAACCCACGCGTTGGATGTGGAATGTGTGCGAGGCCAGAG', 'GAGTTGGTAGCTCTTGATCCGGC', 'GCAACTGCCCGGCTACTACTAC', 'CGTTGGCCGATTCATTAATGCAGC', 'CCGCACCGCTGTCATTAATCTGC', 'CCACTTGTGTAGCGCCAAGTG', 'GGATAATACCGCGCCACATAGC', 'GATCGGGCCCTGAAGAAGGGCCCGGGCCCTGAAGAAGGGCCCGGGCCCTGAAGAAGGGCCCCGG', 'GATCCCGGGGCCCTTCTTCAGGGCCCGGGCCCTTCTTCAGGGCCCGGGCCCTTCTTCAGGGCCC', 'GATCCCCGGGCCCTGAAGAAGGGCCCGGGCCCTGAAGAAGGGCCCGGGCCCTGAAGAAGGGCCCGGA', 'GATCTCCGGGCCCTTCTTCAGGGCCCGGGCCCTTCTTCAGGGCCCGGGCCCTTCTTCAGGGCCCGGG', 'CCACTCCTCCACCTTTGAC', 'ACCCTGTTGCTGTAGCCA', 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCTTGCAAGAATGCATGCGTCAATTTTACGCAGACTATCTTTCTAG', 'ACCATTCCCAATGCCTGAA', 'TGCATACCTACCCAATGTATGG', 'GGCTTAGACCCTCAGGT', 'GCTCCCTCTCTCCACTC', 'GGACTCATGACCACAGTCCATGC', 'GGAAGGCCATGCCAGTGAG', 'CCAAATTCGTTGTCATACCAGG', 'ACACCCAAGCTCGTTGGG', 'CCTTGCCTGCATTTCTCTGC', 'CAGAAAGGTCCTGCTCCGC', 'CCCCTCCTTCCTCTCGCC', 'GTCCCTAATATCCCGGAGGT', 'GCAGGCTTCTAAATCCGTTC', 'GATCGGAAGAGCGGTTCAGCAGGAATGCCG', 'AGAAAGATAGTCTGCGTAAA', 'CTAGAAAGATAGTCTGCGTAAAATTGACGCATGCATTCTTG', 'GTTGACATTGCGAAGAGCGACAAAG', 'GATCTGTTGTAAAACGACGGCCAGTC', 'TTAAGACTGGCCGTCGTTTTACAACA', 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCTACTTTTACGCAGACTATCTTTCT', 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCTCAGTTTACGCAGACTATCTTTCT', 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCTGTCTTTACGCAGACTATCTTTCT', 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCTTGATTTACGCAGACTATCTTTCT', 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCTACGTTTACGCAGACTATCTTTCT', 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCTCTATTTACGCAGACTATCTTTCT', 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCTGATTTTACGCAGACTATCTTTCT', 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCTTGCTTTACGCAGACTATCTTTCT', 'CAAGCAGAAGACGGCATACGAGATCGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATC', 'CGGATTTCCTTGAAGAGAGTGAG', 'CGGATCTGGAAGTTCTGTTCC', 'CGCTGTGCAGAAGCAGAGAGG', 'GTTACCAGGTCCGCGCTCTC', 'ACGGATTCGCGCTATTTAGA', 'GTGCTTGTCAATGCGGTAAG', 'CACGCGGTCGTTATAGTTCA', 'GACGCATGATTATCTTTTACGTG', 'CGCAATTAATGTGAGTTAGC', 'GGTGTAAACCTTAAACTGCC', 'CAGGCAGACATCTGTGAATCG', 'GTTCTGCCCAAGGGTTGGTTTG', 'TTAATCTAGCTGCATCAGGATCATATCGTCGGGTC', 'GGTAGAAAAAGCAACCACGAAGC', 'ACATAAACCTCTGTCTGTGAGTGCC', 'GGCAGCACAGAGCAACTCTA', 'GAGTGCAAAGTCCCGTTTG', 'AGCCTTTGGAAGCTCTTGAA', 'GTGTCTTGGAGAGGCGTGTA', 'AGAAGAGTTAGTTGACTATACAGC', 'ATGTTTGAATGTGATAACCGTCCT', 'AATTACCGATCCAATGCGAAGCTTTAAGAC', 'AATTGTCTTAAAGCTTCGCATTGGATCGGT', 'AATTACGTAAGCTTAATGCCGATCCAAGAC', 'AATTGTCTTGGATCGGCATTAAGCTTACGT', 'TTGAGAAGAGTTAGTTGACTATACAGC', 'CAAGCAGAAGACGGCATACGAGAT', 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT', 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCTATCACGTTTACGCAGACTATCTTTCT', 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCTCGATGTTTTACGCAGACTATCTTTCT', 'ACAGTAGCTGTATATAAAACCAGTGATGTTTGAATGTGATAACCGTCCT', 'TGCTGTATAAAAAACCAGTGGTTATATGTACAGTAGCTGTATATAAAACCAGTGGTTATATGTACAGTAGCTGTATATAAAACCAGTGATGTTTGAATGTGATAACCGTCCT', 'ATGTTTGGTGAGCCAAAAC', 'GATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG', 'GATCGGAAGAGCACACGTCTGAACTCCAGTCACCGATGTATCTCGTATGCCGTCTTCTGCTTG', 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT', 'CAAGCAGAAGACGGCATACGAGATATCACGGTGACTGGAGTTCAGACGTGTGCTCTTCCGATCCGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATC', 'CAAGCAGAAGACGGCATACGAGATCGATGTGTGACTGGAGTTCAGACGTGTGCTCTTCCGATCCGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATC', 'AGCCATCAAGGAGGCTGTAA', 'CCAGGAAGTCCGTAGAGACG', 'GAAGTTCAGGCATTGGGAAT', 'CAAACGGGACTTTGCACTCT', 'GTGAACCGTCAGATCGCCTGG', 'TGTCCAATTATGTCACACCA', 'CCATTCGCCATTCAGGCTGC']

In [40]:
reference = 'CATGACCCCCCCCAAGAAGAAGCGCAAGGTGGAGGACGGAATGCACGCACAAACACGACG aaactatcagtcctgagaagactgttgtcaattaatgccttgtataactg gattgctcaggagaagggagagtaaggcagtgaaagggaaggggcaatag gtttgaggaagacattattggtttcttaacccaacccccatttttcaacc attttctcttgcttgcctcactataaaggctgaaaagcaagatattcctt tttcctatgcagctggtattaaacctgtcacacagttcaggccaataaga caatgaaatccaccgtggtgggggtggaggatggcacttctgatgtaagg cttttgtctccttataaaagggaaagccagaacaagagagttcctgatgc tacctgccccttatttctttcccatcctctgatagtgaggcaacaggaat gaggattaaaaggtaaaagctaaaaatggtacagcagagagaaagaacct aggtccttgataaacttgcttagctgctgaaacagtcctaattctggaca ccttgtgttataaatgttgacacagtttttttttgtttttgtttttgctt tgttttgagatgcagtcttgctctcttgcccaggctgcagtgcaatggca tgatcttggctcactgcaacctctgcctcccgggttcaagcaattctcct gcctcagcctcctgagtagctgggattacagacacacgccaccatgccca gctaatttttgtatttttagtacagatggggtttcaccatcttggtcagg ctagtctcgaactcctgaccttgtgatccgcccgcctctgcctccctaag tgctgggattacaggcgtgagccattgcgcccagcctgtgttcacacagt tttaagctactgttaggtatttgcatctgaaaccattcctaaccaatata cgattttcaatatgtatagattctacttcattttctatctaaggaactta gaaatggaatatacaagaaattattttattctacatcttcatgaatattt atcagttgtatcctaatgctgaattctgttttctttttcacataatatac caaataaattccatctcatcaccacctattgcctcaagcttccttagaaa caatggctatacactgggcttttaggaagaacaagatggtaagagagtgg caccttacaatgctgctttggaaactggggcagttaaagatcactcctca ctaaagaatggagttttagactggagcagcatgttttcttatttctcgct gcccctcttcctcaatacccagtgtttggctttttatcaaactccagggc actcttgggaactacgcttaagaattaacaaggatagatccattatcttt ccttagatggcagctacttacttggatgggttaaatgctcaacaaacttt aaaggaccagtgcctaggaggcagagaaatcccagcagcaattcagccag gcctggtcctatgtcagttacatggctaattgctttctccacagctcttg cctaatggtagaccttttagtattctcctatccagatgtagccacttagc tggggtaccattaaagtccccaaagggataactgtagttcttaacaaagc aagaaaaggttattttaaaatcaaaataattatcacacagccctattaag ttcttgttgacattatacatttctatttttcaatctcatctctctcccca aattcttattaggatgttatgaggtacaaaatccttgaaaaacgtatcaa atacttttatcttctaatttgctacacattaagtgttacaatttctggat ctcacaaatgggtgggaaggacaaagttacagagtgaaaagtatgaacct actacattgtggtttggcttgagctgctgaaaatttcactgtggaaaaac tatactgtaaatacttcctgataatacaggagttttaaaattttgaatgc ttacaatgcattcagagtagagcacatacaagtcagtactgagaccacaa taacaaaaaccctgccctgatctttaaggccttatgcagtttagtactgt aacagtcaagccaaaagaaaatatataactggaaggtgaggagggggaga gaaccaatgagcgtgctcacatgcactcgtctctaaaaccgtctctaaac ttggttcaagtttacaaggagattacgcacacacactctctctctctctt tctctctctctccttccctcactttaaaataataccaaaatacagtatta agcttttgttttaaacaagctatcaaagcttacaggcccagggactcgga aaccacagaagaggttgcagcctggtaaagaaataggcaacgataaaggc tcaaatagtttggcaggtcttctatataaaccataaactggtcaagtgct ctaattctgccctgctatcttaatttaatgcagcaaaaaattcttaccat aatttgttgacatattcatctgattatctttgtgtctaaccaggaaagct ctgataagctatctttagttttacaaagtcaggcaaatggaaaacttcaa ttttggatgtttactttctaaccataagtgatattttttagaatgctggc cacagactagaggtctacactcaaagaaggcaggcaggcaggtaggggca aagaatgaaattgcagctaagcactgtggctcatgctggtaatcccagct acttgggaggctgaggcaggaggatcacctgaggaatttgagacaagcct gggcaatatatattgaaatcctgtctcttaaaaaatttaaagagtagcca ggcatggtggcacatgcctgtagtcccagctacttgggaggctgagaaag agagagagagaaaggaggttcccttgagcccaggtgtttgaagctgcagt aagctatgattgtgccactgcactccggcctgggcgacagaggagggctc catgtctcttaaaaaaaataaaaaataaaaaaggaatctacactgcctat cagtaacacttatccacatacttctagacagtcatagtgatccaacacag aagaacagcagcagctggggaaattgtcttgttcctgtctcatctcctgc tgctctccatcacagtactaaactacttgcattttttctaaacacatcat gctgtttcattctcttgcatctgctattctcttatatgtggaatgttctt ctctactttctctgggaaacttctatttacctttcaaaacctgttaaagc atcatttcctccaggaagccttctctgatgtttaccagaattctcagtac ttgcaaatatttacattattatatttatcatattagaatgtaattacatt ttcatgtgtctgactcccctataaggcagtgaatactttcatgtcttata catctctctgtcactagtacctagcatggtgtttggcacattgtagatgt tcaaatttttttttaatttttttttttgagatggagtctcgctctgtcgc ccaggctggagtgcagtggtgcaatctcagcttactgcaacctacctccg cctccctcgttcaaaggattctcctgcctcagcctcctcagtagctagga ccacaagcatgtgccaccatgcccggctaacttttgtatttttggtagag acggggtttcaccatattggccaggctggtctggaactcctgaccttgtg atctgcctgccttggcctcccaaagtgcagggattacaggtgtgagccac cgcgcctggccattcaaatattttttattgagctgaacccacagaggctt atgaaggtctgattgaattgagaaaattattcatggactgatttctggaa acctgacctcaggcttcctcaaggttagagataccaggcagatacataaa gaataaaatcaaatgccttgatcttatttttacaaaggatttgtgtgcac taagtctctcaccagagacatttgggaaaacccatgttcaatttttgatt acggtctaaaatgataacaaattgttccaattatatagaacctgccactc cattatatcataatgtagacaaaccaggaagtgctaatgggatcctatta accttcatatcacattacaacagcctaggggaagagaggagagaaaggaa aatacaaaggccattaaaaatactgatgggtgtcaggcaggggctaagag gttggaacttccagtcctcacctctaggtaacaactcactacagtatagc aaatgcagatttcaaccatagtatagtaggccggtaatgcctccagtata gacctgtcctgccaaaatacaacataccaggtggcttaaacagaacttta ttatctctcagttctggaggccagaggtgttaccagggttggtttcttct gaaggatgtcagcaagaatgtgtttcatgtctctcctagcttctggcttg ctggcagtctttggcattccttgacttgtggcagcataaatccaaccttc acatagtgttctcctgtgtgcgtgtgtgtgtgtgtgtgtgtgtgtgtgtg tgtgtgtgtgtatgtccaatttcctctttttataggacaccagtcatatt ggagtaggggcccaccttacttcagtatgacctcatattaactaatgcca tctgcaatgaccctgtttccaaataaggtcacattttaaggtacctgggg ttaagacttcaccataaaaattcaattcaatccataacagctcacaaaga tatatcaatgttctaattcttagaagctgtgaatgttaccttatatggtg attaaattaagggctttgagatgaggggattatccagatgggccctcaat gctatcacaagcgtccttctaagacagcagcaaagggagattagaaacac acagagaggagagggcaatgtgaagaaagacatagagattgaagtgatgt aaccacaagccaagaaatgccggcagcaactagaagctggaagaggcaag gaaaggattctcctggagagccttctcaggaggatacggtcctctttaat atcttggtgggcttttggtctcccaaactgtgagataataaatttctttt ctttcaagccacctagtttgtggtaatttgttataacgcctacaggaaac taatatacatattatcttttcataatttaaacataattatctccattcta agagcagcagaggagcaggcagataacagacaataggtgacaaactccaa gcaattatacccataaccacaaaccactgtaaccagcatttttttaaaag ggaaagccaggcacagtggtgtgcacctgtagtcccagctactcagaaga ctgagaagggaggatcatttgagcccacaagtttgagtccagcctgggca acacagtgagactccatctctttaaaaaaaaaaaaaaagggagcagggag acaaatccgcactgaaatatcaccaccacaacaaaagaaactacagagta catcacactagataagggtgaatactattacatggctctctaatttctat tatataaaaaatgtgttttggttgggcacggtggctcacgcctgtaatcc cagcactttgggaggctgaggcagccagatcacctgaagtcaggagttaa aaaccagcctggccaacatggcgaaacccggtctctactaaacatacaaa aattagctgggtgtggtggcgcacacctgtaatcacagacactcggcagg ccgaggcaggagaatcgcttgaacctgagagacggaggttgcagtgagcc gagacagtgccactgcactccagcctgggatacagagccagactccatct caaaaaaaaaaaagaaaaagtgttttaagttgcaatgtaaaatgtatttc ttactcgattcataaaccaaaaggactgagacttctggtttttggcttgg catataaagagcttagaataacagttgtcactctatcctaacaacaaata aaaagctgaactgaaaaatcttaacaactcttcatagatctatcagagaa gtgagttcacagggcaaactgctgttccccaaattggcagatatagagaa tcacaacctaaaggagcagaaacccatgagctgaaacctgcatgacaatc aagtgccaggggaggaaaacctgaactataattgatgaagctagaggcca agtgtggacaagtctgagagttaaaaatgccagggagacccagtcacaga ggagcccccacaccgttgtgagatttacctccaggttctcccagctctac taggttcttacggtgaatatcaggggaaatttcttgtgcttttggcgggg ggaaagggaaaagtacagtagcctccccttacccgtggtttcactttcca tggtttcagttacccatagtcaactgcagttcaaaaatagatgagtactg gtcgagtaaggtggctcacacctataattccagcattttgggaggctgac gtgggaggaccgcttgagcccaggagttcgagactagcctgggtgataga gtgagaccctgcctctatttaattttttttttttttttgagagagaatct cactctgtcacccaggctggagtgcagtggcacaatctcagctcactgca acctccgcctcctgggttcaagcaattcttgtgcctcagcctcctgagta gctgggattaaaggtgtgcaccaccacacccagctaattttgtgttttta gtagagacggggtttcaccatgttggccaggctggtctcgaactcttggc ctcaagtgatctgcccgcctcggcctcccaaagtgttgagattacaggcg tgagccactgtacccggccaaaaaaaaaaaaaaaagatgagtacagtaca ataagatattttgagagagagaacacatttatataacttttattacagta tatttttataactgttctattttattattagttgttaatctcttactgtg ctgaatttataaattaaactttttcataggtatgtacatatagggaaaca tagtatggtcatccctccatatccatgggggactggtttcaagacccatg aggataccaaaatccacagatactcaagtccctgatataaaatggtgtag tatttgcatataaaatgtgcatatcctcctatatactttaaatcatctct agattattataatacctaatacaatgcctacatatcacttcattcgtgtg gattcaatgtagtgcttggcaggcagcaaatttaagttttgcctttttga attctgcggaattttttttttctgagtattttttacctatgattgtttta atccatggatgagaaacccatggatacagagggctgactgtatatagggt ttggtactatctggtttcatgcctccactgggaatcttggaatgtatcct tgtgaataaggggggactactggaaccattttgaaataccccagagcatt ctgttcttctttacaaggcctgccctcaggagaaaatactttaccagagc ctaacctgctgcggttttatcatagactaactaacctgggagaagggtaa tacccaattctacccccgtctagccatcctgtcccatccaagtggggaaa agaaaacaaaactgagaagcactggtgaagtttatagtccagggcatagg ctcaccaaaatattgagacctaatcatacgacagtagaacacttccagtt cccctccctgtcatacctcatcacaactttactagggacgtatttaccaa ttccttttacccagtatatcatgtccacctttcaacagaaaattgtaagt catattaaaaggcaaaaatacactctgaagaaactgaataagcaccagaa ccagtgagatatgacagaactgtcagaattaacagtccaggaattccaaa acactatgattaacacaacaacatgtaagaacagatggataatataagaa gagagatgaaaattctaagaaagaataaaaaagagaaacttttaggctcc tgaacacatggaggtgcaggagtgtgacaggcccaggaagggcatggaag cattgcaccccttcccaccaatgcatctccttatctggttttcatccgta tccttcataatatcctttataataaactggtaaacatcagctttccacag gctttctttctccatggtgagcaggacaatgtcctaggaccccagagagc caggagagagcaagatcaaagccagaaatccaactctgtccctactgccc actggtcttttccagtcaacacttcctcagcaaatacatgccacatagcc gtgctgctcagaggtctcctggcttatatgcagaaaaggccctgtggccc aggatcacagtagcaacaatattctcatcaaagcagccagagtgatacag cagaaggtcaagagacagaaaaaagcactaaatcctggtttataagagac aggcagacttcaagggttgtctcctaagacaataagtaagctcaagggga agcttataaccagtaagcttcagtggtgaaaaaagagcctagccaggccc aaatgggaaatcctaaagaaatagacatatattaaaagaaatagtaaatt caagatggggagcattcaagtgtgcagagggtgagcaagactttagtcag aaaataaagttactctttcttgccattgagcttaagccaaaaaaaaaaaa aaattaaagaaaagaaagctacttggctgggcgtggtggctcatgcctgt aatcccagcactttgggaggccgaggcgggtggataacctgaggtcagga gttcaacaccagcctgaccaacatggtgaaaccccatctctactaaaaat acaaaaactaggctgggtgtggtggctcatgcctgtaatcccagcacttt gggaggccgaggcaggcggatcacctgaggttgagagttcgagaccagcc tgaccaacatggagaaaccctgtctctactaaaaatacaaaattagtagg gtgtggtggtgcacgcctgtaatcccagctgctggggaggctgaggcagg agaatcgcttgaacccaggaggcggaggctgcggtgagccgagattgtgc cattgcactccagcctgggcaacaagagcaaaactctgtcttaaaaaaaa aaaaaaaaaaaattagctggtttggtggtacacgcctgtaatcccagcta ctcaggaggccaaggcagaagaatcacttgaacctgtgaggtggaggttg cagtgagccaagatggcaccactgcactccagcctgggcaacagcgagat gcagtctccaaaaaaaaaaaaaaaatctactcatacacaaaaaagcacac tccaagcattagtttttcatatgcagggagtgatgagtcagcattactct tgcaccagaatatacatgcaggagggaagtcctatgtgtgcaataagtga gggcggggctttagaaataagtcacacttcacataccagaggacacactc agggaagaaggctttcttgtgaaaggagtgtgggtaagacttcttataga aggcaattctcactgcatatcagaaaacccactcagggaagaagtctttt gtatgcaaggagtgtaggtaagactttacccaaaagtcaactgtgcatga gagaacacacttggcagagaagccttatagatgccaggagttgtggacaa aggattaggtataaatcatcctatgataaacactggaaggcacactcagg ggagaaggctttgttgtgcagggagtatgggcaagacttcagcttgaaga catctcttaccagacaccagaggacacacccagggaagtaactctgcttc tttaaaaagctgtagccaagactgtgcaacttgtttatggatatatcacg aaggcagaagggatccagagggctaaaagaacctgaacttggcccaagtt attcccagagattctgagaaaagaggcactgagagcctcaccaacctcct tagacggcacagcagtctaggacccctccaccaaaccttccctccctctg ttcttccctctggcagctctcccaaccttccttagtgctatggtttgaac aatgttgtcccttgcaaaattcatgttgaaacctaatcaccaatgtaaca ttattaagagaatgtttaggatgtggttaagtcatgaggatggagccttc atgaatggcattaggtacccttataaaaggatttggcaaaagatttggca tcactttaccctgtcctccaactgccatgtgaggacacagagctcctctc ctctggaggatgctatgtcaagataccatgttggaagcagagagcagccc ttgtgagataaccaaatctgctggcaccttgatcttgaacttcctagctt ccacaactgtgagaaaataaatttctgttctttataaaacaaacaaacaa acaaacaaacaagaaacgctacagataaaaaacactgtaacagaaatgaa gaatcccgaccaggcacggtggctcatgactgtaaccccggcattttggg aggctgaggtgggtgaatcacttgaggtcaggagttaaagaccagcctgg ccaacatggtgaaacccccgtctctactaaaaatacaaaagttagtcggg catggttgcgcaagactgtagtcccaactactcaggaggctgaggcatga gaattgcttaaacccaggaggtggaggttgcagggacctgagatcgtgcc actgcaatccagcctgggtgaccaagagagaatctgtctcaaaaaacaaa aaagaaatgaagaatgcctatgatgggctcagtagtagactggacatagc tgagcaaagaatctttgaacttgagctgggcatggttgcatgtgtctgta gttccggctacttgggaggctgaaattggaggatgacttgagcccacgag tttgagaccagcctaggcaacatagtgacatagtaagacactgtctctaa aaacaaagaatctctgaacttgaggatatgacaatagaaacttccaaaac tggaaagctaagaaaagaagaactaaaaaaaaaataacaacaacaaaata tccaagaactgtgggacaaatacaaaaggtataatatgtgcataatagga gaaagaaagaaacacacacggccgggcgcggtggctcattcctgtaatcc cagcactttgggaggccgaggcgggtggatcacgaggtcaggagatcgag accatcttggctaacacagtgaaaccccgtctctactaaaaatagaaaaa attagctgggcgcagtggcgggcgcctgtaatcccagctactcgggaggc tgaggcagaagaatggcgtgaacccgggaggcggagcttgcagtgagccg agatagcgccactgcagtccggcctgggagaaagagcgagactccgtctc tgaaaaaaaaaaaaaagaaagaaagaaagaaagaaagaaacacacacaat atttggtattgctaaacaataactaaagcgataactaaaaatgtcccccc aaatcaatgtcagacaccacaccataagaacaataagcaggctaaatgac aaacaaacaaacaaacaaaaaactacacctagatatatcatatccaaact tcagaaaatcaaagagaaagagaaacatcttgaaagaaggcagagggaaa atcaccttacctacagaggagcaaaaacaagaattacatctaacctctca gaaaccatgcaagcagagagtacagtcaaagagagaaaaccctatcagcc tagaattctgtatcctgcacaattcttcagaagaaaggccacatgtggtg tggctcacgtctataatcccagcacggcagatggattgcttgtgtccagg agtttgagactagcctgggcaacacagtgagatctcatctcaatgaaaaa taaacaaaattagccaagtgtcctggcgtgtacctatagtcctagctatt tgggaggctgaggtgggaggatcgcttaagcccaggaggttgaggtgagc tgagatcacaccactgcactctagcctgggcaacacagtgagatcttctc tcaaaaaaaaaaaaaaaaaaaagtgaaggaaaaataaagactttctcaag caaaccaaaatacggagaaattgttgcccacagacttgccttgccagaaa tcttaaaagaagttctttagagagaaggaaaatgagctaggcacggtggc tcatgcctgtaatcccaacactctgggaggctgaggcgggaggactgctt tagcccgggagtttgagatcagcctgggcaatatcagcagaccctatttc taacaaaaataaaaataaattagctgggcatggtggtgcttatctgctgt tccagctactccagtggctgagatgggaggattgctggagccagggatgt tgaggctgtagtgagctacaaccttgccactgcaccccagcctacgtgac acagtgagatgctgtctcaaaaaaaaaaaaaaaaaaaatatctgggattc gaatcagccctctccagctccaagcacactgggatccttgccgttggagg ttgctgttggcttctgcccgtgctgtgtattgagatctgcaaccaagcat gggagcttacgaaggcacctggtggtttctaggtaggaaaacagtgtgga tgccagatgtggtgcctcacacacaacagcactttgggagcctgaggcag gcagatcacctgaggtcaggaattcgagaccagcctggccaacagagaga aaccctgtctctactaaaaaaatacaaaaaaaattagtcaggcatggtgg tacctgtagtccctgctacttgggaggctgaggcacaagaatcacttgaa cctgggaggcagaggttacagtgagctgagatggtgccgctgcactccag cctgggagacagagcaagactccatcttaaaaaaaagggccgggtgcagt ggctcacgcctgcaatcccagcactttgggaggccaaggcgggcagttca caaggtcaggagttcaagaccagcctggccaaaatagtgaaatgccatca aaatagtgaaatgccatctctactaaaaatacaaaaattagccgggcatg gtggtatacacctgtaatcccagctactcaggaggctgaagcaggagaat tgcttgaacccaggaggcagaggctgctgtgagctgagatcgtgccactg cactccaccctgggagacagtgcgaaactccgtctcaaaaaaaaaaaaaa aaaagcgtggagtgtgcgtaattgtgtttgaggaagagatggtcactgtc tctatggtagtgtctttagggaactatctgtgcctggatcctggggaatt atgacaaagcagtctgcaggtttggcatgaactctgaactttctccctct gcaggatcttcaatagaaaggtcctgtcctgaggaaacactcagaggcac ttctcttgcttgtgaagcctgccagcatgtatgttttggtcagcactgag atgcacatgacaaattgtgcgcaatgcctcagcacagcccaacggggttt tttctgtttacacttgactagccttggacagtaggaggaaaccttaaata tttccttctactctcacaaaccctttatctttttttttgagacggagtct cactctgtcggccaggctggagtggcacgatctcagttcactgccacctc cacctcctaggttcaagcgatcctcctgcctcagcctcttgagtagttga gactacaagcgcacactgccacacctggctaatttttgtatttttagtag agacgaggtttcaccatgttggccaagctggtctcaaactcctgacctca ggtgatccacctgccttggcctcccaaagtgctggaattacaggcgtgag ctaccacacccggcctcacaaaccttttatctaataaaacaaaattactg ccaaaaaaaaaaaagagacaatgtcagagtcaatcaaaaaacaagaccta actgtatgttgtatatgagaaaacccctttaatacaaataaacatataga ttaaaagtaaaaagataggccgggtgcggtggctcacgcctgtaatccca gcactttgggaggccaaggcgggtagatcacaaggtcaggagatcgagac catcctggctaacacagtgaaaccccatctctactaaaaatacaaaaaat tagctgggcgtggtggcgggtgcctgtagtcccagctactcagaaggctg aggcaggagaatggtgtgaacccgggaggtggagcttgcagtgagcggag atcacgccactgaactctagcctgggtgacagagcgagactccatctcaa aaaaaaaaaaaaaaaaaaagtaaaaagataaagatataccctaatccaaa aaaggtgagaatagctatattaattttagaaagaaagatatcaggaataa agacgagcattgcataatgataaaatggtcaatactccaagatgacatac aaattcttaatgcaggccaggcatggtggctcacacctgtaaccccagca ctttgggaggccaaggtgggtggatcacttgaggtcaggagttcgagacc agcctggtcaacatggtgaaaccctgtctctactaaaaatacaaaaatta gccaggcatggtagcatgcacctgtaatcccagctactcaggaggctgag gcaggagaaccatttgaacctgggaggtagagcttgcagtgagccaagat tgtgccactgcactccagcctgggcgacagagcaagattccatctcaaaa aaaaaaataaataaaaaataaaaataaaaataattaatgtgtatgtagct aacaacagagcatcaaaatatgtgaaggaaaaactgacagaactgcaaag agacagatagatgaacctgctcttatattaacagttggagactttaacac ccctctatcagaaatggacgtatccagcaggcagaaaatcagtaaggaca cagttgaactcaacagcaccatcaatcaactggacataattaacatctat agattaattcatccaacaacagattatacattcttcttagctcacatgga acatttaccaagactgaccacattctgggccatataacacaccttaacaa gtttaaaagaaatcacacaatgtctgccctcaaactgcaatggaattaaa gtagaaatcagtgacagaaagatagctggaaaatatgaaaggacttagat taaataacatacttctaaataatacatgaagcaaaaaagaaatctccaga gaaattaaaaagtattttgaactacatgaaaatgcaacttataaaaattt gtaggatgcagcagaaacagtgattagtgggaaactgatagcagtgaata aacatactagaaaagaaaaagatctaaaatcaataatctaagcttccacc ttagaaaactagggggaaaaagagcaaattaaatccaaagcaagcagaag aaaagaaataataaaaattagtgtaggtcaggggtggtgccccatgccta taatcccggcactttgggaggctaaagcaggaggattgcttgaagccagg agtttgagaccagcctggtcaacactgcaagaccatgtctctataaaaaa taaaaaaattagccgggtatggtggtgctcaggaggctgaggcgggaaaa tcatttgaactgaggagtcggaggctatagtgagctaagatcacatcact gcactctagcctgggcaacaaagtgacaccctgtcaaatgaatgaatgaa caaaaaaagagtggccaacatggtgaaaccccatctctactaaatacaaa aagaaataaaaaaattagctgagcatggtggcgcgcgcctgtaatcccag ctactcaggaggctgaggcaggagaatctcttgaacctgggaggtgtagg ttgcagtgagccgagatcatgccactacactccagcctggcgacatagca agactctgtctaaaaaaaaaaaaaaaaaacctactataggcctggcatag tgcctcatgcctgtaatcccagcactatgggaggccaaagtgggaggatt gcttgagactaggagtttgagaccagcctggggaacataacgtgaccctg tttctaccaacaccccccgccccaaaaagaaaaaaaacctactagaaagc tacagtagcaaacatagtgtggtactagaaaaagaatagacacataaatt actatatatattttatttatttatttattttgagacggagtctcgttctg tcgcccaggctggagtgcagtggcgagaccttggttcactgcaagctccg cctcctgggttgacgccattctcctgcctcagcctccggagtagctggga ctacaagtccccgccaccacgcccggctaattttttgtattagccgggat tagtagagacgggatttcaccatgttggccaggatggtctcatctcctga cctcgtgatccgcccacctcggcctcccaaagtgctgggattacaggcgt gagccaccgcacctggccatcaattactatatttatgtgaataaacagag ggtccagaaatagacccacataaatatagtcactgatctttgacaaagaa gcaaaggcaatacaatgaagcaaagataatcttttcaacaaatggtgctg gaacaactggacatccacatgcaaaaaaatgaatctagacacatgcttta tagccttcacaaaacttaaaatggatcacagacctaaatgtaaaatacaa aactataaaatttctagaagataatataggggaacacctacaccaggggc cacacagcagggggtgagcagcaggcaagtgagtgaagcttcatctgtat ttacagccactccccatcacttgcattacagccagaactctgccttctgt cagatcagtcgcagcattagattctcaaaggagtcccaaccctactgtga actgtgtatgtgagggatctaggttgtatgccccttatgagaatctaatg cctgatgatctgtcactatctcccatcacccccagatgggaccatctagt tgcaggaaaacaagctcagggctcccactgattctacattatggtgagtt gtataattatttcactattatattataatttaataatactataaataaag tacacaataaatgtaatttgcttgaatcccccccacccccagtccatgaa aaaaattcttccaggaaaccagtccctggtgtcaaaaaggtgggggacca ctgacctagatgacttttgtatggcgatgactttttaaatacaacaccaa agtcatgatccatgaactaagtaattgataagctggactgtgctaaaatt agaaatatctgctctgtgaaagacaatgtcaagaaaatgtaaagacaagc tacagaatgggagagaatgtaaaagatacacctgataaaggactgtcata caaaatacacaaagaactcttaaaactgaacagtaagaaaatgaacaacc tgattaaaaaatgggcaacggactttagtcagacacttcaccaaagaaaa atacagatggcaaataagcatgctcaacatcatgccatcagggaaataca aatacagataattgttattagtattattttttgagacagactctcactct gtcacccaggctggagtacagtggcgtgatcttggctcactgcaacctcc gcctcccaagttcaagcgattctcctggcttcagcctcctgagtagctgg gactacaggcacttgccaccacgcccagctaatttttgtatttttaggag agacgggggtttcatcatattggccaggctcttctcgaactcctgacctc aggtgatccacccgccttggccttccaaaatgcagggattacaggtgtga gccacccggcaggcattaaattgttgtttaatgcaaattaaaacaacaaa atccattacacacctattagagtggccaaaatccaaaacactgacaacat taaatgttgataaggatgtggagcaacaagaactctcattcatttttggt gggaatgctaaataatatagccactttggaagacagtttgacagtttctt acaaaactaaacatagtcttaccataaacatagtattaccatatgatcca gtgatcacactcattgatatttacccaaatgaacagaaaacttatgtcca cacaaaagcctgcacatggatatttatatagcagctttctttactcataa ttgccaaatctcggaagcaatgaagatgtccttcagcaggtgaatggata aattgtggtacattcagacaatggaatattactcagcactaaaaagaaat aaggtatcaagccatgaaaagataaggaagaaacttaaatgcatattact aaatgaaagaagctagtatgaaacggccttatattgtatgattccaacta aacaacattctgtagacggcaaaactacggagacagtagttttggcaaga tcagtgcttgccagcggtcagtgggaagtgagggatgaacaggcagagca cagaggatatttaggacagtgaaactattctgtatgatactacaatggtg gatacgtgtcattttatatattgatttgacaggagttcactctgttgccc aggctggtgtgcagtggcacaatcatggctcactgcagccttgacctccc gggctcaagcagtcctcccacctcagcctcctgaatagctgggactacag gtgcacattaccacacctggctaatttttaattttttttttgtaaggatg gggtctcactatgttgcctaggctggtcttgaactcctgggctccagcaa tcctcctgcctcggcttcccaaagtgttgagattacaggcatgaaccact gtgcccggcctcattatatatttgttaaaacctacagaatttgtaatacc aagagtgaattctaatataaactatggacttgggtgataatatgtcaatg caggttcattgaatgtaacaaattagcattgtggttcagtatgttgacag tgggggaggctagtgtgtgtgaggacaggaggtatatgagaatcctctgc acttttcactcaattttgctgtgaatctaaaactgctctaaaaaaaatag tttcttaatttaaaaaaaagcaaaggaagagcataaatgaaaaaagaaaa gactgaaaaccactaagtaaatgtgaatagcttcttgcatattcattctc atacagtgctcagaaaagacagaaaaccaaagcatagcagaaagtccagc ttccaagcaggaactggtgtctccttgacagagcttcaatccaattatgc cccagacactgatttttctatcactacctcccctttgtttcagagacagt atccatgaccttctgttcatgcggtggaggaagaaattgcttcccagaaa ggcttagcccactccattctactagataaaggaaaaacaaaaagccactc cactctgacaagccgattactgtctatggagaagcaaaatgcctccctta ggactatgggaatagccaacaaggtacagtatgttcaaacctttaaccat gactctgccaagagcagctatgtatgtgaatacaatctcctcatctgggc agtcaaatatgcataactgaaagtacaaacctccataaggtcctgtctag ttatagtttcaaatgcacagaaagtgaaagaaggctgaaataaagtctcc cctatccactattgcaattatctaggtaaagcctagaaggaaagaacagg aaagactgaagagggaaagttaaagtgttgaatataatatgatattttat a'

In [12]:
import re

In [13]:
def pattern_find(reference, pattern):
    start = datetime.datetime.now()
    pattern_location = []  # Empty list to store location of alignments. 
    reference = reference.upper().replace(' ','')  # caps and removes spaces
    reference = re.sub('[^A-Z]+', '', reference)  # regex removes not an A-Z
    
    for pattern in pattern_list:
        pattern = pattern.upper().replace(' ','')  
        pattern = re.sub('[^A-Z]+', '', pattern)  
    
        for i, base in enumerate(reference):
            query_pattern = reference[i:i+len(pattern)]  # Slice "window"
            
            if query_pattern == pattern:
                pattern_location.append(pattern)
                pattern_location.append(i+1) 
    
    stop = datetime.datetime.now()
    print(stop - start)
    return(pattern_location)

In [14]:
print(pattern_find(reference, pattern))

0:00:01.253246
['TACTTGCATTTTTTCTAAACAC', 3184]


#### 00.000057 window find
 
#### 00.001089 window find with list

#### 01.550950 window find with real data

### Production data means longer compute time. Speed is still within reason.

### Now let's incorporate a fuzzy search feature. 

### We want to be able to allow the user to find alignments with n mismatches. 

In [15]:
def mismatch(string1, string2):
    mismatches = 0
    for (nucleotide1, nucleotide2) in zip(string1, string2):
        if nucleotide1 != nucleotide2:
            mismatches += 1
    return(mismatches)

In [16]:
def approximate_patterns(text, pattern_list, max_mismatches):
    start = datetime.datetime.now()
    reference_upper = reference.upper().replace(' ','')  # caps and removes spaces
    reference_clean = re.sub('[^A-Z]+', '', reference_upper)  # removes not A-Z
    
    pattern_matches = []
    
    for pattern in pattern_list:
        
        string_upper = pattern.upper().replace(' ','')  
        string_clean = re.sub('[^A-Z]+', '', string_upper)  
        
        for i in range(len(reference_clean)-len(pattern)+1):
            query_pattern = reference_clean[i:i+len(pattern)]
            if mismatch(string_clean, query_pattern) <= max_mismatches:
                pattern_matches.append(i+1)
                pattern_matches.append(pattern)
    stop = datetime.datetime.now()
    
    print(stop - start)
    return(pattern_matches)

In [17]:
print(approximate_patterns(reference, pattern_list, 1))

0:00:15.928740
[1, 'CATGACCCCCCCCAAGAAGAAGCGCAAGGTGGAGGACGGAATGGACGCACAAACACGACG', 41, 'ATGGACGCACAAACACGAC', 3184, 'TACTTGCATTTTTTCTAAACAC']


#### 00.000057 window find

#### 00.001089 window find with list

#### 01.550950 window find with real data

#### 18.236138 window find with real fuzzy data


### How can we speed this up?

In [18]:
final_list = []
working_list = []

def neighbors(pattern, mismatch_num):
    nucleotides = 'ACGT'
    current_list = []
    final_list = [] 
    pattern_whole = ''
    final_list.append(pattern)
    
    for index, nucleotide in enumerate(pattern):
        rest = pattern[(index+1)::]

        for i, nuc in enumerate(nucleotides):
            working_pattern = ''
            working_pattern += pattern_whole
            working_pattern += nuc
            working_pattern += rest

            if working_pattern not in final_list:
                final_list.append(working_pattern)
                current_list.append(working_pattern)
        pattern_whole += nucleotide

    if mismatch_num > 1:
        for item in current_list:
            working_list.append(item)
        return neighbors(working_list, mismatch_num-1)
    else:
        return final_list
final_list = []
working_list = []

In [19]:
pattern_list = ['acac','cag','ttat','agaa','ttatcaaatt',]

In [20]:
start = datetime.datetime.now()

for pattern in pattern_list:
    print(neighbors(pattern, 1))

stop = datetime.datetime.now()
print(stop-start)

['acac', 'Acac', 'Ccac', 'Gcac', 'Tcac', 'aAac', 'aCac', 'aGac', 'aTac', 'acAc', 'acCc', 'acGc', 'acTc', 'acaA', 'acaC', 'acaG', 'acaT']
['cag', 'Aag', 'Cag', 'Gag', 'Tag', 'cAg', 'cCg', 'cGg', 'cTg', 'caA', 'caC', 'caG', 'caT']
['ttat', 'Atat', 'Ctat', 'Gtat', 'Ttat', 'tAat', 'tCat', 'tGat', 'tTat', 'ttAt', 'ttCt', 'ttGt', 'ttTt', 'ttaA', 'ttaC', 'ttaG', 'ttaT']
['agaa', 'Agaa', 'Cgaa', 'Ggaa', 'Tgaa', 'aAaa', 'aCaa', 'aGaa', 'aTaa', 'agAa', 'agCa', 'agGa', 'agTa', 'agaA', 'agaC', 'agaG', 'agaT']
['ttatcaaatt', 'Atatcaaatt', 'Ctatcaaatt', 'Gtatcaaatt', 'Ttatcaaatt', 'tAatcaaatt', 'tCatcaaatt', 'tGatcaaatt', 'tTatcaaatt', 'ttAtcaaatt', 'ttCtcaaatt', 'ttGtcaaatt', 'ttTtcaaatt', 'ttaAcaaatt', 'ttaCcaaatt', 'ttaGcaaatt', 'ttaTcaaatt', 'ttatAaaatt', 'ttatCaaatt', 'ttatGaaatt', 'ttatTaaatt', 'ttatcAaatt', 'ttatcCaatt', 'ttatcGaatt', 'ttatcTaatt', 'ttatcaAatt', 'ttatcaCatt', 'ttatcaGatt', 'ttatcaTatt', 'ttatcaaAtt', 'ttatcaaCtt', 'ttatcaaGtt', 'ttatcaaTtt', 'ttatcaaaAt', 'ttatcaaaCt', 'ttatc

In [21]:
pattern_list = ['CATGACCCCCCCCAAGAAGAAGCGCAAGGTGGAGGACGGAATGGACGCACAAACACGACG', 'TGGGGCCGCCGCCCACGGCGGGGGCGCCGCCGCCCAACTTGTTTGCAGCTTTCCATTGAGCTT', 'CATGACCCCCCCCAAGAAGAAGCGCAAGGTGGAGGACGGAATGGCTTCTAACTTTACTCAGTTCGTTC', 'TGGGGCCGCCGCCCACGGCGGGGGCGCCGCCGCCCAACTTGTAGATGCCGGAGTTTGCTG', 'ATGGACGCACAAACACGAC', 'ATGGCTTCTAACTTTACTCAGTTCGTT', 'TACTTGCATTTTTTCTAAACAC', 'ACTATAGGGAGACCCAAGCTGG', 'GAGGATTCTGACAGTGAAATATCAG', 'ATGTTTGGTGAGCCAAAAC', 'CATACTTACTTGGCTTGTTTGGGATAT', 'GCCTATGGCATTATTGTACGGA', 'TTACGCATAAACGATGACGTCA', 'AAGGCACAGTCGAGGC', 'GATCTGGGCCCTGAAGAAGGGCCCG', 'GATCCGGGCCCTTCTTCAGGGCCCA', 'ATGTTTGGTGAGCCAAAAC', 'CATGAGGGCCCTGAAGAAGGGCCCC', 'GATCCGGGCCCTTCTTCAGGGCCCA', 'GATCTGGGCCCTGAAGAAGGGCCCG', 'CTATCAGTGATAGAGAACGTATAAG', 'CAGAGGAGGGAAGAGAG', 'CTTCTTATCATCTCCATCTTTATGATG', 'CGTTCAGTGTCAGAAAATG', 'CATCTTGAGACACATGGG', 'GTGACATAATTGGACAAACTACC', 'GGGAGCTTGTATATCCATTTTCGGATCTGATCAGCACGTGATGACCGAGTACAAGCCCACG', 'CATAGAAGGCGGCGGTGGAATCGAAATCTCGTAGCACGTGTCAGGCACCGGGCTTGCGGG', 'TGCCTCTGAGCTATTCC', 'GCGCTGGAGGATCATC', 'AGCGGTTCCCGG', 'CCGCGCTGGAGGATCATCCA', 'GGGTCTGGGCAGCGCCGTCG', 'GGCGAAGAACTCCAGCATGAG', 'CATGGCGATGCCTGCTTGCCGA', 'GCAAGGAACGCCCGTCGTGGC', 'TCTCCGGGCCTTTCGACCTGCAGCCAATATGGGATCGGCCATGACCGAGTACAAGCCCAC', 'CAGTCGAGGCTGATCAGCGAGCTCTAGAGAATTGATCCCCTCAGGCACCGGGCTTGCGGG', 'TCCTCTTCCTCATCTCCGGGCCTTTCGACCTGCAGCCAATATGACCGAGTACAAGCCCAC', 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCTACGTTTACGCAGACTATCTTTCT', 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCTCTATTTACGCAGACTATCTTTCT', 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCTGATTTTACGCAGACTATCTTTCT', 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCTTGCTTTACGCAGACTATCTTTCT', 'CAAGCAGAAGACGGCATACGAGATCGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATC', 'CGTGCCTTTTCCCGCGAGGTTG', 'GCCTGCTGGGGAGCCTGGGGAC', 'CCTCTAGAAATAATTTTGTTTAACTTTAAGAAGGAGATATACATATGAGCGGCCGCGGCAAAGG', 'CAACTCAGCTTCCTTTCGGGCTTTGTTAGCAGCCGGATCCTTAACCGCCAAAACCATACAGG', 'CTCTAGAAATAATTTTGTTTAACTTTAAGAAGGAGATATACATATGGGCCATCACCATCACCATCACC', 'GTCTGTGTAGAAGACCACACACG', 'GCTCAAGCAGAGGCGGCCTCGGCC', 'GCTTCAAGTGGGAGCGCGTGATG', 'GGGGATCAATTCTCTAGAGCTCGC', 'ATTGGCTGCAGGTCGAAAGGC', 'CCGCTCTTCCGATCTGTTATGAAGG', 'GCAGGAAACGAAGATAAATCATGTCG', 'CGTAACAACTCCGCCCCATTGACG', 'CGCGGTCTCGGCATTCCTGCTG', 'AGGCCCGGCATTCTGCACGC', 'CCATTCTCCGCCCCATGGCTGAC', 'CTTGTCTGTAAGCGGATGCCG', 'GATCAGTTGGGTGCACGAGTGG', 'CCGCTGTTGAGATCCAGTTCG', 'CCTCCTCACTACTTCTGGAATAGC', 'AGGCCGAGGCCGCCTCTGC', 'CCTGACGGGCTTGTCTGCTCC', 'CCAAAATGTCGTAACAACTCCGCCC', 'GCATATTTGAGAAGATGCGGCCAGC', 'CAGCGGCCAATAGCAGCTTTGC', 'CACAAGTGGCCTCTGGCCTCGCACACATTCCACATCCAACGCGTGGGTTGCGCCTTTTCCAAGGC', 'CGAGGCTGATCAGCGAGCTCTAGAGAATTGATCCCCCGTCGACGTCAGGCACCGGGCTTGCGGG', 'CGTCGACGGGGGATCAATTCTCTAGAGCTCGCTGATCAGC', 'GGTTCCTGGCCTTTTGCTGG', 'GTGAAATACCGCACAGAGCAAAAGG', 'TGTTCTGCAGCGTGTCGAGC', 'TTACAGCGTGATGGAGCAGATGAAG', 'ACGCGTTGGATGTGGAATGTGTGCGAGGCC', 'ACTGCCCGCTTTCCAGTCG', 'CGACGATATGATCCTGATGCAGCTAG', 'TTTACGCAGACTATCTTTCT', 'CGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATC', 'CAAGAATGCATGCGTCAATTTTACGCAGACTATCTTTCTAG', 'GGTGCCTGACGTCGACGGGGGATCAATTCTCTAGAGCTCGCTGATC', 'CAACCCACGCGTTGGATGTGGAATGTGTGCGA', 'GATCCGTTGTAAAACGACGGCCAGTCA', 'TATGACTGGCCGTCGTTTTACAAC', 'GGAGGACGGGCAGACTCGC', 'CAACCCACGCGTTGGATGTGGAATGTGTGCGAGGCCAGAG', 'GAGTTGGTAGCTCTTGATCCGGC', 'GCAACTGCCCGGCTACTACTAC', 'CGTTGGCCGATTCATTAATGCAGC', 'CCGCACCGCTGTCATTAATCTGC', 'CCACTTGTGTAGCGCCAAGTG', 'GGATAATACCGCGCCACATAGC', 'GATCGGGCCCTGAAGAAGGGCCCGGGCCCTGAAGAAGGGCCCGGGCCCTGAAGAAGGGCCCCGG', 'GATCCCGGGGCCCTTCTTCAGGGCCCGGGCCCTTCTTCAGGGCCCGGGCCCTTCTTCAGGGCCC', 'GATCCCCGGGCCCTGAAGAAGGGCCCGGGCCCTGAAGAAGGGCCCGGGCCCTGAAGAAGGGCCCGGA', 'GATCTCCGGGCCCTTCTTCAGGGCCCGGGCCCTTCTTCAGGGCCCGGGCCCTTCTTCAGGGCCCGGG', 'CCACTCCTCCACCTTTGAC', 'ACCCTGTTGCTGTAGCCA', 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCTTGCAAGAATGCATGCGTCAATTTTACGCAGACTATCTTTCTAG', 'ACCATTCCCAATGCCTGAA', 'TGCATACCTACCCAATGTATGG', 'GGCTTAGACCCTCAGGT', 'GCTCCCTCTCTCCACTC', 'GGACTCATGACCACAGTCCATGC', 'GGAAGGCCATGCCAGTGAG', 'CCAAATTCGTTGTCATACCAGG', 'ACACCCAAGCTCGTTGGG', 'CCTTGCCTGCATTTCTCTGC', 'CAGAAAGGTCCTGCTCCGC', 'CCCCTCCTTCCTCTCGCC', 'GTCCCTAATATCCCGGAGGT', 'GCAGGCTTCTAAATCCGTTC', 'GATCGGAAGAGCGGTTCAGCAGGAATGCCG', 'AGAAAGATAGTCTGCGTAAA', 'CTAGAAAGATAGTCTGCGTAAAATTGACGCATGCATTCTTG', 'GTTGACATTGCGAAGAGCGACAAAG', 'GATCTGTTGTAAAACGACGGCCAGTC', 'TTAAGACTGGCCGTCGTTTTACAACA', 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCTACTTTTACGCAGACTATCTTTCT', 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCTCAGTTTACGCAGACTATCTTTCT', 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCTGTCTTTACGCAGACTATCTTTCT', 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCTTGATTTACGCAGACTATCTTTCT', 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCTACGTTTACGCAGACTATCTTTCT', 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCTCTATTTACGCAGACTATCTTTCT', 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCTGATTTTACGCAGACTATCTTTCT', 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCTTGCTTTACGCAGACTATCTTTCT', 'CAAGCAGAAGACGGCATACGAGATCGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATC', 'CGGATTTCCTTGAAGAGAGTGAG', 'CGGATCTGGAAGTTCTGTTCC', 'CGCTGTGCAGAAGCAGAGAGG', 'GTTACCAGGTCCGCGCTCTC', 'ACGGATTCGCGCTATTTAGA', 'GTGCTTGTCAATGCGGTAAG', 'CACGCGGTCGTTATAGTTCA', 'GACGCATGATTATCTTTTACGTG', 'CGCAATTAATGTGAGTTAGC', 'GGTGTAAACCTTAAACTGCC', 'CAGGCAGACATCTGTGAATCG', 'GTTCTGCCCAAGGGTTGGTTTG', 'TTAATCTAGCTGCATCAGGATCATATCGTCGGGTC', 'GGTAGAAAAAGCAACCACGAAGC', 'ACATAAACCTCTGTCTGTGAGTGCC', 'GGCAGCACAGAGCAACTCTA', 'GAGTGCAAAGTCCCGTTTG', 'AGCCTTTGGAAGCTCTTGAA', 'GTGTCTTGGAGAGGCGTGTA', 'AGAAGAGTTAGTTGACTATACAGC', 'ATGTTTGAATGTGATAACCGTCCT', 'AATTACCGATCCAATGCGAAGCTTTAAGAC', 'AATTGTCTTAAAGCTTCGCATTGGATCGGT', 'AATTACGTAAGCTTAATGCCGATCCAAGAC', 'AATTGTCTTGGATCGGCATTAAGCTTACGT', 'TTGAGAAGAGTTAGTTGACTATACAGC', 'CAAGCAGAAGACGGCATACGAGAT', 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT', 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCTATCACGTTTACGCAGACTATCTTTCT', 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCTCGATGTTTTACGCAGACTATCTTTCT', 'ACAGTAGCTGTATATAAAACCAGTGATGTTTGAATGTGATAACCGTCCT', 'TGCTGTATAAAAAACCAGTGGTTATATGTACAGTAGCTGTATATAAAACCAGTGGTTATATGTACAGTAGCTGTATATAAAACCAGTGATGTTTGAATGTGATAACCGTCCT', 'ATGTTTGGTGAGCCAAAAC', 'GATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG', 'GATCGGAAGAGCACACGTCTGAACTCCAGTCACCGATGTATCTCGTATGCCGTCTTCTGCTTG', 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT', 'CAAGCAGAAGACGGCATACGAGATATCACGGTGACTGGAGTTCAGACGTGTGCTCTTCCGATCCGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATC', 'CAAGCAGAAGACGGCATACGAGATCGATGTGTGACTGGAGTTCAGACGTGTGCTCTTCCGATCCGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATC', 'AGCCATCAAGGAGGCTGTAA', 'CCAGGAAGTCCGTAGAGACG', 'GAAGTTCAGGCATTGGGAAT', 'CAAACGGGACTTTGCACTCT', 'GTGAACCGTCAGATCGCCTGG', 'TGTCCAATTATGTCACACCA', 'CCATTCGCCATTCAGGCTGC']

real data

In [22]:
def neighbor_patterns(text, pattern_list, max_mismatches):
    start = datetime.datetime.now()
    reference_upper = reference.upper().replace(' ','')  # caps and removes spaces
    reference_clean = re.sub('[^A-Z]+', '', reference_upper)  # removes not A-Z
    
    pattern_matches = []
    
    for pattern in pattern_list:
        string_upper = pattern.upper().replace(' ','')  
        string_clean = re.sub('[^A-Z]+', '', string_upper)  

        for neighbor in neighbors(string_clean, max_mismatches):
            
            if neighbor in reference_clean:
                
                for i in range(len(reference_clean)-len(pattern)+1):
                    query_pattern = reference_clean[i:i+len(pattern)]
                    
                    if mismatch(neighbor, query_pattern) <= max_mismatches:
                        pattern_matches.append(i+1)
                        pattern_matches.append(pattern)
                        
    stop = datetime.datetime.now()
    print(stop - start)
    return(pattern_matches)

In [23]:
print(neighbor_patterns(reference, pattern_list, 1))

0:00:02.107689
[1, 'CATGACCCCCCCCAAGAAGAAGCGCAAGGTGGAGGACGGAATGGACGCACAAACACGACG', 41, 'ATGGACGCACAAACACGAC', 3184, 'TACTTGCATTTTTTCTAAACAC']


0:00:00.000057  loop find

0:00:00.001089 loop find with list

0:00:01.550950 loop find with real data

0:00:18.236138 loop find with real fuzzy data

0:00:02.250438 loop find with real data and fuzzy neighbors

In [25]:
import plotly
plotly.tools.set_credentials_file(username='ksindy', api_key='djyAuoTyMMoveuDBFrJ2')
import plotly.graph_objs as go

data = [go.Bar(
            x=['win', 'win list', 'win real', 'win real fuzz', 'win real fuzz neigh'],
            y=[00.000057, 00.001089, 01.550950, 18.236138, 02.250438])]

py.iplot(data, filename='basic-bar')

High five! You successfuly sent some data to your account on plotly. View your plot in your browser at https://plot.ly/~ksindy/0 or inside your plot.ly account where it is named 'basic-bar'


In [26]:
from tinydb import TinyDB, Query
import json

In [44]:
reference = 'cactaagcacacagagaataatgtctagaatctgagtgccatgttatcaaattgtactgagactcttgcagtcacacag'
pattern = 'cac'

In [45]:
pattern_list = ['acac','cag','ttat','agaa','ttatcaaatt',]

In [46]:
db = TinyDB('/Users/ksindy/PycharmProjects/oligo_search_website/pratice_db.json')

In [47]:
def pattern_find_tinydb(pattern_list, reference):
    start = datetime.datetime.now()
    aligns = []  # Creates empty list to store alignments
    pattern_length_list = []  # Creates empty list to store pattern lengths

    for pattern in pattern_list:  
        
        '''
        DETERMINE LENGTH OF PATTERN
        Do not need to add to db if pattern length already seen
        '''
        
        if len(pattern) not in pattern_length_list:  
            pattern_length_list.append(len(pattern))
            
        '''
        ADD REFERENCE TO DATABASE
        Splits reference into pattern length chunks and adds to the db 
        along with location (i).
        [{'sequence': 'cact', 'index': 1}]
        [{'sequence': 'cact', 'index': 1}, {'sequence': 'acta', 'index': 2}]
        ...
        [{'sequence': 'cac', 'index': 1}]
        [{'sequence': 'cac', 'index': 1}, {'sequence': 'act', 'index': 2}]

        '''

        for i, nucleotide in enumerate(reference):  
            ref_chunk = reference[i:i+len(pattern)]  
            db.insert({'sequence': ref_chunk, 'index':i+1}) 

            '''
            FIND ALIGNMENTS
            [{"sequence": "acac", "index": 9}, {"sequence": "acac", "index": 74}]
            [{"sequence": "cag", "index": 12}, {"sequence": "cag", "index": 69}...
            [{"sequence": "ttat", "index": 44}]
            [{"sequence": "agaa", "index": 15}, {"sequence": "agaa", "index": 27}]
            [{"sequence": "ttatcaaatt", "index": 44}]
            '''
        sequence_match = Query()
        matches = db.search(sequence_match.sequence == pattern)  # bit-like object
        matches = json.dumps(matches)  # Dumps json to python dict, in this case to string
        locations = re.findall(r'\d+', matches)
        print(matches)
        aligns.append('{} {}'.format(pattern, locations))
        stop = datetime.datetime.now()
    print(stop-start)
    return(aligns)

In [48]:
db.purge()

In [49]:
pattern_find_tinydb(pattern_list, reference)

[{"sequence": "acac", "index": 9}, {"sequence": "acac", "index": 74}]
[{"sequence": "cag", "index": 77}, {"sequence": "cag", "index": 12}, {"sequence": "cag", "index": 69}, {"sequence": "cag", "index": 77}]
[{"sequence": "ttat", "index": 44}, {"sequence": "ttat", "index": 44}]
[{"sequence": "agaa", "index": 15}, {"sequence": "agaa", "index": 27}, {"sequence": "agaa", "index": 15}, {"sequence": "agaa", "index": 27}, {"sequence": "agaa", "index": 15}, {"sequence": "agaa", "index": 27}]
[{"sequence": "ttatcaaatt", "index": 44}]
0:00:00.821750


["acac ['9', '74']",
 "cag ['77', '12', '69', '77']",
 "ttat ['44', '44']",
 "agaa ['15', '27', '15', '27', '15', '27']",
 "ttatcaaatt ['44']"]

In [13]:
def pattern_find_tinydb(pattern_list, reference):
    print(datetime.datetime.time(datetime.datetime.now()))
    aligns = []  # Creates empty list to store alignments
    pattern_length_list = []  # Creates empty list to store pattern lengths
    
    for pattern in pattern_list:  
        if len(pattern) not in pattern_length_list:  
            pattern_length_list.append(len(pattern))
            
        for i, nucleotide in enumerate(reference):
            ref_chunk = reference[i:i+len(pattern)]  
            db.insert({'sequence': ref_chunk, 'index':i+1})  
            #print(ref_chunk)
                
        sequence_match = Query()
        matches = db.search(sequence_match.sequence == pattern)
        matches = json.dumps(matches)
        locations = re.findall(r'\d+', matches)
        aligns.append('{} {}'.format(pattern, locations))
    print(datetime.datetime.time(datetime.datetime.now()))
    return(aligns)

In [14]:
def pattern_find_tinydb(pattern_list, reference, mismatch_num):
    print(datetime.datetime.time(datetime.datetime.now()))
    aligns = []  # Creates empty list to store alignments
    pattern_length_list = []  # Creates empty list to store pattern lengths
    reference_upper = reference.upper().replace(' ','')
    reference_clean = re.sub('[^A-Z]+', '', reference_upper)
    
    for pattern in pattern_list:  
        string_upper = pattern.upper().replace(' ','')  # Removes whitespace, converts to caps.
        string_clean = re.sub('[^A-Z]+', '', string_upper)  # Uses a regex to remove char not a A-Z.
        
        if len(pattern) not in pattern_length_list:  
            pattern_length_list.append(len(pattern))
            
            for i, nucleotide in enumerate(reference_clean):  
                ref_chunk = reference_clean[i:i+len(pattern)]  
                db.insert({'sequence': ref_chunk, 'index':i+1})  
        aligns.append(pattern)     
        all_matches = ''
        if mismatch_num > 0:
            for neighbor in neighbors(string_clean, mismatch_num):
                matches = db.search(Query().sequence == neighbor)
                matches = json.dumps(matches)    
                all_matches += matches
                #print(matches)
        else:
            matches = db.search(Query().sequence == string_clean
            matches = json.dumps(matches)    
            all_matches += matches
        locations = re.findall(r'\d+', all_matches)
        list1 = [int(x) for x in locations]
        list1.sort()
        aligns.append(list1)
    print(datetime.datetime.time(datetime.datetime.now()))
    return(aligns)

SyntaxError: invalid syntax (<ipython-input-14-1f3e218287de>, line 28)