# The Problem:

## How to locate specific patterns in a reference allowing for fuzzy searches...as fast as possible!

---

### An example of an alignment:

### Does the pattern "cac" occur in the following sequence?

### cactaagcacacagagaata

### yes!

### *cac* taag *cacac* agagaata

---

### Here is a fast algroithm to determine if a pattern is found in a reference.

In [244]:
def find_in(reference, pattern):
    if pattern in reference:
        return "Yes"

In [245]:
reference = 'cactaagcacacagagaataatgtctagaatctgagtgccatgttatcaaattgtactgagactcttgcagtcacacag'
pattern = 'cac'

In [246]:
find_in(reference, pattern)

'Yes'

### Great! But this does not give us location information or allow us to perform fuzzy alignments. 

### This function iterates through the reference using a 'window' the same size as the pattern.

In [247]:
def pattern_find(reference, pattern):
    pattern_location = []  # Creates empty list to store location of alignments. 
    for i, base in enumerate(reference):
        query_pattern = reference[i:i+len(pattern)]  # "Window" used for search.
        if query_pattern == pattern:
            pattern_location.append(i) 
    return(pattern_location)

In [248]:
pattern_find(reference, pattern)

[0, 7, 9, 72, 74]

### Now let's try it on a list and time it.

In [249]:
pattern_list = ['acac','cag','ttat','agaa','ttatcaaatt',]

In [250]:
import datetime

In [251]:
start = datetime.datetime.now()

for pattern in pattern_list:
    print('{} {}'.format(pattern, pattern_find(reference, pattern)))

stop = datetime.datetime.now()
pattern_find_duration = stop - start
print('{} pattern find'.format(pattern_find_duration))


acac [8, 73]
cag [11, 68, 76]
ttat [43]
agaa [14, 26]
ttatcaaatt [43]
0:00:00.001400 pattern find


### Still looking good. How about some real data?

### My oligonucleotides and a portion of the human genome from UCSC DAS database.
#### http://genome.ucsc.edu/cgi-bin/das/hg19/dna?segment=chr12:53960000,53980000

In [252]:
pattern_list = ['CATGACCCCCCCCAAGAAGAAGCGCAAGGTGGAGGACGGAATGGACGCACAAACACGACG', 'TGGGGCCGCCGCCCACGGCGGGGGCGCCGCCGCCCAACTTGTTTGCAGCTTTCCATTGAGCTT', 'CATGACCCCCCCCAAGAAGAAGCGCAAGGTGGAGGACGGAATGGCTTCTAACTTTACTCAGTTCGTTC', 'TGGGGCCGCCGCCCACGGCGGGGGCGCCGCCGCCCAACTTGTAGATGCCGGAGTTTGCTG', 'ATGGACGCACAAACACGAC', 'ATGGCTTCTAACTTTACTCAGTTCGTT', 'TACTTGCATTTTTTCTAAACAC', 'ACTATAGGGAGACCCAAGCTGG', 'GAGGATTCTGACAGTGAAATATCAG', 'ATGTTTGGTGAGCCAAAAC', 'CATACTTACTTGGCTTGTTTGGGATAT', 'GCCTATGGCATTATTGTACGGA', 'TTACGCATAAACGATGACGTCA', 'AAGGCACAGTCGAGGC', 'GATCTGGGCCCTGAAGAAGGGCCCG', 'GATCCGGGCCCTTCTTCAGGGCCCA', 'ATGTTTGGTGAGCCAAAAC', 'CATGAGGGCCCTGAAGAAGGGCCCC', 'GATCCGGGCCCTTCTTCAGGGCCCA', 'GATCTGGGCCCTGAAGAAGGGCCCG', 'CTATCAGTGATAGAGAACGTATAAG', 'CAGAGGAGGGAAGAGAG', 'CTTCTTATCATCTCCATCTTTATGATG', 'CGTTCAGTGTCAGAAAATG', 'CATCTTGAGACACATGGG', 'GTGACATAATTGGACAAACTACC', 'GGGAGCTTGTATATCCATTTTCGGATCTGATCAGCACGTGATGACCGAGTACAAGCCCACG', 'CATAGAAGGCGGCGGTGGAATCGAAATCTCGTAGCACGTGTCAGGCACCGGGCTTGCGGG', 'TGCCTCTGAGCTATTCC', 'GCGCTGGAGGATCATC', 'AGCGGTTCCCGG', 'CCGCGCTGGAGGATCATCCA', 'GGGTCTGGGCAGCGCCGTCG', 'GGCGAAGAACTCCAGCATGAG', 'CATGGCGATGCCTGCTTGCCGA', 'GCAAGGAACGCCCGTCGTGGC', 'TCTCCGGGCCTTTCGACCTGCAGCCAATATGGGATCGGCCATGACCGAGTACAAGCCCAC', 'CAGTCGAGGCTGATCAGCGAGCTCTAGAGAATTGATCCCCTCAGGCACCGGGCTTGCGGG', 'TCCTCTTCCTCATCTCCGGGCCTTTCGACCTGCAGCCAATATGACCGAGTACAAGCCCAC', 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCTACGTTTACGCAGACTATCTTTCT', 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCTCTATTTACGCAGACTATCTTTCT', 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCTGATTTTACGCAGACTATCTTTCT', 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCTTGCTTTACGCAGACTATCTTTCT', 'CAAGCAGAAGACGGCATACGAGATCGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATC', 'CGTGCCTTTTCCCGCGAGGTTG', 'GCCTGCTGGGGAGCCTGGGGAC', 'CCTCTAGAAATAATTTTGTTTAACTTTAAGAAGGAGATATACATATGAGCGGCCGCGGCAAAGG', 'CAACTCAGCTTCCTTTCGGGCTTTGTTAGCAGCCGGATCCTTAACCGCCAAAACCATACAGG', 'CTCTAGAAATAATTTTGTTTAACTTTAAGAAGGAGATATACATATGGGCCATCACCATCACCATCACC', 'GTCTGTGTAGAAGACCACACACG', 'GCTCAAGCAGAGGCGGCCTCGGCC', 'GCTTCAAGTGGGAGCGCGTGATG', 'GGGGATCAATTCTCTAGAGCTCGC', 'ATTGGCTGCAGGTCGAAAGGC', 'CCGCTCTTCCGATCTGTTATGAAGG', 'GCAGGAAACGAAGATAAATCATGTCG', 'CGTAACAACTCCGCCCCATTGACG', 'CGCGGTCTCGGCATTCCTGCTG', 'AGGCCCGGCATTCTGCACGC', 'CCATTCTCCGCCCCATGGCTGAC', 'CTTGTCTGTAAGCGGATGCCG', 'GATCAGTTGGGTGCACGAGTGG', 'CCGCTGTTGAGATCCAGTTCG', 'CCTCCTCACTACTTCTGGAATAGC', 'AGGCCGAGGCCGCCTCTGC', 'CCTGACGGGCTTGTCTGCTCC', 'CCAAAATGTCGTAACAACTCCGCCC', 'GCATATTTGAGAAGATGCGGCCAGC', 'CAGCGGCCAATAGCAGCTTTGC', 'CACAAGTGGCCTCTGGCCTCGCACACATTCCACATCCAACGCGTGGGTTGCGCCTTTTCCAAGGC', 'CGAGGCTGATCAGCGAGCTCTAGAGAATTGATCCCCCGTCGACGTCAGGCACCGGGCTTGCGGG', 'CGTCGACGGGGGATCAATTCTCTAGAGCTCGCTGATCAGC', 'GGTTCCTGGCCTTTTGCTGG', 'GTGAAATACCGCACAGAGCAAAAGG', 'TGTTCTGCAGCGTGTCGAGC', 'TTACAGCGTGATGGAGCAGATGAAG', 'ACGCGTTGGATGTGGAATGTGTGCGAGGCC', 'ACTGCCCGCTTTCCAGTCG', 'CGACGATATGATCCTGATGCAGCTAG', 'TTTACGCAGACTATCTTTCT', 'CGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATC', 'CAAGAATGCATGCGTCAATTTTACGCAGACTATCTTTCTAG', 'GGTGCCTGACGTCGACGGGGGATCAATTCTCTAGAGCTCGCTGATC', 'CAACCCACGCGTTGGATGTGGAATGTGTGCGA', 'GATCCGTTGTAAAACGACGGCCAGTCA', 'TATGACTGGCCGTCGTTTTACAAC', 'GGAGGACGGGCAGACTCGC', 'CAACCCACGCGTTGGATGTGGAATGTGTGCGAGGCCAGAG', 'GAGTTGGTAGCTCTTGATCCGGC', 'GCAACTGCCCGGCTACTACTAC', 'CGTTGGCCGATTCATTAATGCAGC', 'CCGCACCGCTGTCATTAATCTGC', 'CCACTTGTGTAGCGCCAAGTG', 'GGATAATACCGCGCCACATAGC', 'GATCGGGCCCTGAAGAAGGGCCCGGGCCCTGAAGAAGGGCCCGGGCCCTGAAGAAGGGCCCCGG', 'GATCCCGGGGCCCTTCTTCAGGGCCCGGGCCCTTCTTCAGGGCCCGGGCCCTTCTTCAGGGCCC', 'GATCCCCGGGCCCTGAAGAAGGGCCCGGGCCCTGAAGAAGGGCCCGGGCCCTGAAGAAGGGCCCGGA', 'GATCTCCGGGCCCTTCTTCAGGGCCCGGGCCCTTCTTCAGGGCCCGGGCCCTTCTTCAGGGCCCGGG', 'CCACTCCTCCACCTTTGAC', 'ACCCTGTTGCTGTAGCCA', 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCTTGCAAGAATGCATGCGTCAATTTTACGCAGACTATCTTTCTAG', 'ACCATTCCCAATGCCTGAA', 'TGCATACCTACCCAATGTATGG', 'GGCTTAGACCCTCAGGT', 'GCTCCCTCTCTCCACTC', 'GGACTCATGACCACAGTCCATGC', 'GGAAGGCCATGCCAGTGAG', 'CCAAATTCGTTGTCATACCAGG', 'ACACCCAAGCTCGTTGGG', 'CCTTGCCTGCATTTCTCTGC', 'CAGAAAGGTCCTGCTCCGC', 'CCCCTCCTTCCTCTCGCC', 'GTCCCTAATATCCCGGAGGT', 'GCAGGCTTCTAAATCCGTTC', 'GATCGGAAGAGCGGTTCAGCAGGAATGCCG', 'AGAAAGATAGTCTGCGTAAA', 'CTAGAAAGATAGTCTGCGTAAAATTGACGCATGCATTCTTG', 'GTTGACATTGCGAAGAGCGACAAAG', 'GATCTGTTGTAAAACGACGGCCAGTC', 'TTAAGACTGGCCGTCGTTTTACAACA', 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCTACTTTTACGCAGACTATCTTTCT', 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCTCAGTTTACGCAGACTATCTTTCT', 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCTGTCTTTACGCAGACTATCTTTCT', 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCTTGATTTACGCAGACTATCTTTCT', 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCTACGTTTACGCAGACTATCTTTCT', 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCTCTATTTACGCAGACTATCTTTCT', 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCTGATTTTACGCAGACTATCTTTCT', 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCTTGCTTTACGCAGACTATCTTTCT', 'CAAGCAGAAGACGGCATACGAGATCGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATC', 'CGGATTTCCTTGAAGAGAGTGAG', 'CGGATCTGGAAGTTCTGTTCC', 'CGCTGTGCAGAAGCAGAGAGG', 'GTTACCAGGTCCGCGCTCTC', 'ACGGATTCGCGCTATTTAGA', 'GTGCTTGTCAATGCGGTAAG', 'CACGCGGTCGTTATAGTTCA', 'GACGCATGATTATCTTTTACGTG', 'CGCAATTAATGTGAGTTAGC', 'GGTGTAAACCTTAAACTGCC', 'CAGGCAGACATCTGTGAATCG', 'GTTCTGCCCAAGGGTTGGTTTG', 'TTAATCTAGCTGCATCAGGATCATATCGTCGGGTC', 'GGTAGAAAAAGCAACCACGAAGC', 'ACATAAACCTCTGTCTGTGAGTGCC', 'GGCAGCACAGAGCAACTCTA', 'GAGTGCAAAGTCCCGTTTG', 'AGCCTTTGGAAGCTCTTGAA', 'GTGTCTTGGAGAGGCGTGTA', 'AGAAGAGTTAGTTGACTATACAGC', 'ATGTTTGAATGTGATAACCGTCCT', 'AATTACCGATCCAATGCGAAGCTTTAAGAC', 'AATTGTCTTAAAGCTTCGCATTGGATCGGT', 'AATTACGTAAGCTTAATGCCGATCCAAGAC', 'AATTGTCTTGGATCGGCATTAAGCTTACGT', 'TTGAGAAGAGTTAGTTGACTATACAGC', 'CAAGCAGAAGACGGCATACGAGAT', 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT', 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCTATCACGTTTACGCAGACTATCTTTCT', 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCTCGATGTTTTACGCAGACTATCTTTCT', 'ACAGTAGCTGTATATAAAACCAGTGATGTTTGAATGTGATAACCGTCCT', 'TGCTGTATAAAAAACCAGTGGTTATATGTACAGTAGCTGTATATAAAACCAGTGGTTATATGTACAGTAGCTGTATATAAAACCAGTGATGTTTGAATGTGATAACCGTCCT', 'ATGTTTGGTGAGCCAAAAC', 'GATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG', 'GATCGGAAGAGCACACGTCTGAACTCCAGTCACCGATGTATCTCGTATGCCGTCTTCTGCTTG', 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT', 'CAAGCAGAAGACGGCATACGAGATATCACGGTGACTGGAGTTCAGACGTGTGCTCTTCCGATCCGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATC', 'CAAGCAGAAGACGGCATACGAGATCGATGTGTGACTGGAGTTCAGACGTGTGCTCTTCCGATCCGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATC', 'AGCCATCAAGGAGGCTGTAA', 'CCAGGAAGTCCGTAGAGACG', 'GAAGTTCAGGCATTGGGAAT', 'CAAACGGGACTTTGCACTCT', 'GTGAACCGTCAGATCGCCTGG', 'TGTCCAATTATGTCACACCA', 'CCATTCGCCATTCAGGCTGC']

In [253]:
reference = 'CATGACCCCCCCCAAGAAGAAGCGCAAGGTGGAGGACGGAATGCACGCACAAACACGACG aaactatcagtcctgagaagactgttgtcaattaatgccttgtataactg gattgctcaggagaagggagagtaaggcagtgaaagggaaggggcaatag gtttgaggaagacattattggtttcttaacccaacccccatttttcaacc attttctcttgcttgcctcactataaaggctgaaaagcaagatattcctt tttcctatgcagctggtattaaacctgtcacacagttcaggccaataaga caatgaaatccaccgtggtgggggtggaggatggcacttctgatgtaagg cttttgtctccttataaaagggaaagccagaacaagagagttcctgatgc tacctgccccttatttctttcccatcctctgatagtgaggcaacaggaat gaggattaaaaggtaaaagctaaaaatggtacagcagagagaaagaacct aggtccttgataaacttgcttagctgctgaaacagtcctaattctggaca ccttgtgttataaatgttgacacagtttttttttgtttttgtttttgctt tgttttgagatgcagtcttgctctcttgcccaggctgcagtgcaatggca tgatcttggctcactgcaacctctgcctcccgggttcaagcaattctcct gcctcagcctcctgagtagctgggattacagacacacgccaccatgccca gctaatttttgtatttttagtacagatggggtttcaccatcttggtcagg ctagtctcgaactcctgaccttgtgatccgcccgcctctgcctccctaag tgctgggattacaggcgtgagccattgcgcccagcctgtgttcacacagt tttaagctactgttaggtatttgcatctgaaaccattcctaaccaatata cgattttcaatatgtatagattctacttcattttctatctaaggaactta gaaatggaatatacaagaaattattttattctacatcttcatgaatattt atcagttgtatcctaatgctgaattctgttttctttttcacataatatac caaataaattccatctcatcaccacctattgcctcaagcttccttagaaa caatggctatacactgggcttttaggaagaacaagatggtaagagagtgg caccttacaatgctgctttggaaactggggcagttaaagatcactcctca ctaaagaatggagttttagactggagcagcatgttttcttatttctcgct gcccctcttcctcaatacccagtgtttggctttttatcaaactccagggc actcttgggaactacgcttaagaattaacaaggatagatccattatcttt ccttagatggcagctacttacttggatgggttaaatgctcaacaaacttt aaaggaccagtgcctaggaggcagagaaatcccagcagcaattcagccag gcctggtcctatgtcagttacatggctaattgctttctccacagctcttg cctaatggtagaccttttagtattctcctatccagatgtagccacttagc tggggtaccattaaagtccccaaagggataactgtagttcttaacaaagc aagaaaaggttattttaaaatcaaaataattatcacacagccctattaag ttcttgttgacattatacatttctatttttcaatctcatctctctcccca aattcttattaggatgttatgaggtacaaaatccttgaaaaacgtatcaa atacttttatcttctaatttgctacacattaagtgttacaatttctggat ctcacaaatgggtgggaaggacaaagttacagagtgaaaagtatgaacct actacattgtggtttggcttgagctgctgaaaatttcactgtggaaaaac tatactgtaaatacttcctgataatacaggagttttaaaattttgaatgc ttacaatgcattcagagtagagcacatacaagtcagtactgagaccacaa taacaaaaaccctgccctgatctttaaggccttatgcagtttagtactgt aacagtcaagccaaaagaaaatatataactggaaggtgaggagggggaga gaaccaatgagcgtgctcacatgcactcgtctctaaaaccgtctctaaac ttggttcaagtttacaaggagattacgcacacacactctctctctctctt tctctctctctccttccctcactttaaaataataccaaaatacagtatta agcttttgttttaaacaagctatcaaagcttacaggcccagggactcgga aaccacagaagaggttgcagcctggtaaagaaataggcaacgataaaggc tcaaatagtttggcaggtcttctatataaaccataaactggtcaagtgct ctaattctgccctgctatcttaatttaatgcagcaaaaaattcttaccat aatttgttgacatattcatctgattatctttgtgtctaaccaggaaagct ctgataagctatctttagttttacaaagtcaggcaaatggaaaacttcaa ttttggatgtttactttctaaccataagtgatattttttagaatgctggc cacagactagaggtctacactcaaagaaggcaggcaggcaggtaggggca aagaatgaaattgcagctaagcactgtggctcatgctggtaatcccagct acttgggaggctgaggcaggaggatcacctgaggaatttgagacaagcct gggcaatatatattgaaatcctgtctcttaaaaaatttaaagagtagcca ggcatggtggcacatgcctgtagtcccagctacttgggaggctgagaaag agagagagagaaaggaggttcccttgagcccaggtgtttgaagctgcagt aagctatgattgtgccactgcactccggcctgggcgacagaggagggctc catgtctcttaaaaaaaataaaaaataaaaaaggaatctacactgcctat cagtaacacttatccacatacttctagacagtcatagtgatccaacacag aagaacagcagcagctggggaaattgtcttgttcctgtctcatctcctgc tgctctccatcacagtactaaactacttgcattttttctaaacacatcat gctgtttcattctcttgcatctgctattctcttatatgtggaatgttctt ctctactttctctgggaaacttctatttacctttcaaaacctgttaaagc atcatttcctccaggaagccttctctgatgtttaccagaattctcagtac ttgcaaatatttacattattatatttatcatattagaatgtaattacatt ttcatgtgtctgactcccctataaggcagtgaatactttcatgtcttata catctctctgtcactagtacctagcatggtgtttggcacattgtagatgt tcaaatttttttttaatttttttttttgagatggagtctcgctctgtcgc ccaggctggagtgcagtggtgcaatctcagcttactgcaacctacctccg cctccctcgttcaaaggattctcctgcctcagcctcctcagtagctagga ccacaagcatgtgccaccatgcccggctaacttttgtatttttggtagag acggggtttcaccatattggccaggctggtctggaactcctgaccttgtg atctgcctgccttggcctcccaaagtgcagggattacaggtgtgagccac cgcgcctggccattcaaatattttttattgagctgaacccacagaggctt atgaaggtctgattgaattgagaaaattattcatggactgatttctggaa acctgacctcaggcttcctcaaggttagagataccaggcagatacataaa gaataaaatcaaatgccttgatcttatttttacaaaggatttgtgtgcac taagtctctcaccagagacatttgggaaaacccatgttcaatttttgatt acggtctaaaatgataacaaattgttccaattatatagaacctgccactc cattatatcataatgtagacaaaccaggaagtgctaatgggatcctatta accttcatatcacattacaacagcctaggggaagagaggagagaaaggaa aatacaaaggccattaaaaatactgatgggtgtcaggcaggggctaagag gttggaacttccagtcctcacctctaggtaacaactcactacagtatagc aaatgcagatttcaaccatagtatagtaggccggtaatgcctccagtata gacctgtcctgccaaaatacaacataccaggtggcttaaacagaacttta ttatctctcagttctggaggccagaggtgttaccagggttggtttcttct gaaggatgtcagcaagaatgtgtttcatgtctctcctagcttctggcttg ctggcagtctttggcattccttgacttgtggcagcataaatccaaccttc acatagtgttctcctgtgtgcgtgtgtgtgtgtgtgtgtgtgtgtgtgtg tgtgtgtgtgtatgtccaatttcctctttttataggacaccagtcatatt ggagtaggggcccaccttacttcagtatgacctcatattaactaatgcca tctgcaatgaccctgtttccaaataaggtcacattttaaggtacctgggg ttaagacttcaccataaaaattcaattcaatccataacagctcacaaaga tatatcaatgttctaattcttagaagctgtgaatgttaccttatatggtg attaaattaagggctttgagatgaggggattatccagatgggccctcaat gctatcacaagcgtccttctaagacagcagcaaagggagattagaaacac acagagaggagagggcaatgtgaagaaagacatagagattgaagtgatgt aaccacaagccaagaaatgccggcagcaactagaagctggaagaggcaag gaaaggattctcctggagagccttctcaggaggatacggtcctctttaat atcttggtgggcttttggtctcccaaactgtgagataataaatttctttt ctttcaagccacctagtttgtggtaatttgttataacgcctacaggaaac taatatacatattatcttttcataatttaaacataattatctccattcta agagcagcagaggagcaggcagataacagacaataggtgacaaactccaa gcaattatacccataaccacaaaccactgtaaccagcatttttttaaaag ggaaagccaggcacagtggtgtgcacctgtagtcccagctactcagaaga ctgagaagggaggatcatttgagcccacaagtttgagtccagcctgggca acacagtgagactccatctctttaaaaaaaaaaaaaaagggagcagggag acaaatccgcactgaaatatcaccaccacaacaaaagaaactacagagta catcacactagataagggtgaatactattacatggctctctaatttctat tatataaaaaatgtgttttggttgggcacggtggctcacgcctgtaatcc cagcactttgggaggctgaggcagccagatcacctgaagtcaggagttaa aaaccagcctggccaacatggcgaaacccggtctctactaaacatacaaa aattagctgggtgtggtggcgcacacctgtaatcacagacactcggcagg ccgaggcaggagaatcgcttgaacctgagagacggaggttgcagtgagcc gagacagtgccactgcactccagcctgggatacagagccagactccatct caaaaaaaaaaaagaaaaagtgttttaagttgcaatgtaaaatgtatttc ttactcgattcataaaccaaaaggactgagacttctggtttttggcttgg catataaagagcttagaataacagttgtcactctatcctaacaacaaata aaaagctgaactgaaaaatcttaacaactcttcatagatctatcagagaa gtgagttcacagggcaaactgctgttccccaaattggcagatatagagaa tcacaacctaaaggagcagaaacccatgagctgaaacctgcatgacaatc aagtgccaggggaggaaaacctgaactataattgatgaagctagaggcca agtgtggacaagtctgagagttaaaaatgccagggagacccagtcacaga ggagcccccacaccgttgtgagatttacctccaggttctcccagctctac taggttcttacggtgaatatcaggggaaatttcttgtgcttttggcgggg ggaaagggaaaagtacagtagcctccccttacccgtggtttcactttcca tggtttcagttacccatagtcaactgcagttcaaaaatagatgagtactg gtcgagtaaggtggctcacacctataattccagcattttgggaggctgac gtgggaggaccgcttgagcccaggagttcgagactagcctgggtgataga gtgagaccctgcctctatttaattttttttttttttttgagagagaatct cactctgtcacccaggctggagtgcagtggcacaatctcagctcactgca acctccgcctcctgggttcaagcaattcttgtgcctcagcctcctgagta gctgggattaaaggtgtgcaccaccacacccagctaattttgtgttttta gtagagacggggtttcaccatgttggccaggctggtctcgaactcttggc ctcaagtgatctgcccgcctcggcctcccaaagtgttgagattacaggcg tgagccactgtacccggccaaaaaaaaaaaaaaaagatgagtacagtaca ataagatattttgagagagagaacacatttatataacttttattacagta tatttttataactgttctattttattattagttgttaatctcttactgtg ctgaatttataaattaaactttttcataggtatgtacatatagggaaaca tagtatggtcatccctccatatccatgggggactggtttcaagacccatg aggataccaaaatccacagatactcaagtccctgatataaaatggtgtag tatttgcatataaaatgtgcatatcctcctatatactttaaatcatctct agattattataatacctaatacaatgcctacatatcacttcattcgtgtg gattcaatgtagtgcttggcaggcagcaaatttaagttttgcctttttga attctgcggaattttttttttctgagtattttttacctatgattgtttta atccatggatgagaaacccatggatacagagggctgactgtatatagggt ttggtactatctggtttcatgcctccactgggaatcttggaatgtatcct tgtgaataaggggggactactggaaccattttgaaataccccagagcatt ctgttcttctttacaaggcctgccctcaggagaaaatactttaccagagc ctaacctgctgcggttttatcatagactaactaacctgggagaagggtaa tacccaattctacccccgtctagccatcctgtcccatccaagtggggaaa agaaaacaaaactgagaagcactggtgaagtttatagtccagggcatagg ctcaccaaaatattgagacctaatcatacgacagtagaacacttccagtt cccctccctgtcatacctcatcacaactttactagggacgtatttaccaa ttccttttacccagtatatcatgtccacctttcaacagaaaattgtaagt catattaaaaggcaaaaatacactctgaagaaactgaataagcaccagaa ccagtgagatatgacagaactgtcagaattaacagtccaggaattccaaa acactatgattaacacaacaacatgtaagaacagatggataatataagaa gagagatgaaaattctaagaaagaataaaaaagagaaacttttaggctcc tgaacacatggaggtgcaggagtgtgacaggcccaggaagggcatggaag cattgcaccccttcccaccaatgcatctccttatctggttttcatccgta tccttcataatatcctttataataaactggtaaacatcagctttccacag gctttctttctccatggtgagcaggacaatgtcctaggaccccagagagc caggagagagcaagatcaaagccagaaatccaactctgtccctactgccc actggtcttttccagtcaacacttcctcagcaaatacatgccacatagcc gtgctgctcagaggtctcctggcttatatgcagaaaaggccctgtggccc aggatcacagtagcaacaatattctcatcaaagcagccagagtgatacag cagaaggtcaagagacagaaaaaagcactaaatcctggtttataagagac aggcagacttcaagggttgtctcctaagacaataagtaagctcaagggga agcttataaccagtaagcttcagtggtgaaaaaagagcctagccaggccc aaatgggaaatcctaaagaaatagacatatattaaaagaaatagtaaatt caagatggggagcattcaagtgtgcagagggtgagcaagactttagtcag aaaataaagttactctttcttgccattgagcttaagccaaaaaaaaaaaa aaattaaagaaaagaaagctacttggctgggcgtggtggctcatgcctgt aatcccagcactttgggaggccgaggcgggtggataacctgaggtcagga gttcaacaccagcctgaccaacatggtgaaaccccatctctactaaaaat acaaaaactaggctgggtgtggtggctcatgcctgtaatcccagcacttt gggaggccgaggcaggcggatcacctgaggttgagagttcgagaccagcc tgaccaacatggagaaaccctgtctctactaaaaatacaaaattagtagg gtgtggtggtgcacgcctgtaatcccagctgctggggaggctgaggcagg agaatcgcttgaacccaggaggcggaggctgcggtgagccgagattgtgc cattgcactccagcctgggcaacaagagcaaaactctgtcttaaaaaaaa aaaaaaaaaaaattagctggtttggtggtacacgcctgtaatcccagcta ctcaggaggccaaggcagaagaatcacttgaacctgtgaggtggaggttg cagtgagccaagatggcaccactgcactccagcctgggcaacagcgagat gcagtctccaaaaaaaaaaaaaaaatctactcatacacaaaaaagcacac tccaagcattagtttttcatatgcagggagtgatgagtcagcattactct tgcaccagaatatacatgcaggagggaagtcctatgtgtgcaataagtga gggcggggctttagaaataagtcacacttcacataccagaggacacactc agggaagaaggctttcttgtgaaaggagtgtgggtaagacttcttataga aggcaattctcactgcatatcagaaaacccactcagggaagaagtctttt gtatgcaaggagtgtaggtaagactttacccaaaagtcaactgtgcatga gagaacacacttggcagagaagccttatagatgccaggagttgtggacaa aggattaggtataaatcatcctatgataaacactggaaggcacactcagg ggagaaggctttgttgtgcagggagtatgggcaagacttcagcttgaaga catctcttaccagacaccagaggacacacccagggaagtaactctgcttc tttaaaaagctgtagccaagactgtgcaacttgtttatggatatatcacg aaggcagaagggatccagagggctaaaagaacctgaacttggcccaagtt attcccagagattctgagaaaagaggcactgagagcctcaccaacctcct tagacggcacagcagtctaggacccctccaccaaaccttccctccctctg ttcttccctctggcagctctcccaaccttccttagtgctatggtttgaac aatgttgtcccttgcaaaattcatgttgaaacctaatcaccaatgtaaca ttattaagagaatgtttaggatgtggttaagtcatgaggatggagccttc atgaatggcattaggtacccttataaaaggatttggcaaaagatttggca tcactttaccctgtcctccaactgccatgtgaggacacagagctcctctc ctctggaggatgctatgtcaagataccatgttggaagcagagagcagccc ttgtgagataaccaaatctgctggcaccttgatcttgaacttcctagctt ccacaactgtgagaaaataaatttctgttctttataaaacaaacaaacaa acaaacaaacaagaaacgctacagataaaaaacactgtaacagaaatgaa gaatcccgaccaggcacggtggctcatgactgtaaccccggcattttggg aggctgaggtgggtgaatcacttgaggtcaggagttaaagaccagcctgg ccaacatggtgaaacccccgtctctactaaaaatacaaaagttagtcggg catggttgcgcaagactgtagtcccaactactcaggaggctgaggcatga gaattgcttaaacccaggaggtggaggttgcagggacctgagatcgtgcc actgcaatccagcctgggtgaccaagagagaatctgtctcaaaaaacaaa aaagaaatgaagaatgcctatgatgggctcagtagtagactggacatagc tgagcaaagaatctttgaacttgagctgggcatggttgcatgtgtctgta gttccggctacttgggaggctgaaattggaggatgacttgagcccacgag tttgagaccagcctaggcaacatagtgacatagtaagacactgtctctaa aaacaaagaatctctgaacttgaggatatgacaatagaaacttccaaaac tggaaagctaagaaaagaagaactaaaaaaaaaataacaacaacaaaata tccaagaactgtgggacaaatacaaaaggtataatatgtgcataatagga gaaagaaagaaacacacacggccgggcgcggtggctcattcctgtaatcc cagcactttgggaggccgaggcgggtggatcacgaggtcaggagatcgag accatcttggctaacacagtgaaaccccgtctctactaaaaatagaaaaa attagctgggcgcagtggcgggcgcctgtaatcccagctactcgggaggc tgaggcagaagaatggcgtgaacccgggaggcggagcttgcagtgagccg agatagcgccactgcagtccggcctgggagaaagagcgagactccgtctc tgaaaaaaaaaaaaaagaaagaaagaaagaaagaaagaaacacacacaat atttggtattgctaaacaataactaaagcgataactaaaaatgtcccccc aaatcaatgtcagacaccacaccataagaacaataagcaggctaaatgac aaacaaacaaacaaacaaaaaactacacctagatatatcatatccaaact tcagaaaatcaaagagaaagagaaacatcttgaaagaaggcagagggaaa atcaccttacctacagaggagcaaaaacaagaattacatctaacctctca gaaaccatgcaagcagagagtacagtcaaagagagaaaaccctatcagcc tagaattctgtatcctgcacaattcttcagaagaaaggccacatgtggtg tggctcacgtctataatcccagcacggcagatggattgcttgtgtccagg agtttgagactagcctgggcaacacagtgagatctcatctcaatgaaaaa taaacaaaattagccaagtgtcctggcgtgtacctatagtcctagctatt tgggaggctgaggtgggaggatcgcttaagcccaggaggttgaggtgagc tgagatcacaccactgcactctagcctgggcaacacagtgagatcttctc tcaaaaaaaaaaaaaaaaaaaagtgaaggaaaaataaagactttctcaag caaaccaaaatacggagaaattgttgcccacagacttgccttgccagaaa tcttaaaagaagttctttagagagaaggaaaatgagctaggcacggtggc tcatgcctgtaatcccaacactctgggaggctgaggcgggaggactgctt tagcccgggagtttgagatcagcctgggcaatatcagcagaccctatttc taacaaaaataaaaataaattagctgggcatggtggtgcttatctgctgt tccagctactccagtggctgagatgggaggattgctggagccagggatgt tgaggctgtagtgagctacaaccttgccactgcaccccagcctacgtgac acagtgagatgctgtctcaaaaaaaaaaaaaaaaaaaatatctgggattc gaatcagccctctccagctccaagcacactgggatccttgccgttggagg ttgctgttggcttctgcccgtgctgtgtattgagatctgcaaccaagcat gggagcttacgaaggcacctggtggtttctaggtaggaaaacagtgtgga tgccagatgtggtgcctcacacacaacagcactttgggagcctgaggcag gcagatcacctgaggtcaggaattcgagaccagcctggccaacagagaga aaccctgtctctactaaaaaaatacaaaaaaaattagtcaggcatggtgg tacctgtagtccctgctacttgggaggctgaggcacaagaatcacttgaa cctgggaggcagaggttacagtgagctgagatggtgccgctgcactccag cctgggagacagagcaagactccatcttaaaaaaaagggccgggtgcagt ggctcacgcctgcaatcccagcactttgggaggccaaggcgggcagttca caaggtcaggagttcaagaccagcctggccaaaatagtgaaatgccatca aaatagtgaaatgccatctctactaaaaatacaaaaattagccgggcatg gtggtatacacctgtaatcccagctactcaggaggctgaagcaggagaat tgcttgaacccaggaggcagaggctgctgtgagctgagatcgtgccactg cactccaccctgggagacagtgcgaaactccgtctcaaaaaaaaaaaaaa aaaagcgtggagtgtgcgtaattgtgtttgaggaagagatggtcactgtc tctatggtagtgtctttagggaactatctgtgcctggatcctggggaatt atgacaaagcagtctgcaggtttggcatgaactctgaactttctccctct gcaggatcttcaatagaaaggtcctgtcctgaggaaacactcagaggcac ttctcttgcttgtgaagcctgccagcatgtatgttttggtcagcactgag atgcacatgacaaattgtgcgcaatgcctcagcacagcccaacggggttt tttctgtttacacttgactagccttggacagtaggaggaaaccttaaata tttccttctactctcacaaaccctttatctttttttttgagacggagtct cactctgtcggccaggctggagtggcacgatctcagttcactgccacctc cacctcctaggttcaagcgatcctcctgcctcagcctcttgagtagttga gactacaagcgcacactgccacacctggctaatttttgtatttttagtag agacgaggtttcaccatgttggccaagctggtctcaaactcctgacctca ggtgatccacctgccttggcctcccaaagtgctggaattacaggcgtgag ctaccacacccggcctcacaaaccttttatctaataaaacaaaattactg ccaaaaaaaaaaaagagacaatgtcagagtcaatcaaaaaacaagaccta actgtatgttgtatatgagaaaacccctttaatacaaataaacatataga ttaaaagtaaaaagataggccgggtgcggtggctcacgcctgtaatccca gcactttgggaggccaaggcgggtagatcacaaggtcaggagatcgagac catcctggctaacacagtgaaaccccatctctactaaaaatacaaaaaat tagctgggcgtggtggcgggtgcctgtagtcccagctactcagaaggctg aggcaggagaatggtgtgaacccgggaggtggagcttgcagtgagcggag atcacgccactgaactctagcctgggtgacagagcgagactccatctcaa aaaaaaaaaaaaaaaaaaagtaaaaagataaagatataccctaatccaaa aaaggtgagaatagctatattaattttagaaagaaagatatcaggaataa agacgagcattgcataatgataaaatggtcaatactccaagatgacatac aaattcttaatgcaggccaggcatggtggctcacacctgtaaccccagca ctttgggaggccaaggtgggtggatcacttgaggtcaggagttcgagacc agcctggtcaacatggtgaaaccctgtctctactaaaaatacaaaaatta gccaggcatggtagcatgcacctgtaatcccagctactcaggaggctgag gcaggagaaccatttgaacctgggaggtagagcttgcagtgagccaagat tgtgccactgcactccagcctgggcgacagagcaagattccatctcaaaa aaaaaaataaataaaaaataaaaataaaaataattaatgtgtatgtagct aacaacagagcatcaaaatatgtgaaggaaaaactgacagaactgcaaag agacagatagatgaacctgctcttatattaacagttggagactttaacac ccctctatcagaaatggacgtatccagcaggcagaaaatcagtaaggaca cagttgaactcaacagcaccatcaatcaactggacataattaacatctat agattaattcatccaacaacagattatacattcttcttagctcacatgga acatttaccaagactgaccacattctgggccatataacacaccttaacaa gtttaaaagaaatcacacaatgtctgccctcaaactgcaatggaattaaa gtagaaatcagtgacagaaagatagctggaaaatatgaaaggacttagat taaataacatacttctaaataatacatgaagcaaaaaagaaatctccaga gaaattaaaaagtattttgaactacatgaaaatgcaacttataaaaattt gtaggatgcagcagaaacagtgattagtgggaaactgatagcagtgaata aacatactagaaaagaaaaagatctaaaatcaataatctaagcttccacc ttagaaaactagggggaaaaagagcaaattaaatccaaagcaagcagaag aaaagaaataataaaaattagtgtaggtcaggggtggtgccccatgccta taatcccggcactttgggaggctaaagcaggaggattgcttgaagccagg agtttgagaccagcctggtcaacactgcaagaccatgtctctataaaaaa taaaaaaattagccgggtatggtggtgctcaggaggctgaggcgggaaaa tcatttgaactgaggagtcggaggctatagtgagctaagatcacatcact gcactctagcctgggcaacaaagtgacaccctgtcaaatgaatgaatgaa caaaaaaagagtggccaacatggtgaaaccccatctctactaaatacaaa aagaaataaaaaaattagctgagcatggtggcgcgcgcctgtaatcccag ctactcaggaggctgaggcaggagaatctcttgaacctgggaggtgtagg ttgcagtgagccgagatcatgccactacactccagcctggcgacatagca agactctgtctaaaaaaaaaaaaaaaaaacctactataggcctggcatag tgcctcatgcctgtaatcccagcactatgggaggccaaagtgggaggatt gcttgagactaggagtttgagaccagcctggggaacataacgtgaccctg tttctaccaacaccccccgccccaaaaagaaaaaaaacctactagaaagc tacagtagcaaacatagtgtggtactagaaaaagaatagacacataaatt actatatatattttatttatttatttattttgagacggagtctcgttctg tcgcccaggctggagtgcagtggcgagaccttggttcactgcaagctccg cctcctgggttgacgccattctcctgcctcagcctccggagtagctggga ctacaagtccccgccaccacgcccggctaattttttgtattagccgggat tagtagagacgggatttcaccatgttggccaggatggtctcatctcctga cctcgtgatccgcccacctcggcctcccaaagtgctgggattacaggcgt gagccaccgcacctggccatcaattactatatttatgtgaataaacagag ggtccagaaatagacccacataaatatagtcactgatctttgacaaagaa gcaaaggcaatacaatgaagcaaagataatcttttcaacaaatggtgctg gaacaactggacatccacatgcaaaaaaatgaatctagacacatgcttta tagccttcacaaaacttaaaatggatcacagacctaaatgtaaaatacaa aactataaaatttctagaagataatataggggaacacctacaccaggggc cacacagcagggggtgagcagcaggcaagtgagtgaagcttcatctgtat ttacagccactccccatcacttgcattacagccagaactctgccttctgt cagatcagtcgcagcattagattctcaaaggagtcccaaccctactgtga actgtgtatgtgagggatctaggttgtatgccccttatgagaatctaatg cctgatgatctgtcactatctcccatcacccccagatgggaccatctagt tgcaggaaaacaagctcagggctcccactgattctacattatggtgagtt gtataattatttcactattatattataatttaataatactataaataaag tacacaataaatgtaatttgcttgaatcccccccacccccagtccatgaa aaaaattcttccaggaaaccagtccctggtgtcaaaaaggtgggggacca ctgacctagatgacttttgtatggcgatgactttttaaatacaacaccaa agtcatgatccatgaactaagtaattgataagctggactgtgctaaaatt agaaatatctgctctgtgaaagacaatgtcaagaaaatgtaaagacaagc tacagaatgggagagaatgtaaaagatacacctgataaaggactgtcata caaaatacacaaagaactcttaaaactgaacagtaagaaaatgaacaacc tgattaaaaaatgggcaacggactttagtcagacacttcaccaaagaaaa atacagatggcaaataagcatgctcaacatcatgccatcagggaaataca aatacagataattgttattagtattattttttgagacagactctcactct gtcacccaggctggagtacagtggcgtgatcttggctcactgcaacctcc gcctcccaagttcaagcgattctcctggcttcagcctcctgagtagctgg gactacaggcacttgccaccacgcccagctaatttttgtatttttaggag agacgggggtttcatcatattggccaggctcttctcgaactcctgacctc aggtgatccacccgccttggccttccaaaatgcagggattacaggtgtga gccacccggcaggcattaaattgttgtttaatgcaaattaaaacaacaaa atccattacacacctattagagtggccaaaatccaaaacactgacaacat taaatgttgataaggatgtggagcaacaagaactctcattcatttttggt gggaatgctaaataatatagccactttggaagacagtttgacagtttctt acaaaactaaacatagtcttaccataaacatagtattaccatatgatcca gtgatcacactcattgatatttacccaaatgaacagaaaacttatgtcca cacaaaagcctgcacatggatatttatatagcagctttctttactcataa ttgccaaatctcggaagcaatgaagatgtccttcagcaggtgaatggata aattgtggtacattcagacaatggaatattactcagcactaaaaagaaat aaggtatcaagccatgaaaagataaggaagaaacttaaatgcatattact aaatgaaagaagctagtatgaaacggccttatattgtatgattccaacta aacaacattctgtagacggcaaaactacggagacagtagttttggcaaga tcagtgcttgccagcggtcagtgggaagtgagggatgaacaggcagagca cagaggatatttaggacagtgaaactattctgtatgatactacaatggtg gatacgtgtcattttatatattgatttgacaggagttcactctgttgccc aggctggtgtgcagtggcacaatcatggctcactgcagccttgacctccc gggctcaagcagtcctcccacctcagcctcctgaatagctgggactacag gtgcacattaccacacctggctaatttttaattttttttttgtaaggatg gggtctcactatgttgcctaggctggtcttgaactcctgggctccagcaa tcctcctgcctcggcttcccaaagtgttgagattacaggcatgaaccact gtgcccggcctcattatatatttgttaaaacctacagaatttgtaatacc aagagtgaattctaatataaactatggacttgggtgataatatgtcaatg caggttcattgaatgtaacaaattagcattgtggttcagtatgttgacag tgggggaggctagtgtgtgtgaggacaggaggtatatgagaatcctctgc acttttcactcaattttgctgtgaatctaaaactgctctaaaaaaaatag tttcttaatttaaaaaaaagcaaaggaagagcataaatgaaaaaagaaaa gactgaaaaccactaagtaaatgtgaatagcttcttgcatattcattctc atacagtgctcagaaaagacagaaaaccaaagcatagcagaaagtccagc ttccaagcaggaactggtgtctccttgacagagcttcaatccaattatgc cccagacactgatttttctatcactacctcccctttgtttcagagacagt atccatgaccttctgttcatgcggtggaggaagaaattgcttcccagaaa ggcttagcccactccattctactagataaaggaaaaacaaaaagccactc cactctgacaagccgattactgtctatggagaagcaaaatgcctccctta ggactatgggaatagccaacaaggtacagtatgttcaaacctttaaccat gactctgccaagagcagctatgtatgtgaatacaatctcctcatctgggc agtcaaatatgcataactgaaagtacaaacctccataaggtcctgtctag ttatagtttcaaatgcacagaaagtgaaagaaggctgaaataaagtctcc cctatccactattgcaattatctaggtaaagcctagaaggaaagaacagg aaagactgaagagggaaagttaaagtgttgaatataatatgatattttat a'

In [254]:
def pattern_find(reference, pattern):
    start = datetime.datetime.now()
    pattern_location = []  # Creates empty list to store location of alignments. 
    reference = reference.upper().replace(' ','')  # Removes whitespace, converts to caps.
    reference = re.sub('[^A-Z]+', '', reference)  # Uses a regex to remove char not a A-Z.
    
    for pattern in pattern_list:
        pattern = pattern.upper().replace(' ','')  # Removes whitespace, converts to caps.
        pattern = re.sub('[^A-Z]+', '', pattern)  # Uses a regex to remove char not a A-Z.
    
        for i, base in enumerate(reference):
            query_pattern = reference[i:i+len(pattern)]  # "Window" used for search.
            
            if query_pattern == pattern:
                pattern_location.append(pattern)
                pattern_location.append(i+1) 
    
    stop = datetime.datetime.now()
    pattern_find_duration_real_data = stop - start
    print('{} pattern find'.format(pattern_find_duration))
    print('{} pattern find with real data'.format(pattern_find_duration_real_data))
    return(pattern_location)


In [255]:
print(pattern_find(reference, pattern))

0:00:00.001400 pattern find
0:00:01.280562 pattern find with real data
['TACTTGCATTTTTTCTAAACAC', 3184]


### Fuzzy?

In [256]:
def mismatch(string1, string2):
    mismatches = 0
    for (nucleotide1, nucleotide2) in zip(string1, string2):
        if nucleotide1 != nucleotide2:
            mismatches += 1
    return(mismatches)

In [257]:
def approximate_patterns(text, pattern_list, max_mismatches):
    start = datetime.datetime.now()
    reference_upper = reference.upper().replace(' ','')
    reference_clean = re.sub('[^A-Z]+', '', reference_upper)
    
    pattern_matches = []
    
    for pattern in pattern_list:
        
        string_upper = pattern.upper().replace(' ','')  # Removes whitespace, converts to caps.
        string_clean = re.sub('[^A-Z]+', '', string_upper)  # Uses a regex to remove char not a A-Z.
        
#        for i, base in enumerate(reference_clean):
        for i in range(len(reference_clean)-len(pattern)+1):
            query_pattern = reference_clean[i:i+len(pattern)]
            if mismatch(string_clean, query_pattern) <= max_mismatches:
                pattern_matches.append(i+1)
                pattern_matches.append(pattern)
    stop = datetime.datetime.now()
    pattern_find_duration_fuzzy = stop - start
    
    print('{} pattern find'.format(pattern_find_duration))
#    print('{} pattern find with real data'.format(pattern_find_duration_real_data))
    print('{} pattern find with real fuzzy data'.format(pattern_find_duration_fuzzy))
    return(pattern_matches)
#print(approximate_patterns(text, pattern, max_mismatch))


In [258]:
print(approximate_patterns(reference, pattern_list, 1))

0:00:00.001400 pattern find


NameError: name 'pattern_find_duration_real_data' is not defined

In [114]:
from tinydb import TinyDB, Query
import re
import json

In [115]:
db = TinyDB('/Users/ksindy/PycharmProjects/oligo_search_website/pratice_db.json')

In [116]:
def pattern_find_tinydb(pattern_list, reference):
    print(datetime.datetime.time(datetime.datetime.now()))
    aligns = []  # Creates empty list to store alignments
    pattern_length_list = []  # Creates empty list to store pattern lengths
    
    for pattern in pattern_list:  
        if len(pattern) not in pattern_length_list:  
            pattern_length_list.append(len(pattern))
            
        for i, nucleotide in enumerate(reference):
            ref_chunk = reference[i:i+len(pattern)]  
            db.insert({'sequence': ref_chunk, 'index':i+1})  
            #print(ref_chunk)
                
        sequence_match = Query()
        matches = db.search(sequence_match.sequence == pattern)
        matches = json.dumps(matches)
        locations = re.findall(r'\d+', matches)
        aligns.append('{} {}'.format(pattern, locations))
    print(datetime.datetime.time(datetime.datetime.now()))
    return(aligns)

In [117]:
db.purge()

In [118]:
pattern_find_tinydb(pattern_list, reference)

21:21:14.863340


KeyboardInterrupt: 

In [105]:
def pattern_find_tinydb(pattern_list, reference):
    
    aligns = []  # Creates empty list to store alignments
    pattern_length_list = []  # Creates empty list to store pattern lengths

    for pattern in pattern_list:  
        
        '''
        DETERMINE LENGTH OF PATTERN
        Do not need to add to db if pattern length already seen
        '''
        
        if len(pattern) not in pattern_length_list:  
            pattern_length_list.append(len(pattern))
            
        '''
        ADD REFERENCE TO DATABASE
        Splits reference into pattern length chunks and adds to the db 
        along with location (i).
        [{'sequence': 'cact', 'index': 1}]
        [{'sequence': 'cact', 'index': 1}, {'sequence': 'acta', 'index': 2}]
        ...
        [{'sequence': 'cac', 'index': 1}]
        [{'sequence': 'cac', 'index': 1}, {'sequence': 'act', 'index': 2}]

        '''

            for i, nucleotide in enumerate(reference):  
                ref_chunk = reference[i:i+len(pattern)]  
                db.insert({'sequence': ref_chunk, 'index':i+1}) 

            '''
            FIND ALIGNMENTS
            [{"sequence": "acac", "index": 9}, {"sequence": "acac", "index": 74}]
            [{"sequence": "cag", "index": 12}, {"sequence": "cag", "index": 69}...
            [{"sequence": "ttat", "index": 44}]
            [{"sequence": "agaa", "index": 15}, {"sequence": "agaa", "index": 27}]
            [{"sequence": "ttatcaaatt", "index": 44}]
            '''
        sequence_match = Query()
        matches = db.search(sequence_match.sequence == pattern)  # bit-like object
        matches = json.dumps(matches)  # Dumps json to python dict, in this case to string
        locations = re.findall(r'\d+', matches)
        print(matches)
        aligns.append('{} {}'.format(pattern, locations))
    return(aligns)

IndentationError: unexpected indent (<ipython-input-105-55f715084728>, line 28)

In [442]:
final_list = []
working_list = []

def neighbors(pattern, mismatch_num):
    nucleotides = 'ACGT'
    current_list = []
    final_list = [] # this will mess up the recursive algorithm if more than 1 mismatch

#     for pattern in list:
    pattern_whole = ''
    final_list.append(pattern)
    for index, nucleotide in enumerate(pattern):
        rest = pattern[(index+1)::]

        for i, nuc in enumerate(nucleotides):
            working_pattern = ''
            working_pattern += pattern_whole
            working_pattern += nuc
            working_pattern += rest

            if working_pattern not in final_list:
                final_list.append(working_pattern)
                current_list.append(working_pattern)
        pattern_whole += nucleotide

    if mismatch_num > 1:
        for item in current_list:
            working_list.append(item)
        return neighbors(working_list, mismatch_num-1)
    else:
        #return 'this is result{}'.format("\n".join(final_list))
        return final_list
final_list = []
working_list = []

In [382]:
print(neighbors('CACT', 1))

['CACT', 'AACT', 'GACT', 'TACT', 'CCCT', 'CGCT', 'CTCT', 'CAAT', 'CAGT', 'CATT', 'CACA', 'CACC', 'CACG']


In [505]:
def pattern_find_tinydb(pattern_list, reference, mismatch_num):
    print(datetime.datetime.time(datetime.datetime.now()))
    aligns = []  # Creates empty list to store alignments
    pattern_length_list = []  # Creates empty list to store pattern lengths
    reference_upper = reference.upper().replace(' ','')
    reference_clean = re.sub('[^A-Z]+', '', reference_upper)
    
    for pattern in pattern_list:  
        string_upper = pattern.upper().replace(' ','')  # Removes whitespace, converts to caps.
        string_clean = re.sub('[^A-Z]+', '', string_upper)  # Uses a regex to remove char not a A-Z.
        
        if len(pattern) not in pattern_length_list:  
            pattern_length_list.append(len(pattern))
            
            for i, nucleotide in enumerate(reference_clean):  
                ref_chunk = reference_clean[i:i+len(pattern)]  
                db.insert({'sequence': ref_chunk, 'index':i+1})  
        aligns.append(pattern)     
        all_matches = ''
        if mismatch_num > 0:
            for neighbor in neighbors(string_clean, mismatch_num):
                matches = db.search(Query().sequence == neighbor)
                matches = json.dumps(matches)    
                all_matches += matches
                #print(matches)
        else:
            matches = db.search(Query().sequence == string_clean
            matches = json.dumps(matches)    
            all_matches += matches
        locations = re.findall(r'\d+', all_matches)
        list1 = [int(x) for x in locations]
        list1.sort()
        aligns.append(list1)
    print(datetime.datetime.time(datetime.datetime.now()))
    return(aligns)

In [506]:
db.purge()
final_list = []
working_list = []


In [507]:
print(pattern_find_tinydb(pattern_list, reference, 0))
#cactaagcacacagagaataatgtctagaatctgagtgccatgttatcaaattgtactgagactcttgcagtcacacaggct

15:30:35.708530


KeyboardInterrupt: 

In [413]:
def pattern_fast(pattern_list, reference):
    for pattern in pattern_list:
        if pattern in reference:
            print('yes')
            return 'yes'

In [414]:
print(pattern_fast(pattern_list, reference))

None


In [None]:
from tinydb import TinyDB, Query
import re
import json
import re

In [17]:
reference = 'cactaagcacacagagaataatgtctagaatctgagtgccatgttatcaaattgtactgagactcttgcagtcacacaggct'
pattern_list = ['acac','cag', 'gtctagaat', '']
pattern_list_2 = ['*BIOTIN*-ACAC', 'TGTC', 'aaa', 'act gag act ctt gc']
pattern_length_list = []

In [19]:
for pattern in pattern_list:
    pattern_length = len(pattern)
    if pattern_length not in pattern_length_list:
        pattern_length_list.append(len(pattern))
        for i, nucleotide in enumerate(reference):
            ref_chunk = reference[i:i+pattern_length]
            db.insert({'sequence': ref_chunk, 'index':i+1})
            #+1 because of 0 indexing. This will give teh actual start site of the pattern.
    sequence_match = Query()
    print(db.search(sequence_match.sequence==pattern))

[{'index': 9, 'sequence': 'acac'}, {'index': 74, 'sequence': 'acac'}]
[{'index': 12, 'sequence': 'cag'}, {'index': 69, 'sequence': 'cag'}, {'index': 77, 'sequence': 'cag'}]


In [55]:
def pattern_find_tinydb(pattern_list, reference):
    matching = ''
    #matching_dict = {}
    for pattern in pattern_list:
        pattern_length = len(pattern)
        if pattern_length not in pattern_length_list:
            pattern_length_list.append(len(pattern))
            for i, nucleotide in enumerate(reference):
                ref_chunk = reference[i:i+pattern_length]
                db.insert({'sequence': ref_chunk, 'index':i+1})
                #+1 because of 0 indexing. This will give teh actual start site of the pattern.
        sequence_match = Query()
        matches = db.search(sequence_match.sequence==pattern)
        matches = json.dumps(matches)
        #print(matches)
        locations = re.findall(r'\d+', matches)
        #print(type(locations))
        #print([int(loc) for loc in matches.split(" ") if loc.isdigit()])
        #print("{} is found at {}".format(pattern, locations))
        #matching += str((db.search(sequence_match.sequence==pattern)))
        #matching_dict.add(db.search(sequence_match.sequence==pattern))
    return(matching)
print(pattern_find_tinydb(pattern_list, reference))

NameError: name 'pattern_length_list' is not defined

In [23]:
def pattern_find_in(pattern_list, refernece):
    pattern_match = []
    for pattern in pattern_list:
        if pattern in reference:
            pattern_match.append(pattern)
    return(pattern_match)

In [24]:
from timeit import timeit

In [25]:
print(pattern_find_tinydb(pattern_list, reference))
print ("pattern_find_tinydb:{}".format(timeit(
                                    "pattern_find_tinydb(pattern_list, reference)",
                                    "from __main__ import pattern_find_tinydb;"
                                    "reference='cactaagcacacagagaataatgtctagaatctgagtgccatgttatcaaattgtactgagactcttgcagtcacacaggct';" 
                                    "pattern_list=['acac','cag'] "
                                    , number=100000)))

print(pattern_find_in(pattern_list, reference))
print ("pattern_find_in:{}".format(timeit(
                                    "pattern_find_in(pattern_list, reference)",
                                    "from __main__ import pattern_find_in;"
                                    "reference='cactaagcacacagagaataatgtctagaatctgagtgccatgttatcaaattgtactgagactcttgcagtcacacaggct';" 
                                    "pattern_list=['acac','cag'] "
                                    , number=100000)))

print(approximate_patterns(reference, pattern_list, 0))
print ("approximate_patterns:{}".format(timeit(
                                    "approximate_patterns(reference, pattern_list, 0)",
                                    "from __main__ import approximate_patterns;"
                                    "reference='cactaagcacacagagaataatgtctagaatctgagtgccatgttatcaaattgtactgagactcttgcagtcacacaggct';" 
                                    "pattern_list=['acac','cag'] "
                                    , number=100000)))




In [34]:
print(pattern_length_list)
print(db.all())

[4]
[{'index': 1, 'sequence': 'cact'}, {'index': 2, 'sequence': 'acta'}, {'index': 3, 'sequence': 'ctaa'}, {'index': 4, 'sequence': 'taag'}, {'index': 5, 'sequence': 'aagc'}, {'index': 6, 'sequence': 'agca'}, {'index': 7, 'sequence': 'gcac'}, {'index': 8, 'sequence': 'caca'}, {'index': 9, 'sequence': 'acac'}, {'index': 10, 'sequence': 'caca'}, {'index': 11, 'sequence': 'acag'}, {'index': 12, 'sequence': 'caga'}, {'index': 13, 'sequence': 'agag'}, {'index': 14, 'sequence': 'gaga'}, {'index': 15, 'sequence': 'agaa'}, {'index': 16, 'sequence': 'gaat'}, {'index': 17, 'sequence': 'aata'}, {'index': 18, 'sequence': 'ataa'}, {'index': 19, 'sequence': 'taat'}, {'index': 20, 'sequence': 'aatg'}, {'index': 21, 'sequence': 'atgt'}, {'index': 22, 'sequence': 'tgtc'}, {'index': 23, 'sequence': 'gtct'}, {'index': 24, 'sequence': 'tcta'}, {'index': 25, 'sequence': 'ctag'}, {'index': 26, 'sequence': 'taga'}, {'index': 27, 'sequence': 'agaa'}, {'index': 28, 'sequence': 'gaat'}, {'index': 29, 'sequence'

In [15]:
db.purge()
pattern_length_list = []
print(db.all())

[]


In [41]:
# def reference_dict(pattern_length, reference):
#     for i, nucleotide in enumerate(reference):
#         query = reference[i:i+pattern_length]
#         query_number = pattern_to_number(query)
#         if len(query) == pattern_length and query not in query_dict:
#             query_dict[query]=str(i+1)+','
#             #+1 because of 0 indexing. This will give teh actual start site of the pattern.
#         elif len(query) == pattern_length:
#             query_dict[query]+=str(i+1)+','
            
#     db.insert(query_dict)
#     return (db.all())

In [42]:
def symbol_to_number(symbol):
    dict_symbol = {'A':0, 'C':1, 'G':2, 'T':3}
    return dict_symbol[symbol]

def pattern_to_number(pattern):
    pattern = pattern.upper().replace(" ","")
    regex = re.compile('[^agctuAGCTU]')
    pattern = regex.sub('', pattern)
    if not pattern:
        return 0
    symbol = pattern[-1]
    prefix = pattern[0:-1]
    return 4*pattern_to_number(prefix) + symbol_to_number(symbol)