In [1]:
import collections
import functools
import itertools

SYMBOL2NUMBER = {'A': 0, 'C': 1, 'G': 2, 'T': 3}

def find_profile_most_probable_k_mer(text, profile):
    k = len(profile[0])
    max_prob = -1
    for i in range(len(text) - k + 1):
        k_mer = text[i:i + k]
        prob = functools.reduce(lambda x, y: x * y, itertools.starmap(lambda index, symbol: profile[SYMBOL2NUMBER[symbol]][index], enumerate(k_mer)))
        if prob > max_prob:
            max_prob = prob
            most_probable_k_mer = k_mer
    return most_probable_k_mer

def score(motifs):
    return sum(map(lambda column: len(motifs) - collections.Counter(column).most_common(1)[0][1], zip(*motifs)))

def build_profile(motifs):
    k = len(motifs[0])
    columns = list(zip(*motifs))
    
    profile = [[0] * k for _ in range(4)]
    for i in range(k):
        for symbol, count in collections.Counter(collections.Counter(columns[i])).most_common():
            profile[SYMBOL2NUMBER[symbol]][i] = count / len(motifs)
    return profile

def greedy_motif_search(dna, k, t):
    best_motifs = [dna[i][:k] for i in range(len(dna))]
    for i in range(len(dna[0]) - k + 1):
        motifs = [dna[0][i:i + k]]
        for j in range(1, t):
            profile = build_profile(motifs)
            motifs.append(find_profile_most_probable_k_mer(dna[j], profile))
            
        if score(motifs) < score(best_motifs):
            best_motifs = motifs
    return best_motifs

dna =["GGCGTTCAGGCA", "AAGAATCAGTCA", "CAAGGAGTTCGC", "CACGTCAATCAC", "CAATAATATTCG"]
res = greedy_motif_search(dna, 3, 5)
print(*res)

CAG CAG CAA CAA CAA


In [2]:
dna = ["GATGGACCGGGCCATACATGGTGACACGCATCAGAAAGCTGTCCCCGTGCCTTATGCGCTGTCTGTTAGTACATCTCTCTCAATGGCCGTATTTTCAGAACAAGTATCACTTGGATCATCATCTACTCGACGGAGGGCGCGCAAGTGGGTATCTCG",
"TAAAAAGGTATAAGGGAGTCATATCCGCAGTCCTAGTGACCTTTCCCGGCCCTAGCAGTGCTCCGATAGCCCATGGATGAGACGTAACTCGGCTACTGTTTGTGACTCAAGATAGTTGCCGTCGATATCTCGGATTCTGCTTATCGTGTTACGAGC",
"AACCACGAGTACCTCTGTCGTGGTCCTTCACCAGGACTCGAAATTTGGCTCACGCCCAACGCCAAGATTACGTCGATCGTTCCTGTTGATATCTCGCCGCATATCAGGTTTATACTGATCGGCTCAGTGATTGTTAATCATCGGCGCGGGTTGTCA",
"TGTCATGTGCGGATACAGTGGTGACACCAGGTCACCTCCGACCTAAAAGCGTTCAAGGTATGGCCCGAAGAGGTAGGTATCTCTTGTGCCCGCTGCTGCTATCACTCCCTGTGAGCCCCGACACGAATGTTAAACCAGTTTATATTCGCTCGTCAT",
"AACCTAAACCCTTGCTTCCCACCATCCCTGCGAAGCAACCTTATCCGTAGTTCAGTCGCGCTGAACTCGGAAAGTGCGCTCACGAGATTCTAACTTACACTTGTTTAAGTACCACGTCCGGTGGATATCGCTGGCGTTGAAGGATAGTTCTGGTTA",
"GTGCCCGATATGCCCGTCAGGGTTGAAGTCTGGGAAGTCGTTATCCCAAACGAAATACCCCGTTCGCAGGCGTGATGTATATCCTGATTAGTACCGCGTATAATATTAGTTTCGCACAGGAGCGTGCTTGTTTTGGGTCATGGATTGGGTACAGCA",
"TAGTCTTCGAGGCATCTACCTGCGACCGAGCTTGCGATCCAAAAGCATACCCATGAAGTTTGCAAATCTTTCTTAGAGCTACAGGTAGATATCCCTGGGCCGGTAACTAGTAATATGTACAGTTTAGTTGTCCTGTACAATACTGATTGGAGTCAG",
"AGGAGACGTGTTTGGTAGGCGCTCGACCCTTACCCCCCTATCTCGACTGGCATTGGACGTCCTCAGACTGGTGGATTTGACCTAGTGGTTATCACGCACATGGGAGAACCCGGTCAGAATACATCCTGTTACAGTCAGACGCCGGGTCATTCGCAA",
"TGTGGGGATCGTCTGATTCATCGACGTCATGGAAACGGGGACGGGCCAGTGGTTATCCCATGCTGCCTTTTAGGAATTCTAGGAGGCGTTCCTAAATAGCTCGCCCGACGATATCCTACTTGATTGTGGCGGTATACCTTGCTGAACCTCGCAGTA",
"CGCAGTGCACTAGTGGCTATCGCCTTTCTATCCCTGAGGCGTTTGCGTTATAAGATCACTCTGTCGGTGTGAAACCACTGAGTCACACTGACCGGTCGACTGGCCCGTCATATGCAAGTTGAAACGCTTACTGCCGGGTATCGCTCTAAGCTCGAC",
"TACTCGTAACTTCAAAATCTCGTAGGGTCTGCAGAGTAAGAATCCCGGATACTTCAACATTATCGATTCGTCAAGACGTGCGGAGTGGATATCCCTTATTCCTATACGTCACAAGCCGCGGTCAAGTCGCTTACGCACGGTAGAGCGGGAAACCCT",
"GCTCTAGTACGCCACAGTGCCAGTACATGACCTCACGAGCCGCCACTTACGTTGATATGTTATAAATCACTAGTTTCGTTGGTACAAACAATAAGTGAGAAGCAATGAGCAACTTATCTCATAAAAAGTGTGGTCGTTATCACACATGACATAAAC",
"GAGACGGTCGTAGAAATTTGCGCTTGCTCGTATCTCAGTATCCTTCAAAGATTGAACCGGATCGCGGCGGCTAATATTGAAATCCTTAGACTTAACGTTGGTATCACTTTAGAATTTCTGACTTGAGGGTAGTGACTAGACAATCATGATGGAAGA",
"GATCGGTGGCAGTTGAATTAAGACTAGTTATCCCCTGCTTACACTTTTTCCGCCCGGACACGTGTGACGGTAGTTGATATCTCTCAAGTATCCCGACTTCACGTACGATGCCACCCATCTCCGGCAAACACACTTCTATAATATCGTGGAGCCGAA",
"GTAGGTATCACCAGAGTTCTCTCGGTATGTGGCGCTAAAACCTCTTAGAGTATGAAGGGTGAAGACCAAGCTCATACCACCCCTATATAGGGCTAATTAATTCCACATCCAGGCAAGATGTCACCCTACAGGTCGCTCACTTGTGGAGAACATCAC",
"GTGGCTATCGCTGTGATCCGTCACACAAAATCAACTTTGTAGTATTTTGGGTAGTCGGGATAACGCGTGGTCTAAGGTGAGCTGCCTTTGATCCGTTGGGTGGCTACCTTGCAAGTTACCGGCTTGTCACTAGATATAACCGGAAGTCTCTGCAAA",
"TGAGCAGACCCGACAAAGGCCATGACTTCAAAACCGGTTGTGCAGCGACAGGTACTTTAAGTACGGCACCATTAATATCGCTATACTTACGAGTTAGTGGGTATCTCATGCAAACAAACTGCTACTAGGAACTTAGACGAACTTACCAGGAGGATT",
"AGTGATCTGAGCATAGTATACTGAGAACGTGTGTGTGACCCCTCCAGCGTCCCCGGCTACTGTTCCAGATCCTAACTAATTACTGCTAGCGATCTGGTCGATATCGCTACGGAACGAAGCCGTACAATCGCCCAGAAGCGGTTAGTCAAGGGCGTT",
"CAAAATGGGAGTACTTCTCGTAGAATAATTCGTCTGTAATTCCTAGGTTCCATAGATAGGCCTCGATAGTGAAATTCCTTCATGCCCCGAGGTGGCGTTGATATCTCCTTTCTAAGCGGTACCCTTAAGAGTACTCGCGATGGGCTTATCCTCCTC",
"GTTGGTATCACCCCCAATCCTCTTAGTTTACACTGTAAGATTAACGTCAGGGTGTTGTGGATGGCTTTTCTAATTTAGGTCCTCGGGATGCTCAGGTGTTACTATCGGACTGAGTGAATGTAAGTCCGGGTATCAGCCATGAAACCACTGGAGATC",
"CCTCGGAAAACGTCGTAAGTGGTATCACAATCCACTAGTCACGGAGGGCGGTGAGTGTCTCGATGCTCAACCCCCAACCCTCAGATGAGGCTATCTGTAGGTATCACTTACGTCACTACGGGGAACAGCTCGCCTTAAGTGTAGGCTAGGTATATG",
"GGCGGCTCCATCCGATGTGACGGATTTCATGAGGCACAAGCGCTTCACTCCCTATTTGGCTCGTGACAAAGTTCAACGGCTTTGCAGATATCCATGGTCGTTATCCCGGTGGTGAACCTACCGTCAAGTCTCTACAGTGCGCGAAGTGTCCCGGGC",
"TACTAGTATAAGTTCCGAGCCAAATGGATGGCCCAGGCGCACTAGTGTTAGAAGGATTCGGGCTGTAGGTACATCAAGCTCGAATTTGTCCCACGTCATTCTGGGCCACCCGGACCACAGAAGACCTCTCTCTGTGCCGACGGTGTGGTTATCACG",
"GTGGTTATCACCGGGATCGAATACAGATACGTCTGGTACATGCCTTGTCATTATTCATACGCCCCCTGGGCCAACAATCCTTTGTCAACCGCGGTCAAAAAGGTAACTAGGATCGGCTTGATCTCTAATTCCGGACTGTTCACCACGGGTCGACCG",
"CATCACGCAATGCGAACGACTGAAGAAGGCAAGGACAGTTACGCAACCTATCATGCGTAGATCAAGGTAATCGGGACCGGTCTGGAATTTAGGAGTGTTGTTATCGCAACCTGCGATCATAACATCCTCTTATTGCCTATAAACCGACCCTGACCG"]
res = greedy_motif_search(dna, 12, 25)
print(*res)

AGTGGGTATCTC TAAAAAGGTATA AACCACGAGTAC TGTCATGTGCGG AACCTAAACCCT AGTCGTTATCCC AGTAATATGTAC AGTGGTTATCAC AGTGGTTATCCC AGTGGCTATCGC AGTGGATATCCC AGTGAGAAGCAA AGTGACTAGACA TAAGACTAGTTA TATGAAGGGTGA AGTCGGGATAAC AGTGGGTATCTC AGCGGTTAGTCA AGTGAAATTCCT TGTGGATGGCTT TGTAGGTATCAC TGCAGATATCCA TGTGGTTATCAC TGTCATTATTCA TGCGTAGATCAA


In [3]:
dna = ["TGGTCAGGTCGGGCGAAGTTTATACGAGATCGTCCAACGGCAACGCCTTATAACACCACCTTTTACCGTTCAGATCTTAATCGTAAACTACGTAGCTGGCTGGGCTTACATCTCTCACCAAAGGCACGGAATAACTGAAATGCTCGAGTAGTGACC",
"GCAACGCCAGGTAATATAGTTAGCTTACGTATCCGCATGAATTATGGGGCTCTCACGGGAGTCGCAGCACCGTAACGCTGGCCTTACGTAGATCCGTTACTTCGGGATATTTATACTCTGAGACAACTCCCACGAAGCGACTCGCACGGGGTGGCA",
"AAAACCCGGATCGCAACATTGCCGATGTAACCCCCTTAGTCTCCCTTACTCAGATCTGAAATCGCTCATTGCGACCCAGTAAAGCTAGGGCCAGCCGAGGCAATTTGGAAGGTAACTAGCGTACGGATCATACCAGGCTATATACACCTTAATGGA",
"TTCTCCGCTGCTTCCATTTCTACTTCCCCCCTGAATGAGTTTGCACGCAACGTAAATAGCGTTGAGGTGATGCTCCACAAGGAAGCTACAAGGGTGGAGCACATAATTTGAGACCATCCCGGCTCAACCGTCCTCAGACATTCCGACGGACAGAGG",
"AGAAGCTGGATGAAGATACCTAGCGTCCTTGAGCGTACTTACGTGAGAGAAATATTCTTCGACGATCCGTGCAAGAATGAATGTGCAGCTCGGCGCATTAAATGTCCTATGACTTGAAGGCGATGAATGTTTGGAACGGCGAAACTCTGTAAGTCT",
"GCTATTGGGGCTGTGATGTCAGGATCGACTGTTGTCAAACAACGAATCCAGTTATCCATATAATCTATGTTAGTAGTTATGGTTAAATTTGCGTCGATGATAGGGTCTCAAGACAGACCTAGACGCCGACCAAATATAACTAGCAGTTTGAGTAAT",
"TTCCGGTTTGAAAAATTAAATAGCTGGGAGAACAACTCGGTTTTGGATCTGATCTGGCATCATTCAGTTACACCATGGTGAAACGACAGCACGTTCGATCTGTTTTTCTTGGAACCGTTACGTGCAAGAACCCTCTATCGGTAGGTCTGCATGTGC",
"TTACTTGGTCCTATATATGGCAATACTAGTTGCAGTAGGCGGCGACAGGCGGCTGTAGTCCCTGCTGATTTCCCGATAAGTTTCCAACAACAGTTTAATGTATCTAGCACACGTAATTACGGGGTTCTGGGCGTAGAGCAAAAATTCTTATCCGGC",
"TATACCTCCTGAGTGATTCCCTCCGTGGGCGCACGTAATGGTTCCGCTAATGTAAATAGCATTTACAATTAGACCTTAGGCCGTGATTTGATTAATTTCAGAAAACCAACCCTCTTTACCGTCTCAATTGCTGAGTTATCCGTATACGGACTTAAC",
"AAGGTAACTAGCGCCAATCAGCAAAATCACACCACCGCGCCTTACATAGCCGTCGGAGCCGTTCGTTTTACTCGCGTCCCAGTTCCGCGGGGTCGTACGTAGCCTACGTAAGAGCAGTCATACGCAAGGGACACTGGTTAATTTTGTACCCCGAAC",
"AAACGCTAGTACCCCCGATTTATGTTCGTCGCGGTAAATCTATGTAGCGACCATCTTAGTTCCGATGGAAACCCACTACATCCCTTTACCCAATATTCGAAATTTCCCCTATTCTGCGCCTGCCTGACACACACAGAGTTTTCCACGCTAGTCGTA",
"GTGTAACGAACACGAGCGAGTATGCAGTCAACTCCTCGTGACTACTCCAAGGTAACTAGCCACACCCAATCCTGCTGCCTATGTCGTGATAGTACATACTAGATTGGTAACGTCCACTCGGCTTTCATGAAATTACAAACTGTAAGAGAGAGTTTC",
"CAGCAGTCTTCCCAAGCGAGGTAGGTATGGTCTATTTCCTAGATCACGAGAGGGAGTCTTAAATTAGCTAGCCCTGATGTACAGGATACTCTTCGGGACGAGCTCTGTTAGCAGGCTCTGGTCTACAGTAGACGTGGGAGTATCGAAGCGCATAGT",
"CGACCTGAGTTGGCTAGCGATGTCTAACTCGAATACGAAGCTATTCAATGGCCACCGGGCAATAGCATGGATCGCTTTTAGCGGACTAGGACTGTGAACCTAACTAGCTGTTATTGGCGCAGCATCCACGACTCATGGAGGGGCCCGGCGCTAATG",
"CATGCCTTCTGGATCGTAGGCTAGTGCGGCCTTTACCTCGTATAGCTAGCGCAACATACGGCTACGACACGTGGAGCTGATGCACATTTTGGAGCAAAAATATCTAGCCTTGCACCGGATCCGTTAGCACTCAAGACAAACTGGCCACACTTAAAA",
"AATCTACTTAGCAGAGAGACGAACTTCCTCCTCAGGCTTGGAACTAATTGAACCGCCTGATACTAACGCACATTCGGCGCTTCGCCCAACGCTTTAACCTATGTTTGAGGTAAGTTGGATAAACCGTTTTCAACCTACCTACCCAATAATTATACG",
"TTCCACTAAACGCATAAGTGAACTCACCGGAATCCATAAAATGAATTTTATGGTGGTGTCCGACCGGTTCTATGGGAATATACCCCCCGCGCATAGCTATTGAACAACATCCAGCGTGGCAAGATAGGTAGCCCTCTTATGCATCACATCCGCCAC",
"TTCCCAGAGTTGCGAACCTCGCCGTAGAAACGGATTTGCGTCTCGATTCTGACCGATGGTAAGATAATTAGCACTGGTTGACTTCCGAATTTTTAGCTCCGCTGCCGACTGCCATGCTTTAGATCGCCTAATCACTGCTATCAGGATACCGGGCGT",
"CATTTCTATCGAGATTGAAAAGTGTGGTCCACCGTAATAGACAACGATCAGTCGAATATGCCTCTTTAGAACTTCTGGCGTTACACCTGATGCGACCACGGGATTAGTTAACCACCGTAACCCGACATAAAGAATCTAGTTAGCTAGCCCTCCATA",
"AACACCCCGAGCTATTCTTCACCCACGGCAACTCAATATCTGGTGTTTGTGACGCCACTTACATTTCGCAGACATCGAACCGTTATTCAGTGTGCGTAAGTATAATAAACGCCTCCTGAGAACGTAGGTAGCAATCTTTGAACTAGTACCCTGCAG",
"ATACTCGTTATGTTCTTAATTTTCAGTTCTTAGTCTATGAGGAGACAGAACGCGCACTACTATATACGTACGCCTCTGCACACGTGTAGACTCTATCACCGGACCGTGGATCGGTGGGTAAAACTAATTAGCCGAAGTAATACGTCGATGCCTCAT",
"AGAACCAATGGAGACCTCGAGAACCGCCAATCGTTGATATTGAGGTCATTCAATTGTGGGTGGTCGTAATAACGACTACAGTGGTGGGGAGGGAGAGTTTTCAAATAGTGCGCCTGATGTTCGAACTTTACTTGATCGATATTTAACTTAAGTAGC",
"CAAGGACGGACAAAGATAGATAGCTTCCTGAGAGACCAGGAGTTTTGGATGCGATCTCACAATCGACTTAACTGGCCGGTCCCTGTCCGGTATCGGCAGGCTACTCTTGGATTTTCCCTAAATGAGTGCACATGATAGAGCCAGGATACTAGGCAA",
"ATTCACGCTATTTAAGCGCTGGGGCGCGCGCGAACGGTTTGGTCGAAAGCCGTGGTGATGGATGGGCAGATAAATGTACGTAGCACCCAACCGGACTGCTAGTACCAAACCATGTTATTCCACTACGAGAAAGGGTCACAAATCCGAACAGAAATT",
"GTGATATAATAAGGCGACTGGCCACTCCTCGTGATGAGGCTATTACAACTATGGAGGAGCAAGTTATTTAGCCATACCTAGTTGAACCGAGGGTCGGCGGTTCTTTCGTCATGTCTTCAAGTCAACCGTTGTACCAAGCCCTAGGGAACCAGAACA"]
res = greedy_motif_search(dna, 12, 25)
print(*res)

GTTTATACGAGA GCAACGCCAGGT AAAACCCGGATC TTCTCCGCTGCT TCTTCGACGATC GCTATTGGGGCT GAAACGACAGCA GATTTCCCGATA GCATTTACAATT AAGGTAACTAGC ATTTCCCCTATT ACGAACACGAGC TAGATCACGAGA GCATCCACGACT ACGGCTACGACA ACCTACCCAATA TCTATGGGAATA GCTATCAGGATA GCGACCACGGGA AACACCCCGAGC TCTATGAGGAGA ATAACGACTACA GAGACCAGGAGT TCCACTACGAGA GCTATTACAACT


In [4]:
import collections
import functools
import itertools

SYMBOL2NUMBER = {'A': 0, 'C': 1, 'G': 2, 'T': 3}

def find_profile_most_probable_k_mer(text, profile):
    k = len(profile[0])
    max_prob = -1
    for i in range(len(text) - k + 1):
        k_mer = text[i:i + k]
        prob = functools.reduce(lambda x, y: x * y, itertools.starmap(lambda index, symbol: profile[SYMBOL2NUMBER[symbol]][index], enumerate(k_mer)))
        if prob > max_prob:
            max_prob = prob
            most_probable_k_mer = k_mer
    return most_probable_k_mer

def score(motifs):
    return sum(map(lambda column: len(motifs) - collections.Counter(column).most_common(1)[0][1], zip(*motifs)))

def build_profile_with_pseudocounts(motifs):
    k = len(motifs[0])
    t = len(motifs)
    columns = list(zip(*motifs))
    
    profile = [[1 / (t + 4)] * k for _ in range(4)]
    for i in range(k):
        for symbol, count in collections.Counter(collections.Counter(columns[i])).most_common():
            profile[SYMBOL2NUMBER[symbol]][i] = (count + 1) / (t + 4)
    return profile

def greedy_motif_search_with_pseudocounts(dna, k, t):
    best_motifs = [dna[i][:k] for i in range(len(dna))]
    for i in range(len(dna[0]) - k + 1):
        motifs = [dna[0][i:i + k]]
        for j in range(1, t):
            profile = build_profile_with_pseudocounts(motifs)
            motifs.append(find_profile_most_probable_k_mer(dna[j], profile))
            
        if score(motifs) < score(best_motifs):
            best_motifs = motifs
    return best_motifs

dna = ["GGCGTTCAGGCA", "AAGAATCAGTCA", "CAAGGAGTTCGC", "CACGTCAATCAC", "CAATAATATTCG"]
res = greedy_motif_search_with_pseudocounts(dna, 3, 5)
print(*res)

TTC ATC TTC ATC TTC


In [5]:
dna = ["ACGAGAACTGTAATGGAGACCAATCGGGTCGTATGTACGACACGGATCTTCTGTATCGATCATCGCTTAACTTATACGATCTCATTCTCACGACGATCCTCAACCCCGGATACCCGCACTGCCTCAATCCGAAGTACTGCGTAGTACTTTACCCTT",
"ACCTGTGATCACTCAGAGAACAGAGGATCCGGGTTGGATGTCAGTGTTATGCCAAGAAACGAGACCTAAGGTGCCGTCCCCGGCGGAATGCTTCTCGCTTCCCCTTCTAAAGGGTCCTGGCAAAATGCTTGTGACTTTGAATGCCCTCACTGAACT",
"AAATGTGAAACCTATATCAGTCATTATACCGGCGCGATGTTAACTCGCACCTGATTGCAAGGTCACTGATCGCGTCACTACACTAGAGTTTATTCATACCTGCATGGGGGGCATGATGGATGAATTTTAACTAGGTGATCGTGACCAATGTTCACC",
"TTTAACCGAATGAGAAGGGTTTCGTTTTAGGCCGTCGATCCGCCGCTTCTTCTCTCGACTGAATCGGAGGTTTTAATGTGTGTGCTAAAGTGAACTGCCTAATAACCCGCTAGGGGGATGATTTATTGTATCTGCAATGTACGGTGGTACATCCCG",
"GCTTTCCCGTGTTCTTCGTGCGCTCGGGACCAGAGATCAGAACTAGTGCTAAAGCCATGGAAGCATTTAGCCGGCCGATGTAGTAAAGTTGCCCATATTTCTCCCTAAACCAGCGTATACGTCGAAAACTTTCTTCACTCCACTATGTATAGATGG",
"CGCTTAGCGCATCCTCGCTAAACTAATTTGTAGGAGCAATCGCATGTCGACACCGAGGAAGACAACAAGTACATTAGTTACACGCTTCTTACTGGGGATAAAAATCTAGGATCGCGTGATCGGCGTGCTCCGGCGTCAGGTGACTTGATGGCCCCA",
"AGTCATGACAATTCGCACGAGGTGACTTTCAATCGACTTACGTACTGCGCGTCGATGTCGCTTCACTCCACTAATACCCATATTCCTAACACGCCAGTACGGTTCAGAGTCGGCGGCTGAGGGGCCCTAGAAACGAGACACCCTAGAGGCTCTGGG",
"ATAATAAACGAGATTTAGTGCTCCAGGTCCCTCTAACTTCGCTAGACTGTTCACGGTACTTAGAGTAGCTCAGAAATCGCCTGTCTTCGGGTTGGTTTTTTCAGGAGGTGCTCTGTGCGTTAACATACCAAGCCATAGCTGCTTTTCCTCTACAAA",
"CCTACCGGAGGACTTCACTGAACTAAGTACTGGGGTGCTAAGGTCGAACGAGATGACTGGACCCTACTCTCTACGGGACCGACGCCCCAGGGCTTAATTCATATTGACTAGATTTATGATAATAATAGACTCGGCGGTTTGTAGCTTTCCCCTGAA",
"CCGTTACTCCGCCGCTGCTGATCCTACTCCACGTGGGGCCCCCCAATATGCACATCTTATCGTCCCTGGACTGGCAGTTGATGCGAAATAATATTCGTGGGTATGATAACGCGCTATACTGATAGAACCACGGGGACTCCTGTATTCGTCTCGCCA",
"ATTCGTGGGTTGAAGCCTTTAAACGGGATGGCCAAGTTGATTGGGTCTAATTGATATTAATTCTGGTGTACTGTACGAGACCGGTGCAGCACGGACGGGCGGTTTCAACAATACTCGTGCGCCTGGAGGGTACCTCGCTGAACTCGCATATCAGGT",
"TGGTTACCCTCTCTTCACTTAACTCTTGTATCAAGACGTTTCTGTGAGACAAAGCAATGGCCGGACTTTGGGGCGCGTGCTCTGGAGATCCAAGCACTCGAGGTCAGCGGTATAATATAACGCATCCAACATGCAGACTGTGCGTGGGGGCCCAAA",
"CCTTAGCGGTGTCGGCCATCATTTATCGAGCTGGAACTCCGGTGGGTAACGTGGATCCGCAGCAGGCCTTACCGTCACTTAACTTGTTACTAGAACATACGTGGAACCTATGCATGATCGAGATAGAGTGCGCTCCGCGGTACACCGCGCTCTATA",
"AGTTACAACACGACAGGAACTATTTGCTAGGCGTTACATCTCTTTACTTTTAGCCCCTGGATTTTGAACGCATGTCAACACGTTCCACCATGGGTATAAGAATGCATGGACAGGGTTAATGAATGTGTCTCGGTCCGTTAGCTGTTACAATATACC",
"CTTAGCAGCCCACGTCGCTGGACTGTGTTATATTACGAGGTCGAATAGTGAGGTTAACAGTCTCCGTTGTAACTTAATCCCGATATCACGCAGTGTATATGGTCGCTGTAGCTTTCTGGGCAGCTCGCACACCGCCAATTCGCAGAGGCGACCAGA",
"TGGTAATAGGCTTCGAAATAACTCTTGGATTGCAACGAAGGTCCGAGCCTTCTCTGCACTTGGATACATTTTGGACATATGAAAGGATGGGTGCTCGGGATGGGACTTTGGTTGCCTGCAAGACGGCGAGACCACCTTACTGAAACCAACATCTTA",
"TTACCGGTAATCTCTGATCGCCCATGCCGTCAGGTGCCTTAATTTAAGCGAGAGCTAAATAGAAAGCTGCGCGGTTTTAGCAAAATGAAGTATCAGGAATAACATGGGTTAATGTCACAAACCTGAGGGTTACCTCTCTGCACTCCAGGTCCAGCA",
"AATTCAGCTGGTCAGTCACCACAACGTCTCTAGACTCGCACACCCAACTATTATATCACGTACAAGCCGCCCCACAACCGGCATGATAATGTCTGCACGGCCCAACTAACACGCCAGATGACGTACTTTTCGCGGCAGAGCAGTATTCGAACTCAA",
"CGTACCCGTTATACAAGCACCATCTACAAAACGTTAGGTGTCAACGATCGTGGGCCGGACTTAGGGGTGAGACCTTAAAGCACACATTCCCTCCCACATCACTTCACTTGTCAAAAATAAAGTCGAATGATGACTACCTCAATTCCTCGCGAAAGC",
"GCTGACACGTATTACGAACCGAGACCAGGCGCGACCCCAACCTGGACTAACAGCTATCCCTTTGTTACTTGGCACGTACGGTCAAATCCCGGTGGAGTTATTTACCGAGGGTGCGCCATGCATCGCTCAACTCCTAATGGTCGCCTGTACTTGGTC",
"CGGCCTTGGTCCAATTCATCGTAACATTCTGTGAATCTACGGGTACTACTCGACCACGCTTGCTTCGCTGAGCTGTACCGCAAAATCGAATGGACCCAATAATCTGAATCCTTCGGTATACATCACTAGACTCACGATTCAGTATGCCCTCAATCA",
"ATCCGAAACATTCAGTTCCGATGGCAACGACGACCACCCAGCACGCATCATCACTCGACTGACCCTGCTCGAATACAGGCGTATCTAACAACCAGCGGATCCAGGGCCCATGCTGAGGCTATTGTCACTCCCGCCCACCCTGTATGTATTCGGATT",
"CCCGGATCGCCCGGTCAAGCACTCCAGGGCTTCAGCGCTGTGTACCTCTCTGCACACTCCGGTGGTGGGCCCCGTCCCTACACTGCGTATTCAAGTGATAGCTCGTACGATCCGCATCGAAGGCTGACTGCCCCTCTACAACAGTGCGCTCGCACT",
"CCCGTATGTAGGTGATTAGACCCACCAAGCAATCCGCATGTTGCGTCACCGTAGATATATGCAGCGGTCATTCTTCGCTTGACTCTCTGACTTGCCCATTTAGAAACTATCCACTTAATTCCCCTTGGACTTGGGGCCTGTTTTCGGTCGCTTTGT",
"CATTTCTATAAAGCTACAATAATAATCCGCGCTGTCGGCAGACGTGGTACCGACCCTACTCCTACCGTTTGAGAGATGGAGGGTCTTCCCTGAACTAACGGCATGCATGAGAGGGGTACGACCCTGGTACTTCTGAAACCAGCATCCGCGGCGACG"]
res = greedy_motif_search_with_pseudocounts(dna, 12, 25)
print(*res)

CATCGCTTAACT CCTCACTGAACT CGTCACTACACT CTTCTCTCGACT CTTCACTCCACT CCTCGCTAAACT CTTCACTCCACT CTTCGCTAGACT CTTCACTGAACT CGTCCCTGGACT CCTCGCTGAACT CTTCACTTAACT CGTCACTTAACT CATCTCTTTACT CGTCGCTGGACT CTTCTCTGCACT CCTCTCTGCACT CGTCTCTAGACT CATCACTTCACT CATCGCTCAACT CATCACTAGACT CATCACTCGACT CGTCCCTACACT CTTCGCTTGACT CTTCCCTGAACT


In [6]:
dna = ["GACACCCTACCTTTCGACCAGATGGAAAGCTTCCTTCAATTAATTCTTTTACTGATCATCGCATCTCGTTTTGTTTGTTGATATGTCCTAAGTGCAGTGCGCAATGATGTTCCATAGTTAAGCGTGTTCGACCCCCAGAGTGCACGATTGTGCAAA",
"CCCCTCAATGACCCTAAAGTCGGCCTCACGAATCTGCTGTAACAGCCACGCTTACTGTTTCCCCACGGTGCCAGCAGACCTGCAGGGAAACAAGTAGGAACTAACATCACGCGCCGTTTCGAATGATCACGCAGTAGGAACGGGAAGCCAAACGGG",
"TTTCTCATTCTGCTCCAGTGTGCACCCGGTCCTGAGGCTGTCGAATACAAGTGCCTGCTTCCTTCTCTAGCGTGCAAAAACGCTGCTGGCCGCACTTCGGCGGCGGTGTTAGTCTGAGGTGAAGGCGCGGTCAAAATGGCGTCGACCAAGCAGACG",
"TAGCCGATCTCCGTGCAGAGCCGTCCCCATTGTGCCAGCTTAAGGTCCGTGCAGGCGTAGGCTGCACGGTTTGGACGGGATCAGGACAAATATCCACAGATGATACTTGCCGACGTGATCAGGGTGGTAGACACATTAGACTCGAGGTAAAGCGCA",
"GAACTCGTGCGGGTGTAAGTGTGTCAATGCATTCGACTTAGCGCAACAAGTCGTATCAGAAGAAGACCGTCGACTGGACCTAACCACCAATGTGCTAACAGGTGGCGGCATTTCTTCTCAACCTCATGCCCGAATATATCCGTAGCATAAGCTAGG",
"TTCCTTGAAACGTAAACTGTACGTCTAAGGACTCCTTTGCTCAGCGAGCACCACCGTGCAGCGATTAATCTGTGGTTATAGCACCCTCCACTTGCGACCTCACACTCTCCAACGATCCTATCCGCTCTCTCATGGAACGCAGGGCATTGCGGTCCT",
"TGAATTGCCCAGCGGCAGAAGAACCACCAAGGTGCGTATCTTAATCCGGTTGTGTCAGTGCTTCAGGCAAACCATCTTGTTGGAATGCCGCCAAAGCATTTCGATGTCTCTTCTTTTTCGGTGTTTCCTACGGGTATGGGCTGACGTCTCATCCGA",
"ATTCTGGGCATTTCGCAACAATACGCGCAGTCCCTTCGGGAGATGGGCCCAATGGACGTAACGATCGGGGTCCACCAGGGTGCCGGTCTGACGCGACTACAATCGCCGAACAATGATGGTATCCATAGTCTTCCAGGCACTCCCTCGAGACGATGT",
"CTGGAGTGAGCAGTCCTAGTGGGTCTCCAACGTGCTAATACAGGGTTTCCTCGGTCATCATCCCCGCATTTCGCGTGGTACCCTCATGGCGTGTATTCGGCATCAACATCTTTCGCAGGGAAGCTAAGAGACGCGCGGGCCTGTGGGCTTCGAATA",
"AGAGTAACAGTGACAAAAACAGGTATAGCACTGCTACTAGGTATGGTATATAAACAGGATGTATTATTAAGACGCCATGGTGCGAAGGCGACCGCGTCCTAGACGCTTTGATCCAATGAGCACGAAGTTGGACTTCTTCGCTGCAACTTTGCTTTA",
"GTCATAACAAAGCCGCCAAGGGTCTCTATTGTGGTGGCCAGCGGCAAAGACTCCTAGGCCTGCTCCGTTCCACGCCAGCGTGCAAGCATGGACGCGTGGCGAATTGAAATTCATTCTTCATGGCCTACTTCTTCTATCCTCAAAATCACTAACCCA",
"TGGCTATCTTGCTCCCACTCGCCCAGTAAAATGCTGTTTTCAGAAATTCAGGCCCGGATTTATTGAATAGGCCACCAGTGTGCTGGGACATGCCGCAGCAAGGACCAACGACCGAGCAAGTGACCAGCTTCCGCCGTATATAGTCGCTTCAAGCCT",
"CCAGGTTACTTCATAGTACCCCTTGGTCCATTTTTTTCACTGCACGTGCCGCCGGTCAAACCGACCAAGGCTCACAAAGAAACCCACCAGAGTGCGAGTTAGCGAGCCACTTGACGAAGGCGAGACCCCGCGATGTAGAAATGTCCTGGCATTGTG",
"AAAGTACTGTTGTCAATAGATACAGAACTACACCTTGCTGACTAACACTTCGTGAGATTTTAGAAAGTCATGAACGTGTTTTTACGCCAGGGTGCGTATGAGATCGACATTGGGCGATGTGACCTATTATTGAAAACGATAGGTCCCGAGATACGG",
"ATCTTGCGATTAGATCATGAGCGGTGTCAGATGGGTCCAATTACTGTACGAGCATAACATCTCCACAGTGCGTCCCTTAACGGTTCACGGTGGCCAGTCAAAGCTTTCGAGGCAGGGTAGGGCTGCGTTCTTGGATAATCTGCGCATCGGAAGCTC",
"GTAGGGAGCGGCCATTCTGTTTTTGGTCTTGGATCCCCTTTACACAAGCATTAATATTGGCTGATGGACGAATGGCTGGCGTTACAACTCATTTGCCGGATAAATCGTTGTTAGCCTCCCCTATAGACTTCGAGGGCAGTCGGCCCCCATCGTGCC",
"TCTCTAGACAGCATACGAGATACGTACCACTCTGATGCGAACACATAACTCCAACGTGCTTGAATCTCAGGCGACGATTGGGCATAGGCGTCGGCATACCAATTTCGTTCTATCCAATGGTCAGCTACCCGAAAGGCTATTGGCCGTCAAATGCTA",
"AGTGATCTACGCTTGCAGGAAAGGGGTTTATATCCTGTCAAGGCGCATTGGAGCATAAGACTAAGTAAACTATTAAAACGGCGTCGCCAGAGTGCATGCTGGGAAGCATTCCTGACATTAGAGGGGCAAATGAGAACTACTACTCGTGCTTACGCT",
"TCGACCCGTTCGCAGAAACGCAGTCTCCACCGTGCCCTATCCTCGTTAGACGGGATAACGAATCACGGCCTTCGCGTGTAGGTAATAGGCTTTTGATAGTAGGCTGGTTTCCTCAGAGCTACGGCGGAATGCACAACGAGGATTTGCCTAACGGGG",
"TGCCATTGGACGACTTCCCACATCCCTTCGTACCTCTTAATATGACGACGCCACAGTGCGGACATAACAAGACACAACCTTTACAGTGTTTCTGGACAGACCGGATCCGTGGTCAAATAGAAGACCCCCTAAATCAAGAGGCGCGAGCACGTGCAT",
"GAAGCAAAGGGGTGAGAAGGCCGTTCACTCCCGGCACCCCACAGTGCGCCACTTAGGCCTGACTGCGAGTATCTGCCTTATTCTGTATCTCACTCAGTACGCACTTGAAGGCAACTACAGTAGCTATGAGCGCGCATCTTTTGTCTTACCCTCTTT",
"TGATTCTAACAAGAGCTACGTTTAGGTGGGACCCGCTGAAGCGCATCCCGCCAGGGTGCACTGGGTTGTCCTACGTGCCCGTGGCAACTTCCCCTCCTACATTTTCCCCACCGTCGAGTTCAAAAGGTTCTCCTCTTACAGATTATCCAGGTGACG",
"TATAGTATTATTCCCCACTGTGCCAGACTATAATCGGTCTGGGGACCATGACGTAACTTGGCCCAGTGCTGAAAATCTTGTAGACAGCCGCTTTGAGAACGACAGCGACCTGCACTTCTGTCCATAAGTAAAGCAAATTCTGCGTGGTAACTTGAG",
"AGAAACTCCACTCACCTAAAGAGCTGCGGGTCGTCCCCACTAAATAGAAGGTTCGAGCTTCGCCATCGTGCATTGAGTCCACGCCCATGTCACATGGTGCGCACTCAGTATTAAGGTCTGTTCAGGCTCTTTCTCCGGATTGGCCCTTTGCCCTTA",
"ATGAGCCCCAACTGTCTAGGTGGCAAATAGGATTGGTCCGTTACCCGGAGACTAAGACCGGAGACCAGACTTGAGATCTATTGATACATAGACGTGCCCCAATGTGCAGAGGGGCAAGACAGTTATTACGATTGGTCCAATCTTTTGGCAGGGATC"]
res = greedy_motif_search_with_pseudocounts(dna, 12, 25)
print(*res)

CCCCCAGAGTGC TCCCCACGGTGC GCTCCAGTGTGC TCCCCATTGTGC CCACCAATGTGC GCACCACCGTGC CCACCAAGGTGC CCACCAGGGTGC TCTCCAACGTGC ACGCCATGGTGC ACGCCAGCGTGC CCACCAGTGTGC CCACCAGAGTGC ACGCCAGGGTGC TCTCCACAGTGC CCCCCATCGTGC ACTCCAACGTGC TCGCCAGAGTGC TCTCCACCGTGC ACGCCACAGTGC ACCCCACAGTGC CCGCCAGGGTGC TCCCCACTGTGC TCGCCATCGTGC GCCCCAATGTGC
