In [None]:
import numpy as np

In [51]:
def Random_Motif_Matrix(Dna,k):
    motif_matrix = np.zeros(len(Dna)).astype('str')
    for i in range(len(Dna)):
        init_pos = np.random.randint(len(Dna[i])-k+1)
        motif_matrix[i] = Dna[i][init_pos:init_pos+k]
    return motif_matrix

In [2]:
def form_Profile(Motifs,k):
    ####Version1
    #Frequence = {base:np.zeros(k) for base in ['A','C','G','T']}
    #Sum = np.zeros(k)
    ####Version 2 with pseudocounts
    Frequence = {base:np.ones(k) for base in ['A','C','G','T']}
    Sum = np.ones(k)      
    for motif in Motifs:
        for i in range(len(motif)):
            Frequence[motif[i]][i] +=1
            Sum[i] += 1
    
    Profile = {base:np.zeros(k) for base in ['A','C','G','T']}
    for base,val in Profile.items():
        for i in range(len(val)):
            val[i] = Frequence[base][i]/Sum[i]
    return Profile

In [3]:
def Profile_most_probable_kmer(Text,k,Profile):
    max_prob = -1
    most_probable_kmer = ''
    for i in range(len(Text)-k+1):
        kmer = Text[i:i+k]
        prob = 1
        for j in range(len(kmer)):
            base = kmer[j]
            prob *= Profile[base][j]
        if(max_prob < prob):
            max_prob = prob
            most_probable_kmer = kmer
    return most_probable_kmer

In [15]:
def Most_Probable_Motifs(Dna,k,Profile):
    t = len(Dna)
    Motifs = np.zeros(t).astype('str')
    for i in range(t):
        Motifs[i] = Profile_most_probable_kmer(Dna[i],k,Profile)
    return Motifs

In [5]:
def Score(Motifs,k):
    Frequence = {pos:{base:0 for base in ['A','C','G','T']} for pos in range(k)}
    Sum = np.zeros(k)
    for motif in Motifs:
        for i in range(len(motif)):
            Frequence[i][motif[i]] +=1
            Sum[i] += 1
    Score = 0
    for i in range(k):
        max_base = max(Frequence[i], key=Frequence[i].get)
        Score += Sum[i] - Frequence[i][max_base]
    return Score

In [49]:
def RandomizedMotifSearch(Dna, k, t):
        BestMotifs = Random_Motif_Matrix(Dna,k)
        while True:
            Profile = form_Profile(BestMotifs,k)
            Motifs  = Most_Probable_Motifs(Dna,k,Profile)
            if Score(Motifs,k) < Score(BestMotifs,k):
                BestMotifs  = Motifs.copy()
            else:
                return BestMotifs

In [24]:
k = 8
t = 5
Dna = [
'CGCCCCTCTCGGGGGTGTTCAGTAAACGGCCA',
'GGGCGAGGTATGTGTAAGTGCCAAGGTGCCAG',
'TAGTACCGAGACCGAAAGAAGTATACAGGCGT',
'TAGATCAAGTTTCAGGTGCACGTCGGTGAACC',
'AATCCACCAGCTCCACGTGCAATGTTGGCCTA']

In [56]:
BestMotifs = RandomizedMotifSearch(Dna, k, t)
for i in range(1000):
    Motifs = RandomizedMotifSearch(Dna, k, t)
    if Score(Motifs,k) < Score(BestMotifs,k):
        BestMotifs  = Motifs.copy()
for kmer in BestMotifs:
    print(kmer)
    

TCTCGGGG
CCAAGGTG
TACAGGCG
TTCAGGTG
TCCACGTG


In [59]:
Dna = []
k = 0
t = 0
file = open('dataset_369255_5.txt', 'r') 
for i, line in enumerate(file):
    line=line.rstrip('\n')
    if(i==0):
        (k,t)= line.split(' ')
        k = int(k)
        t = int(t)
    else:
        Dna.append(line)

In [61]:
BestMotifs = RandomizedMotifSearch(Dna, k, t)
for i in range(1000):
    Motifs = RandomizedMotifSearch(Dna, k, t)
    if Score(Motifs,k) < Score(BestMotifs,k):
        BestMotifs  = Motifs.copy()
for kmer in BestMotifs:
    print(kmer)

CCGTCGCACGGCAAA
GCGGAGCTCAGAATA
GCGTCTCTTGTAATA
GCGTCTTGTAGAATA
GCGTCAGACAGAATA
GCGTCTCTCAGCGGA
GCCGATCTCAGAATA
CGGTCTCTCAGAATC
CCGTCTCTCAGAAAG
GCGTTCATCAGAATA
GCGTTGTTCAGAATA
GGAGCTCTCAGAATA
TTCTCTCTCAGAATA
GCGTCGAACAGAATA
GCGTCTCTCCCCATA
GCGTCTTAGAGAATA
GCGTCTCTCAACCTA
GCGAAGCTCAGAATA
GCGTCTCTCAGATCG
GCGTCTCGGGGAATA
