In [1]:
import numpy as np

In [2]:
def Profile_most_probable_kmer(Text,k,Profile):
    max_prob = 0
    most_probable_kmer = ''
    for i in range(len(Text)-k+1):
        kmer = Text[i:i+k]
        prob = 1
        for j in range(len(kmer)):
            base = kmer[j]
            prob *= Profile[base][j]
        if(max_prob < prob):
            max_prob = prob
            most_probable_kmer = kmer
    return most_probable_kmer

In [3]:
def Motif_Matrix(Dna,k):
    motif_matrix = np.zeros(len(Dna)).astype('str')
    for i in range(len(Dna)):
        motif_matrix[i] = Dna[i][0:k]
    return motif_matrix
        

In [4]:
def form_Profile(Motifs,k):
    ####Version1
    #Frequence = {base:np.zeros(k) for base in ['A','C','G','T']}
    #Sum = np.zeros(k)
    ####Version 2 with pseudocounts
    Frequence = {base:np.ones(k) for base in ['A','C','G','T']}
    Sum = np.ones(k)      
    for motif in Motifs:
        for i in range(len(motif)):
            Frequence[motif[i]][i] +=1
            Sum[i] += 1
    
    Profile = {base:np.zeros(k) for base in ['A','C','G','T']}
    for base,val in Profile.items():
        for i in range(len(val)):
            val[i] = Frequence[base][i]/Sum[i]
    return Profile
        

In [5]:
def Score(Motifs,k):
    Frequence = {pos:{base:0 for base in ['A','C','G','T']} for pos in range(k)}
    Sum = np.zeros(k)
    for motif in Motifs:
        for i in range(len(motif)):
            Frequence[i][motif[i]] +=1
            Sum[i] += 1
    Score = 0
    for i in range(k):
        max_base = max(Frequence[i], key=Frequence[i].get)
        Score += Sum[i] - Frequence[i][max_base]
    return Score

In [6]:
def GreedyMotifSearch(Dna, k, t):
    BestMotifs =  Motif_Matrix(Dna,k)
    Motifs = np.zeros(t).astype('str')
    for i in range(len(Dna[0])-k+1):
        Motifs[0] = Dna[0][i:i+k]
        for j in range(1,t):
            Profile = form_Profile(Motifs[0:j],k)
            Motifs[j] = Profile_most_probable_kmer(Dna[j],k,Profile)
        if (Score(Motifs,k) < Score(BestMotifs,k)):
            BestMotifs = Motifs.copy()
    return BestMotifs

In [7]:
Dna = ['GGCGTTCAGGCA',
'AAGAATCAGTCA',
'CAAGGAGTTCGC',
'CACGTCAATCAC',
'CAATAATATTCG']
k=3
t=5

In [8]:
GreedyMotifSearch(Dna, k, t)

array(['TTC', 'ATC', 'TTC', 'ATC', 'TTC'], dtype='<U32')

In [9]:
Dna = []
k = 0
t = 0
file = open('dataset_369253_5.txt', 'r') 
for i, line in enumerate(file):
    line=line.rstrip('\n')
    if(i==0):
        (k,t)= line.split(' ')
        k = int(k)
        t = int(t)
    else:
        Dna.append(line)

In [10]:
for kmer in GreedyMotifSearch(Dna, k, t):
    print(kmer)

TAACGGAATATA
TGACGGTATATA
TAACGGAATCGA
TTACGGCATGTA
TCACGGTATATA
TCACGGGATCCA
TCACGGCATCAA
TTACGGGATCAA
TCACGGGATCCA
TCACGGAATGCA
TGACGGAATTAA
TCACGGTATATA
TGACGGCATCCA
TTACGGCATTCA
TCACGGGATGAA
TTACGGTATTGA
TGACGGGATTAA
TAACGGCATGCA
TCACGGTATGCA
TGACGGTATAGA
TAACGGTATCGA
TCACGGCATCAA
TTACGGGATATA
TCACGGAATTTA
TCACGGCATGCA


In [11]:
Dna = ['GGCGTTCAGGCA',
'AAGAATCAGTCA',
'CAAGGAGTTCGC',
'CACGTCAATCAC',
'CAATAATATTCG']
k = 3
t =5

In [12]:
GreedyMotifSearch(Dna, k, t)

array(['TTC', 'ATC', 'TTC', 'ATC', 'TTC'], dtype='<U32')

In [13]:
Dna = []
k = 0
t = 0
file = open('dataset_369254_9.txt', 'r') 
for i, line in enumerate(file):
    line=line.rstrip('\n')
    if(i==0):
        (k,t)= line.split(' ')
        k = int(k)
        t = int(t)
    else:
        Dna.append(line)

In [14]:
for kmer in GreedyMotifSearch(Dna, k, t):
    print(kmer)

CTACGGGTGCCC
CTCCCGGTGGCC
ATCCCGGTGCCC
GTGCGGGTGACC
CTGCAGGTGACC
GTCCCGGTGTCC
TTGCAGGTGTCC
CTACGGGTGACC
TTGCCGGTGTCC
CTCCAGGTGGCC
ATTCGGGTGCCC
CTCCCGGTGCCC
TTCCAGGTGGCC
GTTCAGGTGACC
TTGCGGGTGACC
CTCCCGGTGACC
ATACCGGTGCCC
ATGCAGGTGACC
GTCCTGGTGTCC
ATCCAGGTGGCC
TTGCTGGTGTCC
ATTCAGGTGCCC
CTGCGGGTGGCC
ATACTGGTGGCC
CTCCGGGTGACC


In [15]:
Dna=[
'ACTG',
'AGTC',
'GCTG',
'ACGT',
'AGCA',
'CCAG',
'TGTC',
'GATG',
'ATGT',
'AGAA'
]

In [18]:
Motif = Motif_Matrix(Dna,4)

In [19]:
for i,j in form_Profile(Motif,4)

{'A': array([0.63636364, 0.18181818, 0.27272727, 0.27272727]),
 'C': array([0.18181818, 0.45454545, 0.18181818, 0.27272727]),
 'G': array([0.27272727, 0.45454545, 0.27272727, 0.45454545]),
 'T': array([0.18181818, 0.18181818, 0.54545455, 0.27272727])}