In [1]:
import random
import numpy as np
from collections import Counter

# Week 1

In [2]:
'''
Code Challenge: Implement PatternCount (reproduced below).
Input: Strings Text and Pattern.
Output: Count(Text, Pattern).
'''

def PatternCount(text, pattern):
    count = 0
    for start in range(len(text) - len(pattern) + 1):
        end = start + len(pattern)
        if pattern == text[start: end]:
            count = count + 1
    return(count)

# Test
text    = 'GCGCG'
pattern = 'GCG'

print(PatternCount(text, pattern))

2


In [3]:
'''
Code Challenge: Solve the Frequent Words Problem.
Input: A string Text and an integer k.
Output: All most frequent k-mers in Text.
'''

def FrequentWords(text, k):
    sub_texts = set()
    for start in range(len(text) - k + 1):
        end = start + k
        sub_texts.add(text[start: end])
        
    counts = {}
    for sub_text in sub_texts:
        counts [sub_text] = PatternCount(text, sub_text)
        
    max_count = max(counts.values())
    
    for count in counts:
        if counts[count] == max_count:
            print(count)

# Test      
text = 'ACGTTGCATGTCGCATGATGCATGAGAGCT'
k    = 4

FrequentWords(text, k)

GCAT
CATG


In [4]:
'''
Reverse Complement Problem: Find the reverse complement of a DNA string.
Input: A DNA string Pattern.
Output: Pattern_rc , the reverse complement of Pattern.
'''

def ReverseComplement(text):
    complementry_rule = {
        'A' : 'T',
        'T' : 'A',
        'C' : 'G',
        'G' : 'C' 
    }
    
    com_text = ''
    for base in text:
        com_text = com_text + complementry_rule[base]
    
    com_text = com_text[::-1]
    
    return (com_text)

# Test
text = 'AAAACCCGGT'

print(ReverseComplement(text))

ACCGGGTTTT


In [5]:
'''
Code Challenge: Solve the Pattern Matching Problem.
Input: Two strings, Pattern and Genome.
Output: A collection of space-separated integers specifying all starting positions where Pattern appears as a substring of Genome.
'''

def PatternMatching(pattern, genome):
    positions = []
    for start in range(len(genome) - len(pattern) + 1):
        end = start + len(pattern)
        if pattern == genome[start: end]:
            positions.append(start)
    return(positions)

# Test
pattern = 'ATAT'
genome  = 'GATATATGCATATACTT'

print(PatternMatching(pattern, genome))

[1, 3, 9]


In [6]:
'''
Code Challenge: Solve the Clump Finding Problem (restated below). You will need to make sure that your algorithm is efficient enough to handle a large dataset.

Clump Finding Problem: Find patterns forming clumps in a string.
Input: A string Genome, and integers k, L, and t.
Output: All distinct k-mers forming (L, t)-clumps in Genome.
'''

def ClumpFinding(genome, k, L, t):
    patterns_positions={}
    
    for start in range(len(genome) - k +1):
        end = start + k
        pattern = genome[start: end]
        if pattern in patterns_positions:
            patterns_positions[pattern].append(start)
        else:
            patterns_positions[pattern] = [start]
            
    qualified_patterns = []        
    for pattern, positions in patterns_positions.items():
        if len(positions) >= t:
            for position in positions:
                count = sum((np.array(positions) >= position) &
                      (np.array(positions) <= position + L))
                if count == t:
                    qualified_patterns.append(pattern)
                    break
    return(qualified_patterns)

# Test
genome = 'CGGACTCGACAGATGTGAAGAACGACAATGTGAAGACTCGACACGACAGAGTGAAGAGAAGAGGAAACATTGTAA'
k      = 5 
L      = 50
t      = 4

print(ClumpFinding(genome, k, L, t))

['CGACA', 'GAAGA']


In [7]:
'''
Code Challenge: Implement PatternToNumber.
Input: A DNA string Pattern.
Output: The integer PatternToNumber(Pattern).
'''

def PatternToNumber(pattern):
    if   pattern == 'A': return(0)
    elif pattern == 'C': return(1)
    elif pattern == 'G': return(2)
    elif pattern == 'T': return(3)
    else:
        number = PatternToNumber(pattern[:-1])*4 + PatternToNumber(pattern[-1])
        return(number)

# Test
pattern = 'AGT'

print(PatternToNumber(pattern))

11


In [8]:
'''
CODE CHALLENGE: Implement NumberToPattern.
Input: Integers index and k.
Output: The string NumberToPattern(index, k).
'''

def NumberToBase(number):
    if   number == 0: 
        return('A')
    elif number == 1: 
        return('C')
    elif number == 2: 
        return('G')
    elif number == 3:
        return('T')

def NumberToPattern(number, digits):
    pattern = ''
    for i in range(digits):
        base    = NumberToBase(number % 4)
        pattern = base + pattern
        number  = number // 4
    return(pattern)
        
# Test
number = 45
digits = 4

print(NumberToPattern(number, digits))

AGTC


In [9]:
'''
Code Challenge: Implement ComputingFrequencies to generate a frequency array.
Input: A DNA string Text followed by an integer k.
Output: FrequencyArray(Text).
'''

def ComputingFrequencies(text, k):
    FrequencyArray = [0] * (4**k)
    
    for start in range(len(text) - k +1):
        end               = start + k
        j                 = PatternToNumber(text[start:end])
        FrequencyArray[j] = FrequencyArray[j] + 1
    
    return (FrequencyArray)

# Test
text = 'ACGCGGCTCTGAAA'
k    = 2

print(ComputingFrequencies(text, k))

[2, 1, 0, 0, 0, 0, 2, 2, 1, 2, 1, 0, 0, 1, 1, 0]


In [10]:
def FasterFrequentWords(text, k):
    FrequentPatterns = []
    FrequencyArray   = ComputingFrequencies(text, k)
    maxCount         = max(FrequencyArray)
    
    for i, count in enumerate(FrequencyArray):
        if count == maxCount:
            pattern = NumberToPattern(i, k)
            FrequentPatterns.append(pattern)
            
    return (FrequentPatterns)

In [11]:
def BetterClumpFinding(genome, k, t, L):
    FrequentPatterns   = []
    FrequencyArray_all = [0] * (4 ** k)
    text               = genome[0: L]
    FrequencyArray     = ComputingFrequencies(text, k)
    for i in range(4 ** k):
        if FrequencyArray[i] >= t:
            FrequencyArray_all = 1
    for i in range(0, len(genome) - L +1):
        FirstPattern          = genome[i: k + i]
        index                 = PatternToNumber(FirstPattern)
        FrequencyArray[index] = FrequencyArray[index] - 1
        LastPattern           = genome[i + L - k: i + L]
        index                 = PatternToNumber(LastPattern)
        FrequencyArray[index] = FrequencyArray[index] + 1
        if FrequencyArray[index] >= t:
            FrequencyArray_all[index] =1
    for i in range(4 ** k):
        if FrequencyArray_all[i] == 1:
            pattern = NumberToPattern(i, k)
            FrequentPatterns.append(pattern)
    return (FrequentPatterns)

# Week 2

In [12]:
'''
Minimum Skew Problem: Find a position in a genome where the skew diagram attains a minimum.
Input: A DNA string Genome.
Output: All integer(s) i minimizing Skewi (Genome) among all values of i (from 0 to |Genome|).
'''

def MinimumSkew(Genome):
    
    scores = [0]
    for base in Genome:
        if   base == 'C': scores.append(scores[-1] - 1)
        elif base == 'G': scores.append(scores[-1] + 1)
        else :            scores.append(scores[-1])
            
    scores    = np.array(scores)
    min_index = list(np.where(scores == min(scores))[0])
    return(min_index)

# Test
Genome = 'TAAAGACTGCCGAGAGGCCAACACGAGTGCTAGAACGAGGGGCGTAAACGCGGGTCCGAT'

print(MinimumSkew(Genome))

[11, 24]


In [13]:
'''
Hamming Distance Problem: Compute the Hamming distance between two strings.
Input: Two strings of equal length.
Output: The Hamming distance between these strings.
'''

def HammingDistance(string1, string2):
    mismatch = 0
    for base1, base2 in zip (string1, string2):
        if base1 != base2: 
            mismatch = mismatch + 1
    return (mismatch)

# Test
string1 = 'GGGCCGTTGGT'
string2 = 'GGACCGTTGAC'

print(HammingDistance(string1, string2))

3


In [14]:
'''
Approximate Pattern Matching Problem: Find all approximate occurrences of a pattern in a string.
Input: Strings Pattern and Text along with an integer d.
Output: All starting positions where Pattern appears as a substring of Text with at most d mismatches.
'''

def ApproximatePatternMatching(Text, Pattern, d):
    positions   = []
    pattern_len = len(Pattern)
    
    for start in range(len(Text) - pattern_len + 1):
        end = start + pattern_len
        
        if HammingDistance(Pattern, Text[start: end]) <= d:
            positions.append(start)
            
    return (positions)

# Test
Pattern = 'ATTCTGGA'
Text    = 'CGCCCGAATCCAGAACGCATTCCCATATTTCGGGACCACTGGCCTCCACGGTACGGACGTCAATCAAAT'
d       = 3

print(ApproximatePatternMatching(Text, Pattern, d))

[6, 7, 26, 27]


In [15]:
'''
Code Challenge: Implement ApproximatePatternCount.
Input: Strings Pattern and Text as well as an integer d.
Output: Countd(Text, Pattern).
'''

def ApproximatePatternCount(text, pattern, d):
    count = 0
    for start in range(len(text) - len(pattern) + 1):
        end = start + len(pattern)
        if HammingDistance(pattern, text[start: end]) <= d:
            count = count + 1
            
    return(count)

# Test
pattern = 'GAGG'
text    = 'TTTAGAGCCTTCAGAGG'
d       = 2

print(ApproximatePatternCount(text, pattern, d))

4


In [16]:
'''
Code Challenge: Implement Neighbors to find the d-neighborhood of a string.
Input: A string Pattern and an integer d.
Output: The collection of strings Neighbors(Pattern, d). 
'''

def ImmediateNeighbors(pattern):    
    bases     = ['A', 'C', 'G', 'T']
    neighbors = []
    
    for i in range(len(pattern)):
        other_bases = bases.copy()
        other_bases.remove(pattern[i])
        
        for base in other_bases:
            neighbor    = list(pattern)
            neighbor[i] = base
            neighbor    = ''.join(neighbor)
            neighbors.append(neighbor)
    
    return(neighbors)


def Neighbors(pattern, d):
    bases     = ['A', 'C', 'G', 'T']
    
    if d == 0:
        return (pattern)
    if len(pattern) == 1:
        return (bases)

    neighbors       = []
    suffixNeighbors = Neighbors(pattern[1:], d)
    for text in suffixNeighbors:
        if HammingDistance(pattern[1:], text) < d:
            for base in bases:
                neighbors.append(base + text)
        else :
            neighbors.append(pattern[0] + text)
    return(neighbors)

# Test
pattern = 'ACG'
d       = 1

print(Neighbors(pattern, d))

['ACA', 'ACC', 'AAG', 'ACG', 'CCG', 'GCG', 'TCG', 'AGG', 'ATG', 'ACT']


In [17]:
'''
Frequent Words with Mismatches Problem: Find the most frequent k-mers with mismatches in a string.
Input: A string Text as well as integers k and d. (You may assume k ≤ 12 and d ≤ 3.)
Output: All most frequent k-mers with up to d mismatches in Text.
'''

def ComputingFrequenciesWithMismatches(text, k, d):
    FrequencyArray = [0] * (4 ** k)
    neighborhoods  = []
    for start in range(len(text) - k + 1):
        end          = start + k
        pattern      = text[start: end]
        neighborhood = Neighbors(pattern, d)
        if type(neighborhood) == str:
            neighborhoods.append(neighborhood)
        else:
            neighborhoods.extend(neighborhood)

    for string in neighborhoods:
        j                 = PatternToNumber(string)
        FrequencyArray[j] = FrequencyArray[j] + 1
    
    return (FrequencyArray)

def FrequentWordsWithMismatches(text, k, d):
    FrequencyPatterns = []
    FrequencyArray    = np.array(ComputingFrequenciesWithMismatches(text, k, d))
    max_frequency     = max(FrequencyArray)
    max_index         = list(np.where(FrequencyArray == max_frequency)[0])

    for index in max_index:
        FrequencyPatterns.append(NumberToPattern(index, k))
    
    return (FrequencyPatterns)

# Test
text = 'ACGTTGCATGTCGCATGATGCATGAGAGCT'
k    = 4 
d    = 1

print(FrequentWordsWithMismatches(text, k, d))

['ATGC', 'ATGT', 'GATG']


In [18]:
'''
Frequent Words with Mismatches and Reverse Complements Problem: Find the most frequent k-mers (with mismatches and reverse complements) in a string.
Input: A DNA string Text as well as integers k and d.
Output: All k-mers Pattern maximizing the sum Countd(Text, Pattern)+ Countd(Text, Patternrc) over all possible k-mers.
'''

def FrequentWordsWithMismatchesAndReverseComplements(text, k, d):
    FrequencyPatterns = []
    FrequencyArray    = np.array(ComputingFrequenciesWithMismatches(text, k, d))
    R_FrequencyArray  = np.array(ComputingFrequenciesWithMismatches(ReverseComplement(text), k, d))
    FrequencyArray    = FrequencyArray + R_FrequencyArray
    max_frequency     = max(FrequencyArray)
    max_index         = list(np.where(FrequencyArray == max_frequency)[0])

    for index in max_index:
        FrequencyPatterns.append(NumberToPattern(index, k))
        
    return (FrequencyPatterns)

# Test
text = 'ACGTTGCATGTCGCATGATGCATGAGAGCT'
k    = 4 
d    = 1

print(FrequentWordsWithMismatchesAndReverseComplements(text, k, d))

['ACAT', 'ATGT']


# Week 3

In [19]:
'''
Code Challenge: Implement MotifEnumeration.
Input: Integers k and d, followed by a collection of strings Dna.
Output: All (k, d)-motifs in Dna.
'''

def MotifEnumeration(Dnas, k, d):
    Dnas = Dnas.split(" ") 
    patterns     = []
    patterns_all = []
    for text in Dnas:
        patterns_text = []
        for start in range(len(text) - k + 1):
            end     = start + k
            pattern = list(text[start: end])
            patterns_text.extend(Neighbors(pattern, d)) 
            
        patterns_text = list(set(patterns_text))
        patterns_all.extend(patterns_text)
        
    patterns_count = Counter(patterns_all)
    for pattern, count in patterns_count.items():
        if count == len(Dnas):
            patterns.append(pattern)
            
    return (patterns)

# Test
k    = 3
d    = 1
Dnas ='ATTTGGC TGCCTTA CGGTATC GAAAATT'

print(MotifEnumeration(Dnas, k, d))

['GTT', 'TTT', 'ATT', 'ATA']


In [20]:
'''
Code Challenge: Implement DistanceBetweenPatternAndStrings.
Input: A string Pattern followed by a collection of strings Dna.
Output: d(Pattern, Dna).
'''

def DistanceBetweenPatternAndStrings(pattern, dnas):
    if type(dnas) == str:
        dnas = dnas.split(" ")
        
    distance = 0
    k        = len(pattern) 
    
    for text in dnas:
        H_distance = float('Inf')
        for start in range(len(text) - k + 1):
            end    = start + k
            string = text[start: end]
            
            if H_distance > HammingDistance(pattern, string):
                H_distance = HammingDistance(pattern, string)
        distance = distance + H_distance
    
    return (distance)

# Test
pattern = 'AAA'
dnas    = 'TTACCTTAAC GATATCTGTC ACGGCGTTCG CCCTAAAGAG CGTCAGAGGT'

print(DistanceBetweenPatternAndStrings(pattern, dnas))

5


In [21]:
'''
Code Challenge: Implement MedianString.
Input: An integer k, followed by a collection of strings Dna.
Output: A k-mer Pattern that minimizes d(Pattern, Dna) among all possible choices of k-mers. 
'''

def MedianString(dnas, k):
    if type(dnas) == str:
        dnas = dnas.split(" ")
        
    distance = float('Inf')
    
    for i in range(4 ** k):
        pattern = NumberToPattern(i, k)
        
        if distance > DistanceBetweenPatternAndStrings(pattern, dnas):
            distance = DistanceBetweenPatternAndStrings(pattern, dnas)
            median   = pattern
            
    return(median)

# Test
k    = 3
dnas = 'AAATTGACGCAT GACGACCACGTT CGTCAGCGCCTG GCTGAGCACCGG AGTTCGGGACAG'

print(MedianString(dnas, k))

GAC


In [22]:
'''
Profile-most Probable k-mer Problem: Find a Profile-most probable k-mer in a string.
Input: A string Text, an integer k, and a 4 × k matrix Profile.
Output: A Profile-most probable k-mer in Text.
'''

def ProfileMostProbableKmer(text, k, profile):
    if type(profile) == str:
        profile = profile.split(" ")
    profile        = np.array(profile, dtype='f').reshape(4,k)
    prob_high      = 0
    output_pattern = text[0: k]
    
    for start in range(len(text) - k + 1):
        end      = start + k
        pattern  = text[start: end]
        prob_pattern = 1
        for i in range(k):
            if   pattern[i] == 'A': prob_pattern = prob_pattern * profile[0, i]
            elif pattern[i] == 'C': prob_pattern = prob_pattern * profile[1, i]
            elif pattern[i] == 'G': prob_pattern = prob_pattern * profile[2, i]
            elif pattern[i] == 'T': prob_pattern = prob_pattern * profile[3, i]
                
        if prob_pattern > prob_high:
            prob_high      = prob_pattern
            output_pattern = pattern
            
    return (output_pattern)

# Test
text = 'ACCTGTTTATTGCCTAAGTTCCGAACAAACCCAATATAGCCCGAGGGCCT'
k    = 5
profile = '0.2 0.2 0.3 0.2 0.3 0.4 0.3 0.1 0.5 0.1 0.3 0.3 0.5 0.2 0.4 0.1 0.2 0.1 0.1 0.2'

print(ProfileMostProbableKmer(text, k, profile))

CCGAG


In [23]:
'''
Code Challenge: Implement GreedyMotifSearch.
Input: Integers k and t, followed by a collection of strings Dna.
Output: A collection of strings BestMotifs resulting from applying GreedyMotifSearch(Dna, k, t).
'''

def ProfileMatrix(strings):
    strings  = list(strings)
    n_string = len(strings)
    k        = len(strings[0])
    profile  = np.array([0] * 4 * k, dtype='f').reshape(4, k)
    
    for i in range(k):
        base_i = []
        for string in strings:
            base_i.append(string[i])
        count_base    = Counter(base_i)
        profile[0, i] = count_base['A'] / n_string
        profile[1, i] = count_base['C'] / n_string
        profile[2, i] = count_base['G'] / n_string
        profile[3, i] = count_base['T'] / n_string
  
    return (profile)

def GreedyMotifSearch(dnas, k, t):
    if type(dnas) == str:
        dnas = dnas.split(" ")
    
    best_motifs = []
    for string in dnas:
        best_motifs.append (string[0:k])
    base_string   = dnas[0]
    other_strings = dnas[1:]
    
    for start in range(len(base_string) - k + 1):
        end    = start + k 
        motif  = base_string[start: end]
        motifs = [motif]
        for string in other_strings:
            profile   = ProfileMatrix(motifs)
            new_motif = ProfileMostProbableKmer(string, k, profile)
            motifs.append(new_motif)
            
        best_score    = DistanceBetweenPatternAndStrings(best_motifs[0], best_motifs)
        current_score = DistanceBetweenPatternAndStrings(motifs[0], motifs)
        if best_score > current_score:
            best_motifs = motifs
            
    return(best_motifs)

# Test
k    = 3 
t    = 5
dnas = 'GGCGTTCAGGCA AAGAATCAGTCA CAAGGAGTTCGC CACGTCAATCAC CAATAATATTCG'

print(GreedyMotifSearch(dnas, k, t))

['TCA', 'TCA', 'CAA', 'TCA', 'TAA']


In [24]:
'''
Code Challenge: Implement GreedyMotifSearch with pseudocounts.
Input: Integers k and t, followed by a collection of strings Dna.
Output: A collection of strings BestMotifs resulting from applying GreedyMotifSearch(Dna, k, t) with pseudocounts.
'''

def ProfileMatrix_Pseudocounts(strings):
    strings  = list(strings)
    n_string = len(strings)
    k        = len(strings[0])
    profile  = np.array([0] * 4 * k, dtype='f').reshape(4, k)
    
    for i in range(k):
        base_i = []
        for string in strings:
            base_i.append(string[i])
        count_base    = Counter(base_i)
        profile[0, i] = (count_base['A'] + 1) / (n_string + 4)
        profile[1, i] = (count_base['C'] + 1) / (n_string + 4)
        profile[2, i] = (count_base['G'] + 1) / (n_string + 4)
        profile[3, i] = (count_base['T'] + 1) / (n_string + 4)
  
    return (profile)

def GreedyMotifSearch_Pseudocounts(dnas, k, t):
    if type(dnas) == str:
        dnas = dnas.split(" ")
    
    best_motifs = []
    for string in dnas:
        best_motifs.append (string[0:k])
    base_string   = dnas[0]
    other_strings = dnas[1:]
    
    for start in range(len(base_string) - k + 1):
        end    = start + k 
        motif  = base_string[start: end]
        motifs = [motif]
        for string in other_strings:
            profile   = ProfileMatrix_Pseudocounts(motifs)
            new_motif = ProfileMostProbableKmer(string, k, profile)
            motifs.append(new_motif)
            
        best_score    = DistanceBetweenPatternAndStrings(best_motifs[0], best_motifs)
        current_score = DistanceBetweenPatternAndStrings(motifs[0], motifs)
        if best_score > current_score:
            best_motifs = motifs
            
    return(best_motifs)

# Test
k    = 3 
t    = 5
dnas = 'GGCGTTCAGGCA AAGAATCAGTCA CAAGGAGTTCGC CACGTCAATCAC CAATAATATTCG'

print(GreedyMotifSearch_Pseudocounts(dnas, k, t))

['TTC', 'ATC', 'TTC', 'ATC', 'TTC']


# Week 4

In [25]:
'''
Code Challenge: Implement RandomizedMotifSearch.
Input: Integers k and t, followed by a collection of strings Dna.
Output: A collection BestMotifs resulting from running RandomizedMotifSearch(Dna, k, t) 1,000 times. 
'''

def Score(motifs):
    if type(motifs) == str:
        motifs = motifs.split(" ")
    
    All_base = []
    for motif in motifs:
        All_base.extend(list(motif))
    All_base = np.array(All_base).reshape(len(motifs), len(motifs[0]))
    
    score = 0
    for i in range(k):
        count     = Counter(All_base[:,i])
        score = score + len(motifs) - count.most_common(1)[0][1]

    return (score)

def RandomizedMotifSearch(dnas, k, t):
    if type(dnas) == str:
        dnas = dnas.split("\n")
    
    motifs = []
    for dna in dnas:
        start = random.randrange(len(dna) - k + 1)
        end   = start + k
        motif = dna[start: end]
        motifs.append(motif)
    best_motifs = motifs
    
    while True: 
        profile = ProfileMatrix_Pseudocounts(motifs)
        motifs  = []
        for dna in dnas:
            motif = ProfileMostProbableKmer(dna, k, profile)
            motifs.append(motif)
            
        score_best    = Score(best_motifs)
        score_current = Score(motifs)
        
        if score_best > score_current:
            best_motifs = motifs
        else:
            return (best_motifs)
    

k = 8
t = 5
dnas='''CGCCCCTCTCGGGGGTGTTCAGTAAACGGCCA
GGGCGAGGTATGTGTAAGTGCCAAGGTGCCAG
TAGTACCGAGACCGAAAGAAGTATACAGGCGT
TAGATCAAGTTTCAGGTGCACGTCGGTGAACC
AATCCACCAGCTCCACGTGCAATGTTGGCCTA'''

# Run the RandomizedMotifSearch 1000 times
motifs      = RandomizedMotifSearch(dnas, k, t)
best_motifs = motifs
loop   = 1000
for i in range(loop):
    
    motifs        = RandomizedMotifSearch(dnas, k, t)
    profile       = ProfileMatrix_Pseudocounts(motifs)
    
    score_best    = Score(best_motifs)
    score_current = Score(motifs)

    if score_best > score_current:
        best_motifs = motifs
        
print(best_motifs)

['TCTCGGGG', 'TGTAAGTG', 'TACAGGCG', 'TTCAGGTG', 'TCCACGTG']


In [26]:
'''
Code Challenge: Implement GibbsSampler.
Input: Integers k, t, and N, followed by a collection of strings Dna.
Output: The strings BestMotifs resulting from running GibbsSampler(Dna, k, t, N) with 20 random starts. 
'''


def Random(motifs, profile):
    #if type(profile) == str:
        #profile = profile.split(" ")
    #profile  = np.array(profile, dtype='f').reshape(4, int(len(profile) / 4))
    prob_all = []
    
    for motif in motifs:
        prob_motif = 1
        for i in range(len(motif)):
            if   motif[i] == 'A': prob_motif = prob_motif * profile[0, i]
            elif motif[i] == 'C': prob_motif = prob_motif * profile[1, i]
            elif motif[i] == 'G': prob_motif = prob_motif * profile[2, i]
            elif motif[i] == 'T': prob_motif = prob_motif * profile[3, i]
        prob_all.append(prob_motif)
    
    sum_prob = sum(prob_all)
    for i in range(len(motifs)):
        prob_all[i] = prob_all[i]/sum_prob
    
    choose_index = random.choices(range(len(motifs)), prob_all)[0]
    
    return(choose_index)

def GibbsSampler(dnas, k, t, N):
    if type(dnas) == str:
        dnas = dnas.split("\n")
        
    '''
    motifs      = RandomizedMotifSearch(dnas, k, t)
    best_motifs = motifs
    loop        = 30
    for i in range(loop):

        motifs        = RandomizedMotifSearch(dnas, k, t)
        profile       = ProfileMatrix_Pseudocounts(motifs)

        score_best    = Score(best_motifs)
        score_current = Score(motifs)

        if score_best > score_current:
            best_motifs = motifs
    
    motifs = best_motifs
    '''
    
    motifs = []
    for dna in dnas:
        start = random.randrange(len(dna) - k + 1)
        end   = start + k
        motif = dna[start: end]
        motifs.append(motif)
        
    
    profile = ProfileMatrix_Pseudocounts(motifs)
    best_motifs = motifs
    
    for loop in range(N):
        i          = Random(motifs, profile)
        motifs_tmp = motifs[:i] + motifs[(i+1):]
        profile    = ProfileMatrix_Pseudocounts(motifs_tmp)
        motifs[i]  = ProfileMostProbableKmer(dnas[i], k, profile)
        
        score_best    = Score(best_motifs)
        score_current = Score(motifs)

        if score_best > score_current:
            best_motifs = motifs
    return(best_motifs)

k = 8
t = 5
N = 100
dnas = '''CGCCCCTCTCGGGGGTGTTCAGTAACCGGCCA
GGGCGAGGTATGTGTAAGTGCCAAGGTGCCAG
TAGTACCGAGACCGAAAGAAGTATACAGGCGT
TAGATCAAGTTTCAGGTGCACGTCGGTGAACC
AATCCACCAGCTCCACGTGCAATGTTGGCCTA'''

# Run the RandomizedMotifSearch 1000 times
motifs      = GibbsSampler(dnas, k, t, N)
best_motifs = motifs
loop   = 1000
for i in range(loop):   
    
    motifs        = GibbsSampler(dnas, k, t, N)    
    score_best    = Score(best_motifs)
    score_current = Score(motifs)

    if score_best > score_current:
        best_motifs = motifs

print(best_motifs)

['TCTCGGGG', 'CCAAGGTG', 'TACAGGCG', 'TTCAGGTG', 'TCCACGTG']
