## Problem: BA2G
Implement GibbsSampler
#### link: https://rosalind.info/problems/ba2g/

In [8]:
import random

In [9]:
# Score calculation function
def score(motif_set):
    k = len(motif_set[0])
    l = len(motif_set)
    total_count = 0
    for i in range(k):
        col = [motif[i] for motif in motif_set]
        max_count = max(col.count('A'), col.count('C'), col.count('G'), col.count('T'))
        total_count += (l - max_count)
    return total_count

In [10]:
# Build profile with pseudocounts
def build_profile(motif_set):
    k = len(motif_set[0])
    l = len(motif_set)
    profile = [[0.0]*k for _ in range(4)]
    
    for i in range(k):
        col = [motif[i] for motif in motif_set]
        profile[0][i] = (col.count('A') + 1) / (l + 4)
        profile[1][i] = (col.count('C') + 1) / (l + 4)
        profile[2][i] = (col.count('G') + 1) / (l + 4)
        profile[3][i] = (col.count('T') + 1) / (l + 4)

    return profile

In [11]:
# Profile most probable kmer
def probability(kmer, matrix):
    ans = 1
    for i in range(len(kmer)):
        if kmer[i] == 'A':
            ans *= matrix[0][i]
        if kmer[i] == 'C':
            ans *= matrix[1][i]
        if kmer[i] == 'G':
            ans *= matrix[2][i]
        if kmer[i] == 'T':
            ans *= matrix[3][i]
    return ans


# Profile most probable kmer
def most_probable_kmer(dna, k, matrix):
    n = len(dna)
    prob = -1e9
    ans = ""
    for i in range(n - k + 1):
        kmer = dna[i:i + k]
        x = probability(kmer, matrix)
        if x > prob:
            prob = x
            ans = kmer
    return ans

In [14]:
def GibbsSampler(Dna, k, t, N):
    # Step 1: randomly select initial motifs
    best_motifs = []
    for dna in Dna:
        idx = random.randint(0, len(dna) - k)
        best_motifs.append(dna[idx:idx + k])
    bestScore = score(best_motifs)

    motifs = best_motifs[:]  # copy of initial motifs

    for _ in range(N):
        i = random.randint(0, t - 1)
        # build profile WITHOUT the i-th motif
        reduced = motifs[:i] + motifs[i+1:]
        profile = build_profile(reduced)
        # pick a new motif for dna[i] based on profile
        newMotif = most_probable_kmer(Dna[i], k, profile)
        motifs[i] = newMotif  # replace directly
        currentScore = score(motifs)
        if currentScore < bestScore:
            bestScore = currentScore
            best_motifs = motifs[:]  # copy! Important

    return best_motifs


In [None]:
k, t, N = 8, 5, 100
Dna = input("enter DNA: ").split()
result = GibbsSampler(Dna, k, t, N)
for _ in result:
    print(_)