In [None]:
from itertools import product

def hamming_distance(str1: str, str2: str) -> int:
    
    return sum(1 for a, b in zip(str1, str2) if a != b)

def d(pattern: str, dna: list[str]) -> int:
    
    k = len(pattern)
    total_distance = 0
    
    for seq in dna:
        min_dist = float('inf')
        for i in range(len(seq) - k + 1):
            kmer = seq[i:i+k]
            min_dist = min(min_dist, hamming_distance(pattern, kmer))
        total_distance += min_dist
    
    return total_distance

def median_string(dna: list[str], k: int) -> str:
    
    best_pattern = None
    min_distance = float('inf')
    
    
    for pattern in ("".join(p) for p in product("ACGT", repeat=k)):
        distance = d(pattern, dna)
        if distance < min_distance:
            min_distance = distance
            best_pattern = pattern
    
    return best_pattern


dna_strings = [
    "AAATTGACGCAT", "GACGACCACGTT", "CGTCAGCGCCTG",
    "GCTGAGCACCGG", "AGTACGGGACAG"
]
k = 3

# Sample Output
print(median_string(dna_strings, k))  


ACG


In [2]:
from collections import Counter

def compute_profile(motifs, k, t):
    profile = {"A": [0] * k, "C": [0] * k, "G": [0] * k, "T": [0] * k}
    
    for i in range(k):
        column = [motif[i] for motif in motifs]
        counts = Counter(column)
        for nucleotide in "ACGT":
            profile[nucleotide][i] = counts.get(nucleotide, 0) / t
    
    return profile

def most_probable_kmer(text, k, profile):
    max_prob = -1
    best_kmer = text[:k]
    
    for i in range(len(text) - k + 1):
        kmer = text[i:i+k]
        prob = 1
        for j, nucleotide in enumerate(kmer):
            prob *= profile[nucleotide][j]
        
        if prob > max_prob:
            max_prob = prob
            best_kmer = kmer
    
    return best_kmer

def score(motifs, k):
    consensus = ""
    for i in range(k):
        column = [motif[i] for motif in motifs]
        most_common = Counter(column).most_common(1)[0][0]
        consensus += most_common
    
    return sum(1 for motif in motifs for i in range(k) if motif[i] != consensus[i])

def greedy_motif_search(dna, k, t):
    best_motifs = [seq[:k] for seq in dna]
    
    for i in range(len(dna[0]) - k + 1):
        motifs = [dna[0][i:i+k]]
        
        for j in range(1, t):
            profile = compute_profile(motifs, k, len(motifs))
            next_motif = most_probable_kmer(dna[j], k, profile)
            motifs.append(next_motif)
        
        if score(motifs, k) < score(best_motifs, k):
            best_motifs = motifs
    
    return best_motifs


k, t = 3, 5
dna = ["GGCGTTCAGGCA", "AAGAATCAGTCA", "CAAGGAGTTCGC", "CACGTCAATCAC", "CAATAATATTCG"]


best_motifs = greedy_motif_search(dna, k, t)
print(" ".join(best_motifs))


CAG CAG CAA CAA CAA
