In [None]:
def hamming_distance(p: str, q: str) -> int:
    hdist = 0
    for i in range(len(p)):
        if p[i] != q[i]:
            hdist += 1
    return hdist

def neighbors(s: str, d: int) -> list[str]:
    nucleotides = {'A', 'T', 'C', 'G'}
    neighborhood = set()
    if d == 0:
        neighborhood.add(s)
        return neighborhood
    if len(s) == 1:
        neighborhood.update(['A', 'T', 'C', 'G'])
        return neighborhood
    suffixNeighbors = neighbors(s[1:], d)
    for str in suffixNeighbors:
        if hamming_distance(s[1:], str) < d:
            for nuc in nucleotides:
                neighborhood.add(nuc+str)
        else:
            neighborhood.add(s[0]+str)
    return neighborhood

def motif_enumeration(dna: list[str], k: int, d: int) -> list[str]:
    patterns = []
    for pattern in dna:
        for i in range(len(pattern) - k + 1):
            kmer = pattern[i:i+k]
            kmerNeighbors = neighbors(kmer, d)
            for neighbor in kmerNeighbors:
                count_DNA_neighbor_is_in = 0
                for dnaString in dna:
                    found_in_DNA_string = False
                    for j in range(len(dnaString) - k + 1):
                        kmer1 = dnaString[j:j+k]
                        if hamming_distance(kmer1, neighbor) <= d:
                            found_in_DNA_string = True
                            break
                    if found_in_DNA_string:
                        count_DNA_neighbor_is_in += 1
                if count_DNA_neighbor_is_in == len(dna):
                    if neighbor not in patterns:
                        patterns.append(neighbor)
    return patterns

In [None]:
def hamming_distance(p: str, q: str) -> int:
    hdist = 0
    for i in range(len(p)):
        if p[i] != q[i]:
            hdist += 1
    return hdist

def DistanceBetweenPatternAndStrings (pattern: str, dna: list[str]) -> int:
    k = len(pattern)
    distance = 0
    for dnaString in dna:
        hammingDist = float('inf')
        for i in range(len(dnaString) - k + 1):
            kmer = dnaString[i:i+k]
            if hammingDist > hamming_distance(pattern, kmer):
                hammingDist = hamming_distance(pattern, kmer)
        distance += hammingDist   
    return distance

def AllStrings(k: int) -> list[str]:
    nucleotides = ['A', 'T', 'C', 'G']
    if k == 1:
        return nucleotides
    smaller_strings = AllStrings(k-1)
    kmers = []
    for nuc in nucleotides:
        for string in smaller_strings:
            kmers.append(nuc + string)
    return kmers

def median_string(dna: list[str], k: int) -> str:
    distance = float('inf')
    patterns = AllStrings(k)
    for i in range(len(patterns)):
        pattern = patterns[i]
        if distance > DistanceBetweenPatternAndStrings(pattern, dna):
            distance = DistanceBetweenPatternAndStrings(pattern, dna)
            median = pattern
    return median

In [None]:
def profile_most_probable_kmer(text: str, k: int,
                               profile: list[dict[str, float]]) -> str:
    """Identifies the most probable k-mer according to a given profile matrix.

    The profile matrix is represented as a list of columns, where the i-th element is a map
    whose keys are strings ("A", "C", "G", and "T") and whose values represent the probability
    associated with this symbol in the i-th column of the profile matrix.
    """
    probs = {}
    for i in range(len(text) - k + 1):
        prob = 1
        kmer = text[i:i + k]
        for i in range(k):
            prob *= profile[i][kmer[i]]
        probs[kmer] = prob
    maxProb = max(probs.values())
    for kmer in probs.keys():
        if probs[kmer] == maxProb:
            return kmer

In [None]:
def generate_profile(motifs: list[str]) -> list[dict[str, float]]:
    t = len(motifs)
    k = len(motifs[0])
    profile = [{"A":1, "T":1, "C":1, "G":1} for _ in range(k)]
    for i in range(k):
        for motif in motifs:
            profile[i][motif[i]] += 1/t
    return profile

def score(motifs: list[str], profile: list[dict[str, float]]) -> int:
    consensus = ""
    count = 0
    for i in range(len(profile)):
        column = profile[i]
        m = max(column.values())
        for key in column.keys():
            if len(consensus) == i:
                if column[key] == m:
                    consensus += key
    for motif in motifs:
        for i in range(len(motif)):
            if motif[i] != consensus[i]:
                count += 1
    return count

def profile_most_probable_kmer(text: str, k: int, profile: list[dict[str, float]]) -> str:
    probs = {}
    for i in range(len(text) - k + 1):
        prob = 1
        kmer = text[i:i + k]
        for i in range(k):
            prob *= profile[i][kmer[i]]
        probs[kmer] = prob
    maxProb = max(probs.values())
    for kmer in probs.keys():
        if probs[kmer] == maxProb:
            return kmer


# Please do not remove package declarations because these are used by the autograder.
# Insert your greedy_motif_search function here, along with any subroutines you need
def greedy_motif_search_pseudocounts(dna: list[str], k: int, t: int) -> list[str]:
    """Implements the GreedyMotifSearch algorithm."""
    best_motifs = []
    for string in dna:
        best_motifs.append(string[:k])
    best_score = score(best_motifs, generate_profile(best_motifs))
    for i in range(len(dna[0]) - k + 1):
        motif1 = dna[0][i:i+k]
        motifs = [motif1]
        profile = generate_profile(motifs)
        for i in range(1, t):
            motifs.append(profile_most_probable_kmer(dna[i], k, profile))
            profile = generate_profile(motifs)
        new_score = score(motifs, profile)
        if new_score < best_score:
            best_score = new_score
            best_motifs = motifs
    return best_motifs

In [None]:
import random

def generate_profile(motifs: list[str]) -> list[dict[str, float]]:
    t = len(motifs)
    k = len(motifs[0])
    profile = [{"A":1, "T":1, "C":1, "G":1} for _ in range(k)]
    for i in range(k):
        for motif in motifs:
            profile[i][motif[i]] += 1/t
    return profile

def score(motifs: list[str], profile: list[dict[str, float]]) -> int:
    consensus = ""
    count = 0
    for i in range(len(profile)):
        column = profile[i]
        m = max(column.values())
        for key in column.keys():
            if len(consensus) == i:
                if column[key] == m:
                    consensus += key
    for motif in motifs:
        for i in range(len(motif)):
            if motif[i] != consensus[i]:
                count += 1
    return count

def profile_most_probable_kmer(text: str, k: int, profile: list[dict[str, float]]) -> str:
    probs = {}
    for i in range(len(text) - k + 1):
        prob = 1
        kmer = text[i:i + k]
        for i in range(k):
            prob *= profile[i][kmer[i]]
        probs[kmer] = prob
    maxProb = max(probs.values())
    for kmer in probs.keys():
        if probs[kmer] == maxProb:
            return kmer

def randomized_motif_search(dna: list[str], k: int, t: int) -> list[str]:
    best_motifs = []
    for string in dna:
        randomStart = random.randint(0, len(string) - k)
        randomKmer = string[randomStart:randomStart + k]
        best_motifs.append(randomKmer)
    best_score = score(best_motifs, generate_profile(best_motifs))
    while True:
        profile = generate_profile(best_motifs)
        motifs = []
        for i in range(t):
            motifs.append(profile_most_probable_kmer(dna[i], k, profile))
            profile = generate_profile(motifs)
        new_score = score(motifs, profile)
        if new_score < best_score:
            best_score = new_score
            best_motifs = motifs
        else:
            return best_motifs

In [None]:
import sys 
import random

def generate_profile(motifs: list[str]) -> list[dict[str, float]]:
    t = len(motifs)
    k = len(motifs[0])
    profile = [{"A":1, "T":1, "C":1, "G":1} for _ in range(k)]
    for i in range(k):
        for motif in motifs:
            profile[i][motif[i]] += 1/t
    return profile

def score(motifs: list[str], profile: list[dict[str, float]]) -> int:
    consensus = ""
    count = 0
    for i in range(len(profile)):
        column = profile[i]
        m = max(column.values())
        for key in column.keys():
            if len(consensus) == i:
                if column[key] == m:
                    consensus += key
    for motif in motifs:
        for i in range(len(motif)):
            if motif[i] != consensus[i]:
                count += 1
    return count

def profile_weighted_random_kmer(text: str, k: int, profile: list[dict[str, float]]) -> str:
    probs = {}
    for i in range(len(text) - k + 1):
        prob = 1
        kmer = text[i:i + k]
        for i in range(k):
            prob *= profile[i][kmer[i]]
        probs[kmer] = prob
    selectedKmer = random.choices(list(probs.keys()), list(probs.values()), k=1)
    return selectedKmer[0]

def gibbs_sampler(dna: list[str], k: int, t: int, n: int) -> list[str]:
    best_motifs = []
    for string in dna:
        randomStart = random.randint(0, len(string) - k)
        randomKmer = string[randomStart:randomStart + k]
        best_motifs.append(randomKmer)
    best_score = float('inf')
    for _ in range(n):
        i = random.randint(0,t-1)
        motifs = best_motifs.copy()
        motifs.pop(i)
        profile = generate_profile(motifs)
        motifi = profile_weighted_random_kmer(dna[i], k, profile)
        motifs.insert(i, motifi)
        new_score = score(motifs, profile)
        if new_score < best_score:
            best_score = new_score
            best_motifs = motifs
    return best_motifs
