<a href="https://colab.research.google.com/github/heispv/bioinformatics/blob/master/greedy_motif_search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [48]:
def most_probable_kmer(seq: str, k: int, profile_matrix: dict) -> str:
    """
    Find the most probable k-mer in the given sequence based on a profile matrix.

    Args:
        seq (str): The input DNA sequence.
        k (int): The length of the k-mer to search for.
        profile_matrix (dict): A dictionary representing the profile matrix.

    Returns:
        str: The most probable k-mer based on the profile matrix.
    """
    n = len(seq)
    pattern = ''
    pattern_prob = 0

    for i in range(n - k + 1):
        kmer = seq[i : i+k]

        prob = 1
        for j, letter in enumerate(kmer):
            letter = letter.capitalize()
            prob *= profile_matrix[letter][j]

        if prob > pattern_prob:
            pattern = kmer
            pattern_prob = prob

    return pattern


In [49]:
# Read the data
text = open('/content/dataset_159_3.txt')
data = text.read()
text.close()

In [50]:
# Preprocess the data
data = data.split('\n')
del data[-1]
seq, k, *profile = data
k = int(k)

In [51]:
# profile (str) -> profile (dict)
rows = str(profile).strip("[]'").split("', '")
keys = ['A', 'C', 'G', 'T']
profile_matrix = {}

for i, row in enumerate(rows):
    values = [float(val) for val in row.split()]
    profile_matrix[keys[i]] = values

In [52]:
most_probable_kmer(seq, k, profile_matrix)

'CATCTTTCGGCA'