<a href="https://colab.research.google.com/github/heispv/bioinformatics/blob/master/randomized-motif-search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import random
from collections import defaultdict
import sys


In [7]:
def RandomMotifs(Dna, k):
    """
    Randomly selects k-mers from each DNA string in Dna.

    Parameters:
    - Dna: List of DNA strings.
    - k: Length of the motif.

    Returns:
    - motifs: List of selected k-mers.
    """
    motifs = []
    for seq in Dna:
        start = random.randint(0, len(seq) - k)
        motifs.append(seq[start:start + k])
    return motifs


In [8]:
def ProfileWithPseudocounts(Motifs):
    """
    Constructs a profile matrix with pseudocounts from the given motifs.

    Parameters:
    - Motifs: List of k-mers.

    Returns:
    - profile: Dictionary representing the profile matrix.
    """
    counts = {'A':[1]*len(Motifs[0]),
              'C':[1]*len(Motifs[0]),
              'G':[1]*len(Motifs[0]),
              'T':[1]*len(Motifs[0])}
    k = len(Motifs[0])
    t = len(Motifs)

    for motif in Motifs:
        for idx, nucleotide in enumerate(motif):
            counts[nucleotide][idx] += 1

    profile = {'A':[], 'C':[], 'G':[], 'T':[]}
    for nucleotide in 'ACGT':
        for count in counts[nucleotide]:
            profile[nucleotide].append(count / (t + 4))  # Adding 4 pseudocounts
    return profile


In [9]:
def MotifsFromProfile(Profile, Dna, k):
    """
    Selects k-mers from each DNA string in Dna based on the given profile.
    The selection is probabilistic.

    Parameters:
    - Profile: Profile matrix as a dictionary.
    - Dna: List of DNA strings.
    - k: Length of the motif.

    Returns:
    - motifs: List of selected k-mers.
    """
    motifs = []
    for seq in Dna:
        probabilities = []
        for i in range(len(seq) - k + 1):
            kmer = seq[i:i+k]
            prob = 1
            for idx, nucleotide in enumerate(kmer):
                prob *= Profile[nucleotide][idx]
            probabilities.append(prob)

        # Normalize the probabilities
        total = sum(probabilities)
        if total == 0:
            probabilities = [1/(len(seq) - k +1)] * (len(seq) - k +1)
        else:
            probabilities = [p / total for p in probabilities]

        # Choose a k-mer based on the probabilities
        r = random.random()
        cumulative = 0
        chosen_kmer = seq[0:k]  # Default k-mer
        for i, p in enumerate(probabilities):
            cumulative += p
            if r <= cumulative:
                chosen_kmer = seq[i:i+k]
                break
        motifs.append(chosen_kmer)
    return motifs


In [10]:
def Score(Motifs):
    """
    Calculates the score of the motifs. Lower scores are better.

    Parameters:
    - Motifs: List of k-mers.

    Returns:
    - score: Integer score representing the quality of motifs.
    """
    k = len(Motifs[0])
    t = len(Motifs)
    score = 0
    for idx in range(k):
        count = {'A':0, 'C':0, 'G':0, 'T':0}
        for motif in Motifs:
            nucleotide = motif[idx]
            count[nucleotide] +=1
        max_count = max(count.values())
        score += (t - max_count)
    return score


In [11]:
def RandomizedMotifSearch(Dna, k, t):
    """
    Executes the Randomized Motif Search algorithm once.

    Parameters:
    - Dna: List of DNA strings.
    - k: Length of the motif.
    - t: Number of DNA strings.

    Returns:
    - BestMotifs: Best set of motifs found in this iteration.
    """
    Motifs = RandomMotifs(Dna, k)
    BestMotifs = Motifs.copy()
    BestScore = Score(BestMotifs)

    while True:
        Profile = ProfileWithPseudocounts(Motifs)
        Motifs = MotifsFromProfile(Profile, Dna, k)
        currentScore = Score(Motifs)
        if currentScore < BestScore:
            BestMotifs = Motifs.copy()
            BestScore = currentScore
        else:
            return BestMotifs


In [12]:
def FindBestMotifs(Dna, k, t, iterations=1000, seed=None):
    """
    Runs the Randomized Motif Search multiple times to find the best motifs.

    Parameters:
    - Dna: List of DNA strings.
    - k: Length of the motif.
    - t: Number of DNA strings.
    - iterations: Number of times to run the search.
    - seed: Random seed for reproducibility (optional).

    Returns:
    - BestMotifsOverall: Best set of motifs found across all iterations.
    - BestScoreOverall: Score of the best motifs.
    """
    if seed is not None:
        random.seed(seed)

    BestMotifsOverall = None
    BestScoreOverall = float('inf')

    for i in range(iterations):
        currentMotifs = RandomizedMotifSearch(Dna, k, t)
        currentScore = Score(currentMotifs)
        if currentScore < BestScoreOverall:
            BestScoreOverall = currentScore
            BestMotifsOverall = currentMotifs.copy()
    return BestMotifsOverall, BestScoreOverall


In [13]:
def read_input(file_path):
    """
    Reads the input file and parses k, t, and Dna sequences.

    Parameters:
    - file_path: Path to the input text file.

    Returns:
    - k: Length of the motif (integer).
    - t: Number of DNA strings (integer).
    - Dna: List of DNA strings.
    """
    with open(file_path, 'r') as file:
        lines = file.readlines()

    # Remove any leading/trailing whitespace and split the lines
    lines = [line.strip() for line in lines if line.strip()]

    # Parse k and t from the first line
    first_line = lines[0]
    k, t = map(int, first_line.split())

    # Parse DNA sequences from the remaining lines
    Dna = []
    for line in lines[1:]:
        Dna.extend(line.split())

    # Validate the number of DNA strings
    if len(Dna) != t:
        raise ValueError(f"Expected {t} DNA strings, but found {len(Dna)}.")

    return k, t, Dna


In [14]:
# Specify the path to your input file
input_file = '/content/dataset_30307_5 (1).txt'  # Change this if your file is located elsewhere

# Read and parse the input
try:
    k, t, Dna = read_input(input_file)
    print(f"Successfully read input from '{input_file}'.")
    print(f"k (Motif Length): {k}")
    print(f"t (Number of DNA Strings): {t}")
    print(f"DNA Strings: {Dna}")
except Exception as e:
    print(f"Error reading input file: {e}")


Successfully read input from '/content/dataset_30307_5 (1).txt'.
k (Motif Length): 15
t (Number of DNA Strings): 20
DNA Strings: ['GGTAGGAAGCGGTGAGTGTACCTCTGCTATCGGGATTTTGATTCAGGGTGGTCGGAGGCTGTGAGGTGGCGAAATATGCTGGCCCGACGAGGAGTCGACTTTATGCCAGCCTAGGTAGTTGTGCTTCGCTGACTGTTACTTGAATTACGAGCTCACCATTATTTGACCGCGTCTGCCTATACTTATGCCTCGGGTAGGAAGCGGTGA', 'GTGTACCTCTGCTATCGGGATTTTGATTCAGGGTGGTCGGAGGCTGTGAGGTGGCGAAATATGCTGGCCCGACGAGGAGTCGACTTTTGGCAAGGAAGAGTTATGCCAGCCTAGGTAGTTGTGCTTCGCTGACTGTTACTTGAATTACGAGCTCACCATTATTTGACCGCGTCTGCCTATACTTATGCCTCGGGTAGGAAGCGGTGA', 'AGGTGATCATAGCGGAAATCCGGGCGATTTCATGGGGTGTTCCTCGCTGGAGAACTGTTGGGGAGGGGAAGAGGCCTTATATCTCGTCAAAGAAAGTTCCACCCCTTGACCCTATCCTAGCGTGCAACGGACAGTGACGGAGTGTCTCGTGCCAATTAGGACCTCAGGAGCTCCGATCTGGAAACCGGGGCTGGGGGCGGATAAATC', 'CGGGCTTATAGTTTTTCAACCGACTACAACGGCCATGAAACGGTCACAACATATGCCAACATACAATGGGTAGGTGACTGGAAACAGCCAGTCGGGTTGACCTATGGTTTCGCGTCCATCAACGCATAGGAGGTAAGAGTCCACATCGCATTAGATGTCTGGAGGTTGAGAGTTAGGGTTTCTGCGATGGCAGCTTCAGAATCAGGA', 'ATCGACATTCGCAGCGTTGACTATA

In [15]:
# Define the number of iterations and random seed for reproducibility
iterations = 1000
random_seed = 42  # You can change or set to None for different results each run

# Run the Randomized Motif Search algorithm
BestMotifs, BestScore = FindBestMotifs(Dna, k, t, iterations=iterations, seed=random_seed)

# Display the results
print("Best Motifs Found:")
print(' '.join(BestMotifs))
print("\nScore of Best Motifs:", BestScore)


Best Motifs Found:
AGGAAGCGGTGAGTG TGGCAAGGAAGAGTT GGGAGGGGAAGAGGC TGGAGGTTGAGAGTT TGGAGGGACCGAGTT TGGAGTACAAGAGTT TGGAACAGAAGAGTT TGGAGTAAAAGAGTT TGCTCGGGAAGAGTT TGGTCCGGAAGAGTT TGGAGGGGGTAAGTT AAAAGGGGAAGAGTT TGGAGGGGAAGAACC TGGAGGCCCAGAGTT GCGAGGGGAAGAGTA TGGAGGGGAAGCAAT TAACGGGGAAGAGTT TGGATAAGAAGAGTT TGGAGGGGATAGGTT TGGAGGGGAACTCTT

Score of Best Motifs: 63
