In [2]:
from Bio import SeqIO
from Bio.motifs import Motif
from Bio.Seq import Seq
from Bio.Align import Alignment
import itertools

def discover_motifs(fasta_file, motif_length=6, max_motifs=5):
    """
    Discover motifs of a fixed length from protein sequences in a FASTA file.

    Args:
        fasta_file (str): Path to the FASTA file with protein sequences.
        motif_length (int): Length of motifs to search for.
        max_motifs (int): Maximum number of motifs to return.

    Returns:
        list: A list of discovered motifs.
    """
    # Parse the sequences from the FASTA file
    sequences = [str(record.seq) for record in SeqIO.parse(fasta_file, "fasta")]
    
    # Generate all possible k-mers (motifs) of the specified length
    kmer_counts = {}
    for seq in sequences:
        for i in range(len(seq) - motif_length + 1):
            kmer = seq[i:i + motif_length]
            kmer_counts[kmer] = kmer_counts.get(kmer, 0) + 1

    # Sort motifs by frequency
    sorted_kmers = sorted(kmer_counts.items(), key=lambda x: x[1], reverse=True)

    # Create Motif objects for the top motifs
    discovered_motifs = []
    for kmer, count in sorted_kmers[:max_motifs]:
        alignment = Alignment([Seq(kmer)])
        motif = Motif(alignment)
        discovered_motifs.append((kmer, count, motif))
    
    return discovered_motifs


# Main script
fasta_file = "MSA_GH42.fasta"  # Path to your FASTA file
motif_length = 50  # Length of motifs to discover
max_motifs = 10  # Maximum number of motifs to display

# Discover motifs
motifs = discover_motifs(fasta_file, motif_length, max_motifs)

# Display the discovered motifs
print("Discovered Motifs:")
for i, (kmer, count, motif) in enumerate(motifs, 1):
    print(f"{i}. Motif: {kmer} | Count: {count}")


Discovered Motifs:
1. Motif: MSKRRKHSWPQPLKGAESRLWYGGDYNPDQWPEEVWDDDIRLMKKAGVNL | Count: 20
2. Motif: SKRRKHSWPQPLKGAESRLWYGGDYNPDQWPEEVWDDDIRLMKKAGVNLV | Count: 20
3. Motif: KRRKHSWPQPLKGAESRLWYGGDYNPDQWPEEVWDDDIRLMKKAGVNLVS | Count: 20
4. Motif: RRKHSWPQPLKGAESRLWYGGDYNPDQWPEEVWDDDIRLMKKAGVNLVSV | Count: 20
5. Motif: RKHSWPQPLKGAESRLWYGGDYNPDQWPEEVWDDDIRLMKKAGVNLVSVG | Count: 20
6. Motif: KHSWPQPLKGAESRLWYGGDYNPDQWPEEVWDDDIRLMKKAGVNLVSVGI | Count: 20
7. Motif: HSWPQPLKGAESRLWYGGDYNPDQWPEEVWDDDIRLMKKAGVNLVSVGIF | Count: 20
8. Motif: SWPQPLKGAESRLWYGGDYNPDQWPEEVWDDDIRLMKKAGVNLVSVGIFS | Count: 20
9. Motif: WPQPLKGAESRLWYGGDYNPDQWPEEVWDDDIRLMKKAGVNLVSVGIFSW | Count: 20
10. Motif: PQPLKGAESRLWYGGDYNPDQWPEEVWDDDIRLMKKAGVNLVSVGIFSWA | Count: 20
