In [3]:
import re

def read_fasta_file(filename):
    """
    Read a FASTA file and return a dictionary mapping sequence IDs to sequences
    """
    sequences = {}
    with open(filename, 'r') as f:
        seq_id = None
        seq = []
        for line in f:
            line = line.strip()
            if line.startswith('>'):
                if seq_id is not None:
                    sequences[seq_id] = ''.join(seq)
                seq_id = line[1:]
                seq = []
            else:
                seq.append(line)
        if seq_id is not None:
            sequences[seq_id] = ''.join(seq)
    return sequences

def calculate_gc_content(seq):
    """
    Calculate the GC content of a given sequence
    """
    gc_count = 0
    for base in seq:
        if base in 'GCgc':
            gc_count += 1
    return (gc_count / len(seq)) * 100

def calculate_cpg_ratio(seq):
    """
    Calculate the CpG ratio of a given sequence
    """
    c_count = seq.count('C')
    g_count = seq.count('G')
    cg_count = seq.count('CG')
    if c_count == 0 or g_count == 0:
        return 0
    return (cg_count / c_count) * (len(seq) / g_count)

def find_cpg_islands(seq, min_length=200, cpg_ratio=0.6, gc_content=50, min_island_length=200):
    """
    Find CpG islands in a given sequence that are longer than a certain threshold
    """
    # Filter by GC content
    if calculate_gc_content(seq) < gc_content:
        return []

    # Find regions with high CpG ratio
    cpg_regions = []
    for match in re.finditer(r'(CG+)', seq):
        cpg_start, cpg_end = match.span()
        cpg_seq = seq[cpg_start:cpg_end]
        if calculate_cpg_ratio(cpg_seq) > cpg_ratio:
            cpg_regions.append((cpg_start, cpg_end))

    # Combine adjacent CpG-rich regions
    cpg_islands = []
    current_island_start = None
    current_island_end = None
    for start, end in cpg_regions:
        if current_island_start is None:
            current_island_start = start
            current_island_end = end
        elif start - current_island_end <= min_length:
            current_island_end = end
        else:
            if current_island_end - current_island_start >= min_length:
                cpg_islands.append((current_island_start, current_island_end))
            current_island_start = start
            current_island_end = end
    if current_island_end - current_island_start >= min_length:
        cpg_islands.append((current_island_start, current_island_end))

    # Filter by island length
    cpg_islands = [(start, end) for start, end in cpg_islands if end - start >= min_island_length]

    return cpg_islands

def main(fasta_file):
    """
    Main function that reads a FASTA file and finds CpG islands for each sequence
    """
    sequences = read_fasta_file(fasta_file)
    for seq_id, seq in sequences.items():
        cpg_islands = find_cpg_islands(seq)
        print(f'{seq_id}\t{len(seq)}\t{len(cpg_islands)}\t


SyntaxError: EOL while scanning string literal (2415548650.py, line 91)

In [4]:
import re

def read_fasta_file(filename):
    """
    Read a FASTA file and return a dictionary mapping sequence IDs to sequences
    """
    sequences = {}
    with open(filename, 'r') as f:
        seq_id = None
        seq = []
        for line in f:
            line = line.strip()
            if line.startswith('>'):
                if seq_id is not None:
                    sequences[seq_id] = ''.join(seq)
                seq_id = line[1:]
                seq = []
            else:
                seq.append(line)
        if seq_id is not None:
            sequences[seq_id] = ''.join(seq)
    return sequences

def calculate_gc_content(seq):
    """
    Calculate the GC content of a given sequence
    """
    gc_count = 0
    for base in seq:
        if base in 'GCgc':
            gc_count += 1
    return (gc_count / len(seq)) * 100

def calculate_cpg_ratio(seq):
    """
    Calculate the CpG ratio of a given sequence
    """
    c_count = seq.count('C')
    g_count = seq.count('G')
    cg_count = seq.count('CG')
    if c_count == 0 or g_count == 0:
        return 0
    return (cg_count / c_count) * (len(seq) / g_count)

def find_cpg_islands(seq, min_length=200, cpg_ratio=0.6, gc_content=50, min_island_length=200):
    """
    Find CpG islands in a given sequence that are longer than a certain threshold
    """
    # Filter by GC content
    if calculate_gc_content(seq) < gc_content:
        return []

    # Find regions with high CpG ratio
    cpg_regions = []
    for match in re.finditer(r'(CG+)', seq):
        cpg_start, cpg_end = match.span()
        cpg_seq = seq[cpg_start:cpg_end]
        if calculate_cpg_ratio(cpg_seq) > cpg_ratio:
            cpg_regions.append((cpg_start, cpg_end))

    # Combine adjacent CpG-rich regions
    cpg_islands = []
    current_island_start = None
    current_island_end = None
    for start, end in cpg_regions:
        if current_island_start is None:
            current_island_start = start
            current_island_end = end
        elif start - current_island_end <= min_length:
            current_island_end = end
        else:
            if current_island_end - current_island_start >= min_length:
                cpg_islands.append((current_island_start, current_island_end))
            current_island_start = start
            current_island_end = end
    if current_island_end - current_island_start >= min_length:
        cpg_islands.append((current_island_start, current_island_end))

    # Filter by island length
    cpg_islands = [(start, end) for start, end in cpg_islands if end - start >= min_island_length]

    return cpg_islands

def main(fasta_file):
    """
    Main function that reads a FASTA file and finds CpG islands for each sequence
    """
    sequences = read_fasta_file(fasta_file)
    for seq_id, seq in sequences.items():
        cpg_islands = find_cpg_islands(seq)
        print(f'{seq_id}\t{len(seq)}\t{len(cpg_islands)}\t{cpg


SyntaxError: EOL while scanning string literal (2799801065.py, line 91)

In [5]:
from Bio import SeqIO

def calculate_cpg_islands(seq):
    cpg_islands = []
    cpg_count = 0
    in_island = False
    
    for i in range(len(seq)-1):
        if seq[i:i+2].upper() == 'CG':
            cpg_count += 1
            if not in_island:
                in_island = True
        else:
            if cpg_count > 0:
                cpg_islands.append(cpg_count)
                cpg_count = 0
                in_island = False
    
    if cpg_count > 0:
        cpg_islands.append(cpg_count)
    
    return cpg_islands

# Open the FASTA file and loop over each record
for record in SeqIO.parse("example.fasta", "fasta"):
    seq_id = record.id
    seq = record.seq
    
    # Calculate the number of CpG islands for this sequence
    cpg_islands = calculate_cpg_islands(seq)
    num_islands = len(cpg_islands)
    
    # Print the results
    print(f'{seq_id}\t{num_islands}')


FileNotFoundError: [Errno 2] No such file or directory: 'example.fasta'