In [1]:
import re

def read_fasta_file(filename):
    """
    Read a FASTA file and return a dictionary mapping sequence IDs to sequences
    """
    sequences = {}
    with open(filename, 'r') as f:
        seq_id = None
        seq = []
        for line in f:
            line = line.strip()
            if line.startswith('>'):
                if seq_id is not None:
                    sequences[seq_id] = ''.join(seq)
                seq_id = line[1:]
                seq = []
            else:
                seq.append(line)
        if seq_id is not None:
            sequences[seq_id] = ''.join(seq)
    return sequences

def calculate_gc_content(seq):
    """
    Calculate the GC content of a given sequence
    """
    gc_count = 0
    for base in seq:
        if base in 'GCgc':
            gc_count += 1
    return (gc_count / len(seq)) * 100

def calculate_cpg_ratio(seq):
    """
    Calculate the CpG ratio of a given sequence
    """
    c_count = seq.count('C')
    g_count = seq.count('G')
    cg_count = seq.count('CG')
    if c_count == 0 or g_count == 0:
        return 0
    return (cg_count / c_count) * (len(seq) / g_count)

def find_cpg_islands(seq, min_length=200, cpg_ratio=0.6, gc_content=50, min_island_length=200):
    """
    Find CpG islands in a given sequence that are longer than a certain threshold
    """
    # Filter by GC content
    if calculate_gc_content(seq) < gc_content:
        return []

    # Find regions with high CpG ratio
    cpg_regions = []
    for match in re.finditer(r'(CG+)', seq):
        cpg_start, cpg_end = match.span()
        cpg_seq = seq[cpg_start:cpg_end]
        if calculate_cpg_ratio(cpg_seq) > cpg_ratio:
            cpg_regions.append((cpg_start, cpg_end))

    # Combine adjacent CpG-rich regions
    cpg_islands = []
    current_island_start = None
    current_island_end = None
    for start, end in cpg_regions:
        if current_island_start is None:
            current_island_start = start
            current_island_end = end
        elif start - current_island_end <= min_length:
            current_island_end = end
        else:
            if current_island_end - current_island_start >= min_length:
                cpg_islands.append((current_island_start, current_island_end))
            current_island_start = start
            current_island_end = end
    if current_island_end - current_island_start >= min_length:
        cpg_islands.append((current_island_start, current_island_end))

    # Filter by island length
    cpg_islands = [(start, end) for start, end in cpg_islands if end - start >= min_island_length]

    return cpg_islands

def main(fasta_file):
    """
    Main function that reads a FASTA file and finds CpG islands for each sequence
    """
    sequences = read_fasta_file(fasta_file)
    for seq_id, seq in sequences.items():
        cpg_islands = find_cpg_islands(seq)
        print(f'{seq_id}\t{len(seq)}\t{len(cpg_islands)}\t{cpg


SyntaxError: EOL while scanning string literal (2799801065.py, line 91)

In [2]:
from Bio import SeqIO

def calculate_cpg_islands(seq):
    cpg_islands = []
    cpg_count = 0
    in_island = False
    
    for i in range(len(seq)-1):
        if seq[i:i+2].upper() == 'CG':
            cpg_count += 1
            if not in_island:
                in_island = True
        else:
            if cpg_count > 0:
                cpg_islands.append(cpg_count)
                cpg_count = 0
                in_island = False
    
    if cpg_count > 0:
        cpg_islands.append(cpg_count)
    
    return cpg_islands

# Open the FASTA file and loop over each record
for record in SeqIO.parse("lncRNA.fa", "fasta"):
    seq_id = record.id
    seq = record.seq
    
    # Calculate the number of CpG islands for this sequence
    cpg_islands = calculate_cpg_islands(seq)
    num_islands = len(cpg_islands)
    
    # Print the results
#     print(f'{seq_id}\t{num_islands}')
    print(f'\t{num_islands}')


	23
	32
	11
	20
	9
	14
	30
	3
	5
	12
	9
	16
	4
	40
	2
	29
	17
	6
	34
	0
	6
	17
	14
	22
	7
	11
	10
	11
	13
	10
	9
	4
	19
	13
	14
	17
	12
	11
	7
	11
	10
	13
	10
	5
	2
	2
	3
	1
	15
	10
	189
	36
	2
	52
	47
	46
	43
	41
	38
	35
	40
	37
	31
	36
	39
	40
	36
	23
	20
	28
	4
	5
	3
	2
	6
	9
	9
	11
	11
	10
	6
	14
	5
	23
	32
	14
	49
	78
	9
	51
	46
	169
	109
	211
	48
	100
	153
	36
	78
	152
	75
	113
	24
	60
	70
	29
	64
	70
	69
	55
	27
	74
	58
	59
	70
	99
	87
	96
	61
	53
	57
	57
	30
	64
	98
	60
	59
	25
	53
	17
	48
	25
	64
	39
	21
	19
	4
	24
	82
	13
	134
	47
	22
	52
	7
	18
	52
	41
	30
	35
	45
	43
	56
	37
	19
	15
	33
	18
	82
	39
	163
	51
	15
	18
	8
	59
	59
	101
	96
	94
	95
	52
	52
	59
	28
	66
	67
	65
	16
	33
	60
	28
	21
	22
	33
	30
	23
	70
	92
	81
	132
	17
	209
	103
	41
	94
	44
	37
	23
	42
	52
	64
	37
	11
	25
	134
	33
	21
	11
	193
	280
	23
	5
	14
	31
	109
	83
	55
	21
	136
	35
	136
	26
	54
	12
	19
	7
	4
	19
	11
	25
	15
	26
	34
	33
	55
	118
	7
	27
	261
	16
	50
	26
	51
	53
	12
	108
	78
	40
	78
	1
	66
	84
	5

	11
	6
	5
	6
	3
	95
	38
	100
	37
	25
	100
	19
	26
	1
	6
	5
	5
	123
	41
	21
	36
	4
	26
	6
	11
	2
	4
	31
	46
	11
	11
	5
	11
	17
	24
	3
	3
	28
	32
	16
	18
	7
	60
	44
	20
	33
	19
	46
	19
	35
	39
	43
	15
	17
	18
	17
	16
	37
	12
	8
	10
	14
	12
	5
	6
	9
	7
	12
	5
	0
	3
	2
	5
	6
	9
	6
	5
	6
	11
	11
	14
	13
	15
	6
	8
	7
	6
	5
	5
	6
	11
	12
	6
	5
	5
	6
	9
	13
	0
	14
	4
	25
	54
	12
	31
	10
	87
	0
	2
	19
	14
	8
	3
	11
	13
	5
	6
	26
	25
	39
	48
	36
	24
	35
	3
	31
	3
	5
	6
	5
	101
	55
	49
	10
	41
	9
	4
	2
	5
	1
	5
	5
	14
	37
	9
	24
	12
	93
	12
	21
	51
	47
	23
	60
	41
	168
	43
	54
	36
	61
	25
	16
	19
	13
	4
	4
	10
	37
	29
	14
	31
	5
	1
	28
	4
	19
	35
	32
	23
	18
	18
	7
	3
	12
	27
	3
	14
	16
	16
	16
	17
	16
	16
	4
	24
	20
	21
	39
	44
	37
	40
	50
	30
	36
	27
	64
	34
	30
	23
	16
	16
	8
	32
	4
	13
	7
	4
	4
	9
	11
	2
	56
	9
	3
	7
	24
	10
	4
	7
	5
	2
	9
	9
	17
	14
	14
	46
	31
	17
	45
	43
	43
	62
	31
	120
	9
	10
	16
	13
	10
	19
	21
	18
	25
	20
	39
	4
	4
	9
	4
	16
	13
	16
	13
	36
	46
	85
	84
	28
	60
	37
	35
