In [7]:
!pip install biopython
from Bio import SeqIO
import gzip

# Paths to files
genome_file_path = '/content/drive/MyDrive/Vibrio_cholerae.GFC_11.dna.toplevel.fa'
gff3_file_path = '/content/drive/MyDrive/Vibrio_cholerae.GFC_11.37.gff3'
# Read genome data
def read_genome(file_path):
    with open(file_path, "r") as file:
        for record in SeqIO.parse(file, "fasta"):
            yield str(record.seq)

# Parse the GFF3 file for CDS information
def parse_gff3(file_path):
    cds_regions = []
    with open(file_path, "rt") as file:
        for line in file:
            if not line.startswith("#"):
                parts = line.strip().split("\t")
                if len(parts) > 2 and parts[2] == "CDS" and parts[6] == "+":
                    start = int(parts[3]) - 1  # Convert to 0-based index
                    end = int(parts[4])  # 1-based end
                    cds_regions.append((start, end))
    print(cds_regions)
    return cds_regions

# Calculate intergenic regions
def calculate_intergenic_regions(cds_regions, genome_length):
    intergenic_lengths = []
    if not cds_regions:
        return intergenic_lengths

    cds_regions.sort()  # Ensure the CDS regions are in order
    previous_end = 0
    for start, end in cds_regions:
        intergenic_length = start - previous_end
        if intergenic_length > 0:
            intergenic_lengths.append(intergenic_length)
        previous_end = end
    print(intergenic_lengths)

    # Final intergenic region from the last CDS to the end of genome
    # if previous_end < genome_length:
    #     intergenic_lengths.append(genome_length - previous_end)
    # print(intergenic_lengths)
    return intergenic_lengths


# Calculate average length and nucleotide frequencies
def calculate_statistics(genome, cds_regions):
    intergenic_lengths = calculate_intergenic_regions(cds_regions, len(genome))
    avg_intergenic_length = sum(intergenic_lengths) / len(intergenic_lengths) if intergenic_lengths else 0
    avg_cds_length = sum(end - start for start, end in cds_regions) / len(cds_regions) if cds_regions else 0

    # Nucleotide frequency in intergenic regions
    nucleotide_counts = {nuc: 0 for nuc in "ATCG"}
    for start, end in zip([0] + [end for _, end in cds_regions[:-1]], [start for start, _ in cds_regions]):
        intergenic_seq = genome[start:end]
        for nuc in intergenic_seq:
            if nuc in nucleotide_counts:
                nucleotide_counts[nuc] += 1

    total_intergenic_bases = sum(nucleotide_counts.values())
    nucleotide_freqs = {nuc: count / total_intergenic_bases for nuc, count in nucleotide_counts.items()}

    # Codon frequency in CDS regions
    codon_counts = {}
    for start, end in cds_regions:
        cds_seq = genome[start:end]
        for i in range(0, len(cds_seq) - 2, 3):
            codon = cds_seq[i:i+3]
            if len(codon) == 3:
                codon_counts[codon] = codon_counts.get(codon, 0) + 1

    total_codons = sum(codon_counts.values())
    codon_freqs = {codon: count / total_codons for codon, count in codon_counts.items()}

    return avg_intergenic_length, avg_cds_length, nucleotide_freqs, codon_freqs

# Main execution
genome = "".join(read_genome(genome_file_path))
cds_regions = parse_gff3(gff3_file_path)
avg_intergenic_length, avg_cds_length, nucleotide_freqs, codon_freqs = calculate_statistics(genome, cds_regions)

# Print results
print("Average length of intergenic regions:", avg_intergenic_length)
print("Average length of CDS regions:", avg_cds_length)
print("Nucleotide frequency in intergenic regions:", nucleotide_freqs)
print("Codon frequency in CDS regions:", codon_freqs)


[(653, 2138), (2359, 4096), (8477, 9080), (17646, 18597), (28794, 29145), (29197, 29656), (31868, 32501), (32500, 34396), (34537, 37354), (38982, 39663), (42689, 44636), (46030, 46183), (46330, 47152), (48619, 49648), (49773, 50979), (50981, 52079), (52101, 52857), (53101, 53773), (53886, 54216), (54802, 55996), (59619, 60093), (64315, 65455), (69768, 70875), (82399, 83371), (83621, 84119), (84128, 85658), (92204, 92687), (92683, 92881), (96410, 97115), (97140, 98463), (98675, 99449), (99546, 100467), (104095, 105727), (105838, 107062), (107182, 108061), (115574, 116951), (117273, 117888), (118186, 119440), (119532, 120894), (121059, 122247), (122246, 124205), (124211, 125582), (126409, 128041), (128187, 129588), (131026, 132217), (132532, 134422), (134427, 135129), (137562, 139200), (139218, 141384), (147102, 148386), (150530, 151010), (157071, 157746), (168734, 169634), (173907, 174645), (175093, 177379), (177474, 178608), (178607, 178886), (181028, 181928), (182194, 182707), (182937