<a href="https://colab.research.google.com/github/jb-bioinfo-acc/beginning-bioinformatics/blob/main/Computing_GC_Content.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [21]:
# Define a function to calculate GC content
def calculate_gc_content(dna_sequence):
    """Calculates the GC content of a DNA sequence."""
    if not dna_sequence:
        return 0.0
    gc_count = dna_sequence.count('G') + dna_sequence.count('C')
    return (gc_count / len(dna_sequence)) * 100

# Read and parse the FASTA file and extract sequences
def parse_fasta(filepath):
    """Reads and parses a FASTA file, returning a dictionary of headers and sequences."""
    sequences = {}
    current_sequence = ""
    current_header = None

    try:
        with open(filepath, 'r') as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue  # Skip empty lines

                if line.startswith('>'):
                    if current_header:
                        sequences[current_header] = current_sequence
                    current_header = line[1:] # Remove the '>'
                    current_sequence = ""
                else:
                    current_sequence += line

            # Add the last sequence after the loop
            if current_header:
                sequences[current_header] = current_sequence

    except FileNotFoundError:
        print(f"Error: File '{filepath}' not found.")
        return None

    return sequences

# Create a dummy FASTA file for demonstration
dummy_fasta_content = """>Rosalind_6404
CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCC
TCCCACTAATAATTCTGAGG
>Rosalind_5959
CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCT
ATATCCATTTGTCAGCAGACACGC
>Rosalind_0808
CCACCCTCGTGGTATGGCTAGGCATTCAGGAACCGGAGAACGCTTCAGACCAGCCCGGAC
TGGGAACCTGCGGGCAGTAGGTGGAAT
"""
with open("rosalind_fasta_sample.fasta", "w") as f:
    f.write(dummy_fasta_content)


fasta_filepath = "rosalind_fasta_sample.fasta" # Replace with your file path
fasta_data = parse_fasta(fasta_filepath)

# Find the string with the highest GC content
highest_gc_content = -1.0
highest_gc_header = None

if fasta_data: # Ensure fasta_data was successfully parsed
    for header, sequence in fasta_data.items():
        gc_content = calculate_gc_content(sequence)
        if gc_content > highest_gc_content:
            highest_gc_content = gc_content
            highest_gc_header = header

# Display the result
if highest_gc_header:
    print(highest_gc_header)
    print(f"{highest_gc_content:.6f}") # Print with 6 decimal places as per sample output

RNA sequence: AUGGACGCAGGCAAUGAGAGCGUUAAACCGGGAAUCAGUAUGCGGUUCGAUCAUCUAGACCCUGCCGUGCACAAAUCUAUUGAGACGUAUAGGUCGUUCGUCUAUCAUUUCUUGCGUAAUGAGACAGGGAGGAGUUCCUAUGUUAGCAGCUUCGCUGUCGUGCUGGUGCUGCGCGAUCCUACAAGUGUGUGGGCUAUAUUUGACGGCAUCUCUUAUACUGGAAGUGUGAAGAACUGGCCCACCUCACCAACCGGUCGUGCACCGAUGAGCAUAUACAUUGUUUUACCGUGGACCGGUACGGGAAAUAGCCACUCAGCAGUUUACAAGUGUCUCCACCCAACGCGAACUUGUUCAUCUAGUGACACAACGGGUAAACACAUAGUUAUUAACGCGCUUAGGACUCGACGCUUUCCCAGCGAGGCGAACCUCCGUAUUCCCGAUACCUCAAGGCUGGAAGCGACUUCCGAGUUCCAAGGAGGGGAGUGCUUGCAACUCUGCUCUCACGCUCGCGAGAAGAUGUUGAUGGGAUUUUGGCCCAUAUCACAGUUAUGUGGUUUACCCUGGAGCCUCGUUCUUGGGCCACGCUGGGGCAUGCUUAACGUACCUUGCGUAUACAAACCGAGUGCGGUUAUUCCGGGUCGGGGGCACAACCUCCACGAAUUUAUGGCAGAACCACAGGAGGUUGGUGGAAUGCAUAAUCGUCCCAAGAAGAGGGACCAUGCGUAUGGUUUACAUGGGGCGGCGUAUAGGCCGUCUAUCUCAAGGUUGGCAUGCCUCUGCCAUUCAGGCUGGAGUCGGGCAGAAGGGAGAAUGCCCGUCGACACUCGUGAGAACUCCGCUACAAUGGUAUGCAGAUCGAUCCACUGUCUUCGAAUAAACUACGGAGGGAUCGAAUCACAAGUUGCUAGAAUUUGUUACGGCAUUCCGUACGAUGCUGCCUUUACAUUUUUCAAUGGCGGGUUCAAGCGUGCUGGUGACUGUAGUUC