# In silico translation of DNA sequences to amino acid sequences 

In [15]:
pip install biopython

Defaulting to user installation because normal site-packages is not writeable
Collecting biopython
  Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m0:01[0m:01[0m
Installing collected packages: biopython
Successfully installed biopython-1.84
Note: you may need to restart the kernel to use updated packages.


In [1]:
# Load packages 
from Bio import SeqIO
from Bio.Seq import Seq
import os

# Input and output file names
input_file = "data/MCF7_ORFs_min18.fasta"
output_file = "data/MCF7_altORFs_proteins.fasta"

In [2]:
# Check if output file exists; if not, raise an error
if not os.path.exists(output_file):
    print(f"Error: The file {output_file} doesn't exist. Creating the file...")
    # Create an empty file
    with open(output_file, 'w') as f:
        pass

# Open the output file for writing translated sequences
with open(output_file, "w") as output_handle:
    for record in SeqIO.parse(input_file, "fasta"):
        nucleotide_seq = record.seq
        protein_seq = nucleotide_seq.translate(to_stop=True)
        output_handle.write(f">{record.id}\n{protein_seq}\n")

print(f"Translation completed. Protein sequences saved in {output_file}")

Error: The file data/MCF7_altORFs_proteins.fasta doesn't exist. Creating the file...
Translation completed. Protein sequences saved in data/MCF7_altORFs_proteins.fasta


In [3]:
# Check the first few sequences from the input and output file for visual inspection
## Set of sequences (rows) to display
num_sequences_to_display = 5  # Adjust this as needed

## Read and print 
with open(input_file, "r") as input_handle:
    records = SeqIO.parse(input_handle, "fasta")
    print(f"Nucleic acid Sequences")
    for i, record in enumerate(records):
        if i >= num_sequences_to_display:
            break
        print(f"Record ID: {record.id}")
        print(f"DNA Sequence: {record.seq}\n")
        
with open(output_file, "r") as output_handle:
    records = SeqIO.parse(output_handle, "fasta")
    print(f"Amino acid Sequences")
    for i, record in enumerate(records):
        if i >= num_sequences_to_display:
            break
        print(f"Record ID: {record.id}")
        print(f"Protein Sequence: {record.seq}\n")

Nucleic acid Sequences
Record ID: SRR6730014.4_ORF.1
DNA Sequence: CTGCGCGTGCACGGCGCCACCCTCCCCCCGCCCCAGCCCGGCGCCGTGCGACTTTGCTCCTGCAACACACGCCCCCCCAACCCCCGCCCG

Record ID: SRR6730014.5_ORF.1
DNA Sequence: CTGCGCGTGCACGGCGCCTCCCTCCCCCCGCCCCAGCCCGGCGCCGTGCGACTTTGCTCCTGCACCACTCGCCCCCCCACCCCCCGCCCG

Record ID: SRR6730014.6_ORF.1
DNA Sequence: TTGCGCAGTCATTTTCAACACCGGCAATGCAGCAAAATCATCAGTGGAAATGTAAAAAAATACACATGGCCAGGCCCCAGCCCAAATCAC

Record ID: SRR6730014.6_ORF.2
DNA Sequence: CTGGGGCCTGGCCATGTGTATTTTTTTACATTTCCACTGATGATTTTGCTGCATTGCCGGTGT

Record ID: SRR6730014.6_ORF.3
DNA Sequence: TTGGGCTGGGGCCTGGCCATGTGTATTTTTTTACATTTCCAC

Amino acid Sequences
Record ID: SRR6730014.4_ORF.1
Protein Sequence: LRVHGATLPPPQPGAVRLCSCNTRPPNPRP

Record ID: SRR6730014.5_ORF.1
Protein Sequence: LRVHGASLPPPQPGAVRLCSCTTRPPTPRP

Record ID: SRR6730014.6_ORF.1
Protein Sequence: LRSHFQHRQCSKIISGNVKKYTWPGPSPNH

Record ID: SRR6730014.6_ORF.2
Protein Sequence: LGPGHVYFFTFPLMILLHCRC

Record ID: SRR6730014.6_ORF.3
Protein Se