In [2]:
from Bio.Seq import Seq

# Create a sequence
my_seq = Seq("AGTACACTGGT")
print(my_seq)
# Output: AGTACACTGGT

AGTACACTGGT


In [3]:
from Bio.Seq import Seq

my_seq = Seq("AGTACACTGGT")

# String-like operations work
print(len(my_seq))        # 11
print(my_seq[0])          # A
print(my_seq[0:3])        # AGT

# But you get biological superpowers!
print(my_seq.complement())           # TCATGTGACCA
print(my_seq.reverse_complement())   # ACCAGTGTACT

11
A
AGT
TCATGTGACCA
ACCAGTGTACT


In [4]:
from Bio.Seq import Seq

# DNA sequence
dna_seq = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG")

# Transcription (DNA → RNA)
rna_seq = dna_seq.transcribe()
print(rna_seq)
# AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG

# Translation (RNA → Protein)
protein_seq = rna_seq.translate()
print(protein_seq)
# MAIVMGR*KGAR*

AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG
MAIVMGR*KGAR*


In [16]:
from Bio.SeqUtils import gc_fraction
import re
# Mystery sequence from a research paper
mystery_seq = Seq("CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATT")

# Your mission:
print(len(mystery_seq))
# 2. Get the reverse complement
print(mystery_seq.reverse_complement()) 
# 3. Calculate GC content (hint: look up Bio.SeqUtils)
print(gc_fraction(mystery_seq))
# 4. Find all start codons (ATG)
start_codon = "ATG"
positions = []
seq_str = str(mystery_seq)
for match in re.finditer(start_codon, seq_str):
    start_codon.append(match.start())
print(f"Start codons at positions: {start_codon}")

length = len(mystery_seq)
rev_comp = mystery_seq.reverse_complement()
# Your turn for the rest!

41
AATGATCCTTCCGCAGGTTCACCTACGGAAACCTTGTTACG
0.4878048780487805
Start codons at positions: ATG


In [20]:
from Bio import SeqIO

# Parse a FASTA file
for seq_record in SeqIO.parse("/Users/user/Desktop/python/data/ls_orchid.fasta", "fasta"):
    print(f"ID: {seq_record.id}")
    print(f"Length: {len(seq_record)}")
    print(f"Sequence: {seq_record.seq[:50]}...")
    print("---")

ID: gi|2765658|emb|Z78533.1|CIZ78533
Length: 740
Sequence: CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGATGAGACC...
---
ID: gi|2765657|emb|Z78532.1|CCZ78532
Length: 753
Sequence: CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACA...
---
ID: gi|2765656|emb|Z78531.1|CFZ78531
Length: 748
Sequence: CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACA...
---
ID: gi|2765655|emb|Z78530.1|CMZ78530
Length: 744
Sequence: CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAAACA...
---
ID: gi|2765654|emb|Z78529.1|CLZ78529
Length: 733
Sequence: ACGGCGAGCTGCCGAAGGACATTGTTGAGACAGCAGAATATACGATTGAG...
---
ID: gi|2765652|emb|Z78527.1|CYZ78527
Length: 718
Sequence: CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACA...
---
ID: gi|2765651|emb|Z78526.1|CGZ78526
Length: 730
Sequence: CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACA...
---
ID: gi|2765650|emb|Z78525.1|CAZ78525
Length: 704
Sequence: TGTTGAGATAGCAGAATATACATCGAGTGAATCCGGAGGACCTGTGGTTA...
---
ID: gi|2765649|emb|Z78524.1|CFZ78524
Length: 740
Sequence: CGTAA

In [22]:
from Bio import SeqIO

# Read just the first record
record = next(SeqIO.parse("/Users/user/Desktop/python/data/ls_orchid.fasta", "fasta"))

print(f"ID: {record.id}")
print(f"Description: {record.description}")
print(f"Sequence type: {type(record.seq)}")
print(f"Full sequence: {record.seq}")

ID: gi|2765658|emb|Z78533.1|CIZ78533
Description: gi|2765658|emb|Z78533.1|CIZ78533 C.irapeanum 5.8S rRNA gene and ITS1 and ITS2 DNA
Sequence type: <class 'Bio.Seq.Seq'>
Full sequence: CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGATGAGACCGTGGAATAAACGATCGAGTGAATCCGGAGGACCGGTGTACTCAGCTCACCGGGGGCATTGCTCCCGTGGTGACCCTGATTTGTTGTTGGGCCGCCTCGGGAGCGTCCATGGCGGGTTTGAACCTCTAGCCCGGCGCAGTTTGGGCGCCAAGCCATATGAAAGCATCACCGGCGAATGGCATTGTCTTCCCCAAAACCCGGAGCGGCGGCGTGCTGTCGCGTGCCCAATGAATTTTGATGACTCTCGCAAACGGGAATCTTGGCTCTTTGCATCGGATGGAAGGACGCAGCGAAATGCGATAAGTGGTGTGAATTGCAAGATCCCGTGAACCATCGAGTCTTTTGAACGCAAGTTGCGCCCGAGGCCATCAGGCTAAGGGCACGCCTGCTTGGGCGTCGCGCTTCGTCTCTCTCCTGCCAATGCTTGCCCGGCATACAGCCAGGCCGGCGTGGTGCGGATGTGAAAGATTGGCCCCTTGTGCCTAGGTGCGGCGGGTCCAAGAGCTGGTGTTTTGATGGCCCGGAACCCGGCAAGAGGTGGACGGATGCTGGCAGCAGCTGCCGTGCGAATCCCCCATGTTGTCGTGCTTGTCGGACAGGCAGGAGAACCCTTCCGAACCCCAATGGAGGGCGGTTGACCGCCATTCGGATGTGACCCCAGGTCAGGCGGGGGCACCCGCTGAGTTTACGC
