In [1]:
import Bio

In [2]:
Bio.__version__

'1.85'

SEQ OBJECT = MECHANISM FOR DEALING WITH SEQUENCES

In [3]:
from Bio.Seq import Seq 
my_seq = Seq("AGTACATGGT")
print(my_seq)

AGTACATGGT


In [4]:
print(my_seq.complement())
print(my_seq.reverse_complement())

TCATGTACCA
ACCATGTACT


EXAMPLE 

In [5]:
from Bio import SeqIO
for seq_record in SeqIO.parse("ls_orchid.fasta","fasta"):
    print(seq_record.id)
    print(repr(seq_record.seq))
    print(len(seq_record))

gi|2765658|emb|Z78533.1|CIZ78533
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGATGAGACCGTGG...CGC')
740
gi|2765657|emb|Z78532.1|CCZ78532
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAACAG...GGC')
753
gi|2765656|emb|Z78531.1|CFZ78531
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAGCAG...TAA')
748
gi|2765655|emb|Z78530.1|CMZ78530
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAAACAACAT...CAT')
744
gi|2765654|emb|Z78529.1|CLZ78529
Seq('ACGGCGAGCTGCCGAAGGACATTGTTGAGACAGCAGAATATACGATTGAGTGAA...AAA')
733
gi|2765652|emb|Z78527.1|CYZ78527
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAGTAG...CCC')
718
gi|2765651|emb|Z78526.1|CGZ78526
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAGTAG...TGT')
730
gi|2765650|emb|Z78525.1|CAZ78525
Seq('TGTTGAGATAGCAGAATATACATCGAGTGAATCCGGAGGACCTGTGGTTATTCG...GCA')
704
gi|2765649|emb|Z78524.1|CFZ78524
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGATAGTAG...AGC')
740
gi|2765648|emb|Z78523.1|CHZ78523
Seq('CGTAACCAGGTTTCCGT

In [6]:
from Bio import SeqIO
for seq_record in SeqIO.parse("ls_orchid.gbk", "genbank"):
    print(seq_record.id)
    print(repr(seq_record.seq))
    print(len(seq_record))

Z78533.1
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGATGAGACCGTGG...CGC')
740
Z78532.1
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAACAG...GGC')
753
Z78531.1
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAGCAG...TAA')
748
Z78530.1
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAAACAACAT...CAT')
744
Z78529.1
Seq('ACGGCGAGCTGCCGAAGGACATTGTTGAGACAGCAGAATATACGATTGAGTGAA...AAA')
733
Z78527.1
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAGTAG...CCC')
718
Z78526.1
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAGTAG...TGT')
730
Z78525.1
Seq('TGTTGAGATAGCAGAATATACATCGAGTGAATCCGGAGGACCTGTGGTTATTCG...GCA')
704
Z78524.1
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGATAGTAG...AGC')
740
Z78523.1
Seq('CGTAACCAGGTTTCCGTAGGTGAACCTGCGGCAGGATCATTGTTGAGACAGCAG...AAG')
709
Z78522.1
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAGCAG...GAG')
700
Z78521.1
Seq('GTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAGTAGAATATATGATCGAGT...ACC')
726
Z78520.1
Seq('CGTAACAAGGTTTC

SEQUENCE OBJECTS 

In [7]:
for index, letter in enumerate(my_seq):
    print("%i %s" % (index,letter))

print(len(my_seq))

0 A
1 G
2 T
3 A
4 C
5 A
6 T
7 G
8 G
9 T
10


In [8]:
my_seq[0]

'A'

In [9]:
my_seq.count("GG")

1

In [10]:
100 * (my_seq.count("G") + my_seq.count("C")) / len(my_seq)

40.0

In [11]:
from Bio.SeqUtils import gc_fraction
gc_fraction(my_seq)

0.4

Slicing a sequence

In [12]:
my_seq[4:12]

Seq('CATGGT')

Turning Seq objects into strings

In [13]:
str(my_seq)

'AGTACATGGT'

In [14]:
print(my_seq)

AGTACATGGT


In [15]:
fasta_format_string = ">Name\n%s\n" % my_seq
print(fasta_format_string)

>Name
AGTACATGGT



Concatenating or adding sequences

In [16]:
seq1 = Seq("ACGT")
seq2 = Seq("AACCGG")
seq1 + seq2

Seq('ACGTAACCGG')

In [17]:
list_of_seqs = [Seq("ACGT"), Seq("AACC"), Seq("GGTT")]
concatenated = Seq("")
for s in list_of_seqs:
    concatenated += s

concatenated

Seq('ACGTAACCGGTT')

In [18]:
contigs = [Seq("ATG"), Seq("ATCCCG"), Seq("TTGCA")]
spacer = Seq("N" * 10)
spacer.join(contigs)

Seq('ATGNNNNNNNNNNATCCCGNNNNNNNNNNTTGCA')

Changing case

In [19]:
dna_seq = Seq("acgtACGT")
dna_seq

Seq('acgtACGT')

In [20]:
dna_seq.upper()

Seq('ACGTACGT')

In [21]:
"GTAC" in dna_seq

False

In [22]:
"GTAC" in dna_seq.upper()

True

Nucleotide sequences and (reverse) complements

In [23]:
my_seq.complement()

Seq('TCATGTACCA')

In [24]:
my_seq.reverse_complement()

Seq('ACCATGTACT')

In [25]:
my_seq[::-1]

Seq('TGGTACATGA')

Transcription

In [26]:
coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG")
coding_dna

Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')

In [27]:
template_dna = coding_dna.reverse_complement()
template_dna

Seq('CTATCGGGCACCCTTTCAGCGGCCCATTACAATGGCCAT')

In [28]:
messenger_rna = coding_dna.transcribe()
messenger_rna

Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')

In [29]:
messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG")
messenger_rna
messenger_rna.back_transcribe()

Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')

Translation

In [30]:
messenger_rna.translate()

Seq('MAIVMGR*KGAR*')

In [31]:
coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG")
coding_dna

Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')

In [32]:
coding_dna.translate()

Seq('MAIVMGR*KGAR*')

need to specify relevant genetic code to be used by telling the table name or number 

In [33]:
coding_dna.translate(table="Vertebrate Mitochondrial")

Seq('MAIVMGRWKGAR*')

In [34]:
coding_dna.translate(table=2)

Seq('MAIVMGRWKGAR*')

In [35]:
coding_dna.translate()

Seq('MAIVMGR*KGAR*')

In [36]:
coding_dna.translate(to_stop=True)

Seq('MAIVMGR')

In [37]:
coding_dna.translate(table=2)

Seq('MAIVMGRWKGAR*')

In [38]:
coding_dna.translate(table=2, to_stop=True)

Seq('MAIVMGRWKGAR')

In [39]:
gene = Seq(
    "GTGAAAAAGATGCAATCTATCGTACTCGCACTTTCCCTGGTTCTGGTCGCTCCCATGGCA"
    "GCACAGGCTGCGGAAATTACGTTAGTCCCGTCAGTAAAATTACAGATAGGCGATCGTGAT"
    "AATCGTGGCTATTACTGGGATGGAGGTCACTGGCGCGACCACGGCTGGTGGAAACAACAT"
    "TATGAATGGCGAGGCAATCGCTGGCACCTACACGGACCGCCGCCACCGCCGCGCCACCAT"
    "AAGAAAGCTCCTCATGATCATCACGGCGGTCATGGTCCAGGCAAACATCACCGCTAA"
)

In [40]:
gene.translate(table="Bacterial")

Seq('VKKMQSIVLALSLVLVAPMAAQAAEITLVPSVKLQIGDRDNRGYYWDGGHWRDH...HR*')

In [41]:
gene.translate(table="Bacterial", to_stop=True)

Seq('VKKMQSIVLALSLVLVAPMAAQAAEITLVPSVKLQIGDRDNRGYYWDGGHWRDH...HHR')

In [42]:
gene.translate(table="Bacterial", cds=True)

Seq('MKKMQSIVLALSLVLVAPMAAQAAEITLVPSVKLQIGDRDNRGYYWDGGHWRDH...HHR')

Translation Tables

In [45]:
from Bio.Data import CodonTable
standard_table = CodonTable.unambiguous_dna_by_name["Standard"]
mito_table = CodonTable.unambiguous_dna_by_name["Vertebrate Mitochondrial"]
mito_table

NCBICodonTableDNA(id=2, names=['Vertebrate Mitochondrial', 'SGC1'], ...)

In [46]:
standard_table = CodonTable.unambiguous_dna_by_id[1]
mito_table = CodonTable.unambiguous_dna_by_id[2]

In [47]:
print(standard_table)

Table 1 Standard, SGC0

  |  T      |  C      |  A      |  G      |
--+---------+---------+---------+---------+--
T | TTT F   | TCT S   | TAT Y   | TGT C   | T
T | TTC F   | TCC S   | TAC Y   | TGC C   | C
T | TTA L   | TCA S   | TAA Stop| TGA Stop| A
T | TTG L(s)| TCG S   | TAG Stop| TGG W   | G
--+---------+---------+---------+---------+--
C | CTT L   | CCT P   | CAT H   | CGT R   | T
C | CTC L   | CCC P   | CAC H   | CGC R   | C
C | CTA L   | CCA P   | CAA Q   | CGA R   | A
C | CTG L(s)| CCG P   | CAG Q   | CGG R   | G
--+---------+---------+---------+---------+--
A | ATT I   | ACT T   | AAT N   | AGT S   | T
A | ATC I   | ACC T   | AAC N   | AGC S   | C
A | ATA I   | ACA T   | AAA K   | AGA R   | A
A | ATG M(s)| ACG T   | AAG K   | AGG R   | G
--+---------+---------+---------+---------+--
G | GTT V   | GCT A   | GAT D   | GGT G   | T
G | GTC V   | GCC A   | GAC D   | GGC G   | C
G | GTA V   | GCA A   | GAA E   | GGA G   | A
G | GTG V   | GCG A   | GAG E   | GGG G   | G
--+---------

In [48]:
print(mito_table)

Table 2 Vertebrate Mitochondrial, SGC1

  |  T      |  C      |  A      |  G      |
--+---------+---------+---------+---------+--
T | TTT F   | TCT S   | TAT Y   | TGT C   | T
T | TTC F   | TCC S   | TAC Y   | TGC C   | C
T | TTA L   | TCA S   | TAA Stop| TGA W   | A
T | TTG L   | TCG S   | TAG Stop| TGG W   | G
--+---------+---------+---------+---------+--
C | CTT L   | CCT P   | CAT H   | CGT R   | T
C | CTC L   | CCC P   | CAC H   | CGC R   | C
C | CTA L   | CCA P   | CAA Q   | CGA R   | A
C | CTG L   | CCG P   | CAG Q   | CGG R   | G
--+---------+---------+---------+---------+--
A | ATT I(s)| ACT T   | AAT N   | AGT S   | T
A | ATC I(s)| ACC T   | AAC N   | AGC S   | C
A | ATA M(s)| ACA T   | AAA K   | AGA Stop| A
A | ATG M(s)| ACG T   | AAG K   | AGG Stop| G
--+---------+---------+---------+---------+--
G | GTT V   | GCT A   | GAT D   | GGT G   | T
G | GTC V   | GCC A   | GAC D   | GGC G   | C
G | GTA V   | GCA A   | GAA E   | GGA G   | A
G | GTG V(s)| GCG A   | GAG E   | GGG G   

In [None]:
standard_table.start_codons

['TTG', 'CTG', 'ATG']

In [51]:
standard_table.stop_codons

['TAA', 'TAG', 'TGA']

In [52]:
mito_table.start_codons

['ATT', 'ATC', 'ATA', 'ATG', 'GTG']

In [53]:
mito_table.stop_codons

['TAA', 'TAG', 'AGA', 'AGG']

Comparing Seq objects

In [55]:
seq1 = Seq("ACGT")
seq1

Seq('ACGT')

In [56]:
"ACGT" == seq1

True

Sequences with unknown sequence contents

In [57]:
unknown_seq = Seq(None, 10)

In [58]:
print(unknown_seq)

UndefinedSequenceError: Sequence content is undefined

Sequences with partially defined sequence contents

In [59]:
seq = Seq({117512683: "TTGAAAACCTGAATGTGAGAGTCAGTCAAGGATAGT"}, length=159345973)

In [60]:
seq

Seq({117512683: 'TTGAAAACCTGAATGTGAGAGTCAGTCAAGGATAGT'}, length=159345973)

In [61]:
seq[1000:1020]

Seq(None, length=20)

In [62]:
seq[117512690:117512700]

Seq('CCTGAATGTG')

In [63]:
seq[117512670:117512690]

Seq({13: 'TTGAAAA'}, length=20)

In [64]:
seq[117512700:]

Seq({0: 'AGAGTCAGTCAAGGATAGT'}, length=41833273)

In [65]:
seq = Seq("ACGT")
undefined_seq = Seq(None, length=10)
seq + undefined_seq + seq

Seq({0: 'ACGT', 14: 'ACGT'}, length=18)

MutableSeq objects

In [66]:
my_seq = Seq("GCCATTGTAATGGGCCGCTGAAAGGGTGCCCGA")

In [67]:
my_seq[5] = "G"

TypeError: 'Seq' object does not support item assignment

In [69]:
from Bio.Seq import MutableSeq
mutable_seq = MutableSeq(my_seq)
mutable_seq

MutableSeq('GCCATTGTAATGGGCCGCTGAAAGGGTGCCCGA')

In [70]:
mutable_seq[5] = "C"

In [71]:
mutable_seq

MutableSeq('GCCATCGTAATGGGCCGCTGAAAGGGTGCCCGA')

In [72]:
mutable_seq.remove("T")


In [73]:
mutable_seq

MutableSeq('GCCACGTAATGGGCCGCTGAAAGGGTGCCCGA')

In [77]:
mutable_seq.reverse()


In [78]:
mutable_seq

MutableSeq('AGCCCGTGGGAAAGTCGCCGGGTAATGCACCG')

In [79]:
new_seq = Seq(mutable_seq)
new_seq

Seq('AGCCCGTGGGAAAGTCGCCGGGTAATGCACCG')

Finding subsequences

In [80]:
seq = Seq("GCCATTGTAATGGGCCGCTGAAAGGGTGCCCGA")
seq.index("ATGGGCCGC")

9

In [81]:
seq.index(b"ATGGGCCGC")

9

In [82]:
seq.index(bytearray(b"ATGGGCCGC"))

9

In [83]:
seq.index(Seq("ATGGGCCGC"))

9

In [84]:
seq.index(MutableSeq("ATGGGCCGC"))

9

In [85]:
seq.index("ACTG")  

ValueError: subsection not found

In [86]:
seq.find("ACTG")

-1

In [87]:
for index, sub in seq.search(["CC", "GGG", "CC"]):
    print(index, sub)


1 CC
11 GGG
14 CC
23 GGG
28 CC
29 CC
