In [6]:
#Import packages
import os
from Bio import SeqIO
from Bio import Entrez

# Download data and save

In [7]:
#Declare variables
#E-mail
Entrez.email = "hoge@example.com"
#Goldfish chordin transcript
ID = "XM_026255215.1"

In [None]:
#If the file is not present, the trnascript sequence file will be download and saved as a genbank format file
if not os.path.isfile(ID + ".gb"):
    # Downloading...
    net_handle = Entrez.efetch(db="nucleotide", id=ID, rettype="gb", retmode="text")
    out_handle = open(ID + ".gb", "w")
    out_handle.write(net_handle.read())
    out_handle.close()
    net_handle.close()
    print("Saved")
else:
    print("The file %s is present." % ID + ".gb")

# Load the save file

In [8]:
print("Parsing...")
record = SeqIO.read(ID + ".gb", "genbank")

Parsing...


# Briefly check the structure of the object

In [9]:
type(record)

Bio.SeqRecord.SeqRecord

In [10]:
record

SeqRecord(seq=Seq('ACGACTCACACTCGCTGAGACACATCGGGGAGAACCTCACTCTGTTTATTTGGT...TGT', IUPACAmbiguousDNA()), id='XM_026255215.1', name='XM_026255215', description='PREDICTED: Carassius auratus chordin (LOC113085615), mRNA', dbxrefs=['BioProject:PRJNA487739'])

In [7]:
print(record)

ID: XM_026255215.1
Name: XM_026255215
Description: PREDICTED: Carassius auratus chordin (LOC113085615), mRNA
Database cross-references: BioProject:PRJNA487739
Number of features: 3
/molecule_type=mRNA
/topology=linear
/data_file_division=VRT
/date=04-SEP-2018
/accessions=['XM_026255215']
/sequence_version=1
/keywords=['RefSeq']
/source=Carassius auratus (goldfish)
/organism=Carassius auratus
/taxonomy=['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Actinopterygii', 'Neopterygii', 'Teleostei', 'Ostariophysi', 'Cypriniformes', 'Cyprinidae', 'Carassius']
/comment=MODEL REFSEQ:  This record is predicted by automated computational
analysis. This record is derived from a genomic sequence
(NW_020526691.1) annotated using gene prediction method: Gnomon.
Also see:
    Documentation of NCBI's Annotation Process
                               100
                               pipeline
/structured_comment=OrderedDict([('Genome-Annotation-Data', OrderedDict([('Annot

# Extract CDS sequences

In [11]:
record.features

[SeqFeature(FeatureLocation(ExactPosition(0), ExactPosition(3636), strand=1), type='source'),
 SeqFeature(FeatureLocation(ExactPosition(0), ExactPosition(3636), strand=1), type='gene'),
 SeqFeature(FeatureLocation(ExactPosition(103), ExactPosition(2935), strand=1), type='CDS')]

In [12]:
type(record.features)

list

In [13]:
record.features[2]

SeqFeature(FeatureLocation(ExactPosition(103), ExactPosition(2935), strand=1), type='CDS')

In [14]:
type(record.features[2])

Bio.SeqFeature.SeqFeature

In [15]:
type(record.features[2].extract)

method

In [16]:
record.features[2].extract(record.seq)

Seq('ATGGAGGCGTCGCGAGCTCTGTGGATTCTGTGCTGCGCGTTCCTCGCGTCGGCT...TGA', IUPACAmbiguousDNA())

In [17]:
#Search features whose type is 'CDS', and retrieve sequence from parental DNA sequence
for feature in record.features:
    if feature.type == 'CDS':
        seq_cds = feature.location.extract(record.seq)

In [18]:
seq_cds

Seq('ATGGAGGCGTCGCGAGCTCTGTGGATTCTGTGCTGCGCGTTCCTCGCGTCGGCT...TGA', IUPACAmbiguousDNA())

In [19]:
type(seq_cds)

Bio.Seq.Seq

In [40]:
seq_cds.alphabet

IUPACAmbiguousDNA()

# Basic sequene analysis for the cds

### Direct translation of cds into protein sequence

In [53]:
#Print out the cds sequence
print(seq_cds)

ATGGAGGCGTCGCGAGCTCTGTGGATTCTGTGCTGCGCGTTCCTCGCGTCGGCTTTGGGCTCGAGACTCAAGACCCCCGCGTTACCCATCCAACCCGAGAGGGAACCCATGATCTCTAAAGGCTTATCCGGTTGCTCCTTCGGTGGCCGCTTTTATTCGCTGGAAGACACGTGGCATCCAGATCTCGGAGAGCCGTTCGGTGTGATGCACTGCGTTATGTGTCACTGCGAGCCGCAGAGGAGCCGGCGAGGGAAGGTGTTTGGGAAGGTGAGCTGCAGGAATATGAAACAGGACTGTCCCGATCCGACCTGCGACGATCCCGTCTTGCTTCCAGGACACTGCTGCAAAACATGCCCAAAAGGCAACTCAGGGAAAAAGGAGGTGGAGTCTCTGTTTGAGTTCTTCCAGGAGAAAGATGACGACCTGCACAAGTCTTACAACGACAGATCCTACATCAGCTCTGAGGAGAACAGCAACCGAGACAGCGCTGCCGATTTTGTGGCTGTACTCACGGGCGTGACAGACTCGTGGCTGCCGAGCTCCAGCGGCGTCGCACGGGCACGATTCACACTCGCTCGAACGAGCCTGACCTTCTCTATCACCTTCCAGAGGATGAACAGGCCGAGCCTCATCACGTTCCTGGACTCTGATGGAAACACAGCGTTTGAGTTCAGAGTACCACTGGCGGATACAGACATGATCTGTGGAGTTTGGAGGAACCTGCCAAAGTCTCACCTGCGTCAGCTGGAGGCGGAGCAGCTGCATGTTTCCATGACAACCGCTGACAACAAGAAGGAGGAGATACAGGGCAAAATCATCAAACACCGAGCGCTGTTCGCAGAAACGTTCAGCGCGATCCTGACGTCTGACGAGGTGCATTCTGGGATGGGAGGAATCGCAATGTTGACGCTCAGTGACACGGAAAACAATCTGCATTTCATCCTGATCCTGCAGGGACTCGTTTCTCACGGGAGCTCTTCTGTAAAGGTGCCAGTCCGAG

In [28]:
#Sequence length
print(len(seq_cds))

2832


In [31]:
#Translation
print(seq_cds.translate())

MEASRALWILCCAFLASALGSRLKTPALPIQPEREPMISKGLSGCSFGGRFYSLEDTWHPDLGEPFGVMHCVMCHCEPQRSRRGKVFGKVSCRNMKQDCPDPTCDDPVLLPGHCCKTCPKGNSGKKEVESLFEFFQEKDDDLHKSYNDRSYISSEENSNRDSAADFVAVLTGVTDSWLPSSSGVARARFTLARTSLTFSITFQRMNRPSLITFLDSDGNTAFEFRVPLADTDMICGVWRNLPKSHLRQLEAEQLHVSMTTADNKKEEIQGKIIKHRALFAETFSAILTSDEVHSGMGGIAMLTLSDTENNLHFILILQGLVSHGSSSVKVPVRVKLLYRQHLLREIQANISADDSDLAEVLADLNSRELFWLSRGQLQISVETEGQNSRQVSGFISGKRSCDTLQSVMSSGSALTPGKTGGVGSAVFTLHHNGSLDFQVLVAGLSSAVVGVTIEMKPRRRSKRSVLYDITADFSTAGERGGGRAMGSCGRVEARHIHMLLQNELFINIATAEQQESELRGQIRMLPYNGLDARRNELPVPLAGQFVSPPVRTGAAGHAWVSVDEQCHLHYEIVINGLSNSEDTSVNAHLHGLAEIGEMDDSSTNHKRLLTGFYGQQAQGVLKDISVELLRHLDEGTAYIQVSTKMNPRGEIRGRIHVPNSCELGSRGEVVEEAEFDELVFVRDPAELRKDPHTCFFEGEHHAHGSQWTPQYNTCFTCICQKKTVICDPVICPALSCPHTIQPEDQCCPICDEKKESKQTTAVEKVEEDPEGCYFEGDQKMHAPGTTWHPFVPPFGYIKCAVCTCKGSTGEVHCEKLTCPPLTCSRPIRRNPSDCCKECPAEDTPPLEDDEMMQADGTRHCKFGDNYYQNSEHWHPRVPLVGEMKCITCWCDHGVTKCQKKQCPLLSCSNPIRREGKCCPECIEDFMEKEEMAKMVEKKKNWRH*


In [36]:
len(seq_cds.translate())

944

### Transcribe and translation

In [54]:
#transcribe
seq_cds.transcribe()

Seq('AUGGAGGCGUCGCGAGCUCUGUGGAUUCUGUGCUGCGCGUUCCUCGCGUCGGCU...UGA', IUPACAmbiguousRNA())

In [55]:
#transcribe and translate
seq_cds.transcribe().translate()

Seq('MEASRALWILCCAFLASALGSRLKTPALPIQPEREPMISKGLSGCSFGGRFYSL...RH*', HasStopCodon(ExtendedIUPACProtein(), '*'))

### GC content

In [47]:
#Caliculate gc content using self-made script
100 * float(seq_cds.count("G") + seq_cds.count("C")) / len(seq_cds)

54.378531073446325

In [49]:
#Caliculate gc content using GC method
from Bio.SeqUtils import GC
GC(seq_cds)

54.378531073446325

In [44]:
for index, letter in enumerate(seq_cds):
...     print("%i %s" % (index, letter))

0 A
1 T
2 G
3 G
4 A
5 G
6 G
7 C
8 G
9 T
10 C
11 G
12 C
13 G
14 A
15 G
16 C
17 T
18 C
19 T
20 G
21 T
22 G
23 G
24 A
25 T
26 T
27 C
28 T
29 G
30 T
31 G
32 C
33 T
34 G
35 C
36 G
37 C
38 G
39 T
40 T
41 C
42 C
43 T
44 C
45 G
46 C
47 G
48 T
49 C
50 G
51 G
52 C
53 T
54 T
55 T
56 G
57 G
58 G
59 C
60 T
61 C
62 G
63 A
64 G
65 A
66 C
67 T
68 C
69 A
70 A
71 G
72 A
73 C
74 C
75 C
76 C
77 C
78 G
79 C
80 G
81 T
82 T
83 A
84 C
85 C
86 C
87 A
88 T
89 C
90 C
91 A
92 A
93 C
94 C
95 C
96 G
97 A
98 G
99 A
100 G
101 G
102 G
103 A
104 A
105 C
106 C
107 C
108 A
109 T
110 G
111 A
112 T
113 C
114 T
115 C
116 T
117 A
118 A
119 A
120 G
121 G
122 C
123 T
124 T
125 A
126 T
127 C
128 C
129 G
130 G
131 T
132 T
133 G
134 C
135 T
136 C
137 C
138 T
139 T
140 C
141 G
142 G
143 T
144 G
145 G
146 C
147 C
148 G
149 C
150 T
151 T
152 T
153 T
154 A
155 T
156 T
157 C
158 G
159 C
160 T
161 G
162 G
163 A
164 A
165 G
166 A
167 C
168 A
169 C
170 G
171 T
172 G
173 G
174 C
175 A
176 T
177 C
178 C
179 A
180 G
181 A
182 T
183 C
184 T


1499 G
1500 C
1501 A
1502 G
1503 A
1504 A
1505 C
1506 G
1507 A
1508 A
1509 C
1510 T
1511 G
1512 T
1513 T
1514 C
1515 A
1516 T
1517 T
1518 A
1519 A
1520 C
1521 A
1522 T
1523 C
1524 G
1525 C
1526 C
1527 A
1528 C
1529 G
1530 G
1531 C
1532 C
1533 G
1534 A
1535 G
1536 C
1537 A
1538 G
1539 C
1540 A
1541 G
1542 G
1543 A
1544 G
1545 A
1546 G
1547 C
1548 G
1549 A
1550 A
1551 C
1552 T
1553 G
1554 C
1555 G
1556 T
1557 G
1558 G
1559 A
1560 C
1561 A
1562 G
1563 A
1564 T
1565 A
1566 C
1567 G
1568 A
1569 A
1570 T
1571 G
1572 C
1573 T
1574 G
1575 C
1576 C
1577 T
1578 T
1579 A
1580 C
1581 A
1582 A
1583 C
1584 G
1585 G
1586 A
1587 C
1588 T
1589 G
1590 G
1591 A
1592 C
1593 G
1594 C
1595 A
1596 C
1597 G
1598 C
1599 C
1600 G
1601 A
1602 A
1603 A
1604 C
1605 G
1606 A
1607 G
1608 C
1609 T
1610 T
1611 C
1612 C
1613 G
1614 G
1615 T
1616 T
1617 C
1618 C
1619 T
1620 C
1621 T
1622 G
1623 G
1624 C
1625 G
1626 G
1627 G
1628 T
1629 C
1630 A
1631 G
1632 T
1633 T
1634 T
1635 G
1636 T
1637 G
1638 T
1639 C
1640 T
1641 C