In [6]:
#Import packages
import os
from Bio import SeqIO
from Bio import Entrez

# Download data and save

In [7]:
#Declare variables
#E-mail
Entrez.email = "hoge@example.com"
#Goldfish chordin transcript
ID = "XM_026255215.1"

In [None]:
#If the file is not present, the trnascript sequence file will be download and saved as a genbank format file
if not os.path.isfile(ID + ".gb"):
    # Downloading...
    net_handle = Entrez.efetch(db="nucleotide", id=ID, rettype="gb", retmode="text")
    out_handle = open(ID + ".gb", "w")
    out_handle.write(net_handle.read())
    out_handle.close()
    net_handle.close()
    print("Saved")
else:
    print("The file %s is present." % ID + ".gb")

# Load the save file

In [8]:
print("Parsing...")
record = SeqIO.read(ID + ".gb", "genbank")

Parsing...


# Briefly check the structure of the object

In [9]:
type(record)

Bio.SeqRecord.SeqRecord

In [10]:
record

SeqRecord(seq=Seq('ACGACTCACACTCGCTGAGACACATCGGGGAGAACCTCACTCTGTTTATTTGGT...TGT', IUPACAmbiguousDNA()), id='XM_026255215.1', name='XM_026255215', description='PREDICTED: Carassius auratus chordin (LOC113085615), mRNA', dbxrefs=['BioProject:PRJNA487739'])

In [7]:
print(record)

ID: XM_026255215.1
Name: XM_026255215
Description: PREDICTED: Carassius auratus chordin (LOC113085615), mRNA
Database cross-references: BioProject:PRJNA487739
Number of features: 3
/molecule_type=mRNA
/topology=linear
/data_file_division=VRT
/date=04-SEP-2018
/accessions=['XM_026255215']
/sequence_version=1
/keywords=['RefSeq']
/source=Carassius auratus (goldfish)
/organism=Carassius auratus
/taxonomy=['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Actinopterygii', 'Neopterygii', 'Teleostei', 'Ostariophysi', 'Cypriniformes', 'Cyprinidae', 'Carassius']
/comment=MODEL REFSEQ:  This record is predicted by automated computational
analysis. This record is derived from a genomic sequence
(NW_020526691.1) annotated using gene prediction method: Gnomon.
Also see:
    Documentation of NCBI's Annotation Process
                               100
                               pipeline
/structured_comment=OrderedDict([('Genome-Annotation-Data', OrderedDict([('Annot

# Extract CDS sequences

In [11]:
record.features

[SeqFeature(FeatureLocation(ExactPosition(0), ExactPosition(3636), strand=1), type='source'),
 SeqFeature(FeatureLocation(ExactPosition(0), ExactPosition(3636), strand=1), type='gene'),
 SeqFeature(FeatureLocation(ExactPosition(103), ExactPosition(2935), strand=1), type='CDS')]

In [12]:
type(record.features)

list

In [13]:
record.features[2]

SeqFeature(FeatureLocation(ExactPosition(103), ExactPosition(2935), strand=1), type='CDS')

In [14]:
type(record.features[2])

Bio.SeqFeature.SeqFeature

In [15]:
type(record.features[2].extract)

method

In [16]:
record.features[2].extract(record.seq)

Seq('ATGGAGGCGTCGCGAGCTCTGTGGATTCTGTGCTGCGCGTTCCTCGCGTCGGCT...TGA', IUPACAmbiguousDNA())

In [17]:
#Search features whose type is 'CDS', and retrieve sequence from parental DNA sequence
for feature in record.features:
    if feature.type == 'CDS':
        seq_cds = feature.location.extract(record.seq)

In [18]:
seq_cds

Seq('ATGGAGGCGTCGCGAGCTCTGTGGATTCTGTGCTGCGCGTTCCTCGCGTCGGCT...TGA', IUPACAmbiguousDNA())

In [19]:
type(seq_cds)

Bio.Seq.Seq

# Basic sequene analysis for the cds

In [26]:
print(seq_cds)

ATGGAGGCGTCGCGAGCTCTGTGGATTCTGTGCTGCGCGTTCCTCGCGTCGGCTTTGGGCTCGAGACTCAAGACCCCCGCGTTACCCATCCAACCCGAGAGGGAACCCATGATCTCTAAAGGCTTATCCGGTTGCTCCTTCGGTGGCCGCTTTTATTCGCTGGAAGACACGTGGCATCCAGATCTCGGAGAGCCGTTCGGTGTGATGCACTGCGTTATGTGTCACTGCGAGCCGCAGAGGAGCCGGCGAGGGAAGGTGTTTGGGAAGGTGAGCTGCAGGAATATGAAACAGGACTGTCCCGATCCGACCTGCGACGATCCCGTCTTGCTTCCAGGACACTGCTGCAAAACATGCCCAAAAGGCAACTCAGGGAAAAAGGAGGTGGAGTCTCTGTTTGAGTTCTTCCAGGAGAAAGATGACGACCTGCACAAGTCTTACAACGACAGATCCTACATCAGCTCTGAGGAGAACAGCAACCGAGACAGCGCTGCCGATTTTGTGGCTGTACTCACGGGCGTGACAGACTCGTGGCTGCCGAGCTCCAGCGGCGTCGCACGGGCACGATTCACACTCGCTCGAACGAGCCTGACCTTCTCTATCACCTTCCAGAGGATGAACAGGCCGAGCCTCATCACGTTCCTGGACTCTGATGGAAACACAGCGTTTGAGTTCAGAGTACCACTGGCGGATACAGACATGATCTGTGGAGTTTGGAGGAACCTGCCAAAGTCTCACCTGCGTCAGCTGGAGGCGGAGCAGCTGCATGTTTCCATGACAACCGCTGACAACAAGAAGGAGGAGATACAGGGCAAAATCATCAAACACCGAGCGCTGTTCGCAGAAACGTTCAGCGCGATCCTGACGTCTGACGAGGTGCATTCTGGGATGGGAGGAATCGCAATGTTGACGCTCAGTGACACGGAAAACAATCTGCATTTCATCCTGATCCTGCAGGGACTCGTTTCTCACGGGAGCTCTTCTGTAAAGGTGCCAGTCCGAG

In [28]:
#Sequence length
print(len(seq_cds))

2832


In [31]:
#Translation
print(seq_cds.translate())

MEASRALWILCCAFLASALGSRLKTPALPIQPEREPMISKGLSGCSFGGRFYSLEDTWHPDLGEPFGVMHCVMCHCEPQRSRRGKVFGKVSCRNMKQDCPDPTCDDPVLLPGHCCKTCPKGNSGKKEVESLFEFFQEKDDDLHKSYNDRSYISSEENSNRDSAADFVAVLTGVTDSWLPSSSGVARARFTLARTSLTFSITFQRMNRPSLITFLDSDGNTAFEFRVPLADTDMICGVWRNLPKSHLRQLEAEQLHVSMTTADNKKEEIQGKIIKHRALFAETFSAILTSDEVHSGMGGIAMLTLSDTENNLHFILILQGLVSHGSSSVKVPVRVKLLYRQHLLREIQANISADDSDLAEVLADLNSRELFWLSRGQLQISVETEGQNSRQVSGFISGKRSCDTLQSVMSSGSALTPGKTGGVGSAVFTLHHNGSLDFQVLVAGLSSAVVGVTIEMKPRRRSKRSVLYDITADFSTAGERGGGRAMGSCGRVEARHIHMLLQNELFINIATAEQQESELRGQIRMLPYNGLDARRNELPVPLAGQFVSPPVRTGAAGHAWVSVDEQCHLHYEIVINGLSNSEDTSVNAHLHGLAEIGEMDDSSTNHKRLLTGFYGQQAQGVLKDISVELLRHLDEGTAYIQVSTKMNPRGEIRGRIHVPNSCELGSRGEVVEEAEFDELVFVRDPAELRKDPHTCFFEGEHHAHGSQWTPQYNTCFTCICQKKTVICDPVICPALSCPHTIQPEDQCCPICDEKKESKQTTAVEKVEEDPEGCYFEGDQKMHAPGTTWHPFVPPFGYIKCAVCTCKGSTGEVHCEKLTCPPLTCSRPIRRNPSDCCKECPAEDTPPLEDDEMMQADGTRHCKFGDNYYQNSEHWHPRVPLVGEMKCITCWCDHGVTKCQKKQCPLLSCSNPIRREGKCCPECIEDFMEKEEMAKMVEKKKNWRH*


In [36]:
len(seq_cds.translate())

944