<a href="https://colab.research.google.com/github/hongqin/covid19_sequence_analysis_tutorial/blob/master/covid19_genbank.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

reference :

https://colab.research.google.com/github/chris-rands/biopython-coronavirus/blob/master/biopython-coronavirus-notebook.ipynb#scrollTo=l12U0dJcKXL5

https://www.ncbi.nlm.nih.gov/nuccore/1798174254


In [43]:
 !pip install biopython



In [44]:
import Bio
from Bio import SeqIO, SearchIO, Entrez
from Bio.Seq import Seq
from Bio.SeqUtils import GC
from Bio.Blast import NCBIWWW
from Bio.Data import CodonTable


In [45]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [46]:
gb_file = "/content/drive/My Drive/tmp/NC_045512.gb"
gb_record = SeqIO.read(open(gb_file,"r"), "genbank")


In [47]:
print (gb_record.name, len(gb_record.features))

NC_045512 57


In [48]:
{feature.type for feature in gb_record.features}

{"3'UTR", "5'UTR", 'CDS', 'gene', 'mat_peptide', 'source', 'stem_loop'}

In [49]:
CDSs = [feature for feature in gb_record.features if feature.type == "CDS"]
len(CDSs)

12

In [50]:
CDSs[0].qualifiers["gene"]

['ORF1ab']

In [51]:
for i in range(12):
  print( CDSs[i].qualifiers["gene"])

['ORF1ab']
['ORF1ab']
['S']
['ORF3a']
['E']
['M']
['ORF6']
['ORF7a']
['ORF7b']
['ORF8']
['N']
['ORF10']


In [52]:
protein_seq = Seq(CDSs[0].qualifiers["translation"][0])
protein_seq

Seq('MESLVPGFNEKTHVQLSLPVLQVRDVLVRGFGDSVEEVLSEARQHLKDGTCGLV...VNN')

In [53]:
print("Does the sequence begin with a start codon?\n",
      protein_seq.startswith("M"))

Does the sequence begin with a start codon?
 True


In [54]:
mat_peptides = [feature for feature in gb_record.features if feature.type == "mat_peptide"]
len(mat_peptides)

26

In [55]:
type(mat_peptides[0])

Bio.SeqFeature.SeqFeature

In [56]:
dir(mat_peptides[0])

['__bool__',
 '__class__',
 '__contains__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_flip',
 '_get_location_operator',
 '_get_ref',
 '_get_ref_db',
 '_get_strand',
 '_set_location_operator',
 '_set_ref',
 '_set_ref_db',
 '_set_strand',
 '_shift',
 'extract',
 'id',
 'location',
 'location_operator',
 'qualifiers',
 'ref',
 'ref_db',
 'strand',
 'translate',
 'type']

In [57]:
for i in range(26):
  print( i,  mat_peptides[i].qualifiers["product"],  mat_peptides[i].location )

0 ['leader protein'] [265:805](+)
1 ['nsp2'] [805:2719](+)
2 ['nsp3'] [2719:8554](+)
3 ['nsp4'] [8554:10054](+)
4 ['3C-like proteinase'] [10054:10972](+)
5 ['nsp6'] [10972:11842](+)
6 ['nsp7'] [11842:12091](+)
7 ['nsp8'] [12091:12685](+)
8 ['nsp9'] [12685:13024](+)
9 ['nsp10'] [13024:13441](+)
10 ['RNA-dependent RNA polymerase'] join{[13441:13468](+), [13467:16236](+)}
11 ['helicase'] [16236:18039](+)
12 ["3'-to-5' exonuclease"] [18039:19620](+)
13 ['endoRNAse'] [19620:20658](+)
14 ["2'-O-ribose methyltransferase"] [20658:21552](+)
15 ['leader protein'] [265:805](+)
16 ['nsp2'] [805:2719](+)
17 ['nsp3'] [2719:8554](+)
18 ['nsp4'] [8554:10054](+)
19 ['3C-like proteinase'] [10054:10972](+)
20 ['nsp6'] [10972:11842](+)
21 ['nsp7'] [11842:12091](+)
22 ['nsp8'] [12091:12685](+)
23 ['nsp9'] [12685:13024](+)
24 ['nsp10'] [13024:13441](+)
25 ['nsp11'] [13441:13480](+)
