In [64]:
import pandas as pd
import numpy as np
from Bio import SeqIO
from Bio import Entrez

In [65]:
# ncbi_accession_ids = ['MT019529.1', 'GU190215.1', 'KJ473815.1', 'GQ153547.1', 'MN996532.1']

ncbi_accession_ids = ['RaTG13',      'PCov-GX',   'SARS-CoV-1',    'MERS',    'HCov-229E', 'HCoV-OC43','HCoV-HKU1', 'HCoV-NL63']

ncbi_accession_ids = ['MN996532.2', 'MT072864.1', 'MT040334.1', 'KT029139.1', 'MH940245.1','AB691767.2','AY391777.1','NC_005831.2','MT019529.1','MG772934.1','MG772933.1']


In [66]:
Entrez.email='kavindaperera97@gmail.com'

In [67]:
genomes = []
for id in ncbi_accession_ids:
    print(id)
    handle = Entrez.efetch(db='nucleotide', id=id, rettype='gb')
    record = SeqIO.read(handle, "genbank")
    handle.close()
    print(record.description)
    genomes.append(record)
    print(record.annotations['source'] + " ✔️")
    print("============================")

MN996532.2
Bat coronavirus RaTG13, complete genome
Bat coronavirus RaTG13 ✔️
MT072864.1
Pangolin coronavirus isolate PCoV_GX-P2V, complete genome
Pangolin coronavirus ✔️
MT040334.1
Pangolin coronavirus isolate PCoV_GX-P1E, complete genome
Pangolin coronavirus ✔️
KT029139.1
Middle East respiratory syndrome coronavirus isolate MERS-CoV/KOR/KNIH/002_05_2015, complete genome
Middle East respiratory syndrome-related coronavirus (MERS-CoV) ✔️
MH940245.1
Human coronavirus HKU1 isolate SI17244, complete genome
Human coronavirus HKU1 (HCoV-HKU1) ✔️
AB691767.2
Human coronavirus 229E Niigata/01/08 RNA, complete genome
Human coronavirus 229E ✔️
AY391777.1
Human coronavirus OC43, complete genome
Human coronavirus OC43 (HCoV-OC43) ✔️
NC_005831.2
Human Coronavirus NL63, complete genome
Human coronavirus NL63 (HCoV-NL63) ✔️
MT019529.1
Severe acute respiratory syndrome coronavirus 2 isolate BetaCoV/Wuhan/IPBCAMS-WH-01/2019, complete genome
Severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2) ✔️

In [68]:
SeqIO.write(genomes, "./complete_genomes.fasta", "fasta")

11

In [69]:
characteristic_genes = ['orf1ab', 'spike', 'orf3a', 'orf3b', 'envelope', 'membrane', 'orf7a', 'orf7b', 'orf8', 'nucleocapsid', 'surface glycoprotein', 's protein','replicase polyprotein','non-structural polyprotein 1ab']

In [70]:
characteristic_genes_dict = {'orf1ab':[], 'spike':[],  'orf3a':[], 'orf3b':[], 'envelope':[], 'membrane':[], 'orf7a':[], 'orf7b':[], 'orf8':[], 'nucleocapsid':[],}

In [71]:
for id in ncbi_accession_ids:
    print(id)
    handle = Entrez.efetch(db='nucleotide', id=id, rettype='gb')
    record = SeqIO.read(handle, "genbank")
    handle.close()
    for feature in record.features:
        if(feature.type == 'CDS'):
            gene = feature.qualifiers['product'][0].lower()
            
            for characteristic_gene in characteristic_genes:
                if characteristic_gene in gene: 
                    entry = f">{record.annotations['source']} ({id}) \n"
                    entry = entry.replace(" ","-")
                    entry += record.seq[feature.location.start : feature.location.end] 
                    
                    # surface glycoprotein is the spike protien in SARS-Cov-2
                    if characteristic_gene == "surface glycoprotein":
                        characteristic_genes_dict["spike"].append(entry)
                        print(gene + " ✔️")
                        break

                    # S protein is the spike protein in HCoV-OC43
                    if characteristic_gene == "s protein":
                        characteristic_genes_dict["spike"].append(entry)
                        print(gene + " ✔️")
                        break

                    # replicase polyprotein is the orf1 in HCoV-OC43
                    if characteristic_gene == "replicase polyprotein":
                        characteristic_genes_dict["orf1ab"].append(entry)
                        print(gene + " ✔️")
                        break

                    # replicase polyprotein is the orf1 in HCoV-OC43
                    if characteristic_gene == "non-structural polyprotein 1ab":
                        characteristic_genes_dict["orf1ab"].append(entry)
                        print(gene + " ✔️")
                        break
                    

                    characteristic_genes_dict[characteristic_gene].append(entry)    
                    print(gene + " ✔️")
                    break
                    
    print("=======================")         


MN996532.2
orf1ab polyprotein ✔️
spike glycoprotein ✔️
envelope protein ✔️
membrane protein ✔️
nucleocapsid protein ✔️
MT072864.1
orf1ab polyprotein ✔️
spike protein ✔️
envelope protein ✔️
membrane protein ✔️
orf7a protein ✔️
orf7b protein ✔️
orf8 protein ✔️
nucleocapsid protein ✔️
MT040334.1
orf1ab polyprotein ✔️
spike protein ✔️
orf3a protein ✔️
envelope protein ✔️
orf7a protein ✔️
orf8 protein ✔️
nucleocapsid phosphoprotein ✔️
KT029139.1
orf1ab ✔️
spike protein ✔️
envelope protein ✔️
membrane protein ✔️
MH940245.1
orf1ab polyprotein ✔️
spike glycoprotein ✔️
small membrane protein ✔️
membrane glycoprotein ✔️
nucleocapsid phosphoprotein ✔️
AB691767.2
orf1ab polyprotein ✔️
spike protein ✔️
envelope protein ✔️
membrane protein ✔️
nucleocapsid protein ✔️
AY391777.1
replicase polyprotein ✔️
s protein ✔️
NC_005831.2
orf1ab polyprotein ✔️
spike protein ✔️
envelope protein ✔️
membrane protein ✔️
nucleocapsid protein ✔️
MT019529.1
orf1ab polyprotein ✔️
surface glycoprotein ✔️
orf3a protein ✔️

In [62]:
characteristic_genes_dict

{'orf1ab': [Seq('>Bat-coronavirus-RaTG13-(MN996532.2)-
  ATGGAGAGCCTTGTCC...TAA'),
  Seq('>Pangolin-coronavirus-(MT072864.1)-
  ATGGAGAGCCTTGTCCCT...TAA'),
  Seq('>SARS-coronavirus-BJ01-(AY278488.2)-
  ATGGAGAGCCTTGTTCT...TAA'),
  Seq('>Middle-East-respiratory-syndrome-related-coronavirus-...TGA'),
  Seq('>Human-coronavirus-229E-(AB691767.2)-
  ATGGCCTGCAACCGTG...TAA'),
  Seq('>Human-coronavirus-OC43-(HCoV-OC43)-(AY391777.1)-
  ATGT...TAA'),
  Seq('>Human-coronavirus-HKU1-(HCoV-HKU1)-(MH940245.1)-
  ATGA...TAG'),
  Seq('>Human-coronavirus-NL63-(HCoV-NL63)-(NC_005831.2)-
  ATG...TGA')],
 'spike': [Seq('>Bat-coronavirus-RaTG13-(MN996532.2)-
  ATGTTTGTTTTTCTTG...TAA'),
  Seq('>Pangolin-coronavirus-(MT072864.1)-
  ATGTTTGTTTTTCTTTTT...TAA'),
  Seq('>SARS-coronavirus-BJ01-(AY278488.2)-
  ATGTTTATTTTCTTATT...TAA'),
  Seq('>Middle-East-respiratory-syndrome-related-coronavirus-...TAA'),
  Seq('>Human-coronavirus-229E-(AB691767.2)-
  ATGTTTGTTTTACTTG...TAA'),
  Seq('>Human-coronavirus-OC43-(HCo

In [63]:
for characteristic_gene, sequences in characteristic_genes_dict.items():
    
    results = ""
    for sequence in sequences:
        results += sequence + "\n"
    
    file = open(f'./gene_sequences_new/{characteristic_gene}.fasta', "w")
    file.write(str(results))
    file.close()
    print(characteristic_gene + " | count: " + str(len(sequences)) + " ✔️")

orf1ab | count: 8 ✔️
spike | count: 8 ✔️
orf3a | count: 0 ✔️
orf3b | count: 0 ✔️
envelope | count: 6 ✔️
membrane | count: 8 ✔️
orf7a | count: 1 ✔️
orf7b | count: 1 ✔️
orf8 | count: 1 ✔️
nucleocapsid | count: 6 ✔️
