In [1]:
import pandas as pd
import numpy as np
from Bio import SeqIO
from Bio import Entrez

In [2]:
ncbi_accession_ids = ['NC_045512.2', 'MT371050.1', 'MT371049.1', 'MT371048.1', 'MT371047.1', 'MZ801830.1', 'OK035181.1']

In [3]:
Entrez.email='kavindaperera97@gmail.com'

In [4]:
genomes = []
for id in ncbi_accession_ids:
    print(id)
    handle = Entrez.efetch(db='nucleotide', id=id, rettype='gb')
    record = SeqIO.read(handle, "genbank")
    handle.close()
    print(record.description)
    genomes.append(record)
    print(record.annotations['source'] + " ✔️")
    print("============================")

NC_045512.2
Severe acute respiratory syndrome coronavirus 2 isolate Wuhan-Hu-1, complete genome
Severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2) ✔️
MT371050.1
Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/LKA/COV486/2020, complete genome
Severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2) ✔️
MT371049.1
Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/LKA/COV91/2020, complete genome
Severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2) ✔️
MT371048.1
Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/LKA/COV53/2020, complete genome
Severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2) ✔️
MT371047.1
Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/LKA/COV38/2020, complete genome
Severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2) ✔️
MZ801830.1
Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/LKA/KDU9116/2021 ORF1ab polyprotein

In [5]:
SeqIO.write(genomes, "./complete_genomes.fasta", "fasta")

7

In [6]:
characteristic_genes = ['orf1ab', 'spike', 'orf3a', 'orf3b', 'envelope', 'membrane', 'orf7a', 'orf7b', 'orf8', 'nucleocapsid']

In [7]:
characteristic_genes_dict = {'orf1ab':[], 'spike':[],  'orf3a':[], 'orf3b':[], 'envelope':[], 'membrane':[], 'orf7a':[], 'orf7b':[], 'orf8':[], 'nucleocapsid':[],}

In [8]:
for id in ncbi_accession_ids:
    print(id)
    handle = Entrez.efetch(db='nucleotide', id=id, rettype='gb')
    record = SeqIO.read(handle, "genbank")
    handle.close()
    for feature in record.features:
        if(feature.type == 'CDS'):
            gene = feature.qualifiers['product'][0].lower()
            for characteristic_gene in characteristic_genes:
                if characteristic_gene in gene:
                    entry = f">{record.annotations['source']} ({id}) \n"
                    entry = entry.replace(" ","-")
                    entry += record.seq[feature.location.start : feature.location.end]
                    characteristic_genes_dict[characteristic_gene].append(entry)
                    print(gene + " ✔️")
                    break
                    
    print("=======================")         


NC_045512.2
orf1ab polyprotein ✔️
orf3a protein ✔️
envelope protein ✔️
membrane glycoprotein ✔️
orf7a protein ✔️
orf7b ✔️
orf8 protein ✔️
nucleocapsid phosphoprotein ✔️
MT371050.1
orf1ab polyprotein ✔️
orf3a protein ✔️
envelope protein ✔️
membrane glycoprotein ✔️
orf7a protein ✔️
orf7b ✔️
orf8 protein ✔️
nucleocapsid phosphoprotein ✔️
MT371049.1
orf1ab polyprotein ✔️
orf3a protein ✔️
envelope protein ✔️
membrane glycoprotein ✔️
orf7a protein ✔️
orf7b ✔️
orf8 protein ✔️
nucleocapsid phosphoprotein ✔️
MT371048.1
orf1ab polyprotein ✔️
orf3a protein ✔️
envelope protein ✔️
membrane glycoprotein ✔️
orf7a protein ✔️
orf7b ✔️
orf8 protein ✔️
nucleocapsid phosphoprotein ✔️
MT371047.1
orf1ab polyprotein ✔️
orf3a protein ✔️
envelope protein ✔️
membrane glycoprotein ✔️
orf7a protein ✔️
orf7b ✔️
orf8 protein ✔️
nucleocapsid phosphoprotein ✔️
MZ801830.1
orf1ab polyprotein ✔️
orf3a protein ✔️
envelope protein ✔️
membrane glycoprotein ✔️
orf7b ✔️
orf8 protein ✔️
nucleocapsid phosphoprotein ✔️
OK035181

In [9]:
characteristic_genes_dict

{'orf1ab': [Seq('>Severe-acute-respiratory-syndrome-coronavirus-2-(SARS...TAA'),
  Seq('>Severe-acute-respiratory-syndrome-coronavirus-2-(SARS...TAA'),
  Seq('>Severe-acute-respiratory-syndrome-coronavirus-2-(SARS...TAA'),
  Seq('>Severe-acute-respiratory-syndrome-coronavirus-2-(SARS...TAA'),
  Seq('>Severe-acute-respiratory-syndrome-coronavirus-2-(SARS...TAA'),
  Seq('>Severe-acute-respiratory-syndrome-coronavirus-2-(SARS...TAA'),
  Seq('>Severe-acute-respiratory-syndrome-coronavirus-2-(SARS...TAA')],
 'spike': [],
 'orf3a': [Seq('>Severe-acute-respiratory-syndrome-coronavirus-2-(SARS...TAA'),
  Seq('>Severe-acute-respiratory-syndrome-coronavirus-2-(SARS...TAA'),
  Seq('>Severe-acute-respiratory-syndrome-coronavirus-2-(SARS...TAA'),
  Seq('>Severe-acute-respiratory-syndrome-coronavirus-2-(SARS...TAA'),
  Seq('>Severe-acute-respiratory-syndrome-coronavirus-2-(SARS...TAA'),
  Seq('>Severe-acute-respiratory-syndrome-coronavirus-2-(SARS...TAA'),
  Seq('>Severe-acute-respiratory-syndrome-c

In [13]:
for characteristic_gene, sequences in characteristic_genes_dict.items():
    
    results = ""
    for sequence in sequences:
        results += sequence + "\n"
    
    file = open(f'./gene_sequences/{characteristic_gene}.fasta', "w")
    file.write(str(results))
    file.close()
    print(characteristic_gene + " ✔️")

orf1ab ✔️
spike ✔️
orf3a ✔️
orf3b ✔️
envelope ✔️
membrane ✔️
orf7a ✔️
orf7b ✔️
orf8 ✔️
nucleocapsid ✔️
