In [1]:
import pandas as pd
import numpy as np
from Bio import SeqIO
from Bio import Entrez

In [2]:

ncbi_accession_ids = ['MT019529.1', 'MT371050.1', 'MT371049.1', 'MT371048.1', 'MT371047.1','MZ801830.1','OK035181.1']

In [3]:
Entrez.email='kavindaperera97@gmail.com'

In [4]:
genomes = []
for id in ncbi_accession_ids:
    print(id)
    handle = Entrez.efetch(db='nucleotide', id=id, rettype='gb')
    record = SeqIO.read(handle, "genbank")
    handle.close()
    print(record.description)
    genomes.append(record)
    print(record.annotations['source'] + " ✔️")
    print("============================")

MT019529.1
Severe acute respiratory syndrome coronavirus 2 isolate BetaCoV/Wuhan/IPBCAMS-WH-01/2019, complete genome
Severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2) ✔️
MT371050.1
Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/LKA/COV486/2020, complete genome
Severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2) ✔️
MT371049.1
Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/LKA/COV91/2020, complete genome
Severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2) ✔️
MT371048.1
Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/LKA/COV53/2020, complete genome
Severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2) ✔️
MT371047.1
Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/LKA/COV38/2020, complete genome
Severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2) ✔️
MZ801830.1
Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/LKA/KDU9116/20

In [5]:
SeqIO.write(genomes, "./complete_genomes_variants_sl.fasta", "fasta")

7

In [6]:
characteristic_genes = ['orf1ab', 'spike', 'orf3a', 'orf3b', 'envelope', 'membrane', 'orf7a', 'orf7b', 'orf8', 'nucleocapsid', 'surface glycoprotein', 's protein','replicase polyprotein','non-structural polyprotein 1ab']

In [7]:
characteristic_genes_dict = {'orf1ab':[], 'spike':[],  'orf3a':[], 'orf3b':[], 'envelope':[], 'membrane':[], 'orf7a':[], 'orf7b':[], 'orf8':[], 'nucleocapsid':[],}

In [8]:
for id in ncbi_accession_ids:
    print(id)
    handle = Entrez.efetch(db='nucleotide', id=id, rettype='gb')
    record = SeqIO.read(handle, "genbank")
    handle.close()
    for feature in record.features:
        if(feature.type == 'CDS'):
            gene = feature.qualifiers['product'][0].lower()
            
            for characteristic_gene in characteristic_genes:
                if characteristic_gene in gene: 
                    entry = f">{record.annotations['source']} ({id}) \n"
                    entry = entry.replace(" ","-")
                    entry += record.seq[feature.location.start : feature.location.end] 
                    
                    # surface glycoprotein is the spike protien in SARS-Cov-2
                    if characteristic_gene == "surface glycoprotein":
                        characteristic_genes_dict["spike"].append(entry)
                        print(gene + " ✔️")
                        break

                    # S protein is the spike protein in HCoV-OC43
                    if characteristic_gene == "s protein":
                        characteristic_genes_dict["spike"].append(entry)
                        print(gene + " ✔️")
                        break

                    # replicase polyprotein is the orf1 in HCoV-OC43
                    if characteristic_gene == "replicase polyprotein":
                        characteristic_genes_dict["orf1ab"].append(entry)
                        print(gene + " ✔️")
                        break

                    # replicase polyprotein is the orf1 in HCoV-OC43
                    if characteristic_gene == "non-structural polyprotein 1ab":
                        characteristic_genes_dict["orf1ab"].append(entry)
                        print(gene + " ✔️")
                        break
                    

                    characteristic_genes_dict[characteristic_gene].append(entry)    
                    print(gene + " ✔️")
                    break
                    
    print("=======================")         


MT019529.1
orf1ab polyprotein ✔️
surface glycoprotein ✔️
orf3a protein ✔️
envelope protein ✔️
membrane glycoprotein ✔️
orf7a protein ✔️
orf8 protein ✔️
nucleocapsid phosphoprotein ✔️
MT371050.1
orf1ab polyprotein ✔️
surface glycoprotein ✔️
orf3a protein ✔️
envelope protein ✔️
membrane glycoprotein ✔️
orf7a protein ✔️
orf7b ✔️
orf8 protein ✔️
nucleocapsid phosphoprotein ✔️
MT371049.1
orf1ab polyprotein ✔️
surface glycoprotein ✔️
orf3a protein ✔️
envelope protein ✔️
membrane glycoprotein ✔️
orf7a protein ✔️
orf7b ✔️
orf8 protein ✔️
nucleocapsid phosphoprotein ✔️
MT371048.1
orf1ab polyprotein ✔️
surface glycoprotein ✔️
orf3a protein ✔️
envelope protein ✔️
membrane glycoprotein ✔️
orf7a protein ✔️
orf7b ✔️
orf8 protein ✔️
nucleocapsid phosphoprotein ✔️
MT371047.1
orf1ab polyprotein ✔️
surface glycoprotein ✔️
orf3a protein ✔️
envelope protein ✔️
membrane glycoprotein ✔️
orf7a protein ✔️
orf7b ✔️
orf8 protein ✔️
nucleocapsid phosphoprotein ✔️
MZ801830.1
orf1ab polyprotein ✔️
surface glycopro

In [9]:
characteristic_genes_dict

{'orf1ab': [Seq('>Severe-acute-respiratory-syndrome-coronavirus-2-(SARS...TAA'),
  Seq('>Severe-acute-respiratory-syndrome-coronavirus-2-(SARS...TAA'),
  Seq('>Severe-acute-respiratory-syndrome-coronavirus-2-(SARS...TAA'),
  Seq('>Severe-acute-respiratory-syndrome-coronavirus-2-(SARS...TAA'),
  Seq('>Severe-acute-respiratory-syndrome-coronavirus-2-(SARS...TAA'),
  Seq('>Severe-acute-respiratory-syndrome-coronavirus-2-(SARS...TAA'),
  Seq('>Severe-acute-respiratory-syndrome-coronavirus-2-(SARS...TAA')],
 'spike': [Seq('>Severe-acute-respiratory-syndrome-coronavirus-2-(SARS...TAA'),
  Seq('>Severe-acute-respiratory-syndrome-coronavirus-2-(SARS...TAA'),
  Seq('>Severe-acute-respiratory-syndrome-coronavirus-2-(SARS...TAA'),
  Seq('>Severe-acute-respiratory-syndrome-coronavirus-2-(SARS...TAA'),
  Seq('>Severe-acute-respiratory-syndrome-coronavirus-2-(SARS...TAA'),
  Seq('>Severe-acute-respiratory-syndrome-coronavirus-2-(SARS...TAA'),
  Seq('>Severe-acute-respiratory-syndrome-coronavirus-2-(

In [10]:
for characteristic_gene, sequences in characteristic_genes_dict.items():
    
    results = ""
    for sequence in sequences:
        results += sequence + "\n"
    
    file = open(f'./gene_sequences_new_variants_sl/{characteristic_gene}.fasta', "w")
    file.write(str(results))
    file.close()
    print(characteristic_gene + " | count: " + str(len(sequences)) + " ✔️")

orf1ab | count: 7 ✔️
spike | count: 7 ✔️
orf3a | count: 7 ✔️
orf3b | count: 0 ✔️
envelope | count: 7 ✔️
membrane | count: 7 ✔️
orf7a | count: 6 ✔️
orf7b | count: 6 ✔️
orf8 | count: 7 ✔️
nucleocapsid | count: 7 ✔️
