In [23]:
import pandas as pd
import numpy as np
import gzip
from Bio import Entrez
from Bio import SeqIO
from joblib import Parallel, delayed
from ete3 import NCBITaxa
ncbi = NCBITaxa()

In [2]:
def get_taxid(key):
    Entrez.email = "myemailaddress"
    handle = Entrez.efetch(db='nucleotide', id=key, rettype='gb')
    record = SeqIO.read(handle,'genbank')
    dbref = record.features[0].qualifiers['db_xref']
    for entry in dbref:
        if entry.split(":")[0] == 'taxon':
            return int(entry.split(':')[1])
    
    print(key)
    print(record.features[0].qualifiers)
    return np.NaN

In [12]:
df_bact = pd.read_csv('bact_blast.out.gz', compression='gzip', sep='\t', engine='python', header=None, comment='#',
                       names=["qseqid", "sseqid", "pident", "length", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore"])
df_bact.head()

Unnamed: 0,qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore
0,ASV_2,NR_114743.1,100.0,332,0,0,1,332,579,910,1.2400000000000001e-175,614.0
1,ASV_2,NR_114742.1,100.0,332,0,0,1,332,577,908,1.2400000000000001e-175,614.0
2,ASV_2,NR_114452.1,100.0,332,0,0,1,332,567,898,1.2400000000000001e-175,614.0
3,ASV_2,NR_075022.1,100.0,332,0,0,1,332,591,922,1.2400000000000001e-175,614.0
4,ASV_2,NR_037082.1,100.0,332,0,0,1,332,579,910,1.2400000000000001e-175,614.0


In [96]:
all_bact_acc = list(df_bact[['sseqid']].drop_duplicates().sseqid)
bact_acc_taxids = []
for key in all_bact_acc:
    try:
        bact_acc_taxids.append([key,get_taxid(key)])
    except:
        print("query fails: %s"%(key))
df_bact_acc_taxids = pd.DataFrame(bact_acc_taxids, columns=['sseqid','taxid'])
df_bact_acc_taxids.to_csv("bact_sseqid_taxid.csv")

In [20]:
df_bact_acc_taxids = pd.read_csv("bact_sseqid_taxid.csv", index_col=0)
dict_replace = {x:y for x,y in zip(df_bact_acc_taxids.sseqid, df_bact_acc_taxids.taxid)}

def get_taxid2(key):
    return dict_replace[key]
res = Parallel(n_jobs=12)(delayed(get_taxid2)(key) for key in df_bact.sseqid)
df_bact['sseqid'] = res
df_bact

Unnamed: 0,qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore
0,ASV_2,1354,100.000,332,0,0,1,332,579,910,1.240000e-175,614.0
1,ASV_2,1352,100.000,332,0,0,1,332,577,908,1.240000e-175,614.0
2,ASV_2,1354,100.000,332,0,0,1,332,567,898,1.240000e-175,614.0
3,ASV_2,768486,100.000,332,0,0,1,332,591,922,1.240000e-175,614.0
4,ASV_2,1354,100.000,332,0,0,1,332,579,910,1.240000e-175,614.0
...,...,...,...,...,...,...,...,...,...,...,...,...
9019551,ASV_9998,1119056,94.864,331,17,0,1,331,572,902,9.990000e-147,518.0
9019552,ASV_9998,644,94.864,331,17,0,1,331,553,883,9.990000e-147,518.0
9019553,ASV_9998,80746,94.864,331,17,0,1,331,574,904,9.990000e-147,518.0
9019554,ASV_9998,1505588,94.864,331,17,0,1,331,524,854,9.990000e-147,518.0


In [21]:
df_bact.sseqid = df_bact.sseqid.astype(int)
df_bact.pident = -df_bact.pident
df_bact = df_bact.sort_values(['qseqid','evalue','pident','sseqid'])
df_bact.pident = -df_bact.pident
df_bact_top_hit = df_bact.loc[df_bact[['qseqid']].drop_duplicates(keep='first').index].reset_index(drop=True)
df_bact_top_hit.to_csv("bact_tophit_taxid.csv", index=False)
df_bact_top_hit

Unnamed: 0,qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore
0,ASV_1,1598,100.000,332,0,0,1,332,584,915,1.240000e-175,614.0
1,ASV_10,1532,100.000,331,0,0,1,331,560,890,4.460000e-175,612.0
2,ASV_100,821,99.695,328,1,0,1,328,539,866,9.550000e-172,601.0
3,ASV_1000,1776391,98.480,329,5,0,1,329,533,861,1.250000e-165,580.0
4,ASV_10000,1121115,99.398,332,1,1,1,331,509,840,9.640000e-172,601.0
...,...,...,...,...,...,...,...,...,...,...,...,...
18115,ASV_9994,180164,97.885,331,7,0,1,331,539,869,2.100000e-163,573.0
18116,ASV_9994_TR,2038680,95.455,330,15,0,1,330,561,890,1.650000e-149,527.0
18117,ASV_9996,471875,99.396,331,2,0,1,331,528,858,9.640000e-172,601.0
18118,ASV_9997,180164,96.073,331,13,0,1,331,539,869,2.130000e-153,540.0


In [26]:
df_bact_top_hit = pd.read_csv("bact_tophit_taxid.csv")
desired_ranks = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
taxonomy = []
for asv,taxid in zip(df_bact_top_hit.qseqid, df_bact_top_hit.sseqid):
    lineage = ncbi.get_lineage(taxid)
    lineage2ranks = ncbi.get_rank(lineage)
    ranks2lineage = dict((rank, taxid) for (taxid, rank) in lineage2ranks.items())
    local = [asv, taxid]
    for rank in desired_ranks:
        id_ = ranks2lineage.get(rank, '<not present>') 
        if id_ == '<not present>':
            local.append(id_)
        else:
            local.append(ncbi.get_taxid_translator([id_])[id_])
    taxonomy.append(local)
df_bact_top_hit_names = pd.DataFrame(taxonomy, columns=['qseqid','sseqid','Kingdom','Phylum','Class','Order','Family','Genus','Species'])
df_bact_top_hit_names = pd.merge(df_bact_top_hit_names, df_bact_top_hit, left_on=['qseqid','sseqid'], right_on=['qseqid','sseqid'], how='left')
df_bact_top_hit_names = df_bact_top_hit_names.rename({'qseqid':'ASV', 'sseqid':'TaxonID'}, axis=1)
df_bact_top_hit_names.to_csv("bact_tophit_tax_names.csv", index=False)
df_bact_top_hit_names.head()

Unnamed: 0,ASV,TaxonID,Kingdom,Phylum,Class,Order,Family,Genus,Species,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore
0,ASV_1,1598,Bacteria,Firmicutes,Bacilli,Lactobacillales,Lactobacillaceae,Limosilactobacillus,Limosilactobacillus reuteri,100.0,332,0,0,1,332,584,915,1.2400000000000001e-175,614.0
1,ASV_10,1532,Bacteria,Firmicutes,Clostridia,Eubacteriales,Lachnospiraceae,Blautia,Blautia coccoides,100.0,331,0,0,1,331,560,890,4.4600000000000004e-175,612.0
2,ASV_100,821,Bacteria,Bacteroidetes,Bacteroidia,Bacteroidales,Bacteroidaceae,Phocaeicola,Phocaeicola vulgatus,99.695,328,1,0,1,328,539,866,9.55e-172,601.0
3,ASV_1000,1776391,Bacteria,Firmicutes,Clostridia,Eubacteriales,Peptostreptococcaceae,Romboutsia,Romboutsia timonensis,98.48,329,5,0,1,329,533,861,1.25e-165,580.0
4,ASV_10000,1121115,Bacteria,Firmicutes,Clostridia,Eubacteriales,Lachnospiraceae,Blautia,Blautia wexlerae,99.398,332,1,1,1,331,509,840,9.64e-172,601.0


In [28]:
df_new = pd.read_csv("tblASVtaxonomy_silva138_bact.csv")
df_new = pd.merge(df_new[['ASV','Sequence']], df_bact_top_hit_names, left_on='ASV', right_on='ASV', how='left').fillna('<not present>')
df_new.to_csv("tblASVtaxonomy_refseq_bact.csv", index=False)