In [77]:
import pandas as pd
import numpy as np
import gzip
from Bio import Entrez
from Bio import SeqIO
from joblib import Parallel, delayed
from ete3 import NCBITaxa
ncbi = NCBITaxa()

In [2]:
def get_taxid(key):
    Entrez.email = "myemailaddress"
    handle = Entrez.efetch(db='nucleotide', id=key, rettype='gb')
    record = SeqIO.read(handle,'genbank')
    dbref = record.features[0].qualifiers['db_xref']
    for entry in dbref:
        if entry.split(":")[0] == 'taxon':
            return int(entry.split(':')[1])
    
    print(key)
    print(record.features[0].qualifiers)
    return np.NaN

In [3]:
df_fungi = pd.read_csv('fungi_blast.out.gz', compression='gzip', sep='\t', engine='python', header=None, comment='#',
                       names=["qseqid", "sseqid", "pident", "length", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore"])
df_fungi.head()

Unnamed: 0,qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore
0,ASV_1,NR_130673.1,100.0,263,0,0,1,263,263,1,5.8e-138,486.0
1,ASV_1,NR_159741.1,98.828,256,1,2,1,254,256,1,1.6400000000000002e-128,455.0
2,ASV_1,NR_165186.1,90.444,293,7,15,1,279,294,9,7.88e-102,366.0
3,ASV_1,NR_130661.1,93.004,243,4,7,1,241,232,1,1.33e-94,342.0
4,ASV_1,NR_111473.1,88.256,281,18,11,1,274,273,1,1.73e-88,322.0


In [94]:
all_fungi_acc = list(df_fungi[['sseqid']].drop_duplicates().sseqid)
fungi_acc_taxids = []
for key in all_fungi_acc:
    fungi_acc_taxids.append([key,get_taxid(key)])
df_fungi_acc_taxids = pd.DataFrame(fungi_acc_taxids, columns=['sseqid','taxid'])
df_fungi_acc_taxids.to_csv("fungi_sseqid_taxid.csv")

In [7]:
df_fungi_acc_taxids = pd.read_csv("fungi_sseqid_taxid.csv", index_col=0)
df_fungi.sseqid = df_fungi.sseqid.replace({x:y for x,y in zip(df_fungi_acc_taxids.sseqid, df_fungi_acc_taxids.taxid)})
df_fungi

Unnamed: 0,qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore
0,ASV_1,5480,100.000,263,0,0,1,263,263,1,5.800000e-138,486.0
1,ASV_1,454356,98.828,256,1,2,1,254,256,1,1.640000e-128,455.0
2,ASV_1,273372,90.444,293,7,15,1,279,294,9,7.880000e-102,366.0
3,ASV_1,273371,93.004,243,4,7,1,241,232,1,1.330000e-94,342.0
4,ASV_1,497109,88.256,281,18,11,1,274,273,1,1.730000e-88,322.0
...,...,...,...,...,...,...,...,...,...,...,...,...
4440563,ASV_11763,2834548,97.849,93,2,0,1,93,212,120,7.800000e-40,161.0
4440564,ASV_11763,2796458,97.849,93,2,0,1,93,318,226,7.800000e-40,161.0
4440565,ASV_11763,2796459,97.849,93,2,0,1,93,328,236,7.800000e-40,161.0
4440566,ASV_11763,1381677,97.872,94,1,1,1,93,329,236,7.800000e-40,161.0


In [66]:
df_fungi.sseqid = df_fungi.sseqid.astype(int)
df_fungi.pident = -df_fungi.pident
df_fungi = df_fungi.sort_values(['qseqid','evalue','pident','sseqid'])
df_fungi.pident = -df_fungi.pident
df_fungi_top_hit = df_fungi.loc[df_fungi[['qseqid']].drop_duplicates(keep='first').index].reset_index(drop=True)
df_fungi_top_hit.to_csv("fungi_tophit_taxid.csv", index=False)
df_fungi_top_hit

In [72]:
df_fungi_top_hit = pd.read_csv("fungi_tophit_taxid.csv")
desired_ranks = ['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
taxonomy = []
for asv,taxid in zip(df_fungi_top_hit.qseqid, df_fungi_top_hit.sseqid):
    lineage = ncbi.get_lineage(taxid)
    lineage2ranks = ncbi.get_rank(lineage)
    ranks2lineage = dict((rank, taxid) for (taxid, rank) in lineage2ranks.items())
    local = [asv, taxid]
    for rank in desired_ranks:
        id_ = ranks2lineage.get(rank, '<not present>') 
        if id_ == '<not present>':
            local.append(id_)
        else:
            local.append(ncbi.get_taxid_translator([id_])[id_])
    taxonomy.append(local)
df_fungi_top_hit_names = pd.DataFrame(taxonomy, columns=['qseqid','sseqid','Kingdom','Phylum','Class','Order','Family','Genus','Species'])
df_fungi_top_hit_names = pd.merge(df_fungi_top_hit_names, df_fungi_top_hit, left_on=['qseqid','sseqid'], right_on=['qseqid','sseqid'], how='left')
df_fungi_top_hit_names = df_fungi_top_hit_names.rename({'qseqid':'ASV', 'sseqid':'TaxonID'}, axis=1)
df_fungi_top_hit_names.to_csv("fungi_tophit_tax_names.csv", index=False)
df_fungi_top_hit_names.head()

Unnamed: 0,ASV,TaxonID,Kingdom,Phylum,Class,Order,Family,Genus,Species,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore
0,ASV_1,5480,Fungi,Ascomycota,Saccharomycetes,Saccharomycetales,Debaryomycetaceae,Candida,Candida parapsilosis,100.0,263,0,0,1,263,263,1,5.8e-138,486.0
1,ASV_10,76775,Fungi,Basidiomycota,Malasseziomycetes,Malasseziales,Malasseziaceae,Malassezia,Malassezia restricta,100.0,302,0,0,1,302,302,1,1.5e-159,558.0
2,ASV_100,453955,Fungi,Ascomycota,Saccharomycetes,Saccharomycetales,Pichiaceae,Pichia,Pichia cecembensis,96.327,245,6,3,5,248,243,1,7.89e-112,399.0
3,ASV_1001,1895941,Fungi,Basidiomycota,Tremellomycetes,Trichosporonales,Trichosporonaceae,Cutaneotrichosporon,Cutaneotrichosporon moniliiforme,100.0,241,0,0,1,241,241,1,9.38e-126,446.0
4,ASV_1002,241526,Fungi,Ascomycota,Saccharomycetes,Saccharomycetales,Debaryomycetaceae,Candida,Candida africana,99.197,249,1,1,1,249,248,1,2.63e-126,448.0


In [75]:
df_new = pd.read_csv("../tblASVtaxonomy_refseq_fungi.csv")
df_new = pd.merge(df_new[['ASV','Sequence']], df_fungi_top_hit_names, left_on='ASV', right_on='ASV', how='left').fillna('<not present>')
df_new.to_csv("tblASVtaxonomy_refseq_fungi.csv", index=False)

Unnamed: 0,ASV,Sequence,TaxonID,Kingdom,Phylum,Class,Order,Family,Genus,Species,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore
0,ASV_1,GAATATCTGCAATTCATATTACTTATCGCATTTCGCTGCGTTCTTC...,5480.0,Fungi,Ascomycota,Saccharomycetes,Saccharomycetales,Debaryomycetaceae,Candida,Candida parapsilosis,100.0,263.0,0.0,0.0,1.0,263.0,263.0,1.0,0.0,486.0
1,ASV_2,GAATATCTGCAATTCATATTACGTATCGCATTTCGCTGCGTTCTTC...,241526.0,Fungi,Ascomycota,Saccharomycetes,Saccharomycetales,Debaryomycetaceae,Candida,Candida africana,99.598,249.0,0.0,1.0,1.0,249.0,248.0,1.0,0.0,453.0
2,ASV_3,GAATATCTGCAATTCATATTACGTATCGCATTTCGCTGCGTTCTTC...,241526.0,Fungi,Ascomycota,Saccharomycetes,Saccharomycetales,Debaryomycetaceae,Candida,Candida africana,100.0,248.0,0.0,0.0,1.0,248.0,248.0,1.0,0.0,459.0
3,ASV_4,GAATATCTGCAATTCATATTACTTATCGCATTTCGCTGCGTTCTTC...,5480.0,Fungi,Ascomycota,Saccharomycetes,Saccharomycetales,Debaryomycetaceae,Candida,Candida parapsilosis,99.62,263.0,1.0,0.0,1.0,263.0,263.0,1.0,0.0,481.0
4,ASV_5,GGAATTCTGCAATTCACATTACGTATCGCATTTCGCTGCGTTCTTC...,4932.0,Fungi,Ascomycota,Saccharomycetes,Saccharomycetales,Saccharomycetaceae,Saccharomyces,Saccharomyces cerevisiae,98.67,451.0,1.0,2.0,1.0,450.0,447.0,1.0,0.0,795.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7097,ASV_11652,TCTTTCCCTACACGACGCTCTTCCGATCTCCCTAAGAGCATCGTTC...,1206149.0,Fungi,Ascomycota,Saccharomycetes,Saccharomycetales,Dipodascaceae,Galactomyces,Galactomyces pseudocandidus,89.712,243.0,18.0,6.0,42.0,279.0,273.0,33.0,0.0,303.0
7098,ASV_11760,CCTACACGACGCTCTTCCGATCTACCAGACCAGTGACTCAGTTCAA...,45357.0,Fungi,Ascomycota,Saccharomycetes,Saccharomycetales,Metschnikowiaceae,Clavispora,[Candida] haemuloni,94.527,201.0,7.0,1.0,41.0,241.0,197.0,1.0,0.0,307.0
7099,ASV_11761,TGAATTCTGCAATTCACATTACGTATCGCATTTCGCTGCGTTCTTC...,41959.0,Fungi,Ascomycota,Eurotiomycetes,Eurotiales,Aspergillaceae,Aspergillus,Aspergillus penicillioides,95.669,254.0,10.0,1.0,1.0,254.0,288.0,36.0,0.0,407.0
7100,ASV_11762,TGAATTCTGCAATTCACATTACGTATCGCATTTCGCTGCGTTCTTC...,1915367.0,Fungi,Ascomycota,Eurotiomycetes,Eurotiales,Aspergillaceae,Aspergillus,Aspergillus clavatophorus,93.684,190.0,10.0,1.0,1.0,188.0,239.0,50.0,0.0,283.0


In [98]:
df_new = pd.read_csv("tblASVtaxonomy_refseq_fungi.csv")
df_color= pd.read_csv("../fungal_color.csv")
df_color.columns = ['Species','HexColor']
df_new = pd.merge(df_new, df_color, left_on='Species', right_on='Species', how='left')
assert len(df_new[~df_new.HexColor.notnull()])==0
order = {}
line =[]
curr_order = 1
for c in df_new.HexColor:
    if c in order:
        line.append(order[c])
    else:
        order[c] = curr_order
        line.append(curr_order)
        curr_order += 1
df_new['ColorOrder'] = line
df_new.to_csv("tblASVtaxonomy_refseq_fungi.csv", index=False)
df_new

Unnamed: 0,ASV,Sequence,TaxonID,Kingdom,Phylum,Class,Order,Family,Genus,Species,...,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore,HexColor,ColorOrder
0,ASV_1,GAATATCTGCAATTCATATTACTTATCGCATTTCGCTGCGTTCTTC...,5480.0,Fungi,Ascomycota,Saccharomycetes,Saccharomycetales,Debaryomycetaceae,Candida,Candida parapsilosis,...,0.0,0.0,1.0,263.0,263.0,1.0,5.8e-138,486.0,#EB6B6B,1
1,ASV_2,GAATATCTGCAATTCATATTACGTATCGCATTTCGCTGCGTTCTTC...,241526.0,Fungi,Ascomycota,Saccharomycetes,Saccharomycetales,Debaryomycetaceae,Candida,Candida africana,...,0.0,1.0,1.0,249.0,248.0,1.0,5.649999999999999e-128,453.0,#E64040,2
2,ASV_3,GAATATCTGCAATTCATATTACGTATCGCATTTCGCTGCGTTCTTC...,241526.0,Fungi,Ascomycota,Saccharomycetes,Saccharomycetales,Debaryomycetaceae,Candida,Candida africana,...,0.0,0.0,1.0,248.0,248.0,1.0,1.21e-129,459.0,#E64040,2
3,ASV_4,GAATATCTGCAATTCATATTACTTATCGCATTTCGCTGCGTTCTTC...,5480.0,Fungi,Ascomycota,Saccharomycetes,Saccharomycetales,Debaryomycetaceae,Candida,Candida parapsilosis,...,1.0,0.0,1.0,263.0,263.0,1.0,2.7e-136,481.0,#EB6B6B,1
4,ASV_5,GGAATTCTGCAATTCACATTACGTATCGCATTTCGCTGCGTTCTTC...,4932.0,Fungi,Ascomycota,Saccharomycetes,Saccharomycetales,Saccharomycetaceae,Saccharomyces,Saccharomyces cerevisiae,...,1.0,2.0,1.0,450.0,447.0,1.0,0.0,795.0,#756EAC,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7097,ASV_11652,TCTTTCCCTACACGACGCTCTTCCGATCTCCCTAAGAGCATCGTTC...,1206149.0,Fungi,Ascomycota,Saccharomycetes,Saccharomycetales,Dipodascaceae,Galactomyces,Galactomyces pseudocandidus,...,18.0,6.0,42.0,279.0,273.0,33.0,7.29e-83,303.0,#F0C3C3,17
7098,ASV_11760,CCTACACGACGCTCTTCCGATCTACCAGACCAGTGACTCAGTTCAA...,45357.0,Fungi,Ascomycota,Saccharomycetes,Saccharomycetales,Metschnikowiaceae,Clavispora,[Candida] haemuloni,...,7.0,1.0,41.0,241.0,197.0,1.0,5.6e-84,307.0,#C09C9C,9
7099,ASV_11761,TGAATTCTGCAATTCACATTACGTATCGCATTTCGCTGCGTTCTTC...,41959.0,Fungi,Ascomycota,Eurotiomycetes,Eurotiales,Aspergillaceae,Aspergillus,Aspergillus penicillioides,...,10.0,1.0,1.0,254.0,288.0,36.0,9.07e-114,407.0,#327130,18
7100,ASV_11762,TGAATTCTGCAATTCACATTACGTATCGCATTTCGCTGCGTTCTTC...,1915367.0,Fungi,Ascomycota,Eurotiomycetes,Eurotiales,Aspergillaceae,Aspergillus,Aspergillus clavatophorus,...,10.0,1.0,1.0,188.0,239.0,50.0,1.6e-76,283.0,#3F8D3D,16
