# 211019 Get taxonomy tree

In [1]:
from pathlib import Path
from shutil import copyfileobj

import numpy as np
import pandas as pd
from tqdm import tqdm
from Bio import Entrez

In [2]:
from src.esummary_store import BasicEsummaryStore

## Setup

In [3]:
DATESTR = '211019'
NBNAME = DATESTR + '-get-taxonomy-tree'

In [4]:
Entrez.email = 'mjlumpe@gmail.com'
Entrez.api_key = 'adea2a53e6e44c3cc42c932f9ba4bdffa809'

In [5]:
infiles = dict(
    genomes=Path('data-processed/211015-combine-data/211015-gambit-ani-genomes.csv'),
)

In [6]:
tmpdir = Path('tmp')

taxa_dir = tmpdir / 'taxa'
taxa_dir.mkdir(exist_ok=True)

In [7]:
intermediate_out = Path('data-intermediate') / NBNAME
intermediate_out.mkdir(exist_ok=True)

outfiles = dict(
    genome_taxids=intermediate_out / 'genome-taxids.csv',
    taxa=intermediate_out / 'taxa.csv',
)

## Load data

In [8]:
genomes = pd.read_csv(infiles['genomes'], index_col=[0, 1], dtype=dict(assembly_uid=str))

## Fetch taxonomy data

In [9]:
esummaries = BasicEsummaryStore('assembly', '/home/jared/projects/gambit/data/ncbi/assembly/esummary/')

In [10]:
genomes['taxid'] = None

for ix, uid in genomes['assembly_uid'].dropna().iteritems():
    genomes.loc[ix, 'taxid'] = esummaries[uid]['taxid']

In [11]:
taxon_xml = dict()

for taxid in tqdm(set(genomes['taxid'].dropna())):
    file = taxa_dir / f'{taxid}.xml'
    
    if not file.is_file():
        with Entrez.efetch(db='taxonomy', id=taxid) as src:
            with open(file, 'wb') as dst:
                copyfileobj(src, dst)
                
    with open(file, 'rb') as f:
        data = Entrez.read(f)
        
    assert len(data) == 1
    taxon = data[0]
    assert taxon['TaxId'] == taxid
    taxon_xml[taxid] = taxon

100%|██████████| 499/499 [00:00<00:00, 1738.69it/s]


## Format data

In [12]:
taxa = dict()

for txml in taxon_xml.values():
    lineage = [
        dict(taxid=t['TaxId'], name=t['ScientificName'], rank=t['Rank'], parent=None)
        for t in [*txml['LineageEx'], txml]
    ]
    
    for i, taxon in enumerate(lineage):
        if i > 0:
            taxon['parent'] = lineage[i - 1]['taxid']
            
        tid = taxon['taxid']
        if tid in taxa:
            assert taxa[tid] == taxon
        else:
            taxa[tid] = taxon

In [13]:
taxa_df = pd.DataFrame.from_records(list(taxa.values()), columns=['taxid', 'name', 'rank', 'parent']).set_index('taxid')

## Save

In [15]:
genomes[['taxid']].to_csv(outfiles['genome_taxids'])

In [16]:
taxa_df.to_csv(outfiles['taxa'])