# 220912 Download taxa

Download ESummary data for taxonomy tree.

In [1]:
from pathlib import Path
import json

In [2]:
import pandas as pd
from Bio import Entrez
from tqdm import tqdm

## Setup

In [3]:
Entrez.email = 'mjlumpe@gmail.com'

In [4]:
DATESTR = '220912'
NBNAME = f'{DATESTR}-download-taxa'

In [5]:
infiles = dict(
    genomes='data-src/genomes.csv',
)

In [6]:
intermediate_data = Path('data-intermediate') / NBNAME
intermediate_data.mkdir(exist_ok=True)

outfiles = dict(
    taxa=intermediate_data / 'taxa',
)

outfiles['taxa'].mkdir(exist_ok=True)

## Load data


In [7]:
genomes = pd.read_csv(infiles['genomes'])

## Download taxonomy data

In [8]:
to_download = set(genomes['ncbi_taxid'])
taxa = dict()


# Map taxids of genomes to their species-level ancestors
# (taxon assigned to genome by NCBI may be subspecies or strain)
species_map = dict()


while to_download:
    # Next to download
    taxid = next(iter(to_download))
    print(taxid)
    file = outfiles['taxa'] / f'{taxid}.json'
    
    if not file.exists():
        # Fetch data from NCBI
        result = Entrez.read(Entrez.efetch(db='taxonomy', id=taxid))
        (taxon,) = result

        # Save to local file
        with open(file, 'w') as f:
            json.dump(taxon, f)
    
    else:
        # Already downloaded, read from existing file
        with open(file) as f:
            taxon = json.load(f)
            
    taxa[taxid] = taxon
    to_download.remove(taxid)
            
    # If not a species, find species ancestor and add to download list
    if taxon['Rank'] != 'species':
        for ancestor in taxon['LineageEx']:
            if ancestor['Rank'] == 'species':
                tid = int(ancestor['TaxId'])
                if tid not in taxa:
                    to_download.add(tid)
                    
                break
            
        else:
            raise RuntimeError('No species ancestor found')

498019
4932
5478
4909
4911
36911
4952


In [9]:
species_map

{}

All of these were already at the species level, so no need to deal with that.