# 211203 Download taxonomy

In [1]:
from pathlib import Path
import json

In [2]:
import pandas as pd
from Bio import Entrez
from tqdm import tqdm

## Setup

In [3]:
DATESTR = '212103'
NBNAME = DATESTR + '-download-genomes'

In [4]:
Entrez.email = 'mjlumpe@gmail.com'
Entrez.api_key = 'adea2a53e6e44c3cc42c932f9ba4bdffa809'

In [5]:
tmpdir = Path('tmp')

In [6]:
infiles = dict(
    genomes_table=Path('data-processed/211111-inspect-genomes/genomes.csv'),
)

archive_dir = Path('archive')
archive_dir.mkdir(exist_ok=True)

outfiles = dict(
    genomes=archive_dir / '211109-ncbi-representative-genomes.tar.gz',
)

## Load data

In [7]:
genomes_df = pd.read_csv(infiles['genomes_table'], dtype=dict(taxid=str))

## Download taxonomy tree

In [8]:
%load_ext autoreload
%autoreload 1

In [25]:
%aimport entrez_tools.db.taxonomy
from entrez_tools.db.taxonomy import fetch_taxa, fetch_taxonomy_tree, get_aka_taxids

In [10]:
taxa_file = tmpdir / 'taxa.json'

if taxa_file.is_file():
    with open(taxa_file) as f:
        taxon_data = json.load(f)
        
else:
    taxon_data = dict()

In [18]:
fetch_taxonomy_tree(genomes_df['taxid'], taxon_data);

AssertionError: 

In [None]:
debug

In [22]:
taxids = ['1648870', '202772']

In [27]:
get_aka_taxids(taxon_data.values())

{'2071625': '2042995', '1278819': '1114873', '519423': '2016499'}

In [23]:
fetch_taxa(taxids[0])

{'1648870': {'TaxId': '202772',
  'ScientificName': 'Zooshikella ganghwensis',
  'OtherNames': {'Synonym': ['Zooshikella marina'],
   'Teleomorph': [],
   'Misspelling': [],
   'GenbankAnamorph': [],
   'CommonName': [],
   'Acronym': [],
   'Inpart': [],
   'Anamorph': [],
   'Includes': ['Zooshikella sp. JC333'],
   'EquivalentName': [],
   'Name': [{'ClassCDE': 'authority', 'DispName': 'Zooshikella ganghwensis Yi et al. 2003'},
    {'ClassCDE': 'authority', 'DispName': 'Zooshikella ganghwensis Yi et al. 2003 emend. Huang et al. 2021'},
    {'ClassCDE': 'authority', 'DispName': 'Zooshikella marina Ramaprasad et al. 2015'},
    {'ClassCDE': 'type material', 'DispName': 'BCCM/LMG:28823'},
    {'ClassCDE': 'type material', 'DispName': 'DSM 15267'},
    {'ClassCDE': 'type material', 'DispName': 'DSM:15267'},
    {'ClassCDE': 'type material', 'DispName': 'IMSNU 14003'},
    {'ClassCDE': 'type material', 'DispName': 'IMSNU:14003'},
    {'ClassCDE': 'type material', 'DispName': 'KCTC 12044'

In [24]:
fetch_taxa(taxids[1])

{'202772': {'TaxId': '202772',
  'ScientificName': 'Zooshikella ganghwensis',
  'OtherNames': {'Synonym': ['Zooshikella marina'],
   'Teleomorph': [],
   'Misspelling': [],
   'GenbankAnamorph': [],
   'CommonName': [],
   'Acronym': [],
   'Inpart': [],
   'Anamorph': [],
   'Includes': ['Zooshikella sp. JC333'],
   'EquivalentName': [],
   'Name': [{'ClassCDE': 'authority', 'DispName': 'Zooshikella ganghwensis Yi et al. 2003'},
    {'ClassCDE': 'authority', 'DispName': 'Zooshikella ganghwensis Yi et al. 2003 emend. Huang et al. 2021'},
    {'ClassCDE': 'authority', 'DispName': 'Zooshikella marina Ramaprasad et al. 2015'},
    {'ClassCDE': 'type material', 'DispName': 'BCCM/LMG:28823'},
    {'ClassCDE': 'type material', 'DispName': 'DSM 15267'},
    {'ClassCDE': 'type material', 'DispName': 'DSM:15267'},
    {'ClassCDE': 'type material', 'DispName': 'IMSNU 14003'},
    {'ClassCDE': 'type material', 'DispName': 'IMSNU:14003'},
    {'ClassCDE': 'type material', 'DispName': 'KCTC 12044'}

In [None]:
taxon_data

In [None]:
taxon_data['2555902']

In [None]:
_['AkaTaxIds']

In [None]:
debug

In [None]:
with open(taxa_file, 'w') as f:
    json.dump(taxon_data, f)

In [None]:
len(taxon_data)

### Archive

In [None]:
! gzip -c {taxa_file} > {outfiles['taxon_data']}

## Taxonomy Table

In [None]:
taxa_df = pd.DataFrame.from_records(
    [
        dict(
            taxid=t['TaxId'],
            parent_taxid=t['ParentTaxId'],
            name=t['ScientificName'],
            rank=t['Rank'],
        )
        for t in taxon_data.values()
    ],
    index='taxid',
)

In [None]:
taxa_df.to_csv(outfiles['taxon_table'])