# 211207 Get taxonomy

In [1]:
from pathlib import Path
import json

In [2]:
import pandas as pd
from tqdm import tqdm

In [3]:
from ete3 import NCBITaxa



## Setup

In [4]:
DATESTR = '211207'
NBNAME = DATESTR + '-get-taxonomy'

In [5]:
infiles = dict(
    genomes_table=Path('data-processed/211111-inspect-genomes/genomes.csv'),
    taxdump_archive=Path('/home/jared/projects/gambit/data/ncbi/taxonomy/taxdumps/taxdump-211204.tar.gz'),
    taxdump_db=Path('/home/jared/projects/gambit/data/ncbi/taxonomy/taxdumps/taxdump-211204.db'),
)

In [6]:
intermediate_out = Path('data-intermediate') / NBNAME
intermediate_out.mkdir(exist_ok=True)

outfiles = dict(
    taxa=intermediate_out / 'taxa.csv',
    genomes=intermediate_out / 'genome-taxonomy.csv',
)

## Load data

### Genomes table

In [7]:
genomes_df = pd.read_csv(infiles['genomes_table'], index_col=0)
ngenomes = genomes_df.shape[0]

### NCBI taxonomy dump

In [8]:
# Create SQLite dump if it doesn't already exist
if not infiles['taxdump_db'].is_file():
    NCBITaxa(str(infiles['taxdump_db']), str(infiles['taxdump_archive']))

In [9]:
ncbi = NCBITaxa(str(infiles['taxdump_db']))

### Updated taxids

In [10]:
merge_map = dict(ncbi.db.execute('select taxid_old, taxid_new from merged'))

In [11]:
genomes_df['primary_taxid'] = [merge_map.get(taxid, taxid) for taxid in genomes_df['taxid']]

In [12]:
_sp_taxids = []

for taxid in tqdm(genomes_df['primary_taxid']):
    ancestors = reversed(ncbi.get_lineage(taxid))
    for ancestor in ancestors:
        rank = ncbi.get_rank([ancestor])[ancestor]
        if rank == 'species':
            _sp_taxids.append(ancestor)
            break
            
    else:
        assert 0

genomes_df['species_taxid'] = _sp_taxids

100%|██████████| 14388/14388 [00:00<00:00, 14953.33it/s]


## Filter genomes for tree

In [13]:
genomes_df['use'] = genomes_df['taxonomy_check_status'] == 'OK'

In [14]:
# Five genomes for which sequence data could not be obtained
_noseq = [10898551, 11011431, 11411721, 11411751, 11411881]
genomes_df.loc[_noseq, 'use'] = False

In [15]:
genomes_filtered = genomes_df[genomes_df['use']]
nfiltered = genomes_filtered.shape[0]

In [16]:
ngenomes, nfiltered

(14388, 11844)

## Build taxonomy tree

### Full tree

In [17]:
full_tree = ncbi.get_topology(genomes_filtered['species_taxid'], intermediate_nodes=True, )

full_tree.describe()

Number of leaf nodes:	11836
Total number of nodes:	15435
Rooted:	No children
Most distant node:	35793
Max. distance:	10.000000


### Remove nodes with non-standard rank

In [18]:
RANKS = ['phylum', 'class', 'order', 'family', 'genus', 'species']

In [19]:
tree = (full_tree & 2).copy()

In [20]:
to_delete = {node for node in tree.traverse() if not node.is_root() and node.rank not in RANKS}
len(to_delete)

131

In [21]:
for node in to_delete:
    node.delete(prevent_nondicotomic=False)

In [22]:
tree.describe()

Number of leaf nodes:	11836
Total number of nodes:	15303
Rooted:	No
Most distant node:	7
Max. distance:	6.000000


In [23]:
from collections import Counter

Counter(tuple(a.rank for a in node.get_ancestors()) for node in tree)

Counter({('genus',
          'family',
          'order',
          'class',
          'phylum',
          'superkingdom'): 11640,
         ('genus', 'order', 'class', 'phylum', 'superkingdom'): 93,
         ('genus', 'family', 'class', 'phylum', 'superkingdom'): 6,
         ('genus', 'class', 'phylum', 'superkingdom'): 40,
         ('genus', 'phylum', 'superkingdom'): 6,
         ('genus', 'family', 'order', 'phylum', 'superkingdom'): 51})

## Add extra attributes

### Genome counts

In [24]:
node_attrs = {node.taxid: dict() for node in tree.traverse()}

In [25]:
for node in tree.traverse('postorder'):
    a = node_attrs[node.taxid]
    
    n = sum(genomes_filtered['species_taxid'] == node.taxid)
    for child in node.children:
        n += node_attrs[child.taxid]['ngenomes']
    
    a['ngenomes'] = n

assert node_attrs[tree.taxid]['ngenomes'] == nfiltered

## To table

In [26]:
_rows = []

for node in tree.traverse():
    lineage = list(node.iter_ancestors())
    lineage.reverse()
    lineage.append(node)
    
    _row = dict(
        taxid=node.taxid,
        parent_taxid=0 if node.up is None else node.up.taxid,
        name=node.sci_name,
        rank=node.rank,
        sort_key=tuple(a.taxid for a in lineage),
        lineage_taxids=', '.join(str(a.taxid) for a in lineage),
        lineage=' '.join(a.sci_name for a in lineage),
        nchildren=len(node.children),
        **node_attrs[node.taxid],
    )
    
    _rows.append(_row)

taxonomy_df = pd.DataFrame.from_records(_rows)
taxonomy_df.set_index('taxid', inplace=True)
taxonomy_df.sort_values('sort_key', inplace=True)
del taxonomy_df['sort_key']

In [27]:
taxonomy_df.groupby('rank')[['nchildren', 'ngenomes']].describe().loc[RANKS]

Unnamed: 0_level_0,nchildren,nchildren,nchildren,nchildren,nchildren,nchildren,nchildren,nchildren,ngenomes,ngenomes,ngenomes,ngenomes,ngenomes,ngenomes,ngenomes,ngenomes
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
rank,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
phylum,38.0,2.684211,3.032306,1.0,1.0,1.0,2.0,11.0,38.0,311.684211,934.815666,1.0,3.0,12.0,46.75,4701.0
class,83.0,2.771084,4.899789,1.0,1.0,1.0,2.0,35.0,83.0,142.012048,427.495357,1.0,2.0,7.0,42.0,2452.0
order,215.0,2.655814,4.510906,1.0,1.0,1.0,2.0,41.0,215.0,54.846512,131.570597,1.0,2.0,6.0,37.5,969.0
family,529.0,4.773157,8.820042,1.0,1.0,2.0,5.0,91.0,529.0,22.126654,52.23016,1.0,1.0,4.0,18.0,547.0
genus,2601.0,4.550557,13.583355,1.0,1.0,1.0,3.0,387.0,2601.0,4.553633,13.589515,1.0,1.0,1.0,3.0,387.0
species,11836.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11836.0,1.000676,0.02906,1.0,1.0,1.0,1.0,3.0


## Write output

In [29]:
taxonomy_df.to_csv(outfiles['taxa'])

In [30]:
_cols = ['refseq_acc', 'primary_taxid', 'species_taxid', 'use']
genomes_df[_cols].to_csv(outfiles['genomes'])