# 220412 Genome Set 2 taxonomy

In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
from ete3 import NCBITaxa

## Setup

In [2]:
DATESTR = '220412'
NBNAME = f'{DATESTR}-set2-taxonomy'

In [3]:
infiles = dict(
    workflow=Path('/home/jared/code/gambit/gambit-publication/'),
    taxonomy_db=Path('/home/jared/projects/gambit/data/ncbi/taxonomy/taxdumps/taxdump-211204.db'),
)

In [4]:
processed_out = Path('data-processed') / NBNAME
processed_out.mkdir(exist_ok=True)

In [5]:
outfiles = dict(
    taxa=processed_out / f'{DATESTR}-set2-taxa.csv',
    lineages=processed_out / f'{DATESTR}-set2-lineages.csv',
)

## Load data

In [6]:
taxa = NCBITaxa(infiles['taxonomy_db'])

In [7]:
genomes = pd.read_csv(infiles['workflow'] / 'resources/genomes/set2/genomes.csv')

## Taxonomy tables

In [8]:
ranks = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']

In [9]:
rows = []

for taxid in genomes['taxid']:
    lineage = [tid for tid in taxa.get_lineage(taxid) if taxa.get_rank([tid])[tid] in ranks]
    assert len(lineage) == len(ranks)
    rows.append(lineage)
    
lineages_taxid = pd.DataFrame(rows, columns=ranks)

In [10]:
all_taxids = sorted(set(lineages_taxid.values.flat))
name_map = dict(zip(all_taxids, taxa.translate_to_names(all_taxids)))
rank_map = taxa.get_rank(all_taxids)

assert len(set(name_map.values())) == len(all_taxids)

In [11]:
lineages = lineages_taxid.applymap(name_map.__getitem__)

In [12]:
taxa_df = pd.DataFrame(index=pd.Series(all_taxids, name='taxid'))
taxa_df['name'] = [name_map[tid] for tid in all_taxids]
taxa_df['rank'] = [rank_map[tid] for tid in all_taxids]

## Save

In [13]:
lineages.to_csv(outfiles['lineages'], index=False)

In [14]:
taxa_df.to_csv(outfiles['taxa'])