# 211201 Konstantinidis-2005

In [1]:
from pathlib import Path
import json

In [2]:
import pandas as pd
from Bio import Entrez

In [3]:
from entrez_tools.db.assembly import format_summary_meta
from entrez_tools.db.taxonomy import fetch_taxonomy_tree

## Setup

In [4]:
DATESTR = '211201'
NBNAME = DATESTR + '-konstantinidis-2005'

In [5]:
Entrez.email = 'mjlumpe@gmail.com'
Entrez.api_key = 'adea2a53e6e44c3cc42c932f9ba4bdffa809'

In [6]:
infiles = dict(
    genomes_table=Path('../../data/external/konstantinidis-2005/210910-genomes.csv'),
    genome_summaries=Path('/home/jared/projects/gambit/data/ncbi/assembly/esummary/'),
)

In [7]:
intermediate_out = Path('data-intermediate') / NBNAME
intermediate_out.mkdir(exist_ok=True)

processed_out = Path('data-processed') / NBNAME
processed_out.mkdir(exist_ok=True)

outfiles = dict(
    genome_table=processed_out / f'{DATESTR}-konstantinidis-2005-genomes.csv',
    taxon_data=intermediate_out / f'{DATESTR}-konstantinidis-2005-taxa.json.gz',
    taxon_table=processed_out / f'{DATESTR}-konstantinidis-2005-taxa.csv',
)

In [8]:
tmpdir = Path('tmp')
tmpdir.mkdir(exist_ok=True)

## Load assembly data

In [9]:
genomes_df = pd.read_csv(infiles['genomes_table'])

In [10]:
summaries = dict()

for acc in genomes_df['assembly']:
    with open(infiles['genome_summaries'] / f'{acc}.json') as f:
        summaries[acc] = json.load(f)
        assert summaries[acc]['assemblyaccession'] == acc

## Format assembly data

In [11]:
_rows = []

for i, row in genomes_df.iterrows():
    summary = summaries[row.assembly]
    meta = format_summary_meta(summary['meta'])
    
    _rows.append(dict(
        group=row['group'],
        strain=row['strain'],
        description=row['description'],
        assembly_acc=row['assembly'],
        assembly_uid=summary['uid'],
        taxid=summary['taxid'],
        organism=summary['organism'],
        refseq_category=summary['refseq_category'],
        assembly_status=meta['assembly-status'],
        taxonomy_check_status=meta['taxonomy-check-status'],
        ftppath_refseq=summary['ftppath_refseq'],
    ))
        
genomes_df2 = pd.DataFrame.from_records(_rows)

### Inspect

In [12]:
genomes_df2.value_counts('taxonomy_check_status')

taxonomy_check_status
OK              63
Inconclusive     5
Failed           2
dtype: int64

In [13]:
genomes_df2.value_counts('assembly_status')

assembly_status
Complete Genome    70
dtype: int64

### Save

In [14]:
genomes_df2.to_csv(outfiles['genome_table'], index=False)

## Download taxonomy tree

In [15]:
taxa_file = tmpdir / 'taxa.json'

if taxa_file.is_file():
    with open(taxa_file) as f:
        taxon_data = json.load(f)
        
else:
    taxon_data = dict()

In [17]:
fetch_taxonomy_tree(genomes_df2['taxid'], taxon_data);

In [18]:
with open(taxa_file, 'w') as f:
    json.dump(taxon_data, f)

In [19]:
len(taxon_data)

201

### Archive

In [20]:
! gzip -c {taxa_file} > {outfiles['taxon_data']}

## Taxonomy Table

In [21]:
taxa_df = pd.DataFrame.from_records(
    [
        dict(
            taxid=t['TaxId'],
            parent_taxid=t['ParentTaxId'],
            name=t['ScientificName'],
            rank=t['Rank'],
        )
        for t in taxon_data.values()
    ],
    index='taxid',
)

In [22]:
taxa_df.to_csv(outfiles['taxon_table'])