# 211011 get genomes list

In [1]:
from pathlib import Path
import json
import sys

In [2]:
import pandas as pd
from Bio import Entrez
from tqdm.notebook import tqdm

In [3]:
from gambit.util.progress import iter_progress
import entrez_tools as ez

In [4]:
sys.path.insert(0, './src')

from esummary_store import BasicEsummaryStore

## Setup

In [5]:
Entrez.email = 'mjlumpe@gmail.com'
Entrez.api_key = 'adea2a53e6e44c3cc42c932f9ba4bdffa809'

In [6]:
DATESTR = '211011'
NBNAME = DATESTR + '-get-genomes-list'

In [7]:
infiles = dict(
    konstantinidis_2005=Path('../../data/external/konstaninidis-2005/210910-genomes.csv'),
    snitkin_2012=Path('../../data/processed/211010-snitkin-2012-genomes/211010-snitkin-2012-genomes/211010-snitkin-2012-genomes.csv'),
    assembly_esummary=Path('/home/jared/projects/gambit/data/ncbi/assembly/esummary/')
)

In [8]:
data_processed = Path('data-processed') / NBNAME
data_processed.mkdir(exist_ok=True)

In [9]:
outfiles = dict(
    genomes_list=data_processed / f'{DATESTR}-gambit-ani-additional-genomes.csv',
)

## Get list of genomes

In [10]:
genome_chunks = dict()

### Konstantinidis 2005

In [11]:
konstantinidis_2005 = pd.read_csv(infiles['konstantinidis_2005'])

In [12]:
konstantinidis_2005 = konstantinidis_2005[['description', 'assembly']].copy()
konstantinidis_2005.index.name = 'ds_index'

konstantinidis_2005.insert(0, 'id', konstantinidis_2005['assembly'])
konstantinidis_2005.rename(columns=dict(assembly='assembly_accession'), inplace=True)

In [13]:
genome_chunks['konstantinidis_2005'] = konstantinidis_2005

### Snitkin 2012

In [14]:
snitkin_2012 = pd.read_csv(infiles['snitkin_2012'])

In [15]:
snitkin_2012 = snitkin_2012[['description', 'assembly_acc']].copy()
snitkin_2012.index.name = 'ds_index'
snitkin_2012.insert(0, 'id', snitkin_2012['assembly_acc'])
snitkin_2012.rename(columns=dict(assembly_acc='assembly_accession'), inplace=True)

In [16]:
genome_chunks['snitkin_2012'] = snitkin_2012

### Combine

In [17]:
genomes = pd.concat(genome_chunks.values(), keys=genome_chunks.keys(), names=['data_set'])

In [18]:
genomes

Unnamed: 0_level_0,Unnamed: 1_level_0,id,description,assembly_accession
data_set,ds_index,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
konstantinidis_2005,0,GCF_000008865.2,"Escherichia coli O157:H7 str. Sakai DNA, compl...",GCF_000008865.2
konstantinidis_2005,1,GCF_000732965.1,Escherichia coli O157:H7 str. EDL933 chromosom...,GCF_000732965.1
konstantinidis_2005,2,GCF_000005845.2,"Escherichia coli str. K-12 substr. MG1655, com...",GCF_000005845.2
konstantinidis_2005,3,GCF_014262945.1,Escherichia coli CFT073 (E. coli),GCF_014262945.1
konstantinidis_2005,4,GCF_000007405.1,Shigella flexneri 2a str. 2457T,GCF_000007405.1
...,...,...,...,...
snitkin_2012,15,GCF_000281615.1,Klebsiella pneumoniae subsp. pneumoniae KPNIH19,GCF_000281615.1
snitkin_2012,16,GCF_000281375.1,Klebsiella pneumoniae subsp. pneumoniae KPNIH20,GCF_000281375.1
snitkin_2012,17,GCF_000281495.1,Klebsiella pneumoniae subsp. pneumoniae KPNIH21,GCF_000281495.1
snitkin_2012,18,GCF_000281515.1,Klebsiella pneumoniae subsp. pneumoniae KPNIH22,GCF_000281515.1


## Get Assembly UIDs and ESummary data

In [19]:
esummaries = BasicEsummaryStore('assembly', infiles['assembly_esummary'])

In [20]:
genomes['assembly_uid'] = [esummaries.acc_to_uid(acc) for acc in genomes['assembly_accession']]

### Lookup missing UIDs

In [21]:
missing_uid = pd.isnull(genomes['assembly_uid'])

for ix in tqdm(genomes.index[missing_uid]):
    genomes.loc[ix, 'assembly_uid'] = ez.esearch_accession('assembly', genomes.loc[ix, 'assembly_accession'])

0it [00:00, ?it/s]

In [22]:
assert not any(pd.isnull(genomes['assembly_uid']))

### Download missing summaries

In [23]:
missing_summary = [uid for uid in genomes['assembly_uid'] if uid not in esummaries]

In [24]:
for uid in tqdm(missing_summary):
    with Entrez.esummary(db='assembly', id=uid, retmode='json') as f:
        data = json.load(f)
    summary = ez.get_esummary_result_json(data)
    esummaries.add(summary)

0it [00:00, ?it/s]

## Done

In [25]:
genomes.to_csv(outfiles['genomes_list'])