# 211010 Snitkin-2012 genomes

In [1]:
from pathlib import Path
import json

In [2]:
import pandas as pd
from Bio import Entrez
from tqdm.notebook import tqdm

In [3]:
import entrez_tools as ez

In [4]:
Entrez.email = 'mjlumpe@gmail.com'
Entrez.api_key = 'adea2a53e6e44c3cc42c932f9ba4bdffa809'

## Setup

In [5]:
DATESTR = '211010'
NBNAME = DATESTR + '-snitkin-2012-genomes'

In [6]:
infiles = dict(
    nuccore_accs=Path('/home/jared/projects/gambit/data/external/snitkin-2012/nuccore-accs.txt'),
)

In [7]:
esummary_dirs = dict(
    nuccore=Path('/home/jared/projects/gambit/data/ncbi/nuccore/esummary'),
    assembly=Path('/home/jared/projects/gambit/data/ncbi/assembly/esummary'),
)

def esummary_file(db, acc_or_uid): return esummary_dirs[db] / (acc_or_uid + '.json')

In [8]:
data_processed = Path('data-processed') / NBNAME
data_processed.mkdir(exist_ok=True)

In [9]:
outfiles = dict(
    genomes_table=data_processed / f'{DATESTR}-snitkin-2012-genomes.csv',
    nuccore_table=data_processed / f'{DATESTR}-snitkin-2012-nuccore-esummary-data.csv',
)

## Read data

In [10]:
with open(infiles['nuccore_accs']) as f:
    genome_accs_noversion = [line.strip() for line in f.readlines()]

## Entrez queries

### Nuccore esummaries

In [11]:
for acc in tqdm(genome_accs_noversion):
    acc_file = esummary_file('nuccore', acc)
    if acc_file.is_file():
        continue

    data = json.load(Entrez.esummary(db='nuccore', id=acc, retmode='json'))
    summary = ez.get_esummary_result_json(data)

    uid = summary['uid']
    uid_file = esummary_file('nuccore', uid)
    
    with uid_file.open('w') as f:
        json.dump(summary, f)
    
    acc_file.symlink_to(uid_file.name)
    
    accv = summary['accessionversion']
    accv_file = esummary_file('nuccore', accv)
    accv_file.symlink_to(uid_file.name)

  0%|          | 0/20 [00:00<?, ?it/s]

In [12]:
nuccore_esummaries = []

for acc in genome_accs_noversion:
    with open(esummary_file('nuccore', acc)) as f:
        nuccore_esummaries.append(json.load(f))

### Nuccore summaries to table

In [13]:
nuccore_summary_df = pd.DataFrame.from_records(
    nuccore_esummaries,
    columns=['uid', 'accessionversion', 'title', 'organism', 'createdate', 'updatedate', 'projectid', 'biosample', 'status', 'comment']
)

In [14]:
genomes_df = nuccore_summary_df[['organism', 'uid', 'accessionversion']].copy()
genomes_df.columns = ['description', 'nuccore_uid', 'nuccore_accession']

In [15]:
genomes_df['nuccore_dead'] = nuccore_summary_df['status'] == 'dead'

### Link nuccore to assembly

In [16]:
nuccore2assembly_result = Entrez.read(Entrez.elink(db='assembly', dbfrom='nuccore', id=genomes_df['nuccore_uid']))
nuccore2assembly = ez.get_elink_map_single(nuccore2assembly_result)

In [17]:
[k for k, v in nuccore2assembly.items() if v is None]

['397353112', '397390985']

### Link samples to assembly

Couldn't link the two "dead" nuccore entries to assembly, try using biosamples.

In [18]:
biosample_uids = dict()

for sample in tqdm(nuccore_summary_df['biosample']):
    id = ez.esearch_unique('biosample', 'Accession', sample)
    biosample_uids[sample] = id

  0%|          | 0/20 [00:00<?, ?it/s]

In [19]:
biosample2assembly_result = Entrez.read(Entrez.elink(db='assembly', dbfrom='biosample', id=list(biosample_uids.values())))
biosample2assembly = ez.get_elink_map_single(biosample2assembly_result)

In [20]:
[k for k, v in biosample2assembly.items() if v is None]

[]

### Combine assembly links

In [21]:
genomes_df.loc[:, 'assembly_uid'] = None

In [22]:
for i, row in nuccore_summary_df.iterrows():
    from_nuccore_id = nuccore2assembly[row.uid]
    from_biosample_id = biosample2assembly[biosample_uids[row.biosample]]
    
    assert from_nuccore_id is None or from_nuccore_id == from_biosample_id
    
    genomes_df.loc[i, 'assembly_uid'] = from_biosample_id

### Get assembly summaries

In [23]:
for uid in tqdm(genomes_df['assembly_uid']):
    uid_file = esummary_file('assembly', uid)
    if uid_file.is_file():
        continue

    data = json.load(Entrez.esummary(db='assembly', id=uid, retmode='json'))
    summary = ez.get_esummary_result_json(data)

    with uid_file.open('w') as f:
        json.dump(summary, f)
    
    acc = summary['assemblyaccession']
    acc_file = esummary_file('assembly', acc)
    acc_file.symlink_to(uid_file.name)

  0%|          | 0/20 [00:00<?, ?it/s]

In [24]:
assembly_esummaries = []

for uid in genomes_df['assembly_uid']:
    with open(esummary_file('assembly', uid)) as f:
        assembly_esummaries.append(json.load(f))

In [25]:
genomes_df['assembly_acc'] = [summary['assemblyaccession'] for summary in assembly_esummaries]

## Write output

In [26]:
genomes_df.to_csv(outfiles['genomes_table'], index=False)

In [27]:
nuccore_summary_df.to_csv(outfiles['nuccore_table'], index=False)