In [1]:
from pathlib import Path
import json
from urllib.request import urlopen

In [2]:
import pandas as pd
from tqdm import tqdm
from Bio import Entrez

In [3]:
from entrez_tools.esearch import esearch_accession
from entrez_tools.esummary import esummary_json
from entrez_tools.db.assembly import seq_url_from_esummary

In [4]:
Entrez.email = 'mjlumpe@gmail.com'

## Setup

In [5]:
DATESTR = '220220'
NBNAME = DATESTR + '-konstantinidis-2005'

In [6]:
infiles = dict(
    genomes=Path('../../data/external/konstantinidis-2005/210910-genomes.csv'),
)

In [7]:
processed_out = Path('data-processed') / NBNAME
processed_out.mkdir(exist_ok=True)

outfiles = dict(
    table=processed_out / f'{NBNAME}-genomes.csv',
    esummaries=processed_out / f'{NBNAME}-genome-esummaries.tar.gz',
)

## Reformat existing table

In [8]:
df = pd.read_csv(infiles['genomes'])

In [9]:
del df['description']
del df['former_name']

In [10]:
df.rename(columns=dict(assembly='assembly_accession'), inplace=True)

## Get NCBI data

In [11]:
df.insert(2, 'assembly_uid', None)

In [12]:
for i, row in tqdm(df.iterrows()):
    if row['assembly_uid'] is None:
        df.loc[i, 'assembly_uid'] = esearch_accession('assembly', row['assembly_accession'])

70it [00:27,  2.59it/s]


In [13]:
summaries = esummary_json('assembly', list(df['assembly_uid']))

## Add additional columns

In [14]:
extra = []

for aid in df['assembly_uid']:
    s = summaries[aid]
    extra.append(dict(
        organism=s['organism'],
        taxid=s['taxid'],
        filename=s['assemblyaccession'] + '.fasta.gz',
        url=seq_url_from_esummary(s),
    ))

In [15]:
df = pd.concat([df, pd.DataFrame.from_records(extra)], axis=1)

## Get sequence file checksums

In [16]:
def get_md5(ftp_url):
    ftp_dir, filename = ftp_url.rsplit('/', 1)
    data = urlopen(ftp_dir + '/md5checksums.txt').read().decode()
    
    for line in data.splitlines():
        checksum, file = line.split()
        if file == './' + filename:
            return checksum
        
    assert 0

In [17]:
df['md5'] = None

In [18]:
for i, row in tqdm(df.iterrows()):
    if row['md5'] is None:
        df.loc[i, 'md5'] = get_md5(row['url'])

70it [00:54,  1.29it/s]


## Write output

In [19]:
df.to_csv(outfiles['table'], index=False)

In [20]:
esummary_file = outfiles['esummaries']

tmpdir = esummary_file.parent / esummary_file.name.split('.')[0]
tmpdir.mkdir()

fnames = []

for i, row in df.iterrows():
    fname = row.assembly_accession + '.json'
    with open(tmpdir / fname, 'w') as f:
        json.dump(summaries[row.assembly_uid], f)
        
    fnames.append(fname)
    
!tar -czf {esummary_file} -C {tmpdir} .
!rm -rf {tmpdir}