In [1]:
from pathlib import Path
import json
from urllib.request import urlopen
import tarfile

In [2]:
import pandas as pd
from tqdm import tqdm

In [None]:
Entrez.email = 'mjlumpe@gmail.com'

## Setup

In [4]:
DATESTR = '220222'
NBNAME = DATESTR + '-ondov-2016'

In [5]:
infiles = dict(
    ids=Path('../../data/external/ondov-2016/Escherichia/ids.txt'),
    esummaries=Path('../../data/external/ondov-2016/Escherichia/210902-Escherichia-genome-assembly-summaries.tar.gz'),
    genomes=Path('../../data/processed/210902-mash-Escherichia-genomes/210902-get-genomes/210902-mash-genomes-filtered.csv'),
)

In [6]:
processed_out = Path('data-processed') / NBNAME
processed_out.mkdir(exist_ok=True)

outfiles = dict(
    table=processed_out / f'{NBNAME}-genomes.csv',
)

## Reformat existing table

In [7]:
pd.read_csv('data-processed/220220-konstantinidis-2005/220220-konstantinidis-2005-genomes.csv').head(3)

Unnamed: 0,group,strain,assembly_uid,assembly_accession,organism,taxid,filename,url,md5
0,Enterics,E. coli O157:H7 Sakai,1755381,GCF_000008865.2,Escherichia coli O157:H7 str. Sakai (E. coli),386585,GCF_000008865.2.fasta.gz,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000...,e7bf4b6a0fa4cbd17a2cac69125311e0
1,Enterics,E. coli 0157:H7 EDL933,199821,GCF_000732965.1,Escherichia coli O157:H7 str. EDL933 (E. coli),155864,GCF_000732965.1.fasta.gz,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000...,c16ebef14b4b9536207b545b80e0b2e5
2,Enterics,E. coli K12,79781,GCF_000005845.2,Escherichia coli str. K-12 substr. MG1655 (E. ...,511145,GCF_000005845.2.fasta.gz,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000...,c13d459b5caa702ff7e1f26fe44b8ad7


In [11]:
import dtale

In [8]:
df = pd.read_csv(infiles['genomes'])

In [12]:
dtale.show(df)

2022-03-01 15:51:53,942 - INFO     - Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2022-03-01 15:51:53,942 - INFO     - NumExpr defaulting to 8 threads.




Exception occurred while processing request: 'NoneType' object is not subscriptable
Traceback (most recent call last):
  File "/home/jared/opt/anaconda3/envs/gambit/lib/python3.9/site-packages/dtale/views.py", line 114, in _handle_exceptions
    return func(*args, **kwargs)
  File "/home/jared/opt/anaconda3/envs/gambit/lib/python3.9/site-packages/dtale/views.py", line 2923, in get_scatter
    y_cols = [cols[1], idx_col]
TypeError: 'NoneType' object is not subscriptable


2022-03-01 15:54:55,457 - ERROR    - Exception occurred while processing request: 'NoneType' object is not subscriptable
Traceback (most recent call last):
  File "/home/jared/opt/anaconda3/envs/gambit/lib/python3.9/site-packages/dtale/views.py", line 114, in _handle_exceptions
    return func(*args, **kwargs)
  File "/home/jared/opt/anaconda3/envs/gambit/lib/python3.9/site-packages/dtale/views.py", line 2923, in get_scatter
    y_cols = [cols[1], idx_col]
TypeError: 'NoneType' object is not subscriptable


In [None]:
del df['index']
df.set_index('mash_index', inplace=True)
df.index.name = 'index'

Add original MASH ID

In [None]:
with open(infiles['ids']) as f:
    ids = [line.strip().split('\t')[1] for line in f.readlines()]
    
df.insert(0, 'id', [ids[i - 1] for i in df.index])

Additional cols

In [None]:
df['genus'] = ''
df['species'] = ''
df['taxid'] = 0
df['filename'] = [f'{acc}.fasta.gz' for acc in df['assembly_accession']]
df['md5'] = ''

In [None]:
df

## Add values from ESummary data

In [None]:
with tarfile.open(infiles['esummaries'], 'r:gz') as tar:
    m = tar.members
    for i, row in df.iterrows():
        with tar.extractfile(f'./{row.assembly_uid}.json') as f:
            summary = json.load(f)
            
        df.loc[i, 'taxid'] = int(summary['taxid'])

In [None]:
summary

In [None]:
tar.get

## Add additional columns

In [None]:
extra = []

for aid in df['assembly_uid']:
    s = summaries[aid]
    extra.append(dict(
        organism=s['organism'],
        taxid=s['taxid'],
        filename=s['assemblyaccession'] + '.fasta.gz',
        url=seq_url_from_esummary(s),
    ))

In [None]:
df = pd.concat([df, pd.DataFrame.from_records(extra)], axis=1)

## Get sequence file checksums

In [None]:
def get_md5(ftp_url):
    ftp_dir, filename = ftp_url.rsplit('/', 1)
    data = urlopen(ftp_dir + '/md5checksums.txt').read().decode()
    
    for line in data.splitlines():
        checksum, file = line.split()
        if file == './' + filename:
            return checksum
        
    assert 0

In [None]:
df['md5'] = None

In [None]:
for i, row in tqdm(df.iterrows()):
    if row['md5'] is None:
        df.loc[i, 'md5'] = get_md5(row['url'])

## Write output

In [None]:
df.to_csv(outfiles['table'], index=False)

In [None]:
esummary_file = outfiles['esummaries']

tmpdir = esummary_file.parent / esummary_file.name.split('.')[0]
tmpdir.mkdir()

fnames = []

for i, row in df.iterrows():
    fname = row.assembly_accession + '.json'
    with open(tmpdir / fname, 'w') as f:
        json.dump(summaries[row.assembly_uid], f)
        
    fnames.append(fname)
    
!tar -czf {esummary_file} -C {tmpdir} .
!rm -rf {tmpdir}