In [1]:
from pathlib import Path
import json
import re
from urllib.request import urlretrieve

In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [3]:
from Bio import Entrez

Entrez.email = 'mjlumpe@gmail.com'

## Setup

In [4]:
DATESTR = '210902'
NBNAME = DATESTR + '-get-genomes'

In [5]:
infiles = dict(
    ids=Path('/home/jared/projects/gambit/data/external/mash/Escherichia/ids.txt'),
)

In [6]:
processed_out = Path('data-processed') / NBNAME
processed_out.mkdir(exist_ok=True)

In [7]:
intermediate_out = Path('data-intermediate') / NBNAME
intermediate_out.mkdir(exist_ok=True)

In [8]:
# Shared between notebooks
tmpdir = Path('tmp')
tmpdir.mkdir(exist_ok=True)

In [9]:
# For this notebook only
nb_tmpdir = tmpdir / 'notebooks' / NBNAME
nb_tmpdir.mkdir(exist_ok=True, parents=True)

## Load and format data

In [10]:
_rows = []

with open(infiles['ids']) as f:
    for i, line in enumerate(f):
        m = re.fullmatch(r'^(\d+)\trefseq-(.*).fna\n$', line)
        index = int(m.group(1))
        rest = m.group(2).split('-')
        
        assert index == i + 1
        _rows.append((index, *rest))

df = pd.DataFrame.from_records(_rows, columns=['index', 'col1', 'col2', 'col3-bioproject', 'col4-biosample', 'col5', 'col6', 'col7-organism'])
df.set_index('index', inplace=True)

Dots are used for nulls?

In [11]:
df[df == '.'] = None

This column seems to be all dots

In [12]:
assert all(value is None for value in df['col6'])
del df['col6']

Identify assembly accessions from column 5

In [13]:
df['col5_is_acc'] = df['col5'].str.match('GCF_', na=False)

### Summary

In [14]:
df.aggregate('count')

col1               500
col2               500
col3-bioproject    499
col4-biosample     489
col5               494
col7-organism      500
col5_is_acc        500
dtype: int64

In [15]:
df

Unnamed: 0_level_0,col1,col2,col3-bioproject,col4-biosample,col5,col7-organism,col5_is_acc
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,NZ,1115511,PRJNA224116,SAMD00019863,GCF_000759775.1,Escherichia_albertii_NBRC_107761,True
2,NZ,1115512,PRJNA224116,,NZ_BAFF,Escherichia_hermannii_NBRC_105704,False
3,NZ,1115515,PRJNA224116,SAMD00018687,GCF_000759795.1,Escherichia_vulneris_NBRC_102420,True
4,NZ,1169321,PRJNA224116,SAMN00847639,GCF_000407765.1,Escherichia_sp._KTE114,True
5,NZ,1169331,PRJNA224116,SAMN00847649,GCF_000350705.1,Escherichia_sp._KTE11,True
...,...,...,...,...,...,...,...
496,NZ,869672,PRJNA179711,SAMN02436638,NZ_AEZL,Escherichia_coli_97.0259,False
497,NZ,869679,PRJNA179715,SAMN02436332,NZ_AEZS,Escherichia_coli_3.2608,False
498,NZ,869687,PRJNA224116,SAMN02436474,GCF_000194495.1,Escherichia_coli_4.0967,True
499,NZ,869693,PRJNA179728,SAMN00116811,NZ_AFAG,Escherichia_coli_TW07793,False


## Get biosample UIDs

Biosample seems to be the most present ID value, but ELink won't accept the accession # for some reason. Get the biosample UIDs from the accession #s.

In [16]:
_file = nb_tmpdir / 'biosample-acc-to-uid.json'

if _file.is_file():
    with _file.open() as f:
        biosample_acc_to_uid = json.load(f)
        
else:
    biosample_acc_to_uid = dict()

    for acc in tqdm(df['col4-biosample']):
        if pd.isnull(acc) or acc in biosample_acc_to_uid:
            continue

        result = Entrez.read(Entrez.esearch(db='biosample', term=acc, field='accession'))
        ids = result['IdList']
        assert len(ids) == 1

        biosample_acc_to_uid[acc] = int(ids[0])
        
    with _file.open('w') as f:
        json.dump(biosample_acc_to_uid, f)

In [17]:
df['biosample_uid'] = np.asarray([None if acc is None else biosample_acc_to_uid[acc] for acc in df['col4-biosample']], dtype=object)

## Link biosamples to assembly

In [18]:
biosample_to_assembly_file = nb_tmpdir / 'biosample-to-assembly.xml'

if not biosample_to_assembly_file.is_file():
    ids = [uid for uid in df['biosample_uid'] if uid is not None]
    _data = Entrez.elink(dbfrom='biosample', db='assembly', id=ids).read()
    
    with biosample_to_assembly_file.open('wb') as f:
        f.write(_data)

In [19]:
with open(biosample_to_assembly_file, 'rb') as f:
    results = Entrez.read(f)

biosample_to_assembly_list = dict()

for item in results:
    assert len(item['IdList']) == 1
    assert len(item['LinkSetDb']) == 1

    idfrom = int(item['IdList'][0])
    idto = [int(link['Id']) for link in item['LinkSetDb'][0]['Link']]

    assert idfrom in df['biosample_uid'].values
    assert idfrom not in biosample_to_assembly_list

    biosample_to_assembly_list[idfrom] = idto

### Resolve

In [20]:
{k: v for k, v in biosample_to_assembly_list.items() if len(v) != 1}

{1894029: [691078, 690978]}

In [21]:
df[df['biosample_uid'] == 1894029]

Unnamed: 0_level_0,col1,col2,col3-bioproject,col4-biosample,col5,col7-organism,col5_is_acc,biosample_uid
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
319,NZ,1286350,PRJNA203638,SAMN01894029,NZ_AORA,Escherichia_coli_TOP2652,False,1894029


Both are assemblies of the same biosample submitted the same day, 690978 has assembly method "Velvet v. 0.7" with 90x coverage and 691078 has assembly method "MIRA v. 3.4.0" and 20x coverage. Go with the former.

### Add to table

In [22]:
biosample_to_assembly = {k: v[0] for k, v in biosample_to_assembly_list.items() if len(v) == 1}
biosample_to_assembly[1894029] = 690978

In [23]:
df['assembly_uid'] = np.asarray([None if uid is None else biosample_to_assembly[uid] for uid in df['biosample_uid']], dtype=object)

In [24]:
assert all(pd.isnull(df['biosample_uid']) == pd.isnull(df['assembly_uid']))

In [25]:
df[df['assembly_uid'].isnull()]

Unnamed: 0_level_0,col1,col2,col3-bioproject,col4-biosample,col5,col7-organism,col5_is_acc,biosample_uid,assembly_uid
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2,NZ,1115512,PRJNA224116,,NZ_BAFF,Escherichia_hermannii_NBRC_105704,False,,
30,NC,198214,PRJNA62907,,,Shigella_flexneri_2a_str._301,False,,
31,NC,216599,PRJNA224116,,GCF_000283715.1,Shigella_sonnei_53G,True,,
32,NC,300267,PRJNA58213,,,Shigella_dysenteriae_Sd197,False,,
160,NC,316401,PRJNA224116,,GCF_000210475.1,Escherichia_coli_ETEC_H10407,True,,
162,NC,386585,PRJNA57781,,,Escherichia_coli_O157_H7_str._Sakai,False,,
164,NC,431946,PRJNA224116,,GCF_000010485.1,Escherichia_coli_SE15,True,,
165,NC,585057,PRJNA59381,,,Escherichia_coli_IAI39,False,,
280,NZ,1268238,PRJNA186642,,NZ_CAPL,Escherichia_coli_O5_K4_L_H4_str._ATCC_23502,False,,
341,NZ,1433129,PRJNA224116,,NZ_CBWB,Escherichia_coli_IS5,False,,


## Find remaining assembly UIDs by accession

There are a few rows which have an assembly accession in column 5 but no biosample and so no linked assembly UID.

In [26]:
assembly_acc_no_uid = df['col5_is_acc'] & pd.isnull(df['assembly_uid'])
df[assembly_acc_no_uid]

Unnamed: 0_level_0,col1,col2,col3-bioproject,col4-biosample,col5,col7-organism,col5_is_acc,biosample_uid,assembly_uid
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
31,NC,216599,PRJNA224116,,GCF_000283715.1,Shigella_sonnei_53G,True,,
160,NC,316401,PRJNA224116,,GCF_000210475.1,Escherichia_coli_ETEC_H10407,True,,
164,NC,431946,PRJNA224116,,GCF_000010485.1,Escherichia_coli_SE15,True,,


Search by versionless accession, to find the latest version.

In [27]:
_file = nb_tmpdir / 'assembly-acc-to-uids.json'
        
        
if _file.is_file():
    with open(_file) as f:
        assembly_acc_to_uids = json.load(f)


else:
    assembly_acc_to_uids = dict()
    
    for acc in tqdm(df.loc[assembly_acc_no_uid, 'col5']):
        acc2 = acc.split('.')[0]
        
        with Entrez.esearch(db='assembly', term=acc2, field='Assembly Accession') as response:
            data = Entrez.read(response)
            
        ids = list(map(int, data['IdList']))
        assert ids
        
        assembly_acc_to_uids[acc2] = ids
        
    with open(_file, 'w') as f:
        json.dump(assembly_acc_to_uids, f)

In [28]:
assembly_acc_to_uids

{'GCF_000283715': [406998],
 'GCF_000210475': [379808],
 'GCF_000010485': [380618]}

These are all unique, assign.

In [29]:
for idx in df.index[assembly_acc_no_uid]:
    acc = df.loc[idx, 'col5']
    uids = assembly_acc_to_uids[acc.split('.')[0]]
    assert len(uids) == 1
    df.loc[idx, 'assembly_uid'] = uids[0]

## Get assembly summaries

In [30]:
summaries_dir = tmpdir / 'assembly-summaries'
summaries_dir.mkdir(exist_ok=True)

In [31]:
for uid in tqdm(df['assembly_uid']):
    if uid is None:
        continue
        
    file = summaries_dir / f'{uid}.json'
    if file.is_file():
        continue
        
    with Entrez.esummary(db='assembly', id=uid, retmode='json') as response:
        data = json.load(response)
    
    assert data['result']['uids'] == [str(uid)]
    data2 = data['result'][str(uid)]
    
    with file.open('wt') as f:
        json.dump(data2, f)

100%|██████████| 500/500 [00:00<00:00, 106714.43it/s]


In [32]:
summary_data = dict()

for idx, uid in df['assembly_uid'].iteritems():
    if uid is None:
        continue

    with open(summaries_dir / f'{uid}.json') as f:
        summary_data[uid] = json.load(f)

## Extract information from summary data

In [33]:
_accs = []
_urls = []
_organisms = []

for uid in df['assembly_uid']:
    if uid is None:
        _accs.append(None)
        _urls.append(None)
        _organisms.append(None)
        
    else:
        data = summary_data[uid]

        ftp_dir = data['ftppath_refseq']
        url = ftp_dir + '/' + ftp_dir.rsplit('/', 1)[1] + '_genomic.fna.gz'
        
        _accs.append(data['assemblyaccession'])
        _urls.append(url)
        _organisms.append(data['organism'])
        
df['assembly_accession'] = np.asarray(_accs)
df['url'] = np.asarray(_urls)
df['organism'] = np.asarray(_organisms)

### Check where accessions don't match column 5

In [34]:
acc_changed = df['col5_is_acc'] & (df['col5'] != df['assembly_accession'])
acc_changed.sum()

24

Check they are upgraded versions:

In [35]:
for idx in df.index[acc_changed]:
    acc1 = df.loc[idx, 'col5']
    a1, v1 = acc1.split('.')
    
    acc2 = df.loc[idx, 'assembly_accession']
    a2, v2 = acc2.split('.')
    
    assert a1 == a2
    assert int(v1) < int(v2)

## Download genomes

In [36]:
pd.isnull(df['url']).sum()

8

In [37]:
genomes_dir = tmpdir / 'genomes'
genomes_dir.mkdir(exist_ok=True)

In [38]:
for row in tqdm(df.itertuples()):
    if row.url is None:
        continue
        
    file = genomes_dir / (row.assembly_accession + '.fa.gz')
    if not file.is_file():
        urlretrieve(row.url, file)

500it [00:00, 49471.63it/s]


## Save data

In [39]:
df.to_csv(processed_out / f'{DATESTR}-mash-genomes.csv')

In [40]:
df2 = df[['assembly_accession', 'assembly_uid', 'organism', 'url']]

df2.insert(0, 'mash_index', df2.index)

df2 = df2[pd.notnull(df2['assembly_accession'])]

df2.reset_index(drop=True, inplace=True)

df2.index = range(1, df2.shape[0] + 1)
df2.index.name = 'index'

df2.to_csv(processed_out / f'{DATESTR}-mash-genomes-filtered.csv')