# 221005 Set4 NCBI Data

In [1]:
from pathlib import Path
import re
import json

In [2]:
import pandas as pd

from Bio import Entrez

## Setup

In [3]:
DATESTR = '221005'
NBNAME = f'{DATESTR}-set4-ncbi-data'

In [4]:
infiles = dict(
    table='src/221005-ag-data_v2.csv',
)

In [5]:
data_processed = Path('data-processed') / NBNAME
data_processed.mkdir(exist_ok=True)

outfiles = dict(
    biosamples=data_processed / f'{DATESTR}-set4-biosamples.csv',
    bioprojects=data_processed / f'{DATESTR}-set4-bioprojects.csv',
)

In [6]:
Entrez.email = 'mjlumpe@gmail.com'

In [7]:
tmpdir = Path('tmp')

esummary_dirs = dict()

for db in ['biosample', 'bioproject']:
    d = esummary_dirs[db] = tmpdir / db
    d.mkdir(exist_ok=True)

## Code

In [8]:
def read_json_dir(directory, pattern: str = None, d: dict = None):
    """Read all JSON files in directory, return keyed by file name minus ext"""
    if d is None:
        d = dict()
        
    for f in directory.glob('*.json'):
        if pattern and not re.fullmatch(pattern, f.stem):
            continue

        with open(f) as fobj:
            d[f.stem] = json.load(fobj)
        
    return d

In [9]:
def fetch_esummaries(db: str, uids, directory: Path, overwrite=False):
    """Fetch esummaries, save to JSON files in path named by UID."""
    uids = set(uids)
    files = {uid: directory / f'{uid}.json' for uid in uids}
    result = Entrez.read(Entrez.esummary(db=db, id=','.join(uids), post=True))

    for summary in result['DocumentSummarySet']['DocumentSummary']:
        assert not summary.keys() & summary.attributes.keys()
        summary_dict = {**summary, **summary.attributes}  # UID in attributes
        
        uid = summary_dict['uid']
        file = files[uid]

        if file.is_file():
            if overwrite:
                print('Overwriting UID', uid)
            else:
                print('Skipping existing UID', uid)
                continue
            
        with open(file, 'w') as f:
            json.dump(summary_dict, f)

In [10]:
def parse_elink_result(result, unique: bool, by_name: bool):
    id_map = dict()

    for linkset in result:
        (from_uid,) = linkset['IdList']
        
        for linksetdb in linkset['LinkSetDb']:
            linkname = linksetdb['LinkName']
            
            for link in linksetdb['Link']:
                to_uid = link['Id']
                
                if by_name:
                    d = id_map.setdefault(from_uid, {})
                    if unique:
                        assert linkname not in d
                        d[linkname] = to_uid
                    else:
                        d.setdefault(linkname, []).append(to_uid)
                        
                elif unique:
                    assert from_uid not in id_map
                    id_map[from_uid] = to_uid
                
                else:
                    id_map.setdefault(from_uid, []).append(to_uid)
    
    return id_map

## Load data

In [11]:
set4_df = pd.read_csv(infiles['table'], index_col=0)

In [12]:
cols = ['entity:miniseq_id', 'biosample_accession', 'submission_id']
bs_df = set4_df.loc[~set4_df.biosample_accession.isnull(), cols]
bs_df = bs_df.reset_index().set_index('biosample_accession')

In [13]:
bs_accs = bs_df.index
assert bs_accs.is_unique
num_bs = len(bs_accs)
num_bs

572

## Get biosamples

### Fetch summaries

In [14]:
bs_summaries_by_uid = read_json_dir(esummary_dirs['biosample'], r'\d+')
len(bs_summaries_by_uid)

572

In [15]:
bs_to_fetch = set(bs_accs) - {summary['Accession'] for summary in bs_summaries_by_uid.values()}
print(len(bs_to_fetch), 'to fetch')

if bs_to_fetch:
    # Find uids
    result = Entrez.read(Entrez.esearch(db='biosample', term=' OR '.join(bs_to_fetch), field='ACCN', post=True, retmax=len(bs_to_fetch)))
    print('Got %s results of expected %d' % (result['Count'], len(bs_to_fetch)))

    fetch_esummaries('biosample', result['IdList'], esummary_dirs['biosample'])

0 to fetch


In [16]:
read_json_dir(esummary_dirs['biosample'], r'\d+', bs_summaries_by_uid)
assert len(bs_summaries_by_uid) == num_bs

### Extract summary info

In [17]:
bs_summaries = dict()
bs_uid_to_acc = dict()
bs_acc_to_uid = dict()

for uid, summary in bs_summaries_by_uid.items():
    acc = summary['Accession']
    bs_summaries[acc] = summary
    bs_uid_to_acc[uid] = acc
    bs_acc_to_uid[acc] = uid

assert len(bs_summaries) == num_bs
assert len(bs_uid_to_acc) == num_bs
assert len(bs_acc_to_uid) == num_bs

bs_uids = set(bs_uid_to_acc)

In [18]:
bs_df['biosample_uid'] = pd.Series(bs_acc_to_uid)
assert not bs_df.biosample_uid.isnull().any()

## Inspect "Identifiers" field

In [19]:
def parse_identifiers(identifiers: str):
    ids = dict()
    for s in identifiers.split('; '):
        (k, v) = s.split(': ')
        ids[k] = v
    return ids

In [20]:
identifiers = pd.DataFrame([
    parse_identifiers(summary['Identifiers'])
    for summary in bs_summaries.values()
]).set_index('BioSample')

assert set(identifiers.index) == set(bs_accs)

In [21]:
identifiers.count()

SRA         493
EDLB-CDC    159
NSPHL       413
dtype: int64

### Add SRA links

In [22]:
bs_df['sra'] = identifiers.SRA

## Link biosample to bioproject

In [23]:
result = Entrez.read(Entrez.elink(db='bioproject', dbfrom='biosample', id=bs_uids))
# result = parse.read(Entrez.elink(db='bioproject', dbfrom='biosample', linkname='biosample_bioproject', id=bs_uids))

In [24]:
link_bs_to_bp = parse_elink_result(result, False, True)
assert link_bs_to_bp.keys() == bs_uids

### Inspect and organize

In [25]:
from collections import Counter
Counter((name, len(ids)) for links in link_bs_to_bp.values() for name, ids in links.items())

Counter({('biosample_bioproject_sp', 1): 572,
         ('biosample_bioproject', 1): 572,
         ('biosample_bioproject_all', 1): 535,
         ('biosample_bioproject_all', 2): 37})

In [26]:
bs_to_bp = dict()
bs_extra_bps = dict()
all_bp_uids = set()

for bs_uid, links in link_bs_to_bp.items():
    (main_bp,) = links['biosample_bioproject']
    assert links['biosample_bioproject'] == links['biosample_bioproject_sp']
    
    bs_to_bp[bs_uid] = main_bp
    
    bps_all = links['biosample_bioproject_all']
    assert main_bp in bps_all
    if len(bps_all) > 1:
        bs_extra_bps[bs_uid] = [uid for uid in bps_all if uid != main_bp]
        
    all_bp_uids |= set(bps_all)

In [27]:
len(all_bp_uids)

8

In [28]:
bp_counts_main = Counter(bs_to_bp.values())
bp_counts_main

Counter({'857686': 413,
         '218110': 89,
         '230403': 35,
         '239251': 24,
         '266293': 9,
         '212117': 2})

In [29]:
bp_counts_extra = Counter(e for extra in bs_extra_bps.values() for e in extra)
bp_counts_extra

Counter({'290730': 35, '211456': 2})

In [30]:
bs_df['bioproject_uid'] = [bs_to_bp[bs] for bs in bs_df.biosample_uid]

### Fetch bioproject summaries

In [31]:
fetch_esummaries('bioproject', all_bp_uids, esummary_dirs['bioproject'])

Skipping existing UID 212117
Skipping existing UID 230403
Skipping existing UID 290730
Skipping existing UID 857686
Skipping existing UID 218110
Skipping existing UID 239251
Skipping existing UID 266293
Skipping existing UID 211456


In [32]:
bioprojects = read_json_dir(esummary_dirs['bioproject'])

In [33]:
bioproj_df = pd.DataFrame(list(bioprojects.values())).set_index('uid')
bioproj_df.sort_index(axis=1, inplace=True)

bioproj_df.insert(0, 'biosample_count', [bp_counts_main[uid] for uid in bioproj_df.index])
bioproj_df.insert(1, 'biosample_count_extra', [bp_counts_extra[uid] for uid in bioproj_df.index])

bioproj_df.sort_values('biosample_count', axis='index', inplace=True, ascending=False)

In [34]:
cols = ['Project_Acc', 'Project_Name', 'Project_Title', 'Project_Description', 'Submitter_Organization', 'biosample_count', 'biosample_count_extra']
bioproj_df[cols]

Unnamed: 0_level_0,Project_Acc,Project_Name,Project_Title,Project_Description,Submitter_Organization,biosample_count,biosample_count_extra
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
857686,PRJNA857686,hospital metagenome,hospital metagenome Raw sequence reads,Fastq files for HAI bacteria sequenced at the ...,Nevada State Public Health Laboratory submissi...,413,0
218110,PRJNA218110,Enterobacteriaceae,PulseNet Escherichia coli and Shigella genome ...,PulseNet STEC genome reference library,CDC,89,0
230403,PRJNA230403,Salmonella enterica,PulseNet Salmonella enterica Genome sequencing,Salmonella enterica genomes sequenced as part ...,CDC,35,0
239251,PRJNA239251,Campylobacter,Campylobacter Genome sequencing and assembly,Campylobacter genome sequencing,Centers for Disease Control and Prevention,24,0
266293,PRJNA266293,Bacteria,PulseNet Vibrio spp. Genome sequencing,Vibrio spp. genomes sequenced for PulseNet sur...,CDC,9,0
212117,PRJNA212117,Listeria monocytogenes,Listeria monocytogenes Genome sequencing and a...,Genome sequencing of Listeria monocytogenes,"Enteric Diseases Laboratory Branch, Centers fo...",2,0
290730,PRJNA290730,Salmonella enterica,EDLB's sequencing of Salmonella enterica,Genome sequencing of Salmonella enterica,Centers for Disease Control and Prevention,0,35
211456,PRJNA211456,Listeria monocytogenes,Listeria monocytogenes,,CDC,0,2


In [35]:
bs_df['bioproject_accession'] = [bioprojects[uid]['Project_Acc'] for uid in bs_df.bioproject_uid]

## Check

In [36]:
gb = bs_df.groupby('bioproject_accession')

pd.DataFrame(dict(
    count=gb.size(),
    has_sra=gb['sra'].count(),
))

Unnamed: 0_level_0,count,has_sra
bioproject_accession,Unnamed: 1_level_1,Unnamed: 2_level_1
PRJNA212117,2,2
PRJNA218110,89,89
PRJNA230403,35,35
PRJNA239251,24,24
PRJNA266293,9,9
PRJNA857686,413,334


## Output

In [37]:
bs_df.to_csv(outfiles['biosamples'])

In [38]:
bioproj_df.to_csv(outfiles['bioprojects'])