In [1]:
import pandas as pd
from mg_toolkit.metadata import OriginalMetadata
from mg_toolkit.bulk_download import BulkDownloader
import os
from requests import get
from pathlib import Path

In [2]:
def fetch_metadata(study_accession):
    '''
    '''
    outfile = f'../tables/{study_accession}/{study_accession}_sample.csv'
    if os.path.exists(outfile):
        pass
    else:
        try:
            os.mkdir(f'../tables/{study_accession}')
        except FileExistsError:
            pass
        metadata = pd.DataFrame(OriginalMetadata(study_accession).fetch_metadata()).T
        metadata.to_csv(outfile)
    return pd.read_csv(outfile, index_col=0)


In [3]:
def stats_downloader(study_accession, pipeline, result_group='statistics'):
    '''
    
    '''
    outdir = Path('../tables')
    outfile = outdir / study_accession / f'{study_accession}_metadata.tsv'
    if os.path.exists(outfile):
        pass
    else:
        try:
            os.mkdir(outdir / study_accession)
        except FileExistsError:
            pass
        data = BulkDownloader(study_accession, outdir, pipeline, result_group)
        data.run()
    return 

In [4]:
def read_statistics(study_accession):
    '''
    
    '''
    
    # https://stackoverflow.com/questions/41303246/error-tokenizing-data-c-error-out-of-memory-pandas-python-large-file-csv/41303449
    mylist = []

    for chunk in pd.read_csv(f'../tables/{study_accession}/{study_accession}_metadata.tsv', sep='\t', chunksize=20000):
        mylist.append(chunk)

    big_data = pd.concat(mylist, axis= 0)
    del mylist
    return big_data

In [5]:
def item_downloader(accession, df):
    for i in df.index:
        outdir = Path('../data') / study_accession / str(df.loc[i, 'pipeline_version']) / str(df.loc[i, 'group_type']).replace(' ', '_').lower()
        outfile = outdir / df.loc[i, 'name']
        if os.path.exists(outfile):
            pass
        else:
            try:
                os.mkdir(outdir)
            except FileExistsError:
                pass
            url = df.loc[i, 'download_url']
            html = get(url)
            r = get(url, allow_redirects=True)
            open(outfile, 'wb').write(r.content)
            print(outfile)
    return

List of studies were downloaded as csv file from https://www.ebi.ac.uk/metagenomics/search
Here, studies with human skin microbiome are downloaded

In [6]:
df_studies = pd.read_csv('../data/search_download.csv')
df_studies.head(2)

Unnamed: 0,ENA_PROJECT,METAGENOMICS_ANALYSES,METAGENOMICS_SAMPLES,biome_name,centre_name,creation_date,description,domain_source,id,last_modification_date,name,releaseDate_date
0,PRJEB26427,MGYA00381378,ERS2431659,Skin,P&G Singapore Innovation Center,Metagenomics samples of multiple skin sites (u...,metagenomics_projects,MGYS00005102,Understanding the microbial basis of body odor...,,,
1,PRJNA314604,MGYA00497609,SRS1333647,Skin,NYU Langone Medical Center,To characterize the diversity of cutaneous mic...,metagenomics_projects,MGYS00005212,Body site is a more determinant factor than hu...,,,


In [7]:
# tqdm in notebook should be called tqdm.notebook.tqdm 
# https://stackoverflow.com/questions/42212810/tqdm-in-jupyter-notebook-prints-new-progress-bars-repeatedly
study_accession = 'PRJEB26427'
PRJEB26427_sample = fetch_metadata(study_accession)  
stats_downloader(study_accession, '4.1', 'statistics')
PRJEB26427_downloadables = read_statistics(study_accession)

In [8]:
PRJEB26427_sample

Unnamed: 0,collection date,environment (biome),environment (feature),environment (material),geographic location (country and/or sea),geographic location (latitude),geographic location (longitude),human skin environmental package,investigation type,project name,sequencing method,ENA-CHECKLIST,ENA-SPOT-COUNT,ENA-BASE-COUNT,ENA-FIRST-PUBLIC,ENA-LAST-UPDATE,Sample,Read depth
ERR2538349,2015-08,Human-associated habitat,Malodor aspect,skin,Philippines,14.5995DD,120.9842DD,human-skin,metagenome,Understanding the microbial basis of body odor...,Illumina HiSeq,ERC000017,31417827,3136503308,2018-05-07,2018-04-26,ERS2431609,
ERR2538350,2015-08,Human-associated habitat,Malodor aspect,skin,Philippines,14.5995DD,120.9842DD,human-skin,metagenome,Understanding the microbial basis of body odor...,Illumina HiSeq,ERC000017,27827945,2778965519,2018-05-07,2018-04-26,ERS2431610,
ERR2538351,2015-08,Human-associated habitat,Malodor aspect,skin,Philippines,14.5995DD,120.9842DD,human-skin,metagenome,Understanding the microbial basis of body odor...,Illumina HiSeq,ERC000017,38533917,3848100187,2018-05-07,2018-04-26,ERS2431611,
ERR2538352,2015-08,Human-associated habitat,Malodor aspect,skin,Philippines,14.5995DD,120.9842DD,human-skin,metagenome,Understanding the microbial basis of body odor...,Illumina HiSeq,ERC000017,35959808,3590778644,2018-05-07,2018-04-26,ERS2431612,
ERR2538353,2015-08,Human-associated habitat,Malodor aspect,skin,Philippines,14.5995DD,120.9842DD,human-skin,metagenome,Understanding the microbial basis of body odor...,Illumina HiSeq,ERC000017,26637196,2660198624,2018-05-07,2018-04-26,ERS2431613,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ERR2538523,2015-08,Human-associated habitat,Malodor aspect,skin,Philippines,14.5995DD,120.9842DD,human-skin,metagenome,Understanding the microbial basis of body odor...,Illumina HiSeq,ERC000017,27391255,2732856825,2018-05-07,2018-04-26,ERS2431783,
ERR2538524,2015-08,Human-associated habitat,Malodor aspect,skin,Philippines,14.5995DD,120.9842DD,human-skin,metagenome,Understanding the microbial basis of body odor...,Illumina HiSeq,ERC000017,11858320,1181401371,2018-05-07,2018-04-26,ERS2431784,
ERR2538525,2015-08,Human-associated habitat,Malodor aspect,skin,Philippines,14.5995DD,120.9842DD,human-skin,metagenome,Understanding the microbial basis of body odor...,Illumina HiSeq,ERC000017,12816015,1277202640,2018-05-07,2018-04-26,ERS2431785,
ERR2538526,2015-08,Human-associated habitat,Malodor aspect,skin,Philippines,14.5995DD,120.9842DD,human-skin,metagenome,Understanding the microbial basis of body odor...,Illumina HiSeq,ERC000017,14769354,1473118680,2018-05-07,2018-04-26,ERS2431786,


In [9]:
PRJEB26427_downloadables.group_type.value_counts()

Sequence data                  2413
non-coding RNAs                2048
Taxonomic analysis SSU rRNA     895
Taxonomic analysis LSU rRNA     895
Functional analysis             599
Name: group_type, dtype: int64

In [10]:
#downloader(study_accession, '4.1', 'functional_analysis') # FASTQ_InterPro.tsv.gz ~approx 500 mb

In [11]:
PRJEB26427_downloadables[PRJEB26427_downloadables.group_type == 'Functional analysis']

Unnamed: 0,analysis_id,name,group_type,description,download_url,pipeline_version,experiment_type
15,MGYA00381356,ERR2538349_FASTQ_GO.csv,Functional analysis,Complete GO annotation,https://www.ebi.ac.uk/metagenomics/api/v1/anal...,4.1,metagenomic
16,MGYA00381356,ERR2538349_FASTQ_GO_slim.csv,Functional analysis,GO slim annotation,https://www.ebi.ac.uk/metagenomics/api/v1/anal...,4.1,metagenomic
17,MGYA00381356,ERR2538349_FASTQ_InterPro.tsv.gz,Functional analysis,InterPro matches,https://www.ebi.ac.uk/metagenomics/api/v1/anal...,4.1,metagenomic
55,MGYA00381357,ERR2538350_FASTQ_GO.csv,Functional analysis,Complete GO annotation,https://www.ebi.ac.uk/metagenomics/api/v1/anal...,4.1,metagenomic
56,MGYA00381357,ERR2538350_FASTQ_GO_slim.csv,Functional analysis,GO slim annotation,https://www.ebi.ac.uk/metagenomics/api/v1/anal...,4.1,metagenomic
...,...,...,...,...,...,...,...
6783,MGYA00381533,ERR2538526_FASTQ_InterPro.tsv.gz,Functional analysis,InterPro matches,https://www.ebi.ac.uk/metagenomics/api/v1/anal...,4.1,metagenomic
6823,MGYA00381534,ERR2538527_FASTQ_GO.csv,Functional analysis,Complete GO annotation,https://www.ebi.ac.uk/metagenomics/api/v1/anal...,4.1,metagenomic
6824,MGYA00381534,ERR2538527_FASTQ_GO_slim.csv,Functional analysis,GO slim annotation,https://www.ebi.ac.uk/metagenomics/api/v1/anal...,4.1,metagenomic
6825,MGYA00381534,ERR2538527_FASTQ_InterPro_1.tsv.gz,Functional analysis,InterPro matches,https://www.ebi.ac.uk/metagenomics/api/v1/anal...,4.1,metagenomic


In [12]:
study_accession = 'PRJEB26427'
PRJEB26427_goslim = PRJEB26427_downloadables[PRJEB26427_downloadables.description == 'GO slim annotation']
item_downloader(study_accession, PRJEB26427_goslim)