In [1]:
from pandas import DataFrame, read_csv
from urllib.parse import urlencode
from jsonapi_client import Session, Filter
import html
import requests
import json

In [2]:
MGNIFY_API_BASE = 'https://www.ebi.ac.uk/metagenomics/api/latest/'
ena_url = 'https://www.ebi.ac.uk/ena/portal/api/filereport'

In [3]:
def get_metadata(metadata, key):
    for m in metadata:
        if m['key'].lower() == key.lower():
            value = m['value']
            unit = html.unescape(m['unit']) if m['unit'] else ""
            return "{value} {unit}".format(value=value, unit=unit)
    return None

In [11]:
df = DataFrame(columns=('sample name',
                        'lineage',
                        'biome',
                        'feature',
                        'material',
                        'package',
                        'latitude',
                        'longitude',
                        'depth',
                        'elevation',
                        'region',
                        'instrument model',
                        'investigation type',
                        'sequencing method',
                        'collection-date',
                        'forward reads',
                        'reverse reads'))
df.index.name = 'sample accession'

depth_label = 'geographic location (depth)'
latitude_label = 'geographic location (latitude)'
longitude_label = 'geographic location (longitude)'
region_label = 'geographic location (country and/or sea,region)'
date_label = 'collection date'
elevation_label = 'elevation'
env_biome_label = 'environment (biome)'
env_feature_label = 'environment (feature)'
env_material_label = 'environment (material)'
env_package_label = 'environmental package'
instrument_model_label = 'instrument model'
investigation_label = 'investigation type'
sequence_method_label = 'sequencing method'

with Session(MGNIFY_API_BASE) as mg:
    params = {
        'metadata_key': 'investigation type',
        'metadata_value': 'metagenome',
        'lineage': 'root:Environmental:Terrestrial',
        'include': 'runs'
    }
    fltr = Filter(urlencode(params))
    
    i = 0
    for sample in mg.iterate('samples', fltr):
            if i > 5: break
            i += 1
        #try:
            ena_params = {
                'accession': sample.accession,
                'result': 'read_run',
                'fields': 'fastq_ftp',
                'format': 'JSON'
            }
            
            data = json.loads(requests.get(ena_url, params=ena_params).text)
            fastq_links = ';'.join([d['fastq_ftp'] for d in data])
            fastq_links = fastq_links.split(';')
            forward_reads = ';'.join(list(filter(lambda s: s[-11:-9] != '_2', fastq_links)))
            reverse_reads = ';'.join(list(filter(lambda s: s[-11:-9] == '_2', fastq_links)))
            
            #runs_link = sample.relationships.runs.links.related
            analyzes = []
            downloads = []
            print(sample.accession, end=': ')
            for run in mg.iterate(f'samples/{sample.accession}/runs'):
                for an in mg.iterate(f'runs/{run.id}/analyses'):
                    analyzes.append(an.id)
                    for dnl in mg.iterate(f'analyses/{an.id}/downloads'):
                        #print(dnl.links.self, end=';')
                        if dnl.attributes.file_format.name == 'JSON Biom':
                            downloads.append(str(dnl.links.self))
            analyzes = ';'.join(analyzes)
            downloads = ';'.join(downloads)
            print(analyzes, downloads, sep=': ')
            
            df.loc[sample.accession] = [
                sample.sample_name,
                sample.biome.id,
                get_metadata(sample.sample_metadata, env_biome_label),
                get_metadata(sample.sample_metadata, env_feature_label),
                get_metadata(sample.sample_metadata, env_material_label),
                get_metadata(sample.sample_metadata, env_package_label),
                get_metadata(sample.sample_metadata, latitude_label),
                get_metadata(sample.sample_metadata, longitude_label),
                get_metadata(sample.sample_metadata, depth_label),
                get_metadata(sample.sample_metadata, elevation_label),
                get_metadata(sample.sample_metadata, region_label),
                get_metadata(sample.sample_metadata, instrument_model_label),
                get_metadata(sample.sample_metadata, investigation_label),
                get_metadata(sample.sample_metadata, sequence_method_label),
                get_metadata(sample.sample_metadata, date_label),
                forward_reads,
                reverse_reads
            ]
        #except Exception:
           # print(sample.accession)
            #continue

df

ERS3341288: MGYA00594700;MGYA00594735;MGYA00594774: https://www.ebi.ac.uk/metagenomics/api/v1/analyses/MGYA00594700/file/ERR3256289_MERGED_FASTQ_SSU_OTU_TABLE_JSON.biom;https://www.ebi.ac.uk/metagenomics/api/v1/analyses/MGYA00594735/file/ERR3255920_MERGED_FASTQ_SSU_OTU_TABLE_JSON.biom;https://www.ebi.ac.uk/metagenomics/api/v1/analyses/MGYA00594735/file/ERR3255920_MERGED_FASTQ_ITS_ITSoneDB_OTU_TABLE_JSON.biom;https://www.ebi.ac.uk/metagenomics/api/v1/analyses/MGYA00594735/file/ERR3255920_MERGED_FASTQ_ITS_UNITE_OTU_TABLE_JSON.biom;https://www.ebi.ac.uk/metagenomics/api/v1/analyses/MGYA00594774/file/ERR3255616_MERGED_FASTQ_SSU_OTU_TABLE_JSON.biom
ERS3341289: MGYA00594699;MGYA00594734;MGYA00594773: https://www.ebi.ac.uk/metagenomics/api/v1/analyses/MGYA00594699/file/ERR3256290_MERGED_FASTQ_SSU_OTU_TABLE_JSON.biom;https://www.ebi.ac.uk/metagenomics/api/v1/analyses/MGYA00594734/file/ERR3255921_MERGED_FASTQ_SSU_OTU_TABLE_JSON.biom;https://www.ebi.ac.uk/metagenomics/api/v1/analyses/MGYA0059473

Unnamed: 0_level_0,sample name,lineage,biome,feature,material,package,latitude,longitude,depth,elevation,region,instrument model,investigation type,sequencing method,collection-date,forward reads,reverse reads
sample accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
ERS3341288,SK-16S-01_S21,root:Environmental:Terrestrial:Soil,arid,agricultural,soil,soil,-26.73,25.99,,1479.0,South Africa,,metagenome,Illumina Miseq 16S,2017-02-01,ftp.sra.ebi.ac.uk/vol1/fastq/ERR325/006/ERR325...,ftp.sra.ebi.ac.uk/vol1/fastq/ERR325/006/ERR325...
ERS3341289,SK-16S-02_S22,root:Environmental:Terrestrial:Soil,arid,agricultural,soil,soil,-26.73,25.99,,1479.0,South Africa,,metagenome,Illumina Miseq 16S,2017-02-01,ftp.sra.ebi.ac.uk/vol1/fastq/ERR325/007/ERR325...,ftp.sra.ebi.ac.uk/vol1/fastq/ERR325/007/ERR325...
ERS3341290,SK-16S-03_S23,root:Environmental:Terrestrial:Soil,arid,agricultural,soil,soil,-26.73,25.99,,1479.0,South Africa,,metagenome,Illumina Miseq 16S,2017-02-01,ftp.sra.ebi.ac.uk/vol1/fastq/ERR325/008/ERR325...,ftp.sra.ebi.ac.uk/vol1/fastq/ERR325/008/ERR325...
ERS3341291,SK-16S-04_S24,root:Environmental:Terrestrial:Soil,arid,agricultural,soil,soil,-26.73,25.99,,1479.0,South Africa,,metagenome,Illumina Miseq 16S,2017-02-01,ftp.sra.ebi.ac.uk/vol1/fastq/ERR325/009/ERR325...,ftp.sra.ebi.ac.uk/vol1/fastq/ERR325/009/ERR325...
ERS3341292,SK-16S-05_S25,root:Environmental:Terrestrial:Soil,arid,agricultural,soil,soil,-26.73,25.99,,1479.0,South Africa,,metagenome,Illumina Miseq 16S,2017-02-01,ftp.sra.ebi.ac.uk/vol1/fastq/ERR325/000/ERR325...,ftp.sra.ebi.ac.uk/vol1/fastq/ERR325/000/ERR325...
ERS3341293,SK-16S-06_S26,root:Environmental:Terrestrial:Soil,arid,agricultural,soil,soil,-26.73,25.99,,1479.0,South Africa,,metagenome,Illumina Miseq 16S,2017-02-01,ftp.sra.ebi.ac.uk/vol1/fastq/ERR325/001/ERR325...,ftp.sra.ebi.ac.uk/vol1/fastq/ERR325/001/ERR325...
