In [1]:
from pandas import DataFrame, read_csv
from urllib.parse import urlencode
from jsonapi_client import Session, Filter
import html
import requests
import json

In [2]:
MGNIFY_API_BASE = 'https://www.ebi.ac.uk/metagenomics/api/latest/'
ena_url = 'https://www.ebi.ac.uk/ena/portal/api/filereport'

In [3]:
def get_metadata(metadata, key):
    for m in metadata:
        if m['key'].lower() == key.lower():
            value = m['value']
            unit = html.unescape(m['unit']) if m['unit'] else ""
            return "{value} {unit}".format(value=value, unit=unit)
    return None

In [9]:
df = DataFrame(columns=('sample name',
                        'lineage',
                        'biome',
                        'feature',
                        'material',
                        'package',
                        'latitude',
                        'longitude',
                        'depth',
                        'elevation',
                        'region',
                        'instrument model',
                        'investigation type',
                        'sequencing method',
                        'collection-date',
                        'forward reads',
                        'reverse reads',
                        'analyzes id',
                        'analyzes links'))
df.index.name = 'sample accession'

depth_label = 'geographic location (depth)'
latitude_label = 'geographic location (latitude)'
longitude_label = 'geographic location (longitude)'
region_label = 'geographic location (country and/or sea,region)'
date_label = 'collection date'
elevation_label = 'elevation'
env_biome_label = 'environment (biome)'
env_feature_label = 'environment (feature)'
env_material_label = 'environment (material)'
env_package_label = 'environmental package'
instrument_model_label = 'instrument model'
investigation_label = 'investigation type'
sequence_method_label = 'sequencing method'

with Session(MGNIFY_API_BASE) as mg:
    params = {
        'metadata_key': 'investigation type',
        'metadata_value': 'metagenome',
        'lineage': 'root:Environmental:Terrestrial',
        'include': 'runs'
    }
    fltr = Filter(urlencode(params))
    last_sample = 0
    #i = 0
    for sample in mg.iterate('samples', fltr):
            #if i > 20: break
            #i += 1
            last_sample = sample
        
            #try
            ena_params = {
                'accession': sample.accession,
                'result': 'read_run',
                'fields': 'fastq_ftp',
                'format': 'JSON'
            }
            try:
                data = json.loads(requests.get(ena_url, params=ena_params).text)
                fastq_links = ';'.join([d['fastq_ftp'] for d in data])
                fastq_links = fastq_links.split(';')
                forward_reads = ';'.join(list(filter(lambda s: s[-11:-9] != '_2', fastq_links)))
                reverse_reads = ';'.join(list(filter(lambda s: s[-11:-9] == '_2', fastq_links)))
            except Exception:
                forward_reads = None
                reverse_reads = None
            
            analyzes = []
            downloads = []
            for run in mg.iterate(f'samples/{sample.accession}/runs'):
                for an in mg.iterate(f'runs/{run.id}/analyses'):
                    analyzes.append(an.id)
                    for dnl in mg.iterate(f'analyses/{an.id}/downloads'):
                        downloads.append(str(dnl.links.self))
            analyzes = ';'.join(analyzes)
            downloads = ';'.join(downloads)
            
            df.loc[sample.accession] = [
                sample.sample_name,
                sample.biome.id,
                get_metadata(sample.sample_metadata, env_biome_label),
                get_metadata(sample.sample_metadata, env_feature_label),
                get_metadata(sample.sample_metadata, env_material_label),
                get_metadata(sample.sample_metadata, env_package_label),
                get_metadata(sample.sample_metadata, latitude_label),
                get_metadata(sample.sample_metadata, longitude_label),
                get_metadata(sample.sample_metadata, depth_label),
                get_metadata(sample.sample_metadata, elevation_label),
                get_metadata(sample.sample_metadata, region_label),
                get_metadata(sample.sample_metadata, instrument_model_label),
                get_metadata(sample.sample_metadata, investigation_label),
                get_metadata(sample.sample_metadata, sequence_method_label),
                get_metadata(sample.sample_metadata, date_label),
                forward_reads,
                reverse_reads,
                analyzes,
                downloads
            ]
        #except Exception:
           # print(sample.accession)
            #continue

df.to_csv('diploma_samples_terresrial')

df

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [10]:
df

Unnamed: 0_level_0,sample name,lineage,biome,feature,material,package,latitude,longitude,depth,elevation,region,instrument model,investigation type,sequencing method,collection-date,forward reads,reverse reads,analyzes id,analyzes links
sample accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
ERS3341288,SK-16S-01_S21,root:Environmental:Terrestrial:Soil,arid,agricultural,soil,soil,-26.73,25.99,,1479.0,South Africa,,metagenome,Illumina Miseq 16S,2017-02-01,ftp.sra.ebi.ac.uk/vol1/fastq/ERR325/006/ERR325...,ftp.sra.ebi.ac.uk/vol1/fastq/ERR325/006/ERR325...,MGYA00594700;MGYA00594735;MGYA00594774,https://www.ebi.ac.uk/metagenomics/api/v1/anal...
ERS3341289,SK-16S-02_S22,root:Environmental:Terrestrial:Soil,arid,agricultural,soil,soil,-26.73,25.99,,1479.0,South Africa,,metagenome,Illumina Miseq 16S,2017-02-01,ftp.sra.ebi.ac.uk/vol1/fastq/ERR325/007/ERR325...,ftp.sra.ebi.ac.uk/vol1/fastq/ERR325/007/ERR325...,MGYA00594699;MGYA00594734;MGYA00594773,https://www.ebi.ac.uk/metagenomics/api/v1/anal...
ERS3341290,SK-16S-03_S23,root:Environmental:Terrestrial:Soil,arid,agricultural,soil,soil,-26.73,25.99,,1479.0,South Africa,,metagenome,Illumina Miseq 16S,2017-02-01,ftp.sra.ebi.ac.uk/vol1/fastq/ERR325/008/ERR325...,ftp.sra.ebi.ac.uk/vol1/fastq/ERR325/008/ERR325...,MGYA00594698;MGYA00594733;MGYA00594772,https://www.ebi.ac.uk/metagenomics/api/v1/anal...
ERS3341291,SK-16S-04_S24,root:Environmental:Terrestrial:Soil,arid,agricultural,soil,soil,-26.73,25.99,,1479.0,South Africa,,metagenome,Illumina Miseq 16S,2017-02-01,ftp.sra.ebi.ac.uk/vol1/fastq/ERR325/009/ERR325...,ftp.sra.ebi.ac.uk/vol1/fastq/ERR325/009/ERR325...,MGYA00594732;MGYA00594771,https://www.ebi.ac.uk/metagenomics/api/v1/anal...
ERS3341292,SK-16S-05_S25,root:Environmental:Terrestrial:Soil,arid,agricultural,soil,soil,-26.73,25.99,,1479.0,South Africa,,metagenome,Illumina Miseq 16S,2017-02-01,ftp.sra.ebi.ac.uk/vol1/fastq/ERR325/000/ERR325...,ftp.sra.ebi.ac.uk/vol1/fastq/ERR325/000/ERR325...,MGYA00594697;MGYA00594731;MGYA00594770,https://www.ebi.ac.uk/metagenomics/api/v1/anal...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ERS1456720,Greenland Soils,root:Environmental:Terrestrial:Soil:Oil-contam...,,,,,,,1 m,,,Illumina MiSeq,metagenome,Illumina Miseq,2010-01,ftp.sra.ebi.ac.uk/vol1/fastq/ERR174/004/ERR174...,ftp.sra.ebi.ac.uk/vol1/fastq/ERR174/004/ERR174...,MGYA00086983,https://www.ebi.ac.uk/metagenomics/api/v1/anal...
ERS1456721,Greenland Soils,root:Environmental:Terrestrial:Soil:Oil-contam...,,,,,,,1 m,,,Illumina MiSeq,metagenome,Illumina Miseq,2010-01,ftp.sra.ebi.ac.uk/vol1/fastq/ERR174/005/ERR174...,ftp.sra.ebi.ac.uk/vol1/fastq/ERR174/005/ERR174...,MGYA00087066,https://www.ebi.ac.uk/metagenomics/api/v1/anal...
ERS1456722,Greenland Soils,root:Environmental:Terrestrial:Soil:Oil-contam...,,,,,,,1 m,,,Illumina MiSeq,metagenome,Illumina Miseq,2010-01,ftp.sra.ebi.ac.uk/vol1/fastq/ERR174/006/ERR174...,ftp.sra.ebi.ac.uk/vol1/fastq/ERR174/006/ERR174...,MGYA00087070,https://www.ebi.ac.uk/metagenomics/api/v1/anal...
ERS1456723,Greenland Soils,root:Environmental:Terrestrial:Soil:Oil-contam...,,,,,,,1 m,,,Illumina MiSeq,metagenome,Illumina Miseq,2010-01,ftp.sra.ebi.ac.uk/vol1/fastq/ERR174/007/ERR174...,ftp.sra.ebi.ac.uk/vol1/fastq/ERR174/007/ERR174...,MGYA00087027,https://www.ebi.ac.uk/metagenomics/api/v1/anal...


In [14]:
df.to_csv('diploma_samples_terresrial_reserve')

In [13]:
last_sample.accession

'ERS1456725'

In [15]:
flag = 0
for sample in mg.iterate('samples', fltr):
            #if i > 20: break
            #i += 1
    if sample.accession == last_sample.accession:
        flag = 1
        print('I am here')
        
    if flag != 0:
            last_sample = sample
            #try
            ena_params = {
                'accession': sample.accession,
                'result': 'read_run',
                'fields': 'fastq_ftp',
                'format': 'JSON'
            }
            try:
                data = json.loads(requests.get(ena_url, params=ena_params).text)
                fastq_links = ';'.join([d['fastq_ftp'] for d in data])
                fastq_links = fastq_links.split(';')
                forward_reads = ';'.join(list(filter(lambda s: s[-11:-9] != '_2', fastq_links)))
                reverse_reads = ';'.join(list(filter(lambda s: s[-11:-9] == '_2', fastq_links)))
            except Exception:
                forward_reads = None
                reverse_reads = None
            
            analyzes = []
            downloads = []
            for run in mg.iterate(f'samples/{sample.accession}/runs'):
                for an in mg.iterate(f'runs/{run.id}/analyses'):
                    analyzes.append(an.id)
                    for dnl in mg.iterate(f'analyses/{an.id}/downloads'):
                        downloads.append(str(dnl.links.self))
            analyzes = ';'.join(analyzes)
            downloads = ';'.join(downloads)
            
            df.loc[sample.accession] = [
                sample.sample_name,
                sample.biome.id,
                get_metadata(sample.sample_metadata, env_biome_label),
                get_metadata(sample.sample_metadata, env_feature_label),
                get_metadata(sample.sample_metadata, env_material_label),
                get_metadata(sample.sample_metadata, env_package_label),
                get_metadata(sample.sample_metadata, latitude_label),
                get_metadata(sample.sample_metadata, longitude_label),
                get_metadata(sample.sample_metadata, depth_label),
                get_metadata(sample.sample_metadata, elevation_label),
                get_metadata(sample.sample_metadata, region_label),
                get_metadata(sample.sample_metadata, instrument_model_label),
                get_metadata(sample.sample_metadata, investigation_label),
                get_metadata(sample.sample_metadata, sequence_method_label),
                get_metadata(sample.sample_metadata, date_label),
                forward_reads,
                reverse_reads,
                analyzes,
                downloads
            ]
        #except Exception:
           # print(sample.accession)
            #continue
    else:
        continue

df.to_csv('diploma_samples_terresrial')

df

I am here


Unnamed: 0_level_0,sample name,lineage,biome,feature,material,package,latitude,longitude,depth,elevation,region,instrument model,investigation type,sequencing method,collection-date,forward reads,reverse reads,analyzes id,analyzes links
sample accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
ERS3341288,SK-16S-01_S21,root:Environmental:Terrestrial:Soil,arid,agricultural,soil,soil,-26.73,25.99,,1479.0,South Africa,,metagenome,Illumina Miseq 16S,2017-02-01,ftp.sra.ebi.ac.uk/vol1/fastq/ERR325/006/ERR325...,ftp.sra.ebi.ac.uk/vol1/fastq/ERR325/006/ERR325...,MGYA00594700;MGYA00594735;MGYA00594774,https://www.ebi.ac.uk/metagenomics/api/v1/anal...
ERS3341289,SK-16S-02_S22,root:Environmental:Terrestrial:Soil,arid,agricultural,soil,soil,-26.73,25.99,,1479.0,South Africa,,metagenome,Illumina Miseq 16S,2017-02-01,ftp.sra.ebi.ac.uk/vol1/fastq/ERR325/007/ERR325...,ftp.sra.ebi.ac.uk/vol1/fastq/ERR325/007/ERR325...,MGYA00594699;MGYA00594734;MGYA00594773,https://www.ebi.ac.uk/metagenomics/api/v1/anal...
ERS3341290,SK-16S-03_S23,root:Environmental:Terrestrial:Soil,arid,agricultural,soil,soil,-26.73,25.99,,1479.0,South Africa,,metagenome,Illumina Miseq 16S,2017-02-01,ftp.sra.ebi.ac.uk/vol1/fastq/ERR325/008/ERR325...,ftp.sra.ebi.ac.uk/vol1/fastq/ERR325/008/ERR325...,MGYA00594698;MGYA00594733;MGYA00594772,https://www.ebi.ac.uk/metagenomics/api/v1/anal...
ERS3341291,SK-16S-04_S24,root:Environmental:Terrestrial:Soil,arid,agricultural,soil,soil,-26.73,25.99,,1479.0,South Africa,,metagenome,Illumina Miseq 16S,2017-02-01,ftp.sra.ebi.ac.uk/vol1/fastq/ERR325/009/ERR325...,ftp.sra.ebi.ac.uk/vol1/fastq/ERR325/009/ERR325...,MGYA00594732;MGYA00594771,https://www.ebi.ac.uk/metagenomics/api/v1/anal...
ERS3341292,SK-16S-05_S25,root:Environmental:Terrestrial:Soil,arid,agricultural,soil,soil,-26.73,25.99,,1479.0,South Africa,,metagenome,Illumina Miseq 16S,2017-02-01,ftp.sra.ebi.ac.uk/vol1/fastq/ERR325/000/ERR325...,ftp.sra.ebi.ac.uk/vol1/fastq/ERR325/000/ERR325...,MGYA00594697;MGYA00594731;MGYA00594770,https://www.ebi.ac.uk/metagenomics/api/v1/anal...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ERS227009,2400m3,root:Environmental:Terrestrial:Soil:Tropical r...,,,,soil,,,.1 m,,,454 GS FLX Titanium,metagenome,Roche 454 GS FLX Titanium,2010-10,ftp.sra.ebi.ac.uk/vol1/fastq/ERR249/ERR249387/...,,MGYA00000692,https://www.ebi.ac.uk/metagenomics/api/v1/anal...
ERS227010,3000m1,root:Environmental:Terrestrial:Soil:Tropical r...,,,,soil,,,.1 m,,,454 GS FLX Titanium,metagenome,Roche 454 GS FLX Titanium,2010-10,ftp.sra.ebi.ac.uk/vol1/fastq/ERR249/ERR249388/...,,MGYA00000693,https://www.ebi.ac.uk/metagenomics/api/v1/anal...
ERS227011,3000m2,root:Environmental:Terrestrial:Soil:Tropical r...,,,,soil,,,.1 m,,,454 GS FLX Titanium,metagenome,Roche 454 GS FLX Titanium,2010-10,ftp.sra.ebi.ac.uk/vol1/fastq/ERR249/ERR249389/...,,MGYA00000694,https://www.ebi.ac.uk/metagenomics/api/v1/anal...
ERS227012,3000m3,root:Environmental:Terrestrial:Soil:Tropical r...,,,,soil,,,.1 m,,,454 GS FLX Titanium,metagenome,Roche 454 GS FLX Titanium,2010-10,ftp.sra.ebi.ac.uk/vol1/fastq/ERR249/ERR249390/...,,MGYA00000695,https://www.ebi.ac.uk/metagenomics/api/v1/anal...


In [17]:
df2 = read_csv('diploma_samples_terresrial')
df2

Unnamed: 0,sample accession,sample name,lineage,biome,feature,material,package,latitude,longitude,depth,elevation,region,instrument model,investigation type,sequencing method,collection-date,forward reads,reverse reads,analyzes id,analyzes links
0,ERS3341288,SK-16S-01_S21,root:Environmental:Terrestrial:Soil,arid,agricultural,soil,soil,-26.73,25.99,,1479.0,South Africa,,metagenome,Illumina Miseq 16S,2017-02-01,ftp.sra.ebi.ac.uk/vol1/fastq/ERR325/006/ERR325...,ftp.sra.ebi.ac.uk/vol1/fastq/ERR325/006/ERR325...,MGYA00594700;MGYA00594735;MGYA00594774,https://www.ebi.ac.uk/metagenomics/api/v1/anal...
1,ERS3341289,SK-16S-02_S22,root:Environmental:Terrestrial:Soil,arid,agricultural,soil,soil,-26.73,25.99,,1479.0,South Africa,,metagenome,Illumina Miseq 16S,2017-02-01,ftp.sra.ebi.ac.uk/vol1/fastq/ERR325/007/ERR325...,ftp.sra.ebi.ac.uk/vol1/fastq/ERR325/007/ERR325...,MGYA00594699;MGYA00594734;MGYA00594773,https://www.ebi.ac.uk/metagenomics/api/v1/anal...
2,ERS3341290,SK-16S-03_S23,root:Environmental:Terrestrial:Soil,arid,agricultural,soil,soil,-26.73,25.99,,1479.0,South Africa,,metagenome,Illumina Miseq 16S,2017-02-01,ftp.sra.ebi.ac.uk/vol1/fastq/ERR325/008/ERR325...,ftp.sra.ebi.ac.uk/vol1/fastq/ERR325/008/ERR325...,MGYA00594698;MGYA00594733;MGYA00594772,https://www.ebi.ac.uk/metagenomics/api/v1/anal...
3,ERS3341291,SK-16S-04_S24,root:Environmental:Terrestrial:Soil,arid,agricultural,soil,soil,-26.73,25.99,,1479.0,South Africa,,metagenome,Illumina Miseq 16S,2017-02-01,ftp.sra.ebi.ac.uk/vol1/fastq/ERR325/009/ERR325...,ftp.sra.ebi.ac.uk/vol1/fastq/ERR325/009/ERR325...,MGYA00594732;MGYA00594771,https://www.ebi.ac.uk/metagenomics/api/v1/anal...
4,ERS3341292,SK-16S-05_S25,root:Environmental:Terrestrial:Soil,arid,agricultural,soil,soil,-26.73,25.99,,1479.0,South Africa,,metagenome,Illumina Miseq 16S,2017-02-01,ftp.sra.ebi.ac.uk/vol1/fastq/ERR325/000/ERR325...,ftp.sra.ebi.ac.uk/vol1/fastq/ERR325/000/ERR325...,MGYA00594697;MGYA00594731;MGYA00594770,https://www.ebi.ac.uk/metagenomics/api/v1/anal...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6447,ERS227009,2400m3,root:Environmental:Terrestrial:Soil:Tropical r...,,,,soil,,,.1 m,,,454 GS FLX Titanium,metagenome,Roche 454 GS FLX Titanium,2010-10,ftp.sra.ebi.ac.uk/vol1/fastq/ERR249/ERR249387/...,,MGYA00000692,https://www.ebi.ac.uk/metagenomics/api/v1/anal...
6448,ERS227010,3000m1,root:Environmental:Terrestrial:Soil:Tropical r...,,,,soil,,,.1 m,,,454 GS FLX Titanium,metagenome,Roche 454 GS FLX Titanium,2010-10,ftp.sra.ebi.ac.uk/vol1/fastq/ERR249/ERR249388/...,,MGYA00000693,https://www.ebi.ac.uk/metagenomics/api/v1/anal...
6449,ERS227011,3000m2,root:Environmental:Terrestrial:Soil:Tropical r...,,,,soil,,,.1 m,,,454 GS FLX Titanium,metagenome,Roche 454 GS FLX Titanium,2010-10,ftp.sra.ebi.ac.uk/vol1/fastq/ERR249/ERR249389/...,,MGYA00000694,https://www.ebi.ac.uk/metagenomics/api/v1/anal...
6450,ERS227012,3000m3,root:Environmental:Terrestrial:Soil:Tropical r...,,,,soil,,,.1 m,,,454 GS FLX Titanium,metagenome,Roche 454 GS FLX Titanium,2010-10,ftp.sra.ebi.ac.uk/vol1/fastq/ERR249/ERR249390/...,,MGYA00000695,https://www.ebi.ac.uk/metagenomics/api/v1/anal...
