In [30]:
import os
from pandas import DataFrame
import pandas as pd

try:
    from urllib import urlencode
except ImportError:
    from urllib.parse import urlencode

# Data Cleaning

In [2]:
# Load samples
sample_data = pd.read_csv("data_sample.csv")

# Rapikan label
label = []
for i in sample_data.Description:
    y = " ".join(i.split())
    label.append(y)

# Replace label
for x in range(len(sample_data)):
    sample_data.Description[x] = label[x]
sample_data.head()

Unnamed: 0,Sample,MGnify ID,Name,Description
0,SRS731606,MGYS00000604,MET0253,Human Skin Metagenome
1,SRS927202,"MGYS00005034,MGYS00005101",SKCT13_b2_58,This sample has been submitted by Centre for I...
2,SRS927151,"MGYS00005034,MGYS00005101",SKCT13_b2_29,This sample has been submitted by Centre for I...
3,SRS732144,MGYS00000604,MET0320,Human Skin Metagenome
4,ERS2431693,MGYS00005102,MBS490,Head_kid_1h


# Persiapan Scraping

In [3]:
# Ambil unique value dari study id untuk filter
study_id = sample_data['MGnify ID'].unique()
study_id

array(['MGYS00000604', 'MGYS00005034,MGYS00005101', 'MGYS00005102',
       'MGYS00005101', 'MGYS00000518', 'MGYS00003598,MGYS00005037',
       'MGYS00005098', 'MGYS00000520'], dtype=object)

In [4]:
#filter data untuk diunduh per study
filtered_data = sample_data.loc[sample_data['MGnify ID'] == study_id[4]] 
#ganti nomor ini dari study id, untuk kasus ini ane ambil yg paling sedikit dulu

filtered_data = filtered_data.reset_index(drop=True)
filtered_data

Unnamed: 0,Sample,MGnify ID,Name,Description
0,ERS805755,MGYS00000518,P1709_1005,AD upper_back UB LES
1,ERS805759,MGYS00000518,P1761_1106,TRL lower_back LB NON_LES
2,ERS805760,MGYS00000518,P1896_1043,AD posterior_thigh PT NON_LES
3,ERS805756,MGYS00000518,P1709_1072,P1709_1072
4,ERS805757,MGYS00000518,P1709_1083,AD upper_back UB LES
5,ERS805754,MGYS00000518,P752_123,P752_123
6,ERS805762,MGYS00000518,P1896_1090,CTRL thigh T NON_LES
7,ERS805761,MGYS00000518,P1896_1079,CTRL lower_back LB NON_LES
8,ERS805763,MGYS00000518,P1900_1076,PSO lower_back LB LES
9,ERS805753,MGYS00000518,P752_105,AD posterior_thigh PT LES


# Data Scraping

In [27]:
from jsonapi_client import Session, Filter
import html
import pycurl

In [28]:
# create get function for sample
def ebi_sample(sample):
    API_BASE = 'https://www.ebi.ac.uk/metagenomics/api/latest/samples'
    with Session(API_BASE) as s:
        params = {'page_size': 100}
        f = Filter(urlencode(params))
        sample = s.get(sample).resource
    return sample

In [7]:
filtered_data.Sample[0]

'ERS805755'

### mari kita lihat data apa saja yg bisa diperoleh dari sample

In [141]:
sampled = ebi_sample('ERS805755')

print(sampled.accession)
print(sampled.analysis_completed)
print(sampled.biosample)
print(sampled.collection_date)
print(sampled.environment_biome)
print(sampled.environment_feature)
print(sampled.environment_material)
print(sampled.geo_loc_name)
print(sampled.host_tax_id)
print(sampled.last_update)
print(sampled.latitude)
print(sampled.longitude)
print(sampled.runs)
print(sampled.sample_alias)
print(sampled.sample_desc)
print(sampled.sample_metadata)
print(sampled.species)

ERS805755
2015-09-11
SAMEA3498606
None
None
None
None
None
9606
2015-10-09T16:11:00
None
None
[<ResourceObject: runs: ERR981247 (2308902433232) (2308902433232)>]
P1709_1005
AD          upper_back                        UB      LES
[{'unit': None, 'key': 'NCBI sample classification', 'value': '9606'}, {'unit': None, 'key': 'instrument model', 'value': 'Illumina HiSeq 2500'}, {'unit': None, 'key': 'ENA checklist', 'value': 'ENA default sample checklist (ERC000011)'}]
Homo sapiens


## kita tertarik untuk mengambil data: 1. accession, 2. runs, dan 3. sample_metadata

In [31]:
#create containers
scraping = []
for i in filtered_data.Sample:
    sample = ebi_sample(i)
    a = sample.accession
    r = sample.runs
    m = sample.sample_metadata
    data = [a, r, m]
    scraping.append(data)
scraping

[['ERS805755',
  [<ResourceObject: runs: ERR981247 (1625090802240) (1625090802240)>],
  [{'key': 'NCBI sample classification', 'value': '9606', 'unit': None},
   {'key': 'instrument model', 'value': 'Illumina HiSeq 2500', 'unit': None},
   {'key': 'ENA checklist',
    'value': 'ENA default sample checklist (ERC000011)',
    'unit': None}]],
 ['ERS805759',
  [<ResourceObject: runs: ERR981250 (1625098634744) (1625098634744)>],
  [{'key': 'NCBI sample classification', 'value': '9606', 'unit': None},
   {'key': 'instrument model', 'value': 'Illumina HiSeq 2500', 'unit': None},
   {'key': 'ENA checklist',
    'value': 'ENA default sample checklist (ERC000011)',
    'unit': None}]],
 ['ERS805760',
  [<ResourceObject: runs: ERR981251 (1625074047632) (1625074047632)>],
  [{'unit': None, 'key': 'NCBI sample classification', 'value': '9606'},
   {'unit': None, 'key': 'instrument model', 'value': 'Illumina HiSeq 2500'},
   {'unit': None,
    'key': 'ENA checklist',
    'value': 'ENA default sampl

In [34]:
# create dataframe from list
df_scrape = pd.DataFrame(scraping, columns = ["Sample", "Runs", "Metadata"]) 
df_scrape 

Unnamed: 0,Sample,Runs,Metadata
0,ERS805755,[runs: ERR981247 (1625090802240)],"[{'key': 'NCBI sample classification', 'value'..."
1,ERS805759,[runs: ERR981250 (1625098634744)],"[{'key': 'NCBI sample classification', 'value'..."
2,ERS805760,[runs: ERR981251 (1625074047632)],"[{'unit': None, 'key': 'NCBI sample classifica..."
3,ERS805756,[runs: ERR981248 (1625142707200)],"[{'unit': None, 'value': '9606', 'key': 'NCBI ..."
4,ERS805757,[runs: ERR981249 (1625096597232)],"[{'unit': None, 'key': 'NCBI sample classifica..."
5,ERS805754,[runs: ERR981246 (1625142926416)],"[{'key': 'NCBI sample classification', 'value'..."
6,ERS805762,[runs: ERR981253 (1625093160744)],"[{'unit': None, 'key': 'NCBI sample classifica..."
7,ERS805761,[runs: ERR981252 (1625088466000)],"[{'key': 'NCBI sample classification', 'value'..."
8,ERS805763,[runs: ERR981254 (1625141692064)],"[{'unit': None, 'value': '9606', 'key': 'NCBI ..."
9,ERS805753,[runs: ERR981245 (1625141816176)],"[{'unit': None, 'key': 'NCBI sample classifica..."


# merge and write

In [35]:
#merge based on similar value
df_scrapingresult = pd.merge(filtered_data, df_scrape, on="Sample")
df_scrapingresult

Unnamed: 0,Sample,MGnify ID,Name,Description,Runs,Metadata
0,ERS805755,MGYS00000518,P1709_1005,AD upper_back UB LES,[runs: ERR981247 (1625090802240)],"[{'key': 'NCBI sample classification', 'value'..."
1,ERS805759,MGYS00000518,P1761_1106,TRL lower_back LB NON_LES,[runs: ERR981250 (1625098634744)],"[{'key': 'NCBI sample classification', 'value'..."
2,ERS805760,MGYS00000518,P1896_1043,AD posterior_thigh PT NON_LES,[runs: ERR981251 (1625074047632)],"[{'unit': None, 'key': 'NCBI sample classifica..."
3,ERS805756,MGYS00000518,P1709_1072,P1709_1072,[runs: ERR981248 (1625142707200)],"[{'unit': None, 'value': '9606', 'key': 'NCBI ..."
4,ERS805757,MGYS00000518,P1709_1083,AD upper_back UB LES,[runs: ERR981249 (1625096597232)],"[{'unit': None, 'key': 'NCBI sample classifica..."
5,ERS805754,MGYS00000518,P752_123,P752_123,[runs: ERR981246 (1625142926416)],"[{'key': 'NCBI sample classification', 'value'..."
6,ERS805762,MGYS00000518,P1896_1090,CTRL thigh T NON_LES,[runs: ERR981253 (1625093160744)],"[{'unit': None, 'key': 'NCBI sample classifica..."
7,ERS805761,MGYS00000518,P1896_1079,CTRL lower_back LB NON_LES,[runs: ERR981252 (1625088466000)],"[{'key': 'NCBI sample classification', 'value'..."
8,ERS805763,MGYS00000518,P1900_1076,PSO lower_back LB LES,[runs: ERR981254 (1625141692064)],"[{'unit': None, 'value': '9606', 'key': 'NCBI ..."
9,ERS805753,MGYS00000518,P752_105,AD posterior_thigh PT LES,[runs: ERR981245 (1625141816176)],"[{'unit': None, 'key': 'NCBI sample classifica..."


In [157]:
#write to csv jangan lupa study yg mana
df_scrapingresult.to_csv('scraping_result_study_MGYS00000518.csv', index=False)

# Download Analysis result

In [51]:
import os
def download_GO_Slim(run_id):
    outname = run_id+'_FASTQ_GO_slim.csv'
    outdir = 'result_GO_slim'
    
    if not os.path.exists(outdir):
        os.mkdir(outdir)
    fullname = os.path.join(outdir, outname)    

    df = DataFrame(columns=('category', 'description', 'annotation counts'))
    df.index.name = 'GO term'
    
    API_BASE = 'https://www.ebi.ac.uk/metagenomics/api/latest'
    with Session(API_BASE) as s:
        run = s.get('runs', run_id).resource
        for a in run.analyses:
            for ann in a.go_slim:
                df.loc[ann.accession] = [
                    ann.lineage, ann.description, ann.count
                ]
    df = df.rename(columns={'annotation counts':run_id})
    return df

In [57]:
df_merge = pd.DataFrame()
for num, i in enumerate(df_scrapingresult.Runs):
    if num == 0:
        df_merge = download_GO_Slim(df_scrapingresult.Runs[0][0].id)
    else:
        df = download_GO_Slim(i[0].id)
        df_merge = pd.merge(df_merge, df, on=['GO term','category','description'], how='left')
df_merge

0
1
2
3
4
5
6
7
8
9


Unnamed: 0_level_0,category,description,ERR981247,ERR981250,ERR981251,ERR981248,ERR981249,ERR981246,ERR981253,ERR981252,ERR981254,ERR981245
GO term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
GO:0030031,biological_process,cell projection assembly,39,0,26,0,10,0,1,0,10,10
GO:0071554,biological_process,cell wall organization or biogenesis,2030,131,3567,155,2399,674,259,1062,167,1037
GO:0016043,biological_process,cellular component organization,58626,445,13692,599,8081,40735,8309,3739,3416,34477
GO:0051301,biological_process,cell division,2385,236,4018,295,4467,4859,39,1166,335,894
GO:0016049,biological_process,cell growth,133,0,80,0,7,0,0,0,48,201
...,...,...,...,...,...,...,...,...,...,...,...,...
GO:0000988,molecular_function,protein binding transcription factor activity,2229,226,3950,498,5472,5222,306,1835,486,756
GO:0004872,molecular_function,receptor activity,19313,1086,17212,1266,6795,38163,1938,5269,4999,12234
GO:0045182,molecular_function,translation regulator activity,3,0,18,15,8,0,0,0,2,0
GO:0005215,molecular_function,transporter activity,59812,4610,79841,5452,96470,74178,6009,38988,9254,29484


In [None]:
df_merge.to_csv('MGYS00000518_FASTQ_GO_slim.csv')