In [8]:
import xml.etree.ElementTree as ET
import pandas as pd
import os
import glob
import sarge

In [5]:
def read_config(conf = ".img_config"):
    '''
    parses config file with two lines, first line being the username, second being the password
    Args:
        conf: path to config file
    Returns:
        username, password
    '''
    name, password = [i.strip() for i in open(conf).readlines()]
    return name, password


database = 'https://genome.jgi.doe.gov'

In [6]:
name, password = read_config()

Establishing a connection with the genomes database

In [4]:
curl_login_cmd = "curl 'https://signon.jgi.doe.gov/signon/create' \
            --data-urlencode 'login={name}' --data-urlencode \
            'password={password}' -c cookies > /dev/null".format(name = name, 
                                                                 password = password)

print(curl_login_cmd)
!{curl_login_cmd}

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   164    0   107  100    57    181     96 --:--:-- --:--:-- --:--:--   278


Download xml file based on portal id

In [14]:
portal_id = 'OKS_WetSedge1_19_FD'

curlcmd = "curl '{database}/portal/ext-api/downloads/get-directory?organism={portal_id}' -b cookies > {portal_id}.xml".format(database = database,
                                                                                                                          portal_id = portal_id)


print(curlcmd, "\n")
print('output file is {portal_id}.xml'.format(portal_id = portal_id), )
!{curlcmd}

curl 'https://genome.jgi.doe.gov/portal/ext-api/downloads/get-directory?organism=OKS_WetSedge1_19_FD' -b cookies > OKS_WetSedge1_19_FD.xml 

output file is OKS_WetSedge1_19_FD.xml
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0


Now this downloaded xml file can be used to figure out how to download various data files via IMG's API.

In [15]:
xml = '{portal_id}.xml'.format(portal_id = portal_id)

In [16]:
tree = ET.parse(xml)
root = tree.getroot()

ParseError: no element found: line 1, column 0 (<string>)

In [6]:
root.attrib

{'name': 'OKS_Pond6_8July2_FD'}

In [21]:
for child in root:
    if child.attrib['name'] == 'Binning Data':
        binchild = child
    else:
        print(child.attrib['name'])

Sequencing QC Reports
Raw Data
QC and Genome Assembly
Metagenome Report Tables
IMG Data
Filtered Raw Data


In [6]:
xmls = glob.glob('*2019.xml')

In [84]:
xml = xmls[0]

tree = ET.parse(xml)
root = tree.getroot()

mgid = os.path.basename(xml).split(".")[0]

outdir = mgid

if not os.path.exists(outdir):
    os.mkdir(outdir)

In [85]:
for child in root:
    print(child.attrib)

{'name': 'Sequencing QC Reports'}
{'name': 'Raw Data'}
{'name': 'QC and Genome Assembly'}
{'name': 'Metagenome Report Tables'}
{'name': 'IMG Data'}
{'name': 'Filtered Raw Data'}
{'name': 'Binning Data'}


In [10]:
def curlcmd(url, database, destout):
    
    return "curl '{database}{url}' -b cookies > {destout}".format(database = database,
                                                    url = url, 
                                                    destout = destout)

In [66]:
# bins
bindir = os.path.join(outdir, 'bins')

if not os.path.exists(bindir):
    os.mkdir(bindir)

for child in binchild:
    filename = child.attrib['filename']
    url = child.attrib['url']
    destout = os.path.join(bindir, filename)

    if not os.path.exists(destout):
        !{curlcmd(url, database, destout)}

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 3733k    0 3733k    0     0  2226k      0 --:--:--  0:00:01 --:--:-- 2227k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  591k    0  591k    0     0   539k      0 --:--:--  0:00:01 --:--:--  539k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  985k    0  985k    0     0  1311k      0 --:--:-- --:--:-- --:--:-- 1310k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 2018k    0 2018k    0     0  1737k      0 --:--:--  0:00:01 --:--:-- 1738k
  % Total    % Received % Xferd  Average Speed   Tim

In [67]:
# Filtered Raw Data
frddir = os.path.join(outdir, 'filtered_raw_data')
keep_file_phrases = ['filtered-report', 'METAGENOME.fastq.gz']

if not os.path.exists(frddir):
    os.mkdir(frddir)

for child in root:
    if child.attrib['name'] == 'Filtered Raw Data':
        
        for frchild in child:
            for phrase in keep_file_phrases:
                if phrase in frchild.attrib['filename']:
                    filename = frchild.attrib['filename']

                    destout = os.path.join(frddir, filename)
                    url = frchild.attrib['url']

                    if not os.path.exists(destout):
                        !{curlcmd(url, database, destout)}


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  3647  100  3647    0     0   7833      0 --:--:-- --:--:-- --:--:--  7843
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 23.4G    0 23.4G    0     0  35.6M      0 --:--:--  0:11:15 --:--:-- 49.8M--:-- --:--:-- 19256   0  44.0M      0 --:--:--  0:01:37 --:--:-- 42.5M--:--  0:02:10 --:--:-- 37.5M  0 --:--:--  0:05:12 --:--:-- 6208k05:35 --:--:-- 4548k  0:06:00 --:--:-- 21.1MG    0     0  28.8M      0 --:--:--  0:06:26 --:--:-- 44.7M 0:06:43 --:--:-- 40.4M-:--:--  0:09:14 --:--:-- 51.0M


In [70]:
# QC and Genome Assembly

qcadir = os.path.join(outdir, 'qc_and_assembly')

if not os.path.exists(qcadir):
    os.mkdir(qcadir)
    
for child in root:
    if child.attrib['name'] == 'QC and Genome Assembly':
        for qchild in child:
            for stepchild in qchild:
                if stepchild.attrib['filename'] == 'assembly.contigs.fasta':
                    filename = stepchild.attrib['filename']
                    url = stepchild.attrib['url']

                    destout = os.path.join(qcadir, filename)
                    url = stepchild.attrib['url']

                    if not os.path.exists(destout):
                        sarge.run(curlcmd(url, database, destout))

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 3355M    0 3355M    0     0  5045k      0 --:--:--  0:11:20 --:--:-- 18.5M34 --:--:-- 3807k40 --:--:-- 3080kk      0 --:--:--  0:08:45 --:--:-- 3803k


In [79]:
mgtbldir = os.path.join(outdir, 'mg_report_tables')

if not os.path.exists(mgtbldir):
    os.mkdir(mgtbldir)


for child in root:
    if child.attrib['name'] == 'Metagenome Report Tables':
        for stepchild in child:
            filename = stepchild.attrib['filename']
            url = stepchild.attrib['url']
            
            destout = os.path.join(mgtbldir, filename)
            if not os.path.exists(destout):
                sarge.run(curlcmd(url, database, destout))

In [82]:
imgdir = os.path.join(outdir, 'img_data')

if not os.path.exists(imgdir):
    os.mkdir(imgdir)

for child in root:
    if child.attrib['name'] == 'IMG Data':
        for stepchild in child:
            filename = stepchild.attrib['filename']
            if filename.endswith('.tar.gz'):
                url = stepchild.attrib['url']

                destout = os.path.join(imgdir, filename)

                sarge.run(curlcmd(url, database, destout))

curl 'https://genome.jgi.doe.gov/portal/ext-api/downloads/get_tape_file?blocking=true&url=/OKS_Pond6_8July2_2/download/_JAMO/60e644c6c399d4ad32fe5440/3300045971.tar.gz' -b cookies > OKS_Pond6_19July2019/img_data/3300045971.tar.gz


In [None]:
mgtbldir = os.path.join(outdir, 'mg_report_tables')

if not os.path.exists(mgtbldir):
    os.mkdir(mgtbldir)


for child in root:
    if child.attrib['name'] == 'Metagenome Report Tables':
        for stepchild in child:
            filename = stepchild.attrib['filename']
            url = stepchild.attrib['url']
            
            destout = os.path.join(mgtbldir, filename)
            if not os.path.exists(destout):
                sarge.run(curlcmd(url, database, destout))