In [9]:
import argparse
from copy import copy
from hashlib import md5
import logging
import os
import re
import sys
import time

import yaxil
from yaxil.exceptions import RestApiError
from requests.exceptions import ConnectionError

logging.basicConfig(level=logging.DEBUG)

logger = logging.getLogger(__name__)
MAX_RETRIES = 3

parser = argparse.ArgumentParser(description='Download from Remote XNAT')
parser.add_argument('--project', '-p', type=str)
parser.add_argument('--collections', '-c', type=str, nargs='+')
parser.add_argument('--ignore-list', type=str, nargs='+', default=['OTHER_FILES'])
parser.add_argument('--subjects', '-s', type=str, nargs='+', default=[], help='Explicit list of subjects')
parser.add_argument('--sessions', type=list, default=[], help='Explicit list of sessions')

os.chdir('/net/holynfs01/srv/export/ncf_hcp/share_root/data/intradb/')
         
args = parser.parse_args(['-p', 'CCF_HCD_ITK',  '-s', 'HCD0737657', 'HCD0551239'])
opts = vars(args)
opts


{'project': 'CCF_HCD_ITK',
 'collections': None,
 'ignore_list': ['OTHER_FILES'],
 'subjects': ['HCD0737657', 'HCD0551239'],
 'sessions': []}

In [10]:
locals().update(opts)
opts

{'project': 'CCF_HCD_ITK',
 'collections': None,
 'ignore_list': ['OTHER_FILES'],
 'subjects': ['HCD0737657', 'HCD0551239'],
 'sessions': []}

In [11]:
if not ignore_list:
    ignore_list = list()
if not subjects:
    subjects = list()
if not sessions:
    sessions = list()
auth = yaxil.auth('intradb')  # Requires setup and description
start_time = time.time()
with yaxil.session(auth) as sess:
    if not sessions:
        subject_labels = subjects
        logger.info('Fetching list of experiments')
        experiments = []
        
        for label in subject_labels:
            try:
                sub = list(sess.subjects(label=label, project=project))[0]
                experiments.extend(sess.experiments(subject=sub))
            except Exception as err:
                print('Error with subject {}'.format(label))
                print(str(err))
        logger.info('Found {} experiments'.format(len(experiments)))
        
[x._asdict() for x in experiments]

INFO:__main__:Fetching list of experiments
DEBUG:yaxil:issuing http request https://intradb.humanconnectome.org/data/subjects
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): intradb.humanconnectome.org:443
DEBUG:urllib3.connectionpool:https://intradb.humanconnectome.org:443 "GET /data/subjects?columns=ID%2Clabel%2Cproject&label=HCD0737657&project=CCF_HCD_ITK HTTP/1.1" 200 None
DEBUG:yaxil:issuing http request https://intradb.humanconnectome.org/data/experiments
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): intradb.humanconnectome.org:443
DEBUG:urllib3.connectionpool:https://intradb.humanconnectome.org:443 "GET /data/experiments?columns=ID%2Clabel%2Cproject%2Cxnat%3Asubjectassessordata%2Fsubject_id%2Csubject_label%2Cinsert_date&project=CCF_HCD_ITK&xnat%3Asubjectassessordata%2Fsubject_id=HCPIntradb03_S00662 HTTP/1.1" 200 None
DEBUG:yaxil:issuing http request https://intradb.humanconnectome.org/data/subjects
DEBUG:urllib3.connectionpool:Starting new HTTPS

[OrderedDict([('uri', '/data/experiments/HCPIntradb03_E05029'),
              ('label', 'HCD0737657_V1_A'),
              ('id', 'HCPIntradb03_E05029'),
              ('project', 'CCF_HCD_ITK'),
              ('subject_id', 'HCPIntradb03_S00662'),
              ('subject_label', 'HCD0737657'),
              ('archived_date', '2018-03-21 13:35:49.681')]),
 OrderedDict([('uri', '/data/experiments/HCPIntradb03_E06508'),
              ('label', 'HCD0737657_V1_B'),
              ('id', 'HCPIntradb03_E06508'),
              ('project', 'CCF_HCD_ITK'),
              ('subject_id', 'HCPIntradb03_S00662'),
              ('subject_label', 'HCD0737657'),
              ('archived_date', '2018-04-25 10:38:51.284')]),
 OrderedDict([('uri', '/data/experiments/HCPIntradb07_E00342'),
              ('label', 'HCD0551239_V3_A'),
              ('id', 'HCPIntradb07_E00342'),
              ('project', 'CCF_HCD_ITK'),
              ('subject_id', 'HCPIntradb_S04903'),
              ('subject_label', 'HCD0551

In [21]:
collections = ['tfMRI_CARIT_PA', 'tfMRI_CARIT_AP']
experiments[0]._asdict()

OrderedDict([('uri', '/data/experiments/HCPIntradb03_E05029'),
             ('label', 'HCD0737657_V1_A'),
             ('id', 'HCPIntradb03_E05029'),
             ('project', 'CCF_HCD_ITK'),
             ('subject_id', 'HCPIntradb03_S00662'),
             ('subject_label', 'HCD0737657'),
             ('archived_date', '2018-03-21 13:35:49.681')])

In [22]:
exp_info = experiments[0]._asdict()

with yaxil.session(auth) as sess:
    logger.info('Syncing experiment {}'.format(exp_info['label']))
    start_time = time.time()
    resources_url_pat = ('data/projects/{project}/subjects/{subject_label}/'
                         'experiments/{label}/scans')
    base_url = resources_url_pat.format(**exp_info)
    _, response = yaxil._get(sess._auth, base_url, yaxil.Format.JSON)

    # Filter only wanted collections or return all
    if collections:
        resources = [
            result for result in response['ResultSet']['Result']
            if result['series_description'] in collections
        ]
    else:
        resources = response['ResultSet']['Result']
    

INFO:__main__:Syncing experiment HCD0737657_V1_A
DEBUG:yaxil:issuing http request https://intradb.humanconnectome.org/data/projects/CCF_HCD_ITK/subjects/HCD0737657/experiments/HCD0737657_V1_A/scans
DEBUG:yaxil:query parameters {'format': 'json'}
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): intradb.humanconnectome.org:443
DEBUG:urllib3.connectionpool:https://intradb.humanconnectome.org:443 "GET /data/projects/CCF_HCD_ITK/subjects/HCD0737657/experiments/HCD0737657_V1_A/scans?format=json HTTP/1.1" 200 None


In [24]:
resources

[{'xsiType': 'xnat:mrScanData',
  'xnat_imagescandata_id': '436235',
  'note': '',
  'series_description': 'tfMRI_CARIT_PA',
  'ID': '34',
  'type': 'tfMRI',
  'URI': '/data/experiments/HCPIntradb03_E05029/scans/34',
  'quality': 'usable'},
 {'xsiType': 'xnat:mrScanData',
  'xnat_imagescandata_id': '436243',
  'note': '',
  'series_description': 'tfMRI_CARIT_AP',
  'ID': '37',
  'type': 'tfMRI',
  'URI': '/data/experiments/HCPIntradb03_E05029/scans/37',
  'quality': 'usable'}]

In [29]:
resource_info = resources[0]
ignore_list
resource_info

{'xsiType': 'xnat:mrScanData',
 'xnat_imagescandata_id': '436235',
 'note': '',
 'series_description': 'tfMRI_CARIT_PA',
 'ID': '34',
 'type': 'tfMRI',
 'URI': '/data/experiments/HCPIntradb03_E05029/scans/34',
 'quality': 'usable'}

In [26]:
if not ignore_list:
    ignore_list = list()
    # Use a cookie to mark a resource as complete
success_cookie = os.path.join(exp_info['label'], resource_info['label'],
                              'SUCCESS')
print(success_cookie)
print(os.path.exists(success_cookie))
import os
print(os.getcwd())

KeyError: 'label'

In [36]:
url_info

OrderedDict([('uri', '/data/experiments/HCPIntradb03_E05029'),
             ('label', 'HCD0737657_V1_A'),
             ('id', 'HCPIntradb03_E05029'),
             ('project', 'CCF_HCD_ITK'),
             ('subject_id', 'HCPIntradb03_S00662'),
             ('subject_label', 'HCD0737657'),
             ('archived_date', '2018-03-21 13:35:49.681')])

In [None]:
resource_url_pat = (
    'data/projects/{project}/subjects/{subject_label}/'
    'experiments/{label}/resources/{xnat_abstractresource_id}/files')
url_info = copy(exp_info)  # Combine resource and experiment
url_info['xnat_abstractresource_id'] = resource_info[
    'xnat_abstractresource_id']

base_url = resource_url_pat.format(**url_info)
_, response = yaxil._get(sess._auth, base_url, yaxil.Format.JSON)

filelist = response['ResultSet']['Result']
filelist


In [None]:
print('Downloading ' + filelist[0]['URI'])

In [None]:
def ignore_file(uri, ignores_list):
    ignore = False
    for ignore_pat in ignores_list:
        if re.search(ignore_pat, uri):
            ignore = True
    return ignore
def fetch_resource(sess, exp_info, resource_info, always_checksum=False, ignore_list=None):
    if not ignore_list:
        ignore_list = list()
    # Use a cookie to mark a resource as complete
    success_cookie = os.path.join(exp_info['label'], resource_info['label'],
                                  'SUCCESS')
    if os.path.exists(success_cookie) and not always_checksum:
        return

    resource_url_pat = (
        'data/projects/{project}/subjects/{subject_label}/'
        'experiments/{label}/resources/{xnat_abstractresource_id}/files')
    url_info = copy(exp_info)  # Combine resource and experiment
    url_info['xnat_abstractresource_id'] = resource_info[
        'xnat_abstractresource_id']

    base_url = resource_url_pat.format(**url_info)
    _, response = yaxil._get(sess._auth, base_url, yaxil.Format.JSON)

    filelist = response['ResultSet']['Result']
    if not len(filelist):
        raise ValueError('No files could be read from {} in json response: {}'.format(base_url, response))
    logger.info('Syncing {} file (resources) from {}'.format(len(filelist), base_url))

    start_time = time.time()
    for fileinfo in filelist:
        try:
            # Rename URI (python variable case)
            fileinfo['uri'] = fileinfo.pop('URI')
            if ignore_file(fileinfo['uri'], ignore_list):
                logger.debug('Ignoring {}'.format(fileinfo['uri']))
                continue
            print('Downloading ' + fileinfo['uri'])
            download_file(sess, out_dir=exp_info['label'], **fileinfo)
        except RuntimeError:
            logger.info('Digest failed on {}'.format(fileinfo['uri']))
        except RestApiError as err:
            logger.info('Download Error on {}: {}'.format(
                fileinfo['uri'], err))

    elapsed_time = time.time() - start_time
    logger.info('Finished in {}'.format(
        time.strftime("%H:%M:%S", time.gmtime(elapsed_time))))

    # if we got here, mark this collection as completed ("touch" cookie)
    open(success_cookie, 'w').close()
def download_file(sess,
                  uri,
                  digest,
                  collection,
                  out_dir='.',
                  overwrite=False,
                  **kwargs):
    basename = uri.split('files/')[-1]
    fname = os.path.join(out_dir, collection, basename)
    dirname = os.path.dirname(fname)

    if not os.path.exists(dirname):
        os.makedirs(dirname)
    if os.path.exists(fname):
        with open(fname, 'rb') as f:
            disk_digest = md5(f.read()).hexdigest()
        if disk_digest == digest:
            logger.debug('Digest matched - Skipping {}'.format(fname))
            return
        elif overwrite:
            # import pdb; pdb.set_trace()
            logger.info(
                'Digest failed - removing {} and trying again'.format(fname))
            os.remove(fname)
        else:
            raise RuntimeError(
                '{} exisited with incorrect digest '.format(fname) +
                'but cowardly moving on')

    try:
        _, result = yaxil._get(
            sess._auth,
            uri,
            yaxil.Format.JSON,  # Format is ignored for _file_ downloads
            autobox=False)
    except RestApiError as err:
        # Empty responses are acceptable for some scripts and onset files
        if 'response is empty' in str(err):
            result = bytes('', 'utf8')
        else:
            raise

    with open(fname, 'wb') as f:
        f.write(result)

    with open(fname, 'rb') as f:
        disk_digest = md5(f.read()).hexdigest()
    if disk_digest != digest:
        retries = kwargs.get('retries', 0)
        if overwrite:
            os.remove(fname)
        if retries >= MAX_RETRIES:
            raise RuntimeError(
                'Digest failed - ' +
                '{} may need to be re-downloaded.'.format(fname))
        else:
            retries += 1
            download_file(sess,
                          uri,
                          digest,
                          collection,
                          out_dir,
                          overwrite=True,
                          retries=retries)

In [None]:
print(filelist[0])

In [None]:
print(ignore_list)
print(resources)

In [None]:
fileinfo = filelist[1]
fileinfo['uri'] = fileinfo.pop('URI')


In [None]:
download_file(sess, out_dir=exp_info['label'], **fileinfo)

In [None]:
for fileinfo in filelist:
    try:
        # Rename URI (python variable case)
        fileinfo['uri'] = fileinfo.pop('URI')
        if ignore_file(fileinfo['uri'], ignore_list):
            logger.debug('Ignoring {}'.format(fileinfo['uri']))
            continue
        download_file(sess, out_dir=exp_info['label'], **fileinfo)
    except RuntimeError:
        logger.info('Digest failed on {}'.format(fileinfo['uri']))
    except RestApiError as err:
        logger.info('Download Error on {}: {}'.format(
            fileinfo['uri'], err))

In [None]:
fetch_resource(sess, exp_info, resources[0], always_checksum=True, ignore_list=ignore_list)

In [None]:
for resource in resources:
    try:
        fetch_resource(sess, exp_info, resource, always_checksum=True, ignore_list=ignore_list)
    except ValueError as err:
        if 'No JSON object could be decoded' in str(err):
            logger.error(err)
            continue
        else:
            raise
    except ConnectionError as err:
        logger.error(err)
        resource_errors.append('ConnectionError: {}'.format(
            exp_info['label']))
        continue