In [143]:
import os
import cv2
import glob
import shutil

# Main tool to connect and manage an S3 AWS bucket (https://boto3.amazonaws.com/v1/documentation/api/latest/guide/s3-examples.html)
import boto3

In [150]:
def bucket_list(s3obj, nout=10):
    """
    Lists the buckets in an S3 account
    S3OBJ is captured by the command:
    S3OBJ = boto3.resource('s3')
    
    By default, outputs the first 10 buckets (the order is just the one given by the function), 
    this can be changed by setting the optional parameter NOUT to a different value
    """
    nbuckets = len(list(s3obj.buckets.all()))
    if nbuckets > nout:
        print(f'There are {nbuckets} buckets in the account')
        print(f'Listing the first {min([nbuckets, nout])} buckets:')
    else:
        print(f'Listing all the {nbuckets} buckets in the account:')
    for idx, bucket in enumerate(s3obj.buckets.all()):
        if idx > nout:
            break
        print(f'[{idx}]\t{bucket.name}')

def list_bucket_content(selected_bucket, nout=10):
    """
    Takes a list_objects output and lists the file names (taken from the KEY field)
    SELECTED_BUCKET is captured by the command:
    SELECTED_BUCKET =  S3obj.Bucket(BUCKET_NAME).objects.all()
    
    By default, outputs a few objects in the bucket (the order is just the one given by the function), 
    this can be changed by setting the optional parameter NOUT to a different value
    """
    nels = len(list(selected_bucket))
    if nels > nout:
        print(f'There are {nels} objects in the bucket')
        print(f'Listing the first {nout} objects:')
    else:
        print(f'Listing all the {nels} objects in the bucket:')
    print('Index\tFile Size (Mb)\t File Name')
    for idx, bckt_element in enumerate(selected_bucket):
        if idx > nout:
            break
        print(f'[{idx}]\t{bckt_element.size/(1024**2):.3f}\t\t{bckt_element.key}')

    return nels

In [145]:
# Assuming your credentials to access AWS resources has been setup using `aws configure` (which saves them in ~/.aws/credentials)
# there is no need to explicitely connect to your AWS, by default it'll use the credentials in that file
S3obj = boto3.resource('s3')

In [146]:
# List the buckets available:
bucket_list(S3obj)

Listing all the 7 buckets in the account:
[0]	isandexdcm
[1]	jose-ulloabankingapplication
[2]	jose-ulloashoppingcartapp
[3]	logs.scua.cl
[4]	orthanc.scua.cl
[5]	scua.cl
[6]	www.scua.cl


In [147]:
bcktindex = 4
BUCKET_NAME=list(S3obj.buckets.all())[bcktindex].name
print(f'Data will be fetched from bucket: {BUCKET_NAME}')

Data will be fetched from bucket: orthanc.scua.cl


In [148]:
# Gets the bucket's content:
bckt2proc = S3obj.Bucket(BUCKET_NAME)
bckt_content = bckt2proc.objects.all()

In [149]:
# List the first elements in the bucket:
nels = list_bucket_content(bckt_content)

There are 12600 objects in the bucket
Listing the first 10 objects:
Index	File Size (Mb)	 File Name
[0]	0.993		000df9d8-3120-47a7-9a17-ae8541e34dff.dcm
[1]	0.993		00169169-5d8c-4b5f-97b4-27561eabe992.dcm
[2]	0.993		001e218d-10e6-4681-82e9-24fadfe7f000.dcm
[3]	0.993		002a10a2-cf9e-4bdb-96d7-db6ad1579792.dcm
[4]	0.536		002d108b-4bd0-4494-976b-67ae7b5e7f58.dcm
[5]	0.536		002d475d-6e59-4ade-ac11-ceba84d0108c.dcm
[6]	0.993		002d78c8-23bf-4a38-8043-9d2f9e9a10c7.dcm
[7]	0.536		002dcff2-fd46-46ad-ab1b-6c1f202f3439.dcm
[8]	0.536		0034eb09-dd44-4daf-a575-e1ec796f4438.dcm
[9]	0.536		004526b6-65f3-491f-90c7-d483fb4fb56a.dcm
[10]	0.536		00469e14-9f5b-46bb-aba4-d718457c9138.dcm


In [151]:
# Setup everything to download the data:
HOMEPATH = os.getenv('HOME')
DATAPATH = os.path.join(HOMEPATH, 'Data', 'fMRIBreastData')
SRCDIR = 'rawS3'
FEXT = 'dcm' # Only interested in the DCM files

In [152]:
SRCPATH = os.path.join(DATAPATH, SRCDIR)
os.makedirs(SRCPATH, exist_ok=True)

In [158]:
# Finally, run the download loop. To avoid duplicate downloads, check whether the file exist before downloading
for idx, bckt_element in enumerate(bckt_content):
    element_name = os.path.splitext(bckt_element.key)
    if element_name[1] == '.dcm':
        lfilename = os.path.join(SRCPATH, ''.join(element_name))
        if not (os.path.isfile(lfilename)):
            if ((idx > 0) & ((idx % 100) == 0) & (idx < (nels - 1))):
                print(f'Downloading bucket element {bckt_element.key}')
                print(f'{nels-idx} remaining objects to download. Please wait...')
            bckt2proc.download_file(bckt_element.key, lfilename)
        else:
            print(f'File {bckt_element.key} already exists locally. Nothing done')
    else:
        print(f'Skipping bucket object {bckt_element.key}, it is not {FEXT.upper()} format')

File 000df9d8-3120-47a7-9a17-ae8541e34dff.dcm already exists locally. Nothing done
File 00169169-5d8c-4b5f-97b4-27561eabe992.dcm already exists locally. Nothing done
File 001e218d-10e6-4681-82e9-24fadfe7f000.dcm already exists locally. Nothing done
File 002a10a2-cf9e-4bdb-96d7-db6ad1579792.dcm already exists locally. Nothing done
File 002d108b-4bd0-4494-976b-67ae7b5e7f58.dcm already exists locally. Nothing done
File 002d475d-6e59-4ade-ac11-ceba84d0108c.dcm already exists locally. Nothing done
File 002d78c8-23bf-4a38-8043-9d2f9e9a10c7.dcm already exists locally. Nothing done
File 002dcff2-fd46-46ad-ab1b-6c1f202f3439.dcm already exists locally. Nothing done
File 0034eb09-dd44-4daf-a575-e1ec796f4438.dcm already exists locally. Nothing done
File 004526b6-65f3-491f-90c7-d483fb4fb56a.dcm already exists locally. Nothing done
File 00469e14-9f5b-46bb-aba4-d718457c9138.dcm already exists locally. Nothing done
File 004a1697-12e2-4537-abb3-a3ee014682a7.dcm already exists locally. Nothing done
File