# Download de ficheiros da AWS Registry of Open Data


Este notebook permite fazer o download dos dados armazenados num S3 da AWS. Para este trabalho, optámos por utilizar o datalake **Department of Energy's Open Energy Data Initiative (OEDI)**, mais concretamente a informação de produção de energia fotovoltaica de várias centrais localizadas nos EUA - **Photovoltaic Data Acquisition (PVDAQ) Public Datasets**

A informação está disponível em https://registry.opendata.aws/oedi-data-lake/

In [1]:
import os
import boto3

In [2]:
from botocore import UNSIGNED
from botocore.client import Config

# FUNÇÕES

In [4]:
# Print out the objects' name in a sub-folder of a S3 bucket, given the s3 resource.
# Optionally, filters specific names

def print_all_objects_name_in_folder(s3_resource, bucket_name, prefix, name_word = ""):
    
    # select bucket and objects
    my_bucket = s3_resource.Bucket(bucket_name)
    
    objects = my_bucket.objects.filter(Prefix=prefix)
    for obj in objects:
        filename = obj.key
        if name_word == "" or name_word in filename:  
            print(filename)
            

In [5]:
# Download all objects in a sub-folder of a S3 bucket, given the s3 resource.
# Optionally, filters specific names

def download_all_objects_in_folder(s3_resource, bucket_name, prefix, name_word = ""):
    
    # select bucket and objects
    my_bucket = s3_resource.Bucket(bucket_name)
    
    objects = my_bucket.objects.filter(Prefix=prefix)
    for obj in objects:
        path, filename = os.path.split(obj.key)
        if name_word == "" or name_word in filename:
            my_bucket.download_file(obj.key, filename)
        

# DEFINIÇÕES

In [8]:
# Initiate S3 resource
s3_resource = boto3.resource('s3', config=Config(signature_version=UNSIGNED))

In [9]:
# Set S3 resource name of interest
BUCKET_NAME = 'oedi-data-lake' 

In [10]:
# Particular word in the filename, if it is of interest
# e.g. csv, json, parquet
#Optámos por utilizar apenas ficheiros csv contendo os dados de produção das centrais fotovoltaicas

WORD_IN_FILENAME = 'csv' 

In [12]:
# Set out the related path, if it is of interest
#Por forma a limitar os dados a utilizar, optámos por retirar apenas os dados da Central 10,

PATH_NAME = 'pvdaq/csv/pvdata/system_id=10/'

#C:/Users/jc_co/Downloads/aws/
#s3://oedi-data-lake/umcm/
#PATH_NAME = 'pvdaq/csv/pvdata/system_id=10/year=2021/'

# CODE

In [None]:
#print_all_objects_name_in_folder(s3_resource, BUCKET_NAME, PATH_NAME)

print_all_objects_name_in_folder(s3_resource, BUCKET_NAME, PATH_NAME, WORD_IN_FILENAME)

In [13]:
#download_all_objects_in_folder(s3_resource, BUCKET_NAME, PATH_NAME)

download_all_objects_in_folder(s3_resource, BUCKET_NAME, PATH_NAME, WORD_IN_FILENAME)

#download_all_objects_in_folder_with_complete_name(s3_resource, BUCKET_NAME, PATH_NAME, WORD_IN_FILENAME)

KeyboardInterrupt: 