## AWS Pipeline
This notebook reads in all the files in the Safegraph S3 bucket into the local raw data folder.

In [10]:
import os
from dotenv import load_dotenv, find_dotenv

In [11]:
# find .env automagically by walking up directories until it's found
dotenv_path = find_dotenv()

# load up the entries as environment variables
load_dotenv(dotenv_path)

aws_access_key = os.environ.get("AWS_ACCESS_KEY")
aws_secret_access_key = os.environ.get("AWS_SECRET_ACCESS_KEY")
aws_bucket = os.environ.get("AWS_BUCKET")
root_dir = os.environ.get("ROOT_DIR")

In [12]:
import boto3

In [13]:
# Initialize the session with authentication
session = boto3.Session(
    aws_access_key_id=aws_access_key,
    aws_secret_access_key=aws_secret_access_key,
)
 
# Get the interface to AWS s3
#s3_client = session.client(
#    "s3", endpoint_url="https://s3.wasabisys.com"
#)

s3_resource = session.resource("s3", endpoint_url = 'https://s3.wasabisys.com')
s3_bucket = s3_resource.Bucket(aws_bucket)

In [14]:
paths = (
    "core/",
    "geo-supplement/",
    "monthly-patterns/normalization_stats/",
    "monthly-patterns/patterns/",
    "monthly-patterns/patterns_backfill/2020/05/07/12/2019/",
    "monthly-patterns/patterns_backfill/2020/05/07/12/2020/",
    "open-census-data/",
    "core-places-delivery/brand_info",
    "core-places-delivery/core_poi",
    "monthly-patterns-2020-12/normalization_stats/",
    "monthly-patterns-2020-12/patterns/"
)

In [15]:
# Sync the raw data folder with the Safegraph bucket.

# Make a list of all files in the raw data folder
local = os.path.join(root_dir,'data/raw')
n = len(local)
files = []
# r=root, d=directories, f = files
for r, d, f in os.walk(local):
    for file in f:
        files.append(os.path.join(r, file)[n+1:])

# Download the objects in the bucket that are not in the raw data folder.
for obj in s3_bucket.objects.all():
    file = obj.key
    if file.startswith(paths) and file not in files:
        print(file)
        filepath = os.path.join(local, file)
        directory = os.path.dirname(filepath)
        os.makedirs(directory, exist_ok=True)
        s3_bucket.download_file(file, filepath)
        
# HK: ideally we would delete any files from the local folder that are no longer in the bucket.
#     Ideally we would also check if the files that overlap are the same in both places.