# Upload to S3
- data (~5.8GB)
- configuration files (EMR bootstrap, EMR spark configuration)
- apps, i.e. .py files executed as steps on EMR using spark-submit

In [7]:
import boto3
import glob
import configparser
import pathlib

In [8]:
config = configparser.ConfigParser()
config.read('config/prj.cfg')

aws_region = config.get("AWS", "REGION") 
access_id = config.get("AWS", "AWS_ACCESS_KEY_ID") 
access_key = config.get("AWS", "AWS_SECRET_ACCESS_KEY")
aws_region

'us-east-1'

In [9]:
bucket_name = "airbnbprj-us"

In [10]:
s3_client = boto3.client('s3',
                          region_name=aws_region,
                          aws_access_key_id=access_id,
                          aws_secret_access_key=access_key)
s3 = boto3.resource('s3')

In [11]:
data_parent = 'data'
path_global_listings = 'airbnb-listings.csv'
path_city_listings = 'cities/*/*/listings.csv'
path_city_reviews = 'cities/*/*/reviews.csv'
path_city_temperature = "weather/ECA_blend_tg/*.txt"
path_city_rain = "weather/ECA_blend_rr/*.txt"
s3_key = "{}/{}"

# S3
raw_data_folder = "raw"

In [None]:
s3_client.create_bucket(Bucket=bucket_name)
#s3_client.create_bucket(Bucket=bucket_name,
                       #CreateBucketConfiguration={'LocationConstraint': aws_region})

In [12]:
for filepath in glob.glob(str(pathlib.Path(data_parent,*(path_global_listings.split("/"))))):
    print("/".join(filepath.split("/")[1:]))
    s3_client.upload_file(filepath, bucket_name, s3_key.format(raw_data_folder, "/".join(filepath.split("/")[1:])))    

airbnb-listings.csv


In [13]:
for filepath in glob.glob(str(pathlib.Path(data_parent,*(path_city_listings.split("/"))))):
    print("/".join(filepath.split("/")[1:]))
    s3_client.upload_file(filepath, bucket_name, s3_key.format(raw_data_folder, "/".join(filepath.split("/")[1:])))

cities/Amsterdam/2021-03/listings.csv
cities/Amsterdam/2021-02/listings.csv
cities/Amsterdam/2021-01/listings.csv
cities/Paris/2021-03/listings.csv
cities/Paris/2021-02/listings.csv
cities/Paris/2021-01/listings.csv
cities/Berlin/2021-03/listings.csv
cities/Berlin/2021-02/listings.csv
cities/Berlin/2021-01/listings.csv
cities/London/2021-03/listings.csv
cities/London/2021-02/listings.csv
cities/London/2021-01/listings.csv


In [14]:
for filepath in glob.glob(str(pathlib.Path(data_parent,*(path_city_reviews.split("/"))))):
    print("/".join(filepath.split("/")[1:]))
    s3_client.upload_file(filepath, bucket_name, s3_key.format(raw_data_folder, "/".join(filepath.split("/")[1:])))

cities/Amsterdam/2021-03/reviews.csv
cities/Amsterdam/2021-02/reviews.csv
cities/Amsterdam/2021-01/reviews.csv
cities/Paris/2021-03/reviews.csv
cities/Paris/2021-02/reviews.csv
cities/Paris/2021-01/reviews.csv
cities/Berlin/2021-03/reviews.csv
cities/Berlin/2021-02/reviews.csv
cities/Berlin/2021-01/reviews.csv
cities/London/2021-03/reviews.csv
cities/London/2021-02/reviews.csv
cities/London/2021-01/reviews.csv


In [15]:
for filepath in glob.glob(str(pathlib.Path(data_parent,*(path_city_temperature.split("/"))))):
    print("/".join(filepath.split("/")[1:]))
    s3_client.upload_file(filepath, bucket_name, s3_key.format(raw_data_folder, "/".join(filepath.split("/")[1:])))

weather/ECA_blend_tg/TG_STAID011249.txt
weather/ECA_blend_tg/TG_STAID000041.txt
weather/ECA_blend_tg/TG_STAID000593.txt
weather/ECA_blend_tg/TG_STAID001860.txt


In [16]:
for filepath in glob.glob(str(pathlib.Path(data_parent,*(path_city_rain.split("/"))))):
    print("/".join(filepath.split("/")[1:]))
    s3_client.upload_file(filepath, bucket_name, s3_key.format(raw_data_folder, "/".join(filepath.split("/")[1:])))   

weather/ECA_blend_rr/RR_STAID000041.txt
weather/ECA_blend_rr/RR_STAID000593.txt
weather/ECA_blend_rr/RR_STAID011249.txt
weather/ECA_blend_rr/RR_STAID001860.txt


In [6]:
# upload config
filepaths = 'config/*'
for filepath in glob.glob(filepaths):
    s3_client.upload_file(filepath, bucket_name,filepath)   

In [13]:
# upload apps
filepaths = 'apps/*'
for filepath in glob.glob(filepaths):
    s3_client.upload_file(filepath, bucket_name,filepath)   