In [None]:
# Load data from ./data/ to S3 exploration notebook

In [1]:
# Imports
import os
import boto3
import zipfile
import shutil
import progressbar

# Notebook specific
from IPython.display import clear_output

# Exploring data
import json
from pathlib import Path

In [2]:
# Create connections
region = 'us-east-1'
s3_client = boto3.client('s3', region_name=region)

In [3]:
# Retrieve the list of existing buckets
response = s3_client.list_buckets()

In [4]:
# Output the bucket names
print('Existing buckets:')
for bucket in response['Buckets']:
    print(f'  {bucket["Name"]}')

Existing buckets:
  arxiv-etl
  jbma-data-engineering-us-east


In [5]:
# Check if bucket already exists
bucket_name = 'arxiv-etl'
bucket_exists = False
for obj in response['Buckets']:
    if obj['Name'] == bucket_name:
        bucket_exists = True
print(f"Bucket exists: {bucket_exists}")

Bucket exists: True


In [6]:
# Create bucket if it doesn't exist
if not bucket_exists:
    s3_client.create_bucket(Bucket=bucket_name)
    print(f"Bucket {bucket_name} created")
else: 
    print(f"Bucket {bucket_name} already exists")

Bucket arxiv-etl already exists


In [7]:
# Upload data to bucket - replace existing files as needed

In [8]:
path = '../data/loading/'
directory = os.fsencode(path)

for folder in os.listdir(directory):
    dir_name = os.fsdecode(folder)
    print("Path:", dir_name)
    dir = os.fsencode(os.path.join(path, dir_name))
    for file in os.listdir(dir):
        file_name = os.fsdecode(file)
        print("File:", file_name)

Path: authors
File: authors-parsed.json
Path: citations
File: internal-citations.json
Path: classifications
File: subject-classifications.csv
Path: metadata
File: arxiv-metadata-oai-snapshot-sample.json


In [9]:
def upload_file(s3_client, bucket_name, path, folder_name, file_name):
    # https://stackoverflow.com/questions/41827963/track-download-progress-of-s3-file-using-boto3-and-callbacks
    full_name = os.path.join(path, folder_name, file_name)
    s3_path = f'staging/{folder_name}/{file_name}'

    statinfo = os.stat(full_name)

    up_progress = progressbar.progressbar.ProgressBar(maxval=statinfo.st_size)

    up_progress.start()

    def upload_progress(chunk):
        clear_output(wait = True) # Only for IPython (Notebook)
        up_progress.update(up_progress.currval + chunk)

    response = s3_client.upload_file(full_name, bucket_name, s3_path, Callback=upload_progress)

    up_progress.finish()

    return response

In [39]:
# Sync data/loaded folder to s3
# https://dev.to/razcodes/how-to-copy-files-to-s3-using-boto3-41fp

path = '../data/loading/'
directory = os.fsencode(path)

# https://stackoverflow.com/questions/10377998/how-can-i-iterate-over-files-in-a-given-directory

for folder in os.listdir(directory):
    folder_name = os.fsdecode(folder)
    print("Path:", folder_name)
    dir = os.fsencode(os.path.join(path, folder_name))
    for file in os.listdir(dir):
        file_name = os.fsdecode(file)
        if file_name.endswith(".json") or file_name.endswith(".csv"): 
            print("Uploading", file_name)
            #full_name = os.path.join(folder_name, file_name)
            #response = s3_client.upload_file(full_name, bucket_name, f'staging/{file_name}')
            response = upload_file(s3_client, bucket_name, path, folder_name, file_name)
            if response is not None:
                print("HTTPStatusCode:", response['ResponseMetadata']['HTTPStatusCode'])

FileNotFoundError: [WinError 3] The system cannot find the path specified: b'metadata'

In [None]:
# Delete data/loaded folder 
shutil.rmtree(folder_name)