# Set Up Data in S3

## Import libraries

In [1]:
import boto3
import botocore
import os
import pandas as pd
import sagemaker
import random
from pyathena import connect

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [2]:
# check dependencies are stored
%store

Stored variables and their in-db values:
bucket_name                            -> 'housing-dataset-54355'
set_up_dependencies_passed             -> True
set_up_s3_bucket_passed                -> True


## Setup S3 Bucket

In [3]:
# save Amazon information
account_id = boto3.client("sts").get_caller_identity().get("Account")
region = boto3.Session().region_name
role = sagemaker.get_execution_role()
sagemaker_session = sagemaker.Session()
s3 = boto3.client('s3', region_name=sagemaker_session.boto_region_name)

In [4]:
# define bucket name
bucket_name = 'housing-dataset-'
random_int = random.randint(1000, 9999)  # Generate a random integer between 1000 and 9999
bucket_name += str(random_int)  # Append the random integer to the end of bucket_name

# delete bucket if it already exists
try:
    s3.head_bucket(Bucket=bucket_name)
    print(f"Bucket {bucket_name} already exists. Deleting it.")
    s3.delete_bucket(Bucket=bucket_name)
except botocore.exceptions.ClientError:
    pass

# create a new S3 bucket
s3.create_bucket(Bucket=bucket_name)

{'ResponseMetadata': {'RequestId': 'QA33JZ1ZTH2DC7HD',
  'HostId': 'X1tEidKwI7mEwIU/EHgvrX3Sz9OqQlxXan3KzmQ+9kv5xSvkPzPfQPCrapcnY8bPBbS/91TvxCw=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'X1tEidKwI7mEwIU/EHgvrX3Sz9OqQlxXan3KzmQ+9kv5xSvkPzPfQPCrapcnY8bPBbS/91TvxCw=',
   'x-amz-request-id': 'QA33JZ1ZTH2DC7HD',
   'date': 'Thu, 20 Jun 2024 02:50:02 GMT',
   'location': '/housing-dataset-2111',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'Location': '/housing-dataset-2111'}

In [5]:
print("Default bucket: {}".format(bucket_name))

Default bucket: housing-dataset-2111


## Copy Data to S3 Bucket

In [6]:
# save path to local datasets
current_directory = os.getcwd()
parent_directory = os.path.dirname(current_directory)
local_path = os.path.join(parent_directory, 'data')
print(local_path)

# copy local datasets to bucket
!aws s3 cp "$local_path" s3://{bucket_name}/data/ --recursive

/home/sagemaker-user/AAI_540_SU_04/data
upload: ../data/test.csv to s3://housing-dataset-2111/data/test.csv
upload: ../data/merged_data.csv to s3://housing-dataset-2111/data/merged_data.csv
upload: ../data/train.csv to s3://housing-dataset-2111/data/train.csv


In [7]:
# check it uploaded by listing objects in the bucket
response = s3.list_objects_v2(Bucket=bucket_name)
for item in response.get('Contents', []):
    print(f"Key: {item['Key']}, Last Modified: {item['LastModified']}")

Key: data/merged_data.csv, Last Modified: 2024-06-20 02:50:04+00:00
Key: data/test.csv, Last Modified: 2024-06-20 02:50:04+00:00
Key: data/train.csv, Last Modified: 2024-06-20 02:50:04+00:00


In [8]:
# store variable that s3 bucket has been setup
set_up_s3_bucket_passed = True
%store set_up_s3_bucket_passed
%store bucket_name

Stored 'set_up_s3_bucket_passed' (bool)
Stored 'bucket_name' (str)


In [9]:
# check that variables are stored
%store

Stored variables and their in-db values:
bucket_name                            -> 'housing-dataset-2111'
set_up_dependencies_passed             -> True
set_up_s3_bucket_passed                -> True


## Shut down notebook resources

In [10]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}

<IPython.core.display.Javascript object>

In [11]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>