# Setup Data in S3

## Import libraries

In [2]:
import boto3
import botocore
import os
import pandas as pd
import sagemaker

from IPython.core.display import display, HTML
from pyathena import connect

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


  from IPython.core.display import display, HTML


In [3]:
# check dependencies are stored
%store

Stored variables and their in-db values:
setup_dependencies_passed             -> True
setup_s3_bucket_passed                -> True


## Setup S3 Bucket

In [4]:
# save Amazon information
account_id = boto3.client("sts").get_caller_identity().get("Account")
region = boto3.Session().region_name
role = sagemaker.get_execution_role()
sagemaker_session = sagemaker.Session()
s3 = boto3.client('s3', region_name=sagemaker_session.boto_region_name)

In [5]:
# define bucket name
bucket_name = 'wizard-of-tasks-dataset-5432'

# delete bucket if it already exists
try:
    s3.head_bucket(Bucket=bucket_name)
    print(f"Bucket {bucket_name} already exists. Deleting it.")
    s3.delete_bucket(Bucket=bucket_name)
except botocore.exceptions.ClientError:
    pass

# create a new S3 bucket
s3.create_bucket(Bucket=bucket_name)

Bucket wizard-of-tasks-dataset-5432 already exists. Deleting it.


{'ResponseMetadata': {'RequestId': '1X9VJ7ZFQGBM2GHB',
  'HostId': 'JGRjPlhUKaNcS7CIiGh4WZBV7pA4e9nnuHg30gG5u7YYPhT4PeuG/uq48D9hwqQfgHZZhE2nu+6CSrTyVb0AX1XE3CfyvUor',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'JGRjPlhUKaNcS7CIiGh4WZBV7pA4e9nnuHg30gG5u7YYPhT4PeuG/uq48D9hwqQfgHZZhE2nu+6CSrTyVb0AX1XE3CfyvUor',
   'x-amz-request-id': '1X9VJ7ZFQGBM2GHB',
   'date': 'Tue, 04 Jun 2024 02:18:51 GMT',
   'location': '/wizard-of-tasks-dataset-5432',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'Location': '/wizard-of-tasks-dataset-5432'}

In [6]:
print("Default bucket: {}".format(bucket_name))

Default bucket: wizard-of-tasks-dataset-5432


## Copy Data to S3 Bucket

In [7]:
# copy data from public s3 bucket to ours
!aws s3 cp s3://wizard-of-tasks/ s3://{bucket_name}/ --recursive

copy: s3://wizard-of-tasks/README.md to s3://wizard-of-tasks-dataset-5432/README.md
copy: s3://wizard-of-tasks/wizard_of_tasks_cooking_v1.0.json to s3://wizard-of-tasks-dataset-5432/wizard_of_tasks_cooking_v1.0.json
copy: s3://wizard-of-tasks/wizard_of_tasks_diy_v1.0.json to s3://wizard-of-tasks-dataset-5432/wizard_of_tasks_diy_v1.0.json


In [8]:
# check it uploaded
display(
    HTML(
        '<b>Review <a target="blank" href="https://s3.console.aws.amazon.com/s3/buckets/sagemaker-{}-{}/?region={}&tab=overview">S3 Bucket</a></b>'.format(
            region, account_id, region
        )
    )
)

In [9]:
# call to list_objects_v2 to retrieve bucket contents
response = s3.list_objects_v2(Bucket=bucket_name)

# iterate over the objects
for item in response.get('Contents', []):
    print(f"Key: {item['Key']}, Last Modified: {item['LastModified']}")

Key: README.md, Last Modified: 2024-06-04 02:18:53+00:00
Key: wizard_of_tasks_cooking_v1.0.json, Last Modified: 2024-06-04 02:18:53+00:00
Key: wizard_of_tasks_diy_v1.0.json, Last Modified: 2024-06-04 02:18:53+00:00


In [10]:
# store variable that s3 bucket has been setup
setup_s3_bucket_passed = True
%store setup_s3_bucket_passed
%store bucket_name

Stored 'setup_s3_bucket_passed' (bool)
Stored 'bucket_name' (str)


In [11]:
%store

Stored variables and their in-db values:
bucket_name                           -> 'wizard-of-tasks-dataset-5432'
setup_dependencies_passed             -> True
setup_s3_bucket_passed                -> True


## Shut down notebook resources

In [12]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>

In [13]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}

<IPython.core.display.Javascript object>