# 00 - S3 Datalake and Athena Database

In [2]:
# skip local file copy when using full dataset
full_dataset = True
# dataset should be copied via AWS CLI:
# aws s3 sync <LocalPath> <S3Uri>
# safety/data/images
# safety/data/labels
# place all images in these directories
# don't include subfolders

# Install Required Packages

In [3]:
!pip install pyathena

[0m

# Import Required Libraries

In [4]:
import boto3 # aws sdk for python
import csv # csv file reading and writing
import sagemaker # machine learning platform
import pandas as pd # python data analysis
from IPython.display import display, HTML # for S3 bucket review
from pyathena import connect # athena client

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


# Perform Prerequisites

In [5]:
# establish S3 bucket, provide permissions
sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
account_id = boto3.client("sts").get_caller_identity().get("Account")

# create a boto3 session for the sagemaker service
sm = boto3.Session().client(service_name="sagemaker", region_name=region)

# Copy Raw Data to S3 Datalake

## Set Raw Data Source Location

In [6]:
# path contents should contain an 'images' and a 'labels' folder
# images are .jpg files and labels are .txt files
path_raw_data = './data'

# store variable
%store path_raw_data

# print raw data source location
print('Raw Data Source Location:', path_raw_data)

Stored 'path_raw_data' (str)
Raw Data Source Location: ./data


## Set S3 Destination Location

In [7]:
# define paths to store raw data within s3 bucket
path_s3_data = "s3://{}/safety/data".format(bucket) # raw images and labels
path_s3_catalog = "s3://{}/safety/catalog".format(bucket) # catalog for athena queries

# store variables
%store path_s3_data
%store path_s3_catalog

# print s3 destination locations
print('S3 Destination Data Location:', path_s3_data)
print('S3 Destination Catalog Location:', path_s3_catalog)

Stored 'path_s3_data' (str)
Stored 'path_s3_catalog' (str)
S3 Destination Data Location: s3://sagemaker-us-east-1-414754026690/safety/data
S3 Destination Catalog Location: s3://sagemaker-us-east-1-414754026690/safety/catalog


## Perform Data Copy

In [8]:
if not full_dataset:

    # copy raw data into s3 datalake
    s3_uri = sess.upload_data(
        path=path_raw_data, # source path
        bucket=bucket, # destination bucket
        key_prefix='safety/data' # destination directory within bucket
    )

## Verify Successful Copy

### List Contents of S3 Datalake

In [9]:
# limit to summary
print('Images:')
!aws s3 ls $path_s3_data/'images'/ --recursive --human-readable --summarize | tail -n 2
print('\nLabels:')
!aws s3 ls $path_s3_data/'labels'/ --recursive --human-readable --summarize | tail -n 2

Images:
Total Objects: 22141
   Total Size: 4.3 GiB

Labels:
Total Objects: 22141
   Total Size: 13.7 MiB


### Review S3 Bucket

In [10]:
# display link to s3 bucket for review
display(
    HTML(
        '<b>Review <a target="blank" href="https://s3.console.aws.amazon.com/s3/buckets/sagemaker-{}-{}/safety/?region={}&tab=overview">S3 Bucket</a></b>'.format(
            region, account_id, region
        )
    )
)

## Copy Catalog Data to S3 Datalake

### Create Catalog Data

In [11]:
# access bucket for obtaining image and label filenames
resource_s3 = boto3.resource('s3')
bucket_s3 = resource_s3.Bucket(bucket)

# define prefixes to locate images and labels
prefix_images = 'safety/data/images'
prefix_labels = 'safety/data/labels'

# filter for images and labels
objects_images =  bucket_s3.objects.filter(Prefix=prefix_images)
objects_labels =  bucket_s3.objects.filter(Prefix=prefix_labels)

# extract filenames for images and labels
filenames_images = [object_image.Object().key.split('/')[-1] for object_image in objects_images]
filenames_labels = [object_label.Object().key.split('/')[-1] for object_label in objects_labels]

# confirm number of images and labels are equal before proceeding
assert len(filenames_images) == len(filenames_labels)

# create list of rows to write to csv file
rows_csv = [['sample_id', 'img_filename', 'label_filename']]

# make sure there is a matching labels .txt file for every images .jpg file
for sample in filenames_images:
    if sample.split('.')[0] + '.txt' not in filenames_labels:
        print(f"Warning: {sample} does not have matching labels {sample.split('.')[0] + '.txt'} file.")
        continue
    # append row for writing to csv file
    rows_csv.append([sample.split('.')[0], sample, sample.split('.')[0] + '.txt'])

# create and write csv file
with open('catalog.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(rows_csv)

### Perform Catalog Copy

In [12]:
!aws s3 cp ./catalog.csv $path_s3_catalog/catalog.csv

upload: ./catalog.csv to s3://sagemaker-us-east-1-414754026690/safety/catalog/catalog.csv


## Verify Successful Copy

### List Contents of S3 Datalake

In [13]:
# list contents recursively
!aws s3 ls $path_s3_catalog/ --recursive --human-readable --summarize

2024-02-09 06:32:15  897.8 KiB safety/catalog/catalog.csv

Total Objects: 1
   Total Size: 897.8 KiB


### Review S3 Bucket

In [14]:
# display link to s3 bucket for review
display(
    HTML(
        '<b>Review <a target="blank" href="https://s3.console.aws.amazon.com/s3/buckets/sagemaker-{}-{}/safety/?region={}&tab=overview">S3 Bucket</a></b>'.format(
            region, account_id, region
        )
    )
)

# Create Athena Database

## Establish Parameters

In [15]:
# define database name
database_name = 'safetydb'

# set s3 temporary staging directory
s3_staging_dir = "s3://{0}/athena/staging".format(bucket)

# define connection parameters
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)

# define and print database create statement
statement = "CREATE DATABASE IF NOT EXISTS {}".format(database_name)
print('Database CREATE statement:\n', statement)

Database CREATE statement:
 CREATE DATABASE IF NOT EXISTS safetydb


## Perform Database Creation

In [16]:
# execute connection parameters to create db
pd.read_sql(statement, conn)

  pd.read_sql(statement, conn)


## Verify Database Creation

In [17]:
# define statement to show existing databases
statement = "SHOW DATABASES"

# execute and display resulting dataframe
df_show = pd.read_sql(statement, conn)
df_show.head(5)

  df_show = pd.read_sql(statement, conn)


Unnamed: 0,database_name
0,default
1,dsoaws
2,musicdb
3,safetydb
4,sagemaker_featurestore


# Create Athena Table

In [18]:
# define table name
table_name_csv = 'catalog_csv'

# sql statement to execute
statement = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
         sample_id string,
         img_filename string,
         label_filename string
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\\n' LOCATION '{}'
TBLPROPERTIES ('skip.header.line.count'='1')""".format(
    database_name, table_name_csv, path_s3_catalog
)

# print sql statement for review before executing
print('Database CREATE statement:\n', statement)

Database CREATE statement:
 CREATE EXTERNAL TABLE IF NOT EXISTS safetydb.catalog_csv(
         sample_id string,
         img_filename string,
         label_filename string
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' LOCATION 's3://sagemaker-us-east-1-414754026690/safety/catalog'
TBLPROPERTIES ('skip.header.line.count'='1')


## Perform Table Creation

In [19]:
# execute connection parameters to create table
pd.read_sql(statement, conn)

  pd.read_sql(statement, conn)


## Verify Table Creation

In [20]:
# define statement to show existing tables
statement = "SHOW TABLES in {}".format(database_name)

# execute and display resulting dataframe
df_show = pd.read_sql(statement, conn)
df_show.head(5)

  df_show = pd.read_sql(statement, conn)


Unnamed: 0,tab_name
0,catalog_csv


In [21]:
# also programmatically verify and store variable
if table_name_csv in df_show.values:
    ingest_create_athena_table_csv_passed = True
    
%store ingest_create_athena_table_csv_passed

Stored 'ingest_create_athena_table_csv_passed' (bool)


# Run A Sample Query

In [22]:
# define sql query statement
statement = """SELECT * FROM {}.{}
    WHERE sample_id >= '000015'
    AND img_filename like '%.jpg'
    AND label_filename like '%.txt'
    LIMIT 100""".format(
    database_name, table_name_csv
)

# print sql statement for review before executing
print('SQL query SELECT statement:\n', statement)

SQL query SELECT statement:
 SELECT * FROM safetydb.catalog_csv
    WHERE sample_id >= '000015'
    AND img_filename like '%.jpg'
    AND label_filename like '%.txt'
    LIMIT 100


In [23]:
# execute sql query and display results
df = pd.read_sql(statement, conn)
df.head(5)

  df = pd.read_sql(statement, conn)


Unnamed: 0,sample_id,img_filename,label_filename
0,15,000015.jpg,000015.txt
1,16,000016.jpg,000016.txt
2,17,000017.jpg,000017.txt
3,18,000018.jpg,000018.txt
4,19,000019.jpg,000019.txt


# Review Athena Table in Glue Catalog

In [24]:
# display link to aws glue catalog for review
display(
    HTML(
        '<b>Review <a target="top" href="https://console.aws.amazon.com/glue/home?region={}#">AWS Glue Catalog</a></b>'.format(
            region
        )
    )
)

# List Stored Variables

In [25]:
# list stored variables for use in other notebooks
%store

Stored variables and their in-db values:
ingest_create_athena_table_csv_passed             -> True
path_raw_data                                     -> './data'
path_s3_catalog                                   -> 's3://sagemaker-us-east-1-414754026690/safety/cata
path_s3_data                                      -> 's3://sagemaker-us-east-1-414754026690/safety/data
