# Install Required Packages

In [2]:
!pip install pyathena
!pip3 install -U sagemaker

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


# Import Required Libraries

In [3]:
import boto3 # aws sdk for python
import sagemaker # machine learning platform
import numpy as np # array manipulation
import os # operating system interfaces
import pandas as pd # python data analysis
import re # regular expressions
from pyathena import connect # athena client
from sagemaker.pytorch.estimator import PyTorch # PyTorch estimator
from time import gmtime, strftime, sleep # time-related functions

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


# Perform Prerequisites

In [4]:
# establish sagemaker session, provide permissions
sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

# create a boto3 session for the sagemaker service
sm = boto3.Session().client(service_name='sagemaker', region_name=region)

# client to make featurestore record calls
featurestore_runtime = boto3.Session().client(
    service_name="sagemaker-featurestore-runtime", region_name=region
)

# create boto3 session to establish feature store session
boto_session = boto3.Session(region_name=region)

# create featurestore session
feature_store_session = sagemaker.Session(
    boto_session=boto_session,
    sagemaker_client=sm,
    sagemaker_featurestore_runtime_client=featurestore_runtime,
)

# define prefixes for the safety data directory and featurestore
prefix_data = 'safety/data'
prefix_featurestore = 'safety-featurestore'

# print s3 locations
print('Data directory location:', f"s3://{bucket}/{prefix_data}")
print('FeatureStore directory location:', f"s3://{bucket}/{prefix_featurestore}\n")

# print current IAM role for notebook instance
role = sagemaker.get_execution_role()
print('Execution Role:', role)

Data directory location: s3://sagemaker-us-east-1-414754026690/safety/data
FeatureStore directory location: s3://sagemaker-us-east-1-414754026690/safety-featurestore

Execution Role: arn:aws:iam::414754026690:role/LabRole


# Data Preparation

## Query Catalog Data

In [5]:
# define database name
database_name = 'safetydb'

# define table name
table_name_csv = 'catalog_csv'

# set s3 temporary staging directory
s3_staging_dir = "s3://{0}/athena/staging".format(bucket)

# define connection parameters
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)

# define sql query statement
statement = """SELECT * FROM {}.{}
    WHERE img_filename like '%.jpg'
    AND label_filename like '%.txt'
    LIMIT 25000""".format(
    database_name, table_name_csv
)

# print sql statement for review before executing
print('SQL query SELECT statement:\n', statement)

SQL query SELECT statement:
 SELECT * FROM safetydb.catalog_csv
    WHERE img_filename like '%.jpg'
    AND label_filename like '%.txt'
    LIMIT 25000


In [6]:
# execute sql query and display results
df_catalog_query = pd.read_sql(statement, conn)
df_catalog_query.head(10)

  df_catalog_query = pd.read_sql(statement, conn)


Unnamed: 0,sample_id,img_filename,label_filename
0,1,000001.jpg,000001.txt
1,2,000002.jpg,000002.txt
2,3,000003.jpg,000003.txt
3,4,000004.jpg,000004.txt
4,5,000005.jpg,000005.txt
5,6,000006.jpg,000006.txt
6,7,000007.jpg,000007.txt
7,8,000008.jpg,000008.txt
8,9,000009.jpg,000009.txt
9,10,000010.jpg,000010.txt


## Combine with FeatureGroup Data

### Get Batch Records from FeatureGroups

In [7]:
# define featuregroup name patterns
image_feature_group_name_pattern = 'image-feature-group-'
label_feature_group_name_pattern = 'label-feature-group-'

# obtain latest version of featuregroup names, if multiples exist
all_image_feature_groups = sm.list_feature_groups(NameContains=image_feature_group_name_pattern, SortBy='CreationTime', SortOrder='Descending')
all_label_feature_groups = sm.list_feature_groups(NameContains=label_feature_group_name_pattern, SortBy='CreationTime', SortOrder='Descending')
image_feature_group_name = all_image_feature_groups['FeatureGroupSummaries'][0]['FeatureGroupName']
label_feature_group_name = all_label_feature_groups['FeatureGroupSummaries'][0]['FeatureGroupName']

# print featuregroup names
print('Image FeatureGroup Name:', image_feature_group_name)
print('Label FeatureGroup Name:', label_feature_group_name)

Image FeatureGroup Name: image-feature-group-03-04-13-19
Label FeatureGroup Name: label-feature-group-03-04-13-19


In [8]:
# specify record identifiers (i.e., sample ids) from previous catalog query
record_identifiers_value = df_catalog_query['sample_id'].values.astype(str).tolist()

# query image featuregroup by using record_id as primary key
df_image_feature_group = featurestore_runtime.batch_get_record(
    Identifiers=[
        {
            'FeatureGroupName': image_feature_group_name,
            'RecordIdentifiersValueAsString':record_identifiers_value,
        }
    ]
)

# query label featuregroup by using record_id as primary key
df_label_feature_group = featurestore_runtime.batch_get_record(
    Identifiers=[
        {
            'FeatureGroupName': label_feature_group_name,
            'RecordIdentifiersValueAsString': record_identifiers_value,
        }
    ]
)

### Display Image FeatureGroup Records

In [9]:
# create list of image records and feature names, exclude eventtime
image_records = [sample['Record'] for sample in df_image_feature_group['Records']]
image_feature_names = [feature['FeatureName'] for feature in image_records[0] if feature['FeatureName'] != 'EventTime']

# create list of image data
image_data = [image_feature_names]

# iterate through each record in image featuregroup
for record in image_records:
    
    # iterate through each feature in individual record
    image_data.append([feature['ValueAsString'] for feature in record if feature['FeatureName'] != 'EventTime'])
                       
# create images dataframe
df_images = pd.DataFrame(image_data[1:], columns=image_data[0])
                       
# display dataframe head
df_images.head()

Unnamed: 0,sample_id,img_format,img_mode,img_height,img_width
0,12,JPEG,RGB,640,640
1,6,JPEG,RGB,640,640
2,16,JPEG,RGB,640,640
3,11,JPEG,RGB,640,640
4,2,JPEG,RGB,640,640


### Display Label FeatureGroup Records

In [10]:
# create list of label records and feature names, exclude eventtime
label_records = [sample['Record'] for sample in df_label_feature_group['Records']]
label_feature_names = [feature['FeatureName'] for feature in label_records[0] if feature['FeatureName'] != 'EventTime']

# create list of label data
label_data = [label_feature_names]

# iterate through each record in label featuregroup
for record in label_records:
    
    # iterate through each feature in individual record
    label_data.append([feature['ValueAsString'] for feature in record if feature['FeatureName'] != 'EventTime'])
                       
# create labels dataframe
df_labels = pd.DataFrame(label_data[1:], columns=label_data[0])
                       
# display dataframe head
df_labels.head()

Unnamed: 0,sample_id,count_helmet,count_vest,count_head
0,12,4,0,0
1,6,0,0,5
2,16,1,0,0
3,11,13,0,0
4,2,2,1,0


### Join Records

In [11]:
# merge catalog query and images dataframes, then the resulting with labels dataframe
df_combined = pd.merge(df_catalog_query, df_images, on='sample_id')
df_combined = pd.merge(df_combined, df_labels, on='sample_id')

# display resulting datafrmae
df_combined.head()

Unnamed: 0,sample_id,img_filename,label_filename,img_format,img_mode,img_height,img_width,count_helmet,count_vest,count_head
0,1,000001.jpg,000001.txt,JPEG,RGB,640,640,2,0,0
1,2,000002.jpg,000002.txt,JPEG,RGB,640,640,2,1,0
2,3,000003.jpg,000003.txt,JPEG,RGB,640,640,4,0,0
3,4,000004.jpg,000004.txt,JPEG,RGB,640,640,0,0,5
4,5,000005.jpg,000005.txt,JPEG,RGB,640,640,2,0,0


## Split Data

In [13]:
# use a copy of dataframe, can manipulate if desired for experimentation
data = df_combined.copy()

# data split in four sets - training, validation, test, and batch inference
rand_split = np.random.rand(len(data))
train_list = rand_split < 0.4
val_list = (rand_split >= 0.4) & (rand_split < 0.5)
test_list = (rand_split >= 0.5) & (rand_split < 0.6)
batch_list = rand_split >= 0.6 # "production" data

# print data splits
print('Data Splits:')
print('------------')
print(f"Train :   {sum(train_list)} samples")
print(f"Val   :   {sum(val_list)} samples")
print(f"Test  :   {sum(test_list)} samples")
print(f"Batch :   {sum(batch_list)} samples")

Data Splits:
------------
Train :   11 samples
Val   :   3 samples
Test  :   1 samples
Batch :   5 samples


## Create Split Datasets in S3

In [14]:
# define and print source s3 locations
s3_images_source = f"s3://{bucket}/{prefix_data}/images/"
s3_labels_source = f"s3://{bucket}/{prefix_data}/labels/"
print('Images source directory location:', s3_images_source)
print('Labels source directory location:', s3_labels_source, '\n')

# define and print destination s3 location for data splits
s3_split_dest = f"s3://{bucket}/{prefix_data}/split/"
print('Split destination directory location:', s3_split_dest)

Images source directory location: s3://sagemaker-us-east-1-414754026690/safety/data/images/
Labels source directory location: s3://sagemaker-us-east-1-414754026690/safety/data/labels/ 

Split destination directory location: s3://sagemaker-us-east-1-414754026690/safety/data/split/


In [15]:
# define function to copy files for respective data splits to corresponding s3 destinations
# provide 'split_name' as either 'train', 'val', 'test', or 'batch'
# provide 'split_list' as either 'train_list', 'val_list', 'test_list', or 'batch_list'
def split_dataset(split_name, split_list):

    # iterate through each sample in split
    for index, sample in data[split_list].iterrows():

        # source/destination variables for individual sample
        cp_image_source = f"{s3_images_source}{sample['img_filename']}"
        cp_image_dest = f"{s3_split_dest}{split_name}/images/"
        cp_label_source = f"{s3_labels_source}{sample['label_filename']}"
        cp_label_dest = f"{s3_split_dest}{split_name}/labels/"

        # copy from source to destination
        !aws s3 cp $cp_image_source $cp_image_dest
        !aws s3 cp $cp_label_source $cp_label_dest

In [16]:
# perform data copies
print('Beginning TRAIN data split copies.')
split_dataset(split_name='train', split_list=train_list)
print('Completed TRAIN data split copies.\n')

print('Beginning VAL data split copies.')
split_dataset(split_name='val', split_list=val_list)
print('Completed VAL data split copies.\n')

print('Beginning TEST data split copies.')
split_dataset(split_name='test', split_list=test_list)
print('Completed TEST data split copies.\n')

print('Beginning BATCH data split copies.')
split_dataset(split_name='batch', split_list=batch_list)
print('Completed BATCH data split copies.')

Beginning TRAIN data split copies.
copy: s3://sagemaker-us-east-1-414754026690/safety/data/images/000003.jpg to s3://sagemaker-us-east-1-414754026690/safety/data/split/train/images/000003.jpg
copy: s3://sagemaker-us-east-1-414754026690/safety/data/labels/000003.txt to s3://sagemaker-us-east-1-414754026690/safety/data/split/train/labels/000003.txt
copy: s3://sagemaker-us-east-1-414754026690/safety/data/images/000005.jpg to s3://sagemaker-us-east-1-414754026690/safety/data/split/train/images/000005.jpg
copy: s3://sagemaker-us-east-1-414754026690/safety/data/labels/000005.txt to s3://sagemaker-us-east-1-414754026690/safety/data/split/train/labels/000005.txt
copy: s3://sagemaker-us-east-1-414754026690/safety/data/images/000006.jpg to s3://sagemaker-us-east-1-414754026690/safety/data/split/train/images/000006.jpg
copy: s3://sagemaker-us-east-1-414754026690/safety/data/labels/000006.txt to s3://sagemaker-us-east-1-414754026690/safety/data/split/train/labels/000006.txt
copy: s3://sagemaker-us

# Training Job and Model Creation

## Create and Run Training Job

In [17]:
%%time

# define job name and output location
job_name = 'yolov8-' + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
output_location = "s3://{}/{}/output/{}".format(bucket, prefix_data, job_name)

# build a PyTorch estimator
pytorch_estimator = PyTorch(
    role=role,
    entry_point='train.py', # custom training script, locate in code directory
    framework_version='2.0.1', # training - CPU - Python 3.10
    py_version='py310',
    source_dir='./code',
    instance_count=1,
    instance_type='ml.m5.xlarge',
    output_path=output_location,
    sagemaker_session=sess,
    hyperparameters = {'data': 'data.yaml', # yaml config file for custom dataset
                       'epochs': 5, # number of training epochs
                       'yolo_model': 'yolov8n.pt', # pretrained base model
                       'saved_model_name': 'benchmark_model.pt' # name for model export
                      }
)

# s3 location where training data is saved
inputs = s3_split_dest[:-1]

# begin training job
pytorch_estimator.fit(inputs=inputs, job_name=job_name, logs='All')

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: yolov8-2024-02-05-02-49-54


2024-02-05 02:49:54 Starting - Starting the training job...
2024-02-05 02:50:09 Starting - Preparing the instances for training......
2024-02-05 02:51:17 Downloading - Downloading input data...
2024-02-05 02:51:51 Downloading - Downloading the training image......
2024-02-05 02:52:31 Training - Training image download completed. Training in progress.[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2024-02-05 02:52:37,505 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2024-02-05 02:52:37,506 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2024-02-05 02:52:37,507 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2024-02-05 02:52:37,518 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2024-02-05 02:52:37,520 sagem

## Batch Transform

In [None]:
# create a transform job...