# Install Required Packages

In [None]:
!pip install pyathena
!pip3 install -U sagemaker

Collecting pyathena
  Using cached pyathena-3.2.0-py3-none-any.whl.metadata (85 kB)
Collecting urllib3<2.1,>=1.25.4 (from botocore>=1.29.4->pyathena)
  Using cached urllib3-2.0.7-py3-none-any.whl.metadata (6.6 kB)
Using cached pyathena-3.2.0-py3-none-any.whl (79 kB)
Using cached urllib3-2.0.7-py3-none-any.whl (124 kB)
Installing collected packages: urllib3, pyathena
  Attempting uninstall: urllib3
    Found existing installation: urllib3 2.1.0
    Uninstalling urllib3-2.1.0:
      Successfully uninstalled urllib3-2.1.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
distributed 2022.7.0 requires tornado<6.2,>=6.0.3, but you have tornado 6.4 which is incompatible.
sagemaker 2.199.0 requires urllib3<1.27, but you have urllib3 2.0.7 which is incompatible.[0m[31m
[0mSuccessfully installed pyathena-3.2.0 urllib3-2.0.7
[0m
[1m[[0m[34;49mnotice[0m[1;39;4

# Import Required Libraries

In [None]:
import boto3 # aws sdk for python
#import cv2 # computer vision
import sagemaker # machine learning platform
#import matplotlib.pyplot as plt # plotting functions
import numpy as np # array manipulation
import os # operating system interfaces
import pandas as pd # python data analysis
#import random # pseudo-random numbers
import re # regular expressions
#import seaborn as sns # statistical graphics
#import time # time access and conversions
#from PIL import Image # python imaging library module
from pyathena import connect # athena client
#from sagemaker import get_execution_role # get execution role for notebook
#from sagemaker.feature_store.feature_group import FeatureGroup # featuregroup definition
#from sagemaker.session import Session # class to initialize sagemaker session
from time import gmtime, strftime, sleep # time-related functions

# Perform Prerequisites

In [None]:
# establish sagemaker session, provide permissions
sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

# create a boto3 session for the sagemaker service
sm = boto3.Session().client(service_name='sagemaker', region_name=region)

# client to make featurestore record calls
featurestore_runtime = boto3.Session().client(
    service_name="sagemaker-featurestore-runtime", region_name=region
)

# create boto3 session to establish feature store session
boto_session = boto3.Session(region_name=region)

# create featurestore session
feature_store_session = sagemaker.Session(
    boto_session=boto_session,
    sagemaker_client=sm,
    sagemaker_featurestore_runtime_client=featurestore_runtime,
)

# define prefixes for the safety data directory and featurestore
prefix_data = 'safety/data'
prefix_featurestore = 'safety-featurestore'

# print s3 locations
print('Data directory location:', f"s3://{bucket}/{prefix_data}")
print('FeatureStore directory location:', f"s3://{bucket}/{prefix_featurestore}\n")

# print current IAM role for notebook instance
role = sagemaker.get_execution_role()
print('Execution Role:', role)

Data directory location: s3://sagemaker-us-east-1-414754026690/safety/data
FeatureStore directory location: s3://sagemaker-us-east-1-414754026690/safety-featurestore

Execution Role: arn:aws:iam::414754026690:role/LabRole


# Data Preparation

## Query Catalog Data

In [None]:
# define database name
database_name = 'safetydb'

# define table name
table_name_csv = 'catalog_csv'

# set s3 temporary staging directory
s3_staging_dir = "s3://{0}/athena/staging".format(bucket)

# define connection parameters
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)

# define sql query statement
statement = """SELECT * FROM {}.{}
    WHERE img_filename like '%.jpg'
    AND label_filename like '%.txt'
    LIMIT 25000""".format(
    database_name, table_name_csv
)

# print sql statement for review before executing
print('SQL query SELECT statement:\n', statement)

SQL query SELECT statement:
 SELECT * FROM safetydb.catalog_csv
    WHERE img_filename like '%.jpg'
    AND label_filename like '%.txt'
    LIMIT 25000


In [None]:
# execute sql query and display results
df_catalog_query = pd.read_sql(statement, conn)
df_catalog_query.head(10)

  df_catalog_query = pd.read_sql(statement, conn)


Unnamed: 0,sample_id,img_filename,label_filename
0,1,000001.jpg,000001.txt
1,2,000002.jpg,000002.txt
2,3,000003.jpg,000003.txt
3,4,000004.jpg,000004.txt
4,5,000005.jpg,000005.txt
5,6,000006.jpg,000006.txt
6,7,000007.jpg,000007.txt
7,8,000008.jpg,000008.txt
8,9,000009.jpg,000009.txt
9,10,000010.jpg,000010.txt


## Combine with FeatureGroup Data

### Get Batch Records from FeatureGroups

In [None]:
# define featuregroup name patterns
image_feature_group_name_pattern = 'image-feature-group-'
label_feature_group_name_pattern = 'label-feature-group-'

# obtain latest version of featuregroup names, if multiples exist
all_image_feature_groups = sm.list_feature_groups(NameContains=image_feature_group_name, SortBy='CreationTime', SortOrder='Descending')
all_label_feature_groups = sm.list_feature_groups(NameContains=label_feature_group_name, SortBy='CreationTime', SortOrder='Descending')
image_feature_group_name = all_image_feature_groups['FeatureGroupSummaries'][0]['FeatureGroupName']
label_feature_group_name = all_label_feature_groups['FeatureGroupSummaries'][0]['FeatureGroupName']

# print featuregroup names
print('Image FeatureGroup Name:', image_feature_group_name)
print('Label FeatureGroup Name:', label_feature_group_name)

Image FeatureGroup Name: image-feature-group-03-04-13-19
Label FeatureGroup Name: label-feature-group-03-04-13-19


In [None]:
# specify record identifiers (i.e., sample ids) from previous catalog query
record_identifiers_value = df_catalog_query['sample_id'].values.astype(str).tolist()

# query image featuregroup by using record_id as primary key
df_image_feature_group = featurestore_runtime.batch_get_record(
    Identifiers=[
        {
            'FeatureGroupName': image_feature_group_name,
            'RecordIdentifiersValueAsString':record_identifiers_value,
        }
    ]
)

# query label featuregroup by using record_id as primary key
df_label_feature_group = featurestore_runtime.batch_get_record(
    Identifiers=[
        {
            'FeatureGroupName': label_feature_group_name,
            'RecordIdentifiersValueAsString': record_identifiers_value,
        }
    ]
)

### Display Image FeatureGroup Records

In [None]:
# create list of image records and feature names, exclude eventtime
image_records = [sample['Record'] for sample in df_image_feature_group['Records']]
image_feature_names = [feature['FeatureName'] for feature in image_records[0] if feature['FeatureName'] != 'EventTime']

# create list of image data
image_data = [image_feature_names]

# iterate through each record in image featuregroup
for record in image_records:

    # iterate through each feature in individual record
    image_data.append([feature['ValueAsString'] for feature in record if feature['FeatureName'] != 'EventTime'])

# create images dataframe
df_images = pd.DataFrame(image_data[1:], columns=image_data[0])

# display dataframe head
df_images.head()

Unnamed: 0,sample_id,img_format,img_mode,img_height,img_width
0,12,JPEG,RGB,640,640
1,6,JPEG,RGB,640,640
2,16,JPEG,RGB,640,640
3,2,JPEG,RGB,640,640
4,11,JPEG,RGB,640,640


### Display Label FeatureGroup Records

In [None]:
# create list of label records and feature names, exclude eventtime
label_records = [sample['Record'] for sample in df_label_feature_group['Records']]
label_feature_names = [feature['FeatureName'] for feature in label_records[0] if feature['FeatureName'] != 'EventTime']

# create list of label data
label_data = [label_feature_names]

# iterate through each record in label featuregroup
for record in label_records:

    # iterate through each feature in individual record
    label_data.append([feature['ValueAsString'] for feature in record if feature['FeatureName'] != 'EventTime'])

# create labels dataframe
df_labels = pd.DataFrame(label_data[1:], columns=label_data[0])

# display dataframe head
df_labels.head()

Unnamed: 0,sample_id,count_helmet,count_vest,count_head
0,12,4,0,0
1,6,0,0,5
2,16,1,0,0
3,11,13,0,0
4,2,2,1,0


### Join Records

In [None]:
# merge catalog query and images dataframes, then the resulting with labels dataframe
df_combined = pd.merge(df_catalog_query, df_images, on='sample_id')
df_combined = pd.merge(df_combined, df_labels, on='sample_id')

# display resulting datafrmae
df_combined.head()

Unnamed: 0,sample_id,img_filename,label_filename,img_format,img_mode,img_height,img_width,count_helmet,count_vest,count_head
0,1,000001.jpg,000001.txt,JPEG,RGB,640,640,2,0,0
1,2,000002.jpg,000002.txt,JPEG,RGB,640,640,2,1,0
2,3,000003.jpg,000003.txt,JPEG,RGB,640,640,4,0,0
3,4,000004.jpg,000004.txt,JPEG,RGB,640,640,0,0,5
4,5,000005.jpg,000005.txt,JPEG,RGB,640,640,2,0,0


## Split Data

In [None]:
# use a copy of dataframe, can manipulate if desired for experimentation
data = df_combined.copy()

# data split in four sets - training, validation, test, and batch inference
rand_split = np.random.rand(len(data))
train_list = rand_split < 0.4
val_list = (rand_split >= 0.4) & (rand_split < 0.5)
test_list = (rand_split >= 0.5) & (rand_split < 0.6)
batch_list = rand_split >= 0.6 # "production" data

# print data splits
print('Data Splits:')
print('------------')
print(f"Train :   {sum(train_list)} samples")
print(f"Val   :   {sum(val_list)} samples")
print(f"Test  :   {sum(test_list)} samples")
print(f"Batch :   {sum(batch_list)} samples")

Data Splits:
------------
Train :   6 samples
Val   :   1 samples
Test  :   2 samples
Batch :   11 samples


## Create Split Datasets in S3

In [None]:
# define and print source s3 locations
s3_images_source = f"s3://{bucket}/{prefix_data}/images/"
s3_labels_source = f"s3://{bucket}/{prefix_data}/labels/"
print('Images source directory location:', s3_images_source)
print('Labels source directory location:', s3_labels_source, '\n')

# define and print destination s3 locations for data splits
s3_images_dest = f"s3://{bucket}/{prefix_data}/split/images/"
s3_labels_dest = f"s3://{bucket}/{prefix_data}/split/labels/"
print('Images split destination directory location:', s3_images_dest)
print('Labels split destination directory location:', s3_labels_dest)

Images source directory location: s3://sagemaker-us-east-1-414754026690/safety/data/images/
Labels source directory location: s3://sagemaker-us-east-1-414754026690/safety/data/labels/ 

Images split destination directory location: s3://sagemaker-us-east-1-414754026690/safety/data/split/images/
Labels split destination directory location: s3://sagemaker-us-east-1-414754026690/safety/data/split/labels/


In [None]:
# define function to copy files for respective data splits to corresponding s3 destinations
# provide 'split_name' as either 'train', 'val', 'test', or 'batch'
# provide 'split_list' as either 'train_list', 'val_list', 'test_list', or 'batch_list'
def split_dataset(split_name, split_list):

    # iterate through each sample in split
    for index, sample in data[split_list].iterrows():

        # source/destination variables for individual sample
        cp_image_source = f"{s3_images_source}{sample['img_filename']}"
        cp_image_dest = f"{s3_images_dest}{split_name}/"
        cp_label_source = f"{s3_labels_source}{sample['label_filename']}"
        cp_label_dest = f"{s3_labels_dest}{split_name}/"

        # copy from source to destination
        !aws s3 cp $cp_image_source $cp_image_dest
        !aws s3 cp $cp_label_source $cp_label_dest

In [None]:
# perform data copies
print('Beginning TRAIN data split copies.')
split_dataset(split_name='train', split_list=train_list)
print('Completed TRAIN data split copies.')

print('Beginning VAL data split copies.')
split_dataset(split_name='val', split_list=val_list)
print('Completed VAL data split copies.')

print('Beginning TEST data split copies.')
split_dataset(split_name='test', split_list=test_list)
print('Completed TEST data split copies.')

print('Beginning BATCH data split copies.')
split_dataset(split_name='batch', split_list=batch_list)
print('Completed BATCH data split copies.')

copy: s3://sagemaker-us-east-1-414754026690/safety/data/images/000002.jpg to s3://sagemaker-us-east-1-414754026690/safety/data/split/images/test/000002.jpg
copy: s3://sagemaker-us-east-1-414754026690/safety/data/labels/000002.txt to s3://sagemaker-us-east-1-414754026690/safety/data/split/labels/test/000002.txt


# Training Job and Model Creation

# TESTING BELOW...

In [None]:
!pip install ultralytics

Collecting ultralytics
  Downloading ultralytics-8.1.9-py3-none-any.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.2/40.2 kB[0m [31m507.8 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting opencv-python>=4.6.0 (from ultralytics)
  Using cached opencv_python-4.9.0.80-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)
Collecting torch>=1.8.0 (from ultralytics)
  Downloading torch-2.2.0-cp310-cp310-manylinux1_x86_64.whl.metadata (25 kB)
Collecting torchvision>=0.9.0 (from ultralytics)
  Downloading torchvision-0.17.0-cp310-cp310-manylinux1_x86_64.whl.metadata (6.6 kB)
Collecting py-cpuinfo (from ultralytics)
  Downloading py_cpuinfo-9.0.0-py3-none-any.whl (22 kB)
Collecting thop>=0.1.1 (from ultralytics)
  Downloading thop-0.1.1.post2209072238-py3-none-any.whl (15 kB)
Collecting typing-extensions>=4.8.0 (from torch>=1.8.0->ultralytics)
  Using cached typing_extensions-4.9.0-py3-none-any.whl.metadata (3.0 kB)
Collecti

In [None]:
!pip install opencv-python-headless

Collecting opencv-python-headless
  Using cached opencv_python_headless-4.9.0.80-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)
Using cached opencv_python_headless-4.9.0.80-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (49.6 MB)
Installing collected packages: opencv-python-headless
Successfully installed opencv-python-headless-4.9.0.80
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


# Create Estimator

In [None]:
from sagemaker.pytorch import PyTorch
import cv2
from ultralytics import YOLO

pytorch_estimator = PyTorch('train.py',
                            instance_type='ml.m5.xlarge',
                            instance_count=1,
                            framework_version='1.8.0',
                            py_version='py3',
                            hyperparameters = {'epochs': 2, 'batch-size': 2, 'learning-rate': 0.1},
                            sagemaker_session=sm)

pytorch_estimator.fit({'train': "{}".format(s3_images_dest),
                       'test': "{}".format(s3_images_dest)})

AttributeError: 'SageMaker' object has no attribute 'local_mode'