# Import statements

In [1]:
# Notebook Instance Imports
import os
import sagemaker
from sagemaker.tensorflow import TensorFlow
from sagemaker import get_execution_role
from sagemaker.estimator import Estimator
from sagemaker.debugger import ProfilerConfig, FrameworkProfile
import time
import io
import json
import pandas as pd

profiler_config=ProfilerConfig(
    framework_profile_params=FrameworkProfile()
)

# Build Docker Image

In [2]:
!pwd

/home/ec2-user/SageMaker/SSRC_New_Model_Development/_achive/sagemaker_staging


In [3]:
%cd docker_test_folder

/home/ec2-user/SageMaker/SSRC_New_Model_Development/_achive/sagemaker_staging/docker_test_folder


In [4]:
! aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-east-1.amazonaws.com

https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded


In [5]:
! docker build -t tf-custom-container-test .

Sending build context to Docker daemon  259.6kB
Step 1/5 : FROM 763104351884.dkr.ecr.us-east-1.amazonaws.com/tensorflow-training:2.4.1-gpu-py37-cu110-ubuntu18.04
2.4.1-gpu-py37-cu110-ubuntu18.04: Pulling from tensorflow-training

[1B57c49d0f: Pulling fs layer 
[1B40447d26: Pulling fs layer 
[1B2f862619: Pulling fs layer 
[1B278deddf: Pulling fs layer 
[1B80049843: Pulling fs layer 
[1B556b2329: Pulling fs layer 
[1B1db7094c: Pulling fs layer 
[1B12740c87: Pulling fs layer 
[1Bc97a046a: Pulling fs layer 
[1B7978f146: Pulling fs layer 
[1Bf6d5a580: Pulling fs layer 
[1B91f7c7a3: Pulling fs layer 
[1B23967117: Pulling fs layer 
[1Bc868b245: Pulling fs layer 
[1Bb0566974: Pulling fs layer 
[1Bb55c8fe8: Pulling fs layer 
[1B51f37e08: Pulling fs layer 
[1Bc14aac6f: Pulling fs layer 
[1B8cfd9dd8: Pulling fs layer 
[1Ba1440a31: Pulling fs layer 
[1B43b56dd8: Pulling fs layer 
[1B109d2aaa: Pulling fs layer 
[1B7bbe34cb: Pulling fs layer 
[1B334e9365: Pulling fs layer 
[

# Publish Docker Container to ECR

In [6]:
%%sh

# Specify an algorithm name
algorithm_name=pc-tf-custom-container-test

account=$(aws sts get-caller-identity --query Account --output text)

# Get the region defined in the current configuration (default to us-west-2 if none defined)
region=$(aws configure get region)
region=${region:-us-east-1}

fullname="${account}.dkr.ecr.${region}.amazonaws.com/${algorithm_name}:latest"

# If the repository doesn't exist in ECR, create it.

aws ecr describe-repositories --repository-names "${algorithm_name}" > /dev/null 2>&1
if [ $? -ne 0 ]
then
aws ecr create-repository --repository-name "${algorithm_name}" > /dev/null
fi

# Get the login command from ECR and execute it directly

$(aws ecr get-login --region ${region} --no-include-email)

# Build the docker image locally with the image name and then push it to ECR
# with the full name.

docker build -t ${algorithm_name} .
docker tag ${algorithm_name} ${fullname}

docker push ${fullname}

Login Succeeded
Sending build context to Docker daemon  259.6kB
Step 1/5 : FROM 763104351884.dkr.ecr.us-east-1.amazonaws.com/tensorflow-training:2.4.1-gpu-py37-cu110-ubuntu18.04
 ---> 8467bc1c5070
Step 2/5 : ENV PATH="/opt/ml/code:${PATH}"
 ---> Using cache
 ---> 424e611851bd
Step 3/5 : RUN pip3 install rasterio wandb earthpy tensorflow-addons onnx onnx2keras
 ---> Using cache
 ---> 166af6696485
Step 4/5 : COPY train_no_s3-Copy1.py /opt/ml/code/train.py
 ---> Using cache
 ---> f00830f3e7bf
Step 5/5 : ENV SAGEMAKER_PROGRAM train.py
 ---> Using cache
 ---> bb53fed99fe1
Successfully built bb53fed99fe1
Successfully tagged pc-tf-custom-container-test:latest
The push refers to repository [963659202518.dkr.ecr.us-east-1.amazonaws.com/pc-tf-custom-container-test]
d3fc32499159: Preparing
739585fc888f: Preparing
f14967dc4bde: Preparing
7220541afc5b: Preparing
164f5f71f1c9: Preparing
6fba17d615b8: Preparing
0c571e9ec35a: Preparing
832fc832c708: Preparing
a3906ac790ce: Preparing
d291b5a945df: Prep

https://docs.docker.com/engine/reference/commandline/login/#credentials-store



In [7]:
import boto3

account_id = boto3.client('sts').get_caller_identity().get('Account')
ecr_repository = 'pc-tf-custom-container-test'
tag = ':latest'

region = boto3.session.Session().region_name

uri_suffix = 'amazonaws.com'
if region in ['cn-north-1', 'cn-northwest-1']:
    uri_suffix = 'amazonaws.com.cn'

image_uri = '{}.dkr.ecr.{}.{}/{}'.format(account_id, region, uri_suffix, ecr_repository + tag)

image_uri
# This should return something like
# 111122223333.dkr.ecr.us-east-2.amazonaws.com/sagemaker-byoc-test:latest



'963659202518.dkr.ecr.us-east-1.amazonaws.com/pc-tf-custom-container-test:latest'

# Train Model on Sagemaker

### Set hyperparameters

In [8]:
# S3 directory containing the training data
training_files = "s3://canopy-production-ml/chips/model2_s2cloudless/training_v2/null/"

inputs = training_files

hyperparameters = {
    "wandb_key": "ded96d05c0cfafc1f209276af6c21cb7ac61e5de", # Use your own wandb key
    "epochs": 100, # number of epochs
    "s3_chkpt_dir": "ckpt", # S3 directory where the checkpoints will be stored
    "batch_size": 32, # batch size
    "training_file": "label_files/DRC_labels_SAB_train_v1.csv", # CSV file that labels the training data
    "validation_file": "label_files/DRC_labels_SAB_val_v1.csv", # CSV file that lagels the validation data
    'bands': "1 2 3 4 5 6 7 8 9 10 11 12 13", # raster bands used for the training, make sure to separate with spaces
    'augment': False, # whether or not to use data augmentation
    'numclasses': 2, # number of classes in the final model
    'callback': 'clr', # how to modify the learning rate; see the training.py file for more details
    'clr_initial':.00001, # initial learning rate to use for the circular learning rate algorithm
    'clr_max':.001, # max learning rate to use for the circular learning rate algorithm
    'clr_step':8, # how much to modify the learning rate each epoch in the circular learning rate algorithm
    'lr_reduce_min': .00001, # minimum learning rate
    'early_stop':False, # whether or not to use early stopping
    'job_name': 'SAB_newweights5_NBR_13bands_824', # name of the training job, this is arbitrary
    'model': 'resnet_sentinel' # which type of model to use, see training.py for more details
}
# additional hyperparameters can be found in the training.py file

print(inputs)
print(hyperparameters)

s3://canopy-production-ml/chips/model2_s2cloudless/training_v2/null/
{'wandb_key': 'ded96d05c0cfafc1f209276af6c21cb7ac61e5de', 'epochs': 100, 's3_chkpt_dir': 'ckpt', 'batch_size': 32, 'training_file': 'label_files/DRC_labels_SAB_train_v1.csv', 'validation_file': 'label_files/DRC_labels_SAB_val_v1.csv', 'bands': '1 2 3 4 5 6 7 8 9 10 11 12 13', 'augment': False, 'numclasses': 2, 'callback': 'clr', 'clr_initial': 1e-05, 'clr_max': 0.001, 'clr_step': 8, 'lr_reduce_min': 1e-05, 'early_stop': False, 'job_name': 'SAB_newweights5_NBR_13bands_824', 'model': 'resnet_sentinel'}


In [9]:
# define profile_config

profiler_config=ProfilerConfig(
    framework_profile_params=FrameworkProfile(start_unix_time=int(time.time()), duration=600)
)

In [10]:
job_name = 'pc-tf-drc-sab-newweights5-nbr-13bands' # arbitrary

# create estimator
estimator = Estimator(image_uri=image_uri,
                      input_mode='File',
                       instance_type='ml.g4dn.4xlarge',
                       output_path='s3://canopy-production-ml-output',
                       base_job_name=job_name,
                       instance_count=1,
                       role=get_execution_role(), # Passes to the container the AWS role that you are using on this notebook
                       py_version='py37',
                     profiler_config=profiler_config,
                     checkpoint_s3_uri=
                      f's3://canopy-production-ml-output/ckpt/{job_name}-{time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())}',
                     hyperparameters=hyperparameters,
                      max_wait=60*60*24*3,
                      max_run=60*60*24*3,
                      use_spot_instances=True) # switch to False for faster training but lots more expense

In [None]:
estimator.fit(inputs)

2022-08-27 09:46:06 Starting - Starting the training job...
2022-08-27 09:46:30 Starting - Preparing the instances for trainingProfilerReport-1661593566: InProgress
............
2022-08-27 09:48:30 Downloading - Downloading input data........................
2022-08-27 09:52:35 Training - Training image download completed. Training in progress..[34m2022-08-27 09:52:38.170716: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.[0m
[34m2022-08-27 09:52:43,844 sagemaker-training-toolkit INFO     Imported framework sagemaker_tensorflow_container.training[0m
[34m2022-08-27 09:52:43,845 sagemaker-training-toolkit INFO     Failed to parse hyperparameter augment value False to Json.[0m
[34mReturning the value itself[0m
[34m2022-08-27 09:52:43,845 sagemaker-training-toolkit I