In [2]:
import os
import sys
import json

import boto3
from dotenv import load_dotenv
from sagemaker.utils import name_from_base
from sagemaker.tuner import ContinuousParameter

sys.path.append(os.path.join("..", ".."))
from utils.estimators import ImageClassificationEstimator, ImageClassificationHPTuner

In [3]:
load_dotenv(os.path.join("..", "..", "env"))

True

In [4]:
model_id = os.environ["IC_MODEL_ID"]
model_version = os.environ["IC_MODEL_VERSION"]

base_job_name = name_from_base(model_id.replace("tensorflow-", ""))
print(f"base_job_name: {base_job_name}")

base_job_name: ic-efficientnet-lite4-classification-2-2022-11-02-22-20-13-982


In [5]:
s3_input_path = os.path.join(os.environ["IC_S3_INPUT_PATH"])
s3_output_path = os.path.join(os.environ["IC_S3_OUTPUT_PATH"], model_id)

print(f"s3_input_path: {s3_input_path}")
print(f"s3_output_path: {s3_output_path}")

s3_input_path: s3://ava-cv-raw-photo-bucket/temp/leaves/images/
s3_output_path: s3://ava-cv-models/temp/leaves/tensorflow-ic-efficientnet-lite4-classification-2


In [6]:
hyperparameters = json.loads(os.environ["IC_HYPERPARAMETERS"])
hyperparameters

{'train_only_top_layer': 'True',
 'epochs': '50',
 'batch_size': '4',
 'optimizer': 'adam',
 'learning_rate': '0.001',
 'beta_1': '0.9',
 'beta_2': '0.999',
 'momentum': '0.9',
 'epsilon': '1e-07',
 'rho': '0.95',
 'initial_accumulator_value': '0.1',
 'reinitialize_top_layer': 'Auto',
 'early_stopping': 'True',
 'early_stopping_patience': '5',
 'early_stopping_min_delta': '0.0',
 'dropout_rate': '0.2',
 'regularizers_l2': '0.0001',
 'label_smoothing': '0.1',
 'image_resize_interpolation': 'bilinear',
 'augmentation': 'True',
 'augmentation_random_flip': 'horizontal_and_vertical',
 'augmentation_random_rotation': '0.2',
 'augmentation_random_zoom': '0.1',
 'binary_mode': 'False',
 'eval_metric': 'accuracy'}

In [7]:
hp_tuning = os.environ["IC_ENABLE_HP_TUNING"] == "True"
if hp_tuning:    
    # You can select from the hyperparameters supported by the model, and configure ranges of values to be searched for training the optimal model.
    # (https://docs.aws.amazon.com/sagemaker/latest/dg/automatic-model-tuning-define-ranges.html)
    hyperparameter_ranges = {
        "learning_rate": ContinuousParameter(1e-5, 1e-1, scaling_type="Logarithmic"),
        "dropout_rate": ContinuousParameter(0.1, 0.5, scaling_type="Linear"),
        "regularizers_l2": ContinuousParameter(1e-5, 1e-1, scaling_type="Logarithmic"),
    }

In [8]:
estimator = ImageClassificationEstimator(
    model_id=model_id,
    model_version=model_version,
    hyperparameters=hyperparameters,
    instance_type=os.environ["IC_TRAINING_INSTANCE_TYPE"],
    instance_count=int(os.environ["IC_TRAINING_INSTANCE_COUNT"]),
    max_run=int(os.environ["IC_MAX_RUN"]),
    output_path=s3_output_path,
    base_job_name=base_job_name,
)
if hp_tuning:
    hp_tuner = ImageClassificationHPTuner(
        estimator=estimator,
        hyperparameter_ranges=hyperparameter_ranges,
        max_jobs=int(os.environ["IC_MAX_JOBS"]),
        max_parallel_jobs=int(os.environ["IC_MAX_PARALLEL_JOBS"]),
        base_job_name=base_job_name,
    )

In [9]:
%%time
if hp_tuning:
    # Launch a SageMaker Tuning job to search for the best hyperparameters
    hp_tuner.fit({"training": s3_input_path}, logs="None")
else:
    # Launch a SageMaker Training job by passing s3 path of the training data
    estimator.fit({"training": s3_input_path}, logs="None")


2022-11-02 22:20:14 Starting - Starting the training job..
2022-11-02 22:20:29 Starting - Preparing the instances for training............
2022-11-02 22:21:32 Downloading - Downloading input data.......
2022-11-02 22:22:12 Training - Downloading the training image...................................
2022-11-02 22:25:13 Training - Training image download completed. Training in progress...........................................
2022-11-02 22:28:49 Uploading - Uploading generated training model
2022-11-02 22:28:55 Failed - Training job failed


UnexpectedStatusException: Error for Training job ic-efficientnet-lite4-classification-2--2022-11-02-22-20-14-532: Failed. Reason: ClientError: Please use an instance type with more memory, or reduce the size of training data processed on an instance.

In [10]:
# Identify the previously trained model path based on the output location where artifacts are stored previously and the training job name.
if hp_tuning:  # If using amt, select the model for the best training job.
    sage_client = boto3.Session().client("sagemaker")
    tuning_job_result = sage_client.describe_hyper_parameter_tuning_job(
        HyperParameterTuningJobName=hp_tuner._estimator._current_job_name
    )
    last_training_job_name = tuning_job_result["BestTrainingJob"]["TrainingJobName"]
else:
    last_training_job_name = estimator._estimator._current_job_name

last_trained_model_path = f"{s3_output_path}/{last_training_job_name}/output/model.tar.gz"
print(f"Best model saved in:\n{last_trained_model_path}")

Best model saved in:
s3://ava-cv-models/temp/leaves/tensorflow-ic-efficientnet-lite4-classification-2/ic-efficientnet-lite4-classification-2--2022-11-02-22-20-14-532/output/model.tar.gz
