# Hyperparameter Tuning

## Setup Sagemaker

In [1]:
# %pip install -U sagemaker

In [2]:
# %pip install sagemaker-experiments

In [1]:
import boto3
import pandas as pd
import numpy as np
import time
import sagemaker
from sagemaker.session import Session
from sagemaker.feature_store.feature_group import FeatureGroup

In [2]:
%env AWS_PROFILE=aeroxye-sagemaker

env: AWS_PROFILE=aeroxye-sagemaker


In [3]:
!aws sts get-caller-identity

{
    "UserId": "AROAWC4YSIQL5OBFCNGEX:botocore-session-1686731443",
    "Account": "418542404631",
    "Arn": "arn:aws:sts::418542404631:assumed-role/SageMaker-UserRole/botocore-session-1686731443"
}


In [4]:
try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='SageMaker-UserRole')['Role']['Arn']

region = boto3.Session().region_name
print(f'Current region: {region}')

boto_session = boto3.Session(region_name=region)
sagemaker_session = sagemaker.Session(boto_session=boto_session)
sagemaker_client = boto_session.client(service_name='sagemaker', region_name=region)
sagemaker_client.list_feature_groups()

featurestore_runtime = boto_session.client(service_name='sagemaker-featurestore-runtime', region_name=region)

feature_store_session = Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime
)

Couldn't call 'get_role' to get Role ARN from role name SageMaker-UserRole to get Role path.


Current region: ap-southeast-1


In [5]:
# users = pd.DataFrame()
users_feature_group = FeatureGroup(name="users-feature-group", sagemaker_session=sagemaker_session)

# get single record from user
record_identifier_value = str("079b0ec9-cec6-42fb-9f00-7891c52a10fb")
featurestore_runtime.get_record(FeatureGroupName="users-feature-group",
                                RecordIdentifierValueAsString=record_identifier_value)

{'ResponseMetadata': {'RequestId': 'a2acff11-031c-428e-a5a1-2b23964e6abb',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'a2acff11-031c-428e-a5a1-2b23964e6abb',
   'content-type': 'application/json',
   'content-length': '1661',
   'date': 'Wed, 14 Jun 2023 08:30:47 GMT'},
  'RetryAttempts': 0},
 'Record': [{'FeatureName': 'id',
   'ValueAsString': '079b0ec9-cec6-42fb-9f00-7891c52a10fb'},
  {'FeatureName': 'has_other_cats', 'ValueAsString': '0'},
  {'FeatureName': 'personality', 'ValueAsString': 'all sweet'},
  {'FeatureName': 'gender', 'ValueAsString': 'no preference'},
  {'FeatureName': 'good_with_other_dogs', 'ValueAsString': '0'},
  {'FeatureName': 'employment', 'ValueAsString': 'working full time'},
  {'FeatureName': 'created_at', 'ValueAsString': '1970-01-01T00:00:00Z'},
  {'FeatureName': 'agree_to_fee', 'ValueAsString': '1'},
  {'FeatureName': 'is_first_cat', 'ValueAsString': '1'},
  {'FeatureName': 'good_with_kids', 'ValueAsString': '0'},
  {'FeatureName': 'att

## Setup Data Source and Images

In [6]:
# TODO: add code to retrieve latest train/test split based on type of split
split_type = "strat"
object_name = "strat-2023-05-31-13-50-15-148"
train_uri = f"s3://petfinder6000-training/{object_name}/output/train/train.csv"
eval_uri = f"s3://petfinder6000-training/{object_name}/output/validation/validation.csv"
test_uri = f"s3://petfinder6000-training/{object_name}/output/test/test.csv"

In [7]:
def get_uri_instance(run_mode):
    if run_mode == 'LOCAL':
        image_uri = 'cornac-39' # can pull remote container from ECR too
        instance_type = 'local'
    else:
        image_uri = '418542404631.dkr.ecr.ap-southeast-1.amazonaws.com/petfinder6000:cornac-39-v2'
        instance_type = 'ml.c5.xlarge'
    return image_uri, instance_type

## Hyperparameter Tuning

In [8]:
from sagemaker.tuner import (
    IntegerParameter,
    ContinuousParameter,
    HyperparameterTuner,
)
from sagemaker.tensorflow import TensorFlow

exp_name = 'testing'
run_name = 'tuning-bpr-' + time.strftime("%Y%m%d-%H-%M-%S", time.gmtime())

# setup hyperparameters
k = 50
max_iter = 200
learning_rate = 0.001
lambda_reg = 0.001

run_mode = 'REMOTE' # hyperparam tuning must be remote
image_uri, instance_type = get_uri_instance(run_mode)

estimator = TensorFlow(
    image_uri=image_uri,
    entry_point="./train_bpr.py",
    dependencies=['./metrics/harmonic_mean.py', './metrics/combined_eval_method.py', './metrics/serendipity_wrapper.py'],
    role=role,
    instance_count=1,
    instance_type=instance_type,
    source_dir="./",
    environment={"REGION": region},
    hyperparameters={
        "k": k,
        "max_iter": max_iter,
        "learning_rate": learning_rate,
        "lambda_reg": lambda_reg,
    },
)

# configure hyperparameter tuning
hyperparameter_ranges = {
    "k": IntegerParameter(10, 500),
    "max_iter": IntegerParameter(50, 200),
    "learning_rate": ContinuousParameter(0.001, 0.1),
    "lambda_reg": ContinuousParameter(0.001, 0.1),
}
objective_metric_name = "HarmonicMean"
objective_type = "Maximize"
metric_definitions = [{"Name": "HarmonicMean", "Regex": "HarmonicMean: ([0-9\\.]+)"},
                      {"Name": "Serendipity", "Regex": "Serendipity: ([0-9\\.]+)"},
                      {"Name": "F1@10", "Regex": "F1@10: ([0-9\\.]+)"},
                      {"Name": "NDCG@-1", "Regex": "NDCG@-1: ([0-9\\.]+)"},
                      {"Name": "NCRR@-1", "Regex": "NCRR@-1: ([0-9\\.]+)"}]
tuner = HyperparameterTuner(
    estimator,
    objective_metric_name,
    hyperparameter_ranges,
    metric_definitions,
    max_jobs=3,
    max_parallel_jobs=3,
    strategy="Random",
)

# tune hyperparameter
tuner.fit(
    inputs={
        "train": train_uri,
        "eval": eval_uri,
        "test": test_uri
    },
    include_cls_metadata=False,
    job_name=run_name,
    wait=False
)

Using provided s3_resource


No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


In [10]:
# run this cell to check current status of hyperparameter tuning job
print(f"Latest job name: {tuner.latest_tuning_job.job_name}")
tuning_job_result = sagemaker_client.describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuner.latest_tuning_job.job_name
)

status = tuning_job_result["HyperParameterTuningJobStatus"]
if status != "Completed":
    print("Reminder: the tuning job has not been completed.")

job_count = tuning_job_result["TrainingJobStatusCounters"]["Completed"]
print("%d training jobs have completed" % job_count)

objective = tuning_job_result["HyperParameterTuningJobConfig"]["HyperParameterTuningJobObjective"]
is_minimize = objective["Type"] != "Maximize"
objective_name = objective["MetricName"]

Latest job name: tuning-bpr-20230614-08-30-51
3 training jobs have completed


In [11]:
from pprint import pprint

if tuning_job_result.get("BestTrainingJob", None):
    print("Best model found so far:")
    pprint(tuning_job_result["BestTrainingJob"])
else:
    print("No training jobs have reported results yet.")

Best model found so far:
{'CreationTime': datetime.datetime(2023, 6, 14, 16, 30, 56, tzinfo=tzlocal()),
 'FinalHyperParameterTuningJobObjectiveMetric': {'MetricName': 'HarmonicMean',
                                                 'Value': 0.01937107741832733},
 'ObjectiveStatus': 'Succeeded',
 'TrainingEndTime': datetime.datetime(2023, 6, 14, 16, 36, 24, tzinfo=tzlocal()),
 'TrainingJobArn': 'arn:aws:sagemaker:ap-southeast-1:418542404631:training-job/tuning-bpr-20230614-08-30-51-001-e5daedc1',
 'TrainingJobName': 'tuning-bpr-20230614-08-30-51-001-e5daedc1',
 'TrainingJobStatus': 'Completed',
 'TrainingStartTime': datetime.datetime(2023, 6, 14, 16, 32, 27, tzinfo=tzlocal()),
 'TunedHyperParameters': {'k': '212',
                          'lambda_reg': '0.012350714841053868',
                          'learning_rate': '0.0051821320815837926',
                          'max_iter': '106'}}


In [12]:
job_name = tuning_job_result["BestTrainingJob"]["TrainingJobName"]
jobd = sagemaker_client.describe_training_job(TrainingJobName=job_name)
metrics = jobd['FinalMetricDataList']
print(metrics)

[{'MetricName': 'HarmonicMean', 'Value': 0.01937107741832733, 'Timestamp': datetime.datetime(2023, 6, 14, 16, 36, 6, tzinfo=tzlocal())}, {'MetricName': 'Serendipity', 'Value': 0.08398120850324631, 'Timestamp': datetime.datetime(2023, 6, 14, 16, 36, 6, tzinfo=tzlocal())}, {'MetricName': 'F1@10', 'Value': 0.0272739939391613, 'Timestamp': datetime.datetime(2023, 6, 14, 16, 36, 6, tzinfo=tzlocal())}, {'MetricName': 'NDCG@-1', 'Value': 0.26288220286369324, 'Timestamp': datetime.datetime(2023, 6, 14, 16, 36, 6, tzinfo=tzlocal())}, {'MetricName': 'NCRR@-1', 'Value': 0.056838296353816986, 'Timestamp': datetime.datetime(2023, 6, 14, 16, 36, 6, tzinfo=tzlocal())}, {'MetricName': 'ObjectiveMetric', 'Value': 0.01937107741832733, 'Timestamp': datetime.datetime(2023, 6, 14, 16, 36, 6, tzinfo=tzlocal())}]


In [13]:
# update experiment with best parameters
from sagemaker.experiments.run import Run

with Run(experiment_name=exp_name, run_name=run_name, sagemaker_session=sagemaker_session) as run:
    params = tuning_job_result["BestTrainingJob"]["TunedHyperParameters"]
    run.log_parameters(params)

    job_name = tuning_job_result["BestTrainingJob"]["TrainingJobName"]
    jobd = sagemaker_client.describe_training_job(TrainingJobName=job_name)
    metrics = jobd['FinalMetricDataList']
    for m in metrics:
        run.log_metric(name=m.get('MetricName'), value=m.get('Value'))

### Register Trained Model in Model Registry

In [16]:
# register trained model in Model Registry
from sagemaker.tensorflow.model import TensorFlowModel

model_package_group = "test"

# for hyperparam tuning
model = tuner.best_estimator()

model.register(
    model_package_group_name=model_package_group,
    image_uri=image_uri,
    content_types=["text/csv"],
    inference_instances=["ml.c5.xlarge"],
    transform_instances=["ml.c5.xlarge"],
    response_types=["text/csv"],
    approval_status="PendingManualApproval",
)


2023-06-14 08:36:25 Starting - Preparing the instances for training
2023-06-14 08:36:25 Downloading - Downloading input data
2023-06-14 08:36:25 Training - Training image download completed. Training in progress.
2023-06-14 08:36:25 Uploading - Uploading generated training model
2023-06-14 08:36:25 Completed - Resource retained for reuse


<sagemaker.model.ModelPackage at 0x20237fb52b0>

### Delete experiments

In [45]:
from sagemaker.experiments.experiment import Experiment

experiment_name = "tensorflow-script-mode-experiment"
exp = Experiment.load(experiment_name=experiment_name, sagemaker_session=sagemaker_session)
print(exp)
exp._delete_all(action="--force")

Experiment(sagemaker_session=<sagemaker.session.Session object at 0x0000026ECD0BC670>,experiment_name='tensorflow-script-mode-experiment',experiment_arn='arn:aws:sagemaker:ap-southeast-1:418542404631:experiment/tensorflow-script-mode-experiment',display_name='tensorflow-script-mode-experiment',creation_time=datetime.datetime(2023, 6, 9, 22, 44, 58, 124000, tzinfo=tzlocal()),created_by={'UserProfileArn': 'arn:aws:sagemaker:ap-southeast-1:418542404631:user-profile/d-ni9jmwq8akiv/aeroxye', 'UserProfileName': 'aeroxye', 'DomainId': 'd-ni9jmwq8akiv'},last_modified_time=datetime.datetime(2023, 6, 10, 16, 30, 12, 866000, tzinfo=tzlocal()),last_modified_by={},response_metadata={'RequestId': 'd6609acb-9f9e-4700-97f8-235f2b7ffa68', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'd6609acb-9f9e-4700-97f8-235f2b7ffa68', 'content-type': 'application/x-amz-json-1.1', 'content-length': '472', 'date': 'Wed, 14 Jun 2023 08:22:31 GMT'}, 'RetryAttempts': 0})
