# Warm Start from a Completed Hyper-Parameter Tuning Job
Once the previous hyper-parameter tuning job completes, we can analyze the results and perform another round of optimization using `Warm Start`. 

In [1]:
import boto3
import sagemaker
import pandas as pd

sess   = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name='sagemaker', region_name=region)

# Specify the S3 Location of the Features

In [2]:
%store -r scikit_processing_job_name

In [3]:
print(scikit_processing_job_name)

sagemaker-scikit-learn-2020-04-25-19-24-12-990


In [4]:
print('Previous Scikit Processing Job Name: {}'.format(scikit_processing_job_name))

Previous Scikit Processing Job Name: sagemaker-scikit-learn-2020-04-25-19-24-12-990


In [5]:
prefix_train = '{}/output/bert-train'.format(scikit_processing_job_name)
prefix_validation = '{}/output/bert-validation'.format(scikit_processing_job_name)
prefix_test = '{}/output/bert-test'.format(scikit_processing_job_name)

train_s3_uri = 's3://{}/{}'.format(bucket, prefix_train)
validation_s3_uri = 's3://{}/{}'.format(bucket, prefix_validation)
test_s3_uri = 's3://{}/{}'.format(bucket, prefix_test)

In [6]:
print(train_s3_uri)
!aws s3 ls $train_s3_uri/

s3://sagemaker-us-west-2-903253828154/sagemaker-scikit-learn-2020-04-25-19-24-12-990/output/bert-train
2020-04-25 19:30:15   50965015 part-algo-1-amazon_reviews_us_Digital_Software_v1_00.tfrecord
2020-04-25 19:31:00   71723377 part-algo-2-amazon_reviews_us_Digital_Video_Games_v1_00.tfrecord


In [7]:
s3_input_train_data = sagemaker.s3_input(s3_data=train_s3_uri, distribution='ShardedByS3Key') 
s3_input_validation_data = sagemaker.s3_input(s3_data=validation_s3_uri, distribution='ShardedByS3Key')
s3_input_test_data = sagemaker.s3_input(s3_data=test_s3_uri, distribution='ShardedByS3Key')

print(s3_input_train_data.config)
print(s3_input_validation_data.config)
print(s3_input_test_data.config)

{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://sagemaker-us-west-2-903253828154/sagemaker-scikit-learn-2020-04-25-19-24-12-990/output/bert-train', 'S3DataDistributionType': 'ShardedByS3Key'}}}
{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://sagemaker-us-west-2-903253828154/sagemaker-scikit-learn-2020-04-25-19-24-12-990/output/bert-validation', 'S3DataDistributionType': 'ShardedByS3Key'}}}
{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://sagemaker-us-west-2-903253828154/sagemaker-scikit-learn-2020-04-25-19-24-12-990/output/bert-test', 'S3DataDistributionType': 'ShardedByS3Key'}}}


In [8]:
!cat src/tf_bert_reviews.py

import time
import random
import pandas as pd
from glob import glob
import pprint
import argparse
import json
import subprocess
import sys
import os
import tensorflow as tf
#subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'tensorflow==2.1.0'])
subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'transformers==2.8.0'])
subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'sagemaker-tensorflow==2.1.0.1.0.0'])
subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'smdebug==0.7.2'])
from transformers import DistilBertTokenizer
from transformers import TFDistilBertForSequenceClassification
from transformers import TextClassificationPipeline
from transformers.configuration_distilbert import DistilBertConfig

CLASSES = [1, 2, 3, 4, 5]

def select_data_and_label_from_record(record):
    x = {
        'input_ids': record['input_ids'],
        'input_mask': record['input_mask'],
        'segment_ids': record['segment_ids']
 

# Setup Hyper-Parameters

In [9]:
epsilon=0.00000001
validation_batch_size=128
test_batch_size=128
train_steps_per_epoch=1000
validation_steps=1000
test_steps=1000
train_instance_count=1
train_instance_type='ml.p3.8xlarge'
train_volume_size=1024
use_xla=True
use_amp=True
max_seq_length=128
freeze_bert_layer=True
input_mode='Pipe'
run_validation=True
run_test=True
run_sample_predictions=True

# Setup Metrics

In [10]:
metrics_definitions = [
     {'Name': 'train:loss', 'Regex': 'loss: ([0-9\\.]+)'},
     {'Name': 'train:accuracy', 'Regex': 'accuracy: ([0-9\\.]+)'},
     {'Name': 'validation:loss', 'Regex': 'val_loss: ([0-9\\.]+)'},
     {'Name': 'validation:accuracy', 'Regex': 'val_accuracy: ([0-9\\.]+)'},
]

In [11]:
from sagemaker.tensorflow import TensorFlow

estimator = TensorFlow(entry_point='tf_bert_reviews.py',
                       source_dir='src',
                       role=role,
                       train_instance_count=train_instance_count, # Make sure you have at least this number of input files or the ShardedByS3Key distibution strategy will fail the job due to no data available
                       train_instance_type=train_instance_type,
                       train_volume_size=train_volume_size,
                       py_version='py3',
                       framework_version='2.1.0',
                       hyperparameters={
                               'epsilon': epsilon,
                               'validation_batch_size': validation_batch_size,
                               'test_batch_size': test_batch_size,                                             
                               'train_steps_per_epoch': train_steps_per_epoch,
                               'validation_steps': validation_steps,
                               'test_steps': test_steps,
                               'use_xla': use_xla,
                               'use_amp': use_amp,                                             
                               'max_seq_length': max_seq_length,
                               'run_validation': run_validation,
                               'run_test': run_test,
                               'run_sample_predictions': run_sample_predictions},
                       input_mode=input_mode,
                       metric_definitions=metrics_definitions,
                       train_max_run=7200 # max 2 hours * 60 minutes seconds per hour * 60 seconds per minute
                      )

# Setup Warm Start Config
We configure `WarmStartConfig` using 1 or more  of the previous hyper-parameter tuning job runs called the `parent` jobs - as well as a `WarmStartType`.  The parents must have finished either with one of the following success or failure states: `Completed`, `Stopped`, or `Failed`.

`WarmStartType` is one of the following strategies:

* `IDENTICAL_DATA_AND_ALGORITHM` uses the same input data and algorithm as the parent tuning jobs, but allows a practitioner to explore more hyper-parameter range values.  Upon completion, a tuning job with this strategy will return an additional field, `OverallBestTrainingJob` containing the best model candidate including this tuning job as well as the completed parent tuning jobs.
* `TRANSFER_LEARNING` allows you to transfer the knowledge from previous tuning jobs.  You can use different input dataset and algorithm - as well as everything from the `IDENTICAL_DATA_AND_ALGORITHM` strategy.

_Note:  Recursive parent-child relationships are not supported._

In [12]:
%store -r tuning_job_name

In [13]:
print(tuning_job_name)

tensorflow-training-200425-2332


In [14]:
print('Previous Tuning Job Name: {}'.format(tuning_job_name))

Previous Tuning Job Name: tensorflow-training-200425-2332


In [15]:
from sagemaker.tuner import WarmStartConfig
from sagemaker.tuner import WarmStartTypes

warm_start_config = WarmStartConfig(warm_start_type=WarmStartTypes.IDENTICAL_DATA_AND_ALGORITHM, 
                                    parents={tuning_job_name})

# Define the Hyper-Parameter Ranges to Explore for the Warm Start Tuning Job
While not necessary, we can choose to statically define any hyper-parameters that we are not choosing to explore in this WarmStart optimization run.


In [16]:
from sagemaker.tuner import IntegerParameter
from sagemaker.tuner import ContinuousParameter
from sagemaker.tuner import CategoricalParameter
from sagemaker.tuner import HyperparameterTuner
                                                
hyperparameter_ranges = {
    'epochs': IntegerParameter(8, 64, scaling_type='Logarithmic'),
    'learning_rate': ContinuousParameter(0.00015, 0.00075, scaling_type='Linear'),
    'train_batch_size': CategoricalParameter([128, 512, 1024]),
    'freeze_bert_layer': CategoricalParameter([True, False])
}

# Setup Hyper-Parameter Tuning Job with Warm Start Config

In [17]:
objective_metric_name = 'validation:accuracy'

tuner = HyperparameterTuner(
    estimator=estimator,
    objective_type='Maximize',
    objective_metric_name=objective_metric_name,
    hyperparameter_ranges=hyperparameter_ranges,
    metric_definitions=metrics_definitions,
    max_jobs=2,
    max_parallel_jobs=1,
    strategy='Bayesian',
    early_stopping_type='Auto',
    warm_start_config=warm_start_config
)

In [20]:
tuner.fit({'train': s3_input_train_data, 
           'validation': s3_input_validation_data,
           'test': s3_input_test_data
          }, include_cls_metadata=False)

# If You See an Error, Please Wait for the Hyper-Parameter Tuning Job to Complete from the Previous Notebook

##  Check Tuning Job Status

Re-run this cell to track the status.

In [21]:
from pprint import pprint

tuning_job_name = tuner.latest_tuning_job.job_name

job_description = sm.describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuning_job_name
)

status = job_description['HyperParameterTuningJobStatus']

print('\n')
print(status)
print('\n')
pprint(job_description)

if status != 'Completed':
    job_count = job_description['TrainingJobStatusCounters']['Completed']
    print('Not yet complete, but {} jobs have completed.')
    
    if job_description.get('BestTrainingJob', None):
        print("Best candidate:")
        pprint(job_description['BestTrainingJob']['TrainingJobName'])
        pprint(job_description['BestTrainingJob']['FinalHyperParameterTuningJobObjectiveMetric'])
    else:
        print("No training jobs have reported results yet.")    



InProgress


{'CreationTime': datetime.datetime(2020, 4, 26, 0, 23, 53, 721000, tzinfo=tzlocal()),
 'HyperParameterTuningJobArn': 'arn:aws:sagemaker:us-west-2:903253828154:hyper-parameter-tuning-job/tensorflow-training-200426-0023',
 'HyperParameterTuningJobConfig': {'HyperParameterTuningJobObjective': {'MetricName': 'validation:accuracy',
                                                                        'Type': 'Maximize'},
                                   'ParameterRanges': {'CategoricalParameterRanges': [{'Name': 'train_batch_size',
                                                                                       'Values': ['"128"',
                                                                                                  '"512"',
                                                                                                  '"1024"']},
                                                                                      {'Name': 'freeze_bert_layer',
        

In [22]:
from IPython.core.display import display, HTML
    
display(HTML('<b>Review <a href="https://console.aws.amazon.com/sagemaker/home?region={}#/hyper-tuning-jobs/{}">Hyper-Parameter Tuning Job</a></b>'.format(region, tuning_job_name)))

# Show the Tuning Job
### _Note:  This will fail at first.  Please wait about 15-30 seconds and re-run._

In [23]:
from sagemaker.analytics import HyperparameterTuningJobAnalytics

hp_results = HyperparameterTuningJobAnalytics(
    sagemaker_session=sess, 
    hyperparameter_tuning_job_name=tuning_job_name
)

df_results = hp_results.dataframe()

df_results.sort_values('FinalObjectiveValue', ascending=0)

Unnamed: 0,FinalObjectiveValue,TrainingEndTime,TrainingJobName,TrainingJobStatus,TrainingStartTime,epochs,freeze_bert_layer,learning_rate,train_batch_size
0,,,tensorflow-training-200426-0023-001-374665c5,InProgress,,13.0,"""True""",0.000693,"""1024"""


## Show the Overall Best Candidate

In [24]:
df_results.sort_values('FinalObjectiveValue', ascending=0).head(1)

Unnamed: 0,FinalObjectiveValue,TrainingEndTime,TrainingJobName,TrainingJobStatus,TrainingStartTime,epochs,freeze_bert_layer,learning_rate,train_batch_size
0,,,tensorflow-training-200426-0023-001-374665c5,InProgress,,13.0,"""True""",0.000693,"""1024"""


In [25]:
best_candidate_tuning_job_name = df_results.sort_values('FinalObjectiveValue', ascending=0).head(1)['TrainingJobName']

In [26]:
print(best_candidate_tuning_job_name)

0    tensorflow-training-200426-0023-001-374665c5
Name: TrainingJobName, dtype: object


In [27]:
%store best_candidate_tuning_job_name

Stored 'best_candidate_tuning_job_name' (Series)
