## Set up the environment

In [1]:
import time
import os
import sagemaker
import boto3
from sagemaker import get_execution_role
from sagemaker.tensorflow import TensorFlow
from sagemaker.analytics import ExperimentAnalytics

boto3_session = boto3.Session()
sm_client = boto3_session.client('sagemaker')
sm_session = sagemaker.Session(boto_session=boto3_session, sagemaker_client=sm_client)
role = get_execution_role()

In [2]:
import sys
!{sys.executable} -m pip install sagemaker-experiments

from smexperiments.experiment import Experiment
from smexperiments.trial import Trial
from smexperiments.trial_component import TrialComponent
from smexperiments.tracker import Tracker

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/tensorflow2_p36/bin/python -m pip install --upgrade pip' command.[0m


In [3]:
bucket = 'ml-misc-sagemaker'
prefix = 'sagemaker/script-mode'

## Download the CIFAR-10 dataset
Downloading the test and training data takes around 5 minutes.

In [6]:
!pip install wget
import wget # for TF2

#!python generate_cifar10_tfrecords_v1.x.py --data-dir data/
!mkdir data
!python generate_cifar10_tfrecords_v2.py --data-dir data/

Collecting wget
  Downloading wget-3.2.zip (10 kB)
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25ldone
[?25h  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9681 sha256=fc814cb7566b7c08ca93b9fbb0f2e6f0b47012555b6cf9c5bfaea05d4e856b87
  Stored in directory: /home/ec2-user/.cache/pip/wheels/90/1d/93/c863ee832230df5cfc25ca497b3e88e0ee3ea9e44adc46ac62
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/tensorflow2_p36/bin/python -m pip install --upgrade pip' command.[0m
mkdir: cannot create directory ‘data’: File exists
Download from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz and extract.
data/
100% [..................................................] 170498071 / 170498071Generating data//train/train.tfrecords
Generating data//validation/validation.tfrecords
Generating data//eval/eval.tfrecords
Done!


## Run on SageMaker cloud

### Uploading the data to s3

In [7]:
dataset_location = sm_session.upload_data(path='data', key_prefix='data/DEMO-cifar10-tf')
display(dataset_location)

INFO:sagemaker:Created S3 bucket: sagemaker-ap-southeast-2-719164424367


's3://sagemaker-ap-southeast-2-719164424367/data/DEMO-cifar10-tf'

Now lets track the parameters from the data pre-processing step.

In [8]:
with Tracker.create(display_name="Preprocessing", sagemaker_boto_client=sm_client) as tracker:
    tracker.log_parameters({
        "datatype": 'tfrecords',
        "image_size": 32,
    })
    # we can log the s3 uri to the dataset we just uploaded
    tracker.log_input(name="cifar10-dataset", media_type="s3/uri", value=dataset_location)

SageMaker can get training metrics directly from the logs and send them to CloudWatch metrics.

In [9]:
keras_metric_definition = [
    {'Name': 'train:loss', 'Regex': '.*loss: ([0-9\\.]+) - acc: [0-9\\.]+.*'},
    {'Name': 'train:accuracy', 'Regex': '.*loss: [0-9\\.]+ - acc: ([0-9\\.]+).*'},
    {'Name': 'validation:accuracy', 'Regex': '.*step - loss: [0-9\\.]+ - acc: [0-9\\.]+ - val_loss: [0-9\\.]+ - val_acc: ([0-9\\.]+).*'},
    {'Name': 'validation:loss', 'Regex': '.*step - loss: [0-9\\.]+ - acc: [0-9\\.]+ - val_loss: ([0-9\\.]+) - val_acc: [0-9\\.]+.*'},
    {'Name': 'sec/steps', 'Regex': '.* - \d+s (\d+)[mu]s/step - loss: [0-9\\.]+ - acc: [0-9\\.]+ - val_loss: [0-9\\.]+ - val_acc: [0-9\\.]+'}
]

### Step 1 - Set up the Experiment

Create an experiment to track all the model training iterations. Experiments are a great way to organize your data science work. You can create experiments to organize all your model development work for : [1] a business use case you are addressing, or [2] a data science team that owns the experiment, or [3] a specific data science and ML project. Think of it as a “folder” for organizing your “files”.

In [10]:
cifar10_experiment = Experiment.create(
    experiment_name=f"cifar10-image-classification-{int(time.time())}", 
    description="Classification of images", 
    sagemaker_boto_client=sm_client)
print(cifar10_experiment)

Experiment(sagemaker_boto_client=<botocore.client.SageMaker object at 0x7f9305e19f28>,experiment_name='cifar10-image-classification-1596084065',description='Classification of images',tags=None,experiment_arn='arn:aws:sagemaker:ap-southeast-2:719164424367:experiment/cifar10-image-classification-1596084065',response_metadata={'RequestId': '455160b1-1f7e-4d69-87e1-39f3e33fbeba', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '455160b1-1f7e-4d69-87e1-39f3e33fbeba', 'content-type': 'application/x-amz-json-1.1', 'content-length': '116', 'date': 'Thu, 30 Jul 2020 04:41:05 GMT'}, 'RetryAttempts': 0})


### Step 2 - Track Experiment
### Now create a Trial for each training run to track the it's inputs, parameters, and metrics.
While training the CNN model on SageMaker, we will experiment with several values for optimization method in the model. We will create a Trial to track each training job run. We will also create a TrialComponent from the tracker we created before, and add to the Trial. This will enrich the Trial with the parameters we captured from the data pre-processing stage.

Note the execution of the following code takes a while. If you want to run the following training jobs asynchronously, you may need to increase your resource limit. Otherwise, you can run them sequentially.

In [11]:
tags = [{'Key':'product','Value':'mlai'}, {'Key':'environment', 'Value':'development'}, {'Key':'owner', 'Value':'ML/AI team'}]

In [12]:
train_use_spot_instances = True
train_max_run=3600
train_max_wait = 3600 if train_use_spot_instances else None

In [13]:
opt_method_trial_name_map = {}
#for i, opt_method in enumerate(['adam','sgd','rmsprop']):
for i, opt_method in enumerate(['sgd','rmsprop']):
    # create trial
    trial_name = f"cifar10-training-job-with-{opt_method}-optimization-{int(time.time())}"
    cifar10_trial = Trial.create(
        trial_name=trial_name, 
        experiment_name=cifar10_experiment.experiment_name,
        sagemaker_boto_client=sm_client,
    )
    opt_method_trial_name_map[opt_method] = trial_name
    
    # associate the proprocessing trial component with the current trial
    cifar10_trial.add_trial_component(tracker.trial_component)


    estimator = TensorFlow(base_job_name='cifar10-tf',
                           entry_point='cifar10_keras_main.py',
                           source_dir=os.path.join(os.getcwd(), 'source_dir'),
                           role=role,
                           framework_version='1.12.0',
                           py_version='py3',
                           hyperparameters={'epochs': 1, 'batch-size' : 256, 'optimizer' : opt_method},
                           train_instance_count=1, train_instance_type='ml.p3.2xlarge',
                           tags=tags, 
                           train_use_spot_instances=train_use_spot_instances,
                           train_max_run=train_max_run,
                           train_max_wait=train_max_wait, 
                           metric_definitions=keras_metric_definition)
    
    cifar10_training_job_name = "cifar-training-job-{}".format(int(time.time()))
    remote_inputs = {'train' : dataset_location+'/train', 'validation' : dataset_location+'/validation', 'eval' : dataset_location+'/eval'}
    estimator.fit(remote_inputs, job_name=cifar10_training_job_name,
        experiment_config={
            "TrialName": cifar10_trial.trial_name,
            "TrialComponentDisplayName": "Training",
        },
        wait=False,)
    # give it a while before dispatching the next training job
    time.sleep(2)

INFO:sagemaker:Creating training-job with name: cifar-training-job-1596084147
INFO:sagemaker:Creating training-job with name: cifar-training-job-1596084150


### Compare the model training runs for an experiment

Now we will use the analytics capabilities of Python SDK to query and compare the training runs for identifying the best model produced by our experiment. You can retrieve trial components by using a search expression.

### Some Simple Analyses

In [14]:
search_expression = {
    "Filters":[
        {
            "Name": "DisplayName",
            "Operator": "Equals",
            "Value": "Training",
        }
    ],
}

In [15]:
trial_component_analytics = ExperimentAnalytics(
    sagemaker_session=sm_session, 
    experiment_name=cifar10_experiment.experiment_name,
    search_expression=search_expression,
    sort_by="metrics.validation:accuracy.max",
    sort_order="Descending",
    metric_names=['train:accuracy', 'validation:accuracy'],
    parameter_names=['optimizer']
)

In [16]:
trial_component_analytics.dataframe()

Unnamed: 0,TrialComponentName,DisplayName,SourceArn,optimizer,validation:accuracy - Min,validation:accuracy - Max,validation:accuracy - Avg,validation:accuracy - StdDev,validation:accuracy - Last,validation:accuracy - Count,...,eval - MediaType,eval - Value,train - MediaType,train - Value,validation - MediaType,validation - Value,SageMaker.DebugHookOutput - MediaType,SageMaker.DebugHookOutput - Value,SageMaker.ModelArtifact - MediaType,SageMaker.ModelArtifact - Value
0,cifar-training-job-1596084150-aws-training-job,Training,arn:aws:sagemaker:ap-southeast-2:719164424367:...,"""rmsprop""",0.0,0.0,0.0,0.0,0.3927,0,...,,s3://sagemaker-ap-southeast-2-719164424367/dat...,,s3://sagemaker-ap-southeast-2-719164424367/dat...,,s3://sagemaker-ap-southeast-2-719164424367/dat...,,s3://sagemaker-ap-southeast-2-719164424367/,,s3://sagemaker-ap-southeast-2-719164424367/cif...
1,cifar-training-job-1596084147-aws-training-job,Training,arn:aws:sagemaker:ap-southeast-2:719164424367:...,"""sgd""",0.0,0.0,0.0,0.0,0.251,0,...,,s3://sagemaker-ap-southeast-2-719164424367/dat...,,s3://sagemaker-ap-southeast-2-719164424367/dat...,,s3://sagemaker-ap-southeast-2-719164424367/dat...,,s3://sagemaker-ap-southeast-2-719164424367/,,s3://sagemaker-ap-southeast-2-719164424367/cif...


Next let's look at an example of tracing the lineage of a model by accessing the data tracked by SageMaker Experiments for a `cifar-training-job` trial

In [18]:
lineage_table = ExperimentAnalytics(
    sagemaker_session=sm_session, 
    search_expression={
        "Filters":[{
            "Name": "Parents.TrialName",
            "Operator": "Equals",
            "Value": opt_method_trial_name_map['sgd']
        }]
    },
    sort_by="CreationTime",
    sort_order="Ascending",
)
lineage_table.dataframe()

Unnamed: 0,TrialComponentName,DisplayName,datatype,image_size,cifar10-dataset - MediaType,cifar10-dataset - Value,SourceArn,SageMaker.ImageUri,SageMaker.InstanceCount,SageMaker.InstanceType,...,eval - MediaType,eval - Value,train - MediaType,train - Value,validation - MediaType,validation - Value,SageMaker.DebugHookOutput - MediaType,SageMaker.DebugHookOutput - Value,SageMaker.ModelArtifact - MediaType,SageMaker.ModelArtifact - Value
0,TrialComponent-2020-07-30-044058-awby,Preprocessing,tfrecords,32.0,s3/uri,s3://sagemaker-ap-southeast-2-719164424367/dat...,,,,,...,,,,,,,,,,
1,cifar-training-job-1596084147-aws-training-job,Training,,,,,arn:aws:sagemaker:ap-southeast-2:719164424367:...,520713654638.dkr.ecr.ap-southeast-2.amazonaws....,1.0,ml.p3.2xlarge,...,,s3://sagemaker-ap-southeast-2-719164424367/dat...,,s3://sagemaker-ap-southeast-2-719164424367/dat...,,s3://sagemaker-ap-southeast-2-719164424367/dat...,,s3://sagemaker-ap-southeast-2-719164424367/,,s3://sagemaker-ap-southeast-2-719164424367/cif...
