In [1]:
import sys
import time
from time import strftime
import subprocess
import os

import sagemaker
from smexperiments.experiment import Experiment
from smexperiments.trial import Trial
from smexperiments.trial_component import TrialComponent
from smexperiments.tracker import Tracker
from sagemaker.tensorflow import TensorFlow
from sagemaker.debugger import (ProfilerConfig, 
                                FrameworkProfile, 
                                DetailedProfilingConfig, 
                                DataloaderProfilingConfig, 
                                PythonProfilingConfig)

from sagemaker.debugger import TensorBoardOutputConfig

from sagemaker.debugger import Rule, DebuggerHookConfig, CollectionConfig, rule_configs



In [2]:
role = sagemaker.get_execution_role()
sm_sess = sagemaker.session.Session()

create_date = strftime("%Y-%m-%d-%H-%M-%S")
cars_experiment = Experiment.create(experiment_name = "CARS-{}".format(create_date),
                                    description = "Cars experiment",
                                    tags = [{'Key': 'my-experiments', 'Value': 'cars-{}'.format(create_date)}])

cars_trial = Trial.create(trial_name = "CARS-SIMPLE-{}".format(create_date),
                          experiment_name = cars_experiment.experiment_name,
                          tags = [{'Key': 'my-experiments', 'Value': 'cars-simple-{}'.format(create_date)}])

source_dir = "."
main_script = "train.py"

s3_bucket = "jbsnyder"
user_id = "jbsnyder"
s3_path = os.path.join('s3://{}/cars/outputs/{}'.format(s3_bucket, create_date))
job_name = '{}-cars-{}'.format(user_id, create_date)
output_path = os.path.join(s3_path, job_name)

account_call = "aws sts get-caller-identity --query Account --output text"
ecr_account = subprocess.check_output(account_call, shell=True).decode().strip()
ecr_repo = "jbsnyder"
algo_name = "tf_mrcnn_sagemaker"

s3_bucket = "jbsnyder" # name of your s3 bucket without s3://
docker_image = "{0}.dkr.ecr.us-west-2.amazonaws.com/{1}:{2}".format(ecr_account,
                                                                    ecr_repo,
                                                                    algo_name)

channels = {
    'all_data': 's3://{}/data/cars'.format(s3_bucket),
}

tags = [{'Key': 'my-experiments', 'Value': 'cars-1'}]

job_count = 0

In [5]:
experiment_config = {
        "ExperimentName": cars_experiment.experiment_name,
        "TrialName" : cars_trial.trial_name,
        "TrialComponentDisplayName" : "TrainingJob{}".format(job_count),
    }
    
job_count += 1
'''profiler_config=ProfilerConfig(
    system_monitor_interval_millis=500,
    framework_profile_params=FrameworkProfile(
        detailed_profiling_config=DetailedProfilingConfig(
            start_step=200, 
            num_steps=50
        ),
        dataloader_profiling_config=DataloaderProfilingConfig(
            start_step=50, 
            num_steps=10
        ),
    )
)'''

'''profiler_config=ProfilerConfig(
    framework_profile_params=FrameworkProfile()
)'''

profiler_config=ProfilerConfig(
    framework_profile_params=FrameworkProfile(
        start_step=1,
        num_steps=10,
    )
)

tensorboard_output_config = TensorBoardOutputConfig(
            s3_output_path = "s3://{}/cars/tensorboard/".format(s3_bucket)
    )

collection_configs=[CollectionConfig(
                    name="losses", parameters={
                    "train.save_interval": "100",
                    "eval.save_interval": "10" }
                    )]

job_name = "{}-{}".format(job_name, job_count)

region = "us-west-2"

image_uri = f"763104351884.dkr.ecr.{region}.amazonaws.com/tensorflow-training:2.3.1-gpu-py37-cu110-ubuntu18.04"

estimator = TensorFlow(
                entry_point=main_script, 
                source_dir=source_dir, 
                role=role,
                image_uri=image_uri,
                instance_count=1,
                instance_type="ml.g4dn.xlarge",
                # output_path=output_path, 
                volume_size=50,
                # sagemaker_session=sm_sess,
                # tags=tags,
                profiler_config=profiler_config,
                #tensorboard_output_config=tensorboard_output_config,
                #debugger_hook_config = DebuggerHookConfig(
                #        collection_configs=collection_configs,
                #)
)

In [6]:
estimator.fit(channels, wait=False, job_name=job_name, experiment_config=experiment_config)

INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: latest.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: jbsnyder-cars-2021-01-27-19-45-34-1-2
