In [1]:
from sagemaker import get_execution_role
from sagemaker.tensorflow import TensorFlow
from datetime import datetime
import os
import pprint

### Hyperparameters that override default config 

In [23]:
hyperparameters = {
        'schedule': '1x',
        'fp16': True,
        'base_learning_rate': 15e-3,
        'warmup_steps': 500,
        'warmup_init_lr_scale': 3.0,
        'instance_type': 'ml.p3dn.24xlarge',
        'instance_count': 2,
        'batch_size_per_device': 2,
        'num_workers_per_host': 8,
        'use_conv': True,
        'use_rcnn_bn': False,
        'ls': 0.0
    }

### Distributed training configuration

In [24]:
# single default config that can be overriden for HPO by manipulating key
model_cfg = "configs/sagemaker_default_config.py"
hvd_processes_per_host = hyperparameters['num_workers_per_host']
hvd_instance_type = hyperparameters['instance_type']
hvd_instance_count = hyperparameters['instance_count']

### SageMaker configuration

In [25]:
role = "arn:aws:iam::578276202366:role/service-role/AmazonSageMaker-ExecutionRole-20191220T135085"#get_execution_role()
now = datetime.now()
time_str = now.strftime("%d-%m-%Y-%H-%M")
user_id = 'jbsnyder'
image = '578276202366.dkr.ecr.us-east-1.amazonaws.com/jbsnyder:faster_rcnn'
source_dir = "/workspace/shared_workspace/deep-learning-models/models/vision/detection/"
main_script = "tools/train_sagemaker.py"
ec2_instance = hvd_instance_type.replace(".","")

distributions = {
    "mpi": {
        "enabled": True,
        "processes_per_host": hvd_processes_per_host,
        "custom_mpi_options": "-x OMPI_MCA_btl_vader_single_copy_mechanism=none -x TF_CUDNN_USE_AUTOTUNE=0"
#        \
#        -x HOROVOD_NUM_NCCL_STREAMS=2 -x NCCL_TREE_THRESHOLD=4294967296 -x NCCL_MIN_NRINGS=13\
#        -x HOROVOD_CYCLE_TIME=0.5 -x HOROVOD_FUSION_THRESHOLD=67108864",
    }
}

channels = {
    'coco': 's3://jbsnyder-sagemaker/faster-rcnn/data/coco/',
    'weights': 's3://jbsnyder-sagemaker/faster-rcnn/data/weights/'
}

s3_path = os.path.join('s3://jbsnyder-sagemaker/faster-rcnn/scaling', time_str)

job_name = '{}-1-2-scaling-{}'.format(user_id, time_str)

output_path = os.path.join(s3_path, "output", job_name)

subnets=['subnet-58b35b04']

security_group_ids=['sg-02a21bf8f59e59172']

configuration = {
    'configuration': 'configs/sagemaker_default_model_config.py', 
    's3_path': s3_path,
    'instance_name': job_name
}
configuration.update(hyperparameters)

In [20]:
pprint.pprint(configuration)

{'base_learning_rate': 0.015,
 'batch_size_per_device': 2,
 'configuration': 'configs/sagemaker_default_model_config.py',
 'fp16': True,
 'instance_count': 1,
 'instance_name': 'jbsnyder-1-2-scaling-15-05-2020-12-40',
 'instance_type': 'ml.p3.16xlarge',
 'ls': 0.0,
 'num_workers_per_host': 8,
 's3_path': 's3://jbsnyder-sagemaker/faster-rcnn/15-05-2020-12-40',
 'schedule': '1x',
 'use_conv': True,
 'use_rcnn_bn': False,
 'warmup_init_lr_scale': 3.0,
 'warmup_steps': 500}


In [21]:
estimator = TensorFlow(
                entry_point=main_script, 
                source_dir=source_dir, 
                image_name=image, 
                role=role,
                framework_version="2.1.0",
                py_version="py3",
                train_instance_count=hvd_instance_count,
                train_instance_type=hvd_instance_type,
                distributions=distributions,
                output_path=output_path, train_volume_size=200,
                hyperparameters=configuration
)

In [22]:
estimator.fit(channels, wait=False, job_name=job_name)