In [23]:
# Define IAM role
import boto3
import re
import sys
import os
import time
import numpy as np
import pandas as pd
from sagemaker import get_execution_role
from sagemaker.session import Session

role = get_execution_role()

In [27]:
import sagemaker
from time import gmtime, strftime

sess = sagemaker.Session() # can use LocalSession() to run container locally

bucket = 'privisaa-bucket-2' # sess.default_bucket()
region = "us-east-1"
account = sess.boto_session.client('sts').get_caller_identity()['Account']
prefix_input = 'detectron2-input'
prefix_output = 'detectron2-ouput'

# Upload data for training

In [3]:
! ./upload_coco2017_to_s3.sh {bucket} coco

Create stage directory: /home/ec2-user/SageMaker/coco-2017-2020-10-15-00-24-32
--2020-10-15 00:24:32--  http://images.cocodataset.org/zips/train2017.zip
Resolving images.cocodataset.org (images.cocodataset.org)... 52.217.44.140
Connecting to images.cocodataset.org (images.cocodataset.org)|52.217.44.140|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 19336861798 (18G) [application/zip]
Saving to: ‘/home/ec2-user/SageMaker/coco-2017-2020-10-15-00-24-32/train2017.zip’


2020-10-15 00:28:32 (77.0 MB/s) - ‘/home/ec2-user/SageMaker/coco-2017-2020-10-15-00-24-32/train2017.zip’ saved [19336861798/19336861798]

Extracting /home/ec2-user/SageMaker/coco-2017-2020-10-15-00-24-32/train2017.zip
--2020-10-15 00:30:14--  http://images.cocodataset.org/zips/val2017.zip
Resolving images.cocodataset.org (images.cocodataset.org)... 54.231.33.163
Connecting to images.cocodataset.org (images.cocodataset.org)|54.231.33.163|:80... connected.
HTTP request sent, awaiting response... 200 

In [None]:
!aws s3 cp --recursive s3://privisaa-bucket-2/coco s3://privisaa-bucket-2/coco

## Push Docker image to registry

For this training, we'll extend [Sagemaker PyTorch Container](https://docs.aws.amazon.com/deep-learning-containers/latest/devguide/deep-learning-containers-images.html) with Detectron2 dependencies (using official [D2 Dockerfile](https://github.com/facebookresearch/detectron2/blob/master/docker/Dockerfile)) as baseline. See Dockerfile below.

In [4]:
!pygmentize Dockerfile

[37m# Build an image of Detectron2 that can do [39;49;00m
[37m# distributing training and inference in Amazon Sagemaker[39;49;00m

[37m# using Sagemaker PyTorch container as base image[39;49;00m
[34mFROM[39;49;00m[33m 763104351884.dkr.ecr.us-east-2.amazonaws.com/pytorch-training:1.4.0-gpu-py36-cu101-ubuntu16.04[39;49;00m
LABEL [31mauthor[39;49;00m=[33m"vadimd@amazon.com"[39;49;00m

[37m############# Installing latest builds ############[39;49;00m

[37m# This is to fix issue: https://github.com/pytorch/vision/issues/1489[39;49;00m
[34mRUN[39;49;00m pip install --upgrade --force-reinstall torch torchvision cython

[37m############# D2 section ##############[39;49;00m

[37m# installing dependecies for D2 https://github.com/facebookresearch/detectron2/blob/master/docker/Dockerfile[39;49;00m
[34mRUN[39;49;00m pip install [33m'git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI'[39;49;00m
[34mRUN[39;49;00m pip install [33m'git+https://github.c

You'll need to build container from this Dockerfile and push it to Amazon Elastic Container Registry using `build_and_push.sh` script. But you'll need to loging to Sagemaker ECR and your private ECR first.

In [3]:
# loging to Sagemaker ECR with Deep Learning Containers
!aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin 763104351884.dkr.ecr.{region}.amazonaws.com
# loging to your private ECR
!aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin 553020858742.dkr.ecr.{region}.amazonaws.com

https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded
https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded


Now you can ready to push your D2 container to private ECR

In [4]:
! ./build_and_push.sh d2-sm-coco distributed

https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded
Sending build context to Docker daemon    154MB
Step 1/17 : FROM 763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:1.4.0-gpu-py36-cu101-ubuntu16.04
1.4.0-gpu-py36-cu101-ubuntu16.04: Pulling from pytorch-training

[1B7927d38a: Pulling fs layer 
[1Bac894db4: Pulling fs layer 
[1B2af6d627: Pulling fs layer 
[1B86211d23: Pulling fs layer 
[1B603ff777: Pulling fs layer 
[1B7165632f: Pulling fs layer 
[1B96e40dcf: Pulling fs layer 
[1B91ff3706: Pulling fs layer 
[1Bb0b1c69b: Pulling fs layer 
[1B0b70f92c: Pulling fs layer 
[1Be52996c6: Pulling fs layer 
[1B89cd9471: Pulling fs layer 
[9B603ff777: Waiting fs layer 
[9B7165632f: Waiting fs layer 
[1B241663de: Pulling fs layer 
[1B2ba0eec4: Pulling fs layer 
[11B6e40dcf: Waiting fs layer 
[8Be52996c6: Waiting fs layer 
[8B89cd9471: Waiting fs layer 
[12B0b1c69b: Waiting fs layer 
[6B2ba0eec4: Waiting fs layer 
[8B2416

# Train your model

Define algorithm metrics which Sagemaker will scrap, persist, and render in training job console

In [4]:
container = "d2-sm-coco" # your container name
tag = "distributed"
image = '{}.dkr.ecr.{}.amazonaws.com/{}:{}'.format(account, region, container, tag)

In [5]:
metric_definitions=[
    {
        "Name": "total_loss",
        "Regex": ".*total_loss:\s([0-9\\.]+)\s*"
    },
    {
        "Name": "loss_cls",
        "Regex": ".*loss_cls:\s([0-9\\.]+)\s*"
    },
    {
        "Name": "loss_box_reg",
        "Regex": ".*loss_box_reg:\s([0-9\\.]+)\s*"
    },
    {
        "Name": "loss_mask",
        "Regex": ".*loss_mask:\s([0-9\\.]+)\s*"
    },
    {
        "Name": "loss_rpn_cls",
        "Regex": ".*loss_rpn_cls:\s([0-9\\.]+)\s*"
    },
    {
        "Name": "loss_rpn_loc",
        "Regex": ".*loss_rpn_loc:\s([0-9\\.]+)\s*"
    }, 
    {
        "Name": "overall_training_speed",
        "Regex": ".*Overall training speed:\s([0-9\\.]+)\s*"
    },
    {
        "Name": "lr",  
        "Regex": ".*lr:\s([0-9\\.]+)\s*"
    },
    {
        "Name": "iter",  
        "Regex": ".*iter:\s([0-9\\.]+)\s*"
    }
]


In [None]:
!{sys.executable} -m pip install sagemaker-experiments==0.1.24

In [None]:

inputs = sagemaker.Session().upload_data(path='mnist', bucket=bucket, key_prefix=prefix)
print('input spec: {}'.format(inputs))

In [12]:
with Tracker.create(display_name="Preprocessing", sagemaker_boto_client=sm) as tracker:
    tracker.log_parameters({
        "normalization_mean": 0.1307,
        "normalization_std": 0.3081,
    })
    # we can log the s3 uri to the dataset we just uploaded
#     tracker.log_input(name="d2-dataset", media_type="s3/uri", value=inputs)

In [13]:
# create d2 experiment

from sagemaker.analytics import ExperimentAnalytics
from smexperiments.experiment import Experiment
from smexperiments.trial import Trial
from smexperiments.trial_component import TrialComponent
from smexperiments.tracker import Tracker

b3sess = boto3.Session()
sm = b3sess.client('sagemaker')

d2_experiment = Experiment.create(
    experiment_name=f"d2-coco-{int(time.time())}", 
    description="Detectron2 training on COCO2017", 
    sagemaker_boto_client=sm)
print(d2_experiment)


Experiment(sagemaker_boto_client=<botocore.client.SageMaker object at 0x7f2562cfecf8>,experiment_name='d2-coco-1602890769',description='Detectron2 training on COCO2017',tags=None,experiment_arn='arn:aws:sagemaker:us-east-1:209419068016:experiment/d2-coco-1602890769',response_metadata={'RequestId': 'e577e674-9f07-4e6d-8f81-a204c789e96b', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'e577e674-9f07-4e6d-8f81-a204c789e96b', 'content-type': 'application/x-amz-json-1.1', 'content-length': '90', 'date': 'Fri, 16 Oct 2020 23:26:08 GMT'}, 'RetryAttempts': 0})


In [16]:
hidden_channel_trial_name_map = {}
preprocessing_trial_component = tracker.trial_component

trial_name = f"d2-training-job-{int(time.time())}"
d2_trial = Trial.create(
    trial_name=trial_name, 
    experiment_name=d2_experiment.experiment_name,
    sagemaker_boto_client=sm,
)
hidden_channel_trial_name_map[0] = trial_name

# associate the proprocessing trial component with the current trial
d2_trial.add_trial_component(preprocessing_trial_component)

In [17]:
hyperparameters = {"config-file":"COCO-InstanceSegmentation/mask_rcnn_R_101_C4_3x.yaml", 
                   #"local-config-file" : "config.yaml", # if you'd like to supply custom config file, please add it in container_training folder, and provide file name here
                   "resume":"True", # whether to re-use weights from pre-trained model
                   "eval-only":"False", # whether to perform only D2 model evaluation
                  # opts are D2 model configuration as defined here: https://detectron2.readthedocs.io/modules/config.html#config-references
                  # this is a way to override individual parameters in D2 configuration from Sagemaker API
                   "opts": "SOLVER.MAX_ITER 2000"
                   }

sessLocal = sagemaker.LocalSession() # can use LocalSession()
    
d2 = sagemaker.estimator.Estimator(f"209419068016.dkr.ecr.us-east-1.amazonaws.com/d2-sm-coco:distributed",
                                   role=role,
                                   train_instance_count=2, 
                                   train_instance_type= 'ml.p3.16xlarge',
#                                   train_instance_type="local_gpu", # use local_gpu for quick troubleshooting
                                   train_volume_size=100,
                                   output_path="s3://{}/{}".format(bucket, prefix_output),
                                   metric_definitions = metric_definitions,
                                   hyperparameters = hyperparameters, 
                                   sagemaker_session=sess)

d2.fit({'training':f"s3://{bucket}/coco"},
       job_name = "2-nodes-max-iter-20000-v14",
       wait=False,
              experiment_config={
            "TrialName": d2_trial.trial_name,
            "TrialComponentDisplayName": "Training",
        }) 

INFO:sagemaker:Creating training-job with name: 2-nodes-max-iter-20000-v14


In [None]:
sm.create_training_job()

## Training with Spot Instance

In [11]:
train_use_spot_instances = True
train_max_run=21600
train_max_wait = 30000 if train_use_spot_instances else None

import uuid
checkpoint_suffix = str(uuid.uuid4())[:8]
checkpoint_s3_uri = 's3://{}/artifacts/d2-checkpoint-{}/'.format(bucket, checkpoint_suffix) if train_use_spot_instances else None

In [13]:
container = "d2-sm-coco-custom" # your container name
image = '{}.dkr.ecr.{}.amazonaws.com/{}:latest'.format(account, region, container)
image = f"209419068016.dkr.ecr.us-east-1.amazonaws.com/d2-sm-coco:distributed"

hyperparameters = {"config-file":"COCO-InstanceSegmentation/mask_rcnn_R_101_C4_3x.yaml", 
                   #"local-config-file" : "config.yaml", # if you'd like to supply custom config file, please add it in container_training folder, and provide file name here
                   "resume":"True", # whether to re-use weights from pre-trained model
                   "eval-only":"False", # whether to perform only D2 model evaluation
                  # opts are D2 model configuration as defined here: https://detectron2.readthedocs.io/modules/config.html#config-references
                  # this is a way to override individual parameters in D2 configuration from Sagemaker API
                   "opts": "SOLVER.MAX_ITER 20000"
                   }

d2 = sagemaker.estimator.Estimator(image,
                                   role=role,
                                   train_instance_count=2, 
                                   train_instance_type='ml.p3.8xlarge',
                                   train_volume_size=100,
                                   output_path="s3://{}/{}".format(bucket, prefix_output),
                                   metric_definitions = metric_definitions,
                                   hyperparameters = hyperparameters, 
                                   sagemaker_session=sess,
                                   train_use_spot_instances=train_use_spot_instances,
                                   train_max_run=train_max_run,
                                   train_max_wait=train_max_wait,
                                   checkpoint_s3_uri=checkpoint_s3_uri)

d2.fit({'training':f"s3://{bucket}/coco"},
       job_name = "2-nodes-max-iter-20000-spot1",
       wait=False,
              experiment_config={
            "TrialName": d2_trial.trial_name,
            "TrialComponentDisplayName": "Training",
        }) 

Parameter image_name will be renamed to image_uri in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


In [30]:
search_expression = {
    "Filters":[
        {
            "Name": "DisplayName",
            "Operator": "Equals",
            "Value": "Training",
        }
    ],
}

trial_component_analytics = ExperimentAnalytics(
    sagemaker_session=Session(b3sess, sm), 
    experiment_name=d2_experiment.experiment_name,
    search_expression=search_expression,
    sort_by="metrics.test:accuracy.max",
    sort_order="Descending",
    metric_names=['test:accuracy'],
    parameter_names=['hidden_channels', 'epochs', 'dropout', 'optimizer']
)


In [31]:
trial_component_analytics.dataframe()

In [18]:
!conda install -c mlio -c conda-forge mlio-py -y

Collecting package metadata (current_repodata.json): done
Solving environment: / 
The environment is inconsistent, please check the package plan carefully
The following packages are causing the inconsistency:
- 
  - defaults/linux-64::python-language-server==0.31.7=py36_0
  - defaults/noarch::numpydoc==0.9.2=py_0
  - defaults/noarch::sphinx==2.4.0=py_0
  - defaults/linux-64::spyder==4.0.1=py36_0
  - fastai/noarch::fastprogress==1.0.0=pyh39e3cac_0
  - defaults/noarch::s3fs==0.4.2=py_0
  - fastai/noarch::fastai==1.0.61=1
done


  current version: 4.8.3
  latest version: 4.8.5

Please update conda by running

    $ conda update -n base conda



## Package Plan ##

  environment location: /home/ec2-user/anaconda3/envs/pytorch_p36

  added / updated specs:
    - mlio-py


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    astroid-2.4.2              |   py36h9f0ad1d_1         297 KB  conda-forge
