# Incremental Training

SageMaker three built-in algorithms currently support incremental training: Object Detection Algorithm, Image Classification Algorithm, and Semantic Segmentation Algorithm.

Note that SageMaker object detection algorithm currently only support the re-training feature with the same network, which means the new training job must have the same base_network and num_classes as the previous training job.

In [1]:
%%time
import sagemaker
from sagemaker import get_execution_role

role = get_execution_role()
print(role)
sess = sagemaker.Session()

arn:aws:iam::476271697919:role/app-sagemaker-execution-role
CPU times: user 673 ms, sys: 84 ms, total: 757 ms
Wall time: 923 ms


In [2]:
bucket = 'beyoung-sagemaker' # custom bucket name.
# bucket = sess.default_bucket()
prefix = 'coco-object-detection-20200422'

In [3]:
from sagemaker.amazon.amazon_estimator import get_image_uri

training_image = get_image_uri(sess.boto_region_name, 'object-detection', repo_version="latest")
print (training_image)

433757028032.dkr.ecr.us-west-2.amazonaws.com/object-detection:latest


## Dataset and Channel Setup

In [4]:
train_channel = prefix + '/train'
validation_channel = prefix + '/validation'
train_annotation_channel = prefix + '/train_annotation'
validation_annotation_channel = prefix + '/validation_annotation'

In [5]:
s3_train_data = 's3://{}/{}'.format(bucket, train_channel)
s3_validation_data = 's3://{}/{}'.format(bucket, validation_channel)
s3_train_annotation = 's3://{}/{}'.format(bucket, train_annotation_channel)
s3_validation_annotation = 's3://{}/{}'.format(bucket, validation_annotation_channel)
s3_output_location = 's3://{}/{}/output'.format(bucket, prefix)

### Add Ground Truth new labeling data

In [10]:
import boto3

labeling_job = 'demo-workshop-20200417'

client = boto3.client('sagemaker')
response = client.describe_labeling_job(
    LabelingJobName=labeling_job
)

total_labeled = response['LabelCounters']['TotalLabeled']
output_uri = response['LabelingJobOutput']['OutputDatasetS3Uri']

print('labeled: {}\noutput_uri: {}'.format(total_labeled, output_uri))

labeled: 25
output_uri: s3://beyoung-sm-groundtruth/output/demo-workshop-20200417/manifests/output/output.manifest


In [30]:
## Copy mainfest files
fp_manifest = './output.manifest'
!aws s3 cp {output_uri} {fp_manifest}

Completed 12.7 KiB/12.7 KiB (248.3 KiB/s) with 1 file(s) remainingdownload: s3://beyoung-sm-groundtruth/output/demo-workshop-20200417/manifests/output/output.manifest to ./output.manifest


## Mapping the labeling classes to COCO dataset

In [15]:
LabelCategoryConfigS3Uri = response['LabelCategoryConfigS3Uri']
LabelCategoryConfigS3Uri

's3://beyoung-sm-groundtruth/output/demo-workshop-20200417/annotation-tool/data.json'

In [20]:
fp_gt = './gt_label_config.json'

!aws s3 cp {LabelCategoryConfigS3Uri} ./{fp_gt}
!cat {fp_gt}

download: s3://beyoung-sm-groundtruth/output/demo-workshop-20200417/annotation-tool/data.json to ./gt_label_config.json
{"document-version":"2018-11-28","labels":[{"label":"bear"},{"label":"dog"},{"label":"cat"},{"label":"bird"}]}

In [23]:
import json
gt_class = []
with open(fp_gt) as f:
    js = json.load(f)
    labels = js['labels']
    for i in labels:
        gt_class.append(i['label'])
gt_class

['bear', 'dog', 'cat', 'bird']

In [24]:
object_categories = ['person', 'bicycle', 'car',  'motorbike', 'aeroplane', 'bus', 'train', 'truck', 'boat', 
                     'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog',
                     'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag',
                     'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat',
                     'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup',
                     'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot',
                     'hot dog', 'pizza', 'donut', 'cake', 'chair', 'sofa', 'pottedplant', 'bed', 'diningtable',
                     'toilet', 'tvmonitor', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven',
                     'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier',
                     'toothbrush']

In [28]:
object_categories.index('dog')

16

In [29]:
def get_mapper_fn(cls_id):
    return object_categories.index(gt_class[cls_id])

get_mapper_fn(1)

16

In [38]:
!head -n 2 {fp_manifest}

{"source-ref":"s3://beyoung-sm-groundtruth/raw_data/sm_workshop/bear-01.jpg","demo-workshop-20200417":{"annotations":[{"class_id":0,"width":1021,"top":132,"height":1301,"left":143},{"class_id":0,"width":520,"top":519,"height":909,"left":976}],"image_size":[{"width":1500,"depth":3,"height":1434}]},"demo-workshop-20200417-metadata":{"job-name":"labeling-job/demo-workshop-20200417","class-map":{"0":"bear"},"human-annotated":"yes","objects":[{"confidence":0.09},{"confidence":0.09}],"creation-date":"2020-04-20T03:47:36.100370","type":"groundtruth/object-detection"}}
{"source-ref":"s3://beyoung-sm-groundtruth/raw_data/sm_workshop/bear-02.jpg","demo-workshop-20200417":{"annotations":[{"class_id":0,"width":540,"top":96,"height":370,"left":82}],"image_size":[{"width":693,"depth":3,"height":600}]},"demo-workshop-20200417-metadata":{"job-name":"labeling-job/demo-workshop-20200417","class-map":{"0":"bear"},"human-annotated":"yes","objects":[{"confidence":0.09}],"creation-date":"2020-04-20T03:47:3

In [40]:
# transfer class map
# "class-map":{"0":"bear","1":"dog","2":"cat","3":"bird"}

new_class_map = {}
for cls_name in gt_class:
    coco_cls_id = str(object_categories.index(cls_name))
    new_class_map[coco_cls_id] = cls_name

json.dumps(new_class_map)


'{"21": "bear", "16": "dog", "15": "cat", "14": "bird"}'

In [44]:
# Mapping the manifest file
fp_coco_manifest = './output_coco_class.manifest'
labeling_job = 'demo-workshop-20200417'

data = []
with open(fp_manifest) as f:
    for line in f:
        js = json.loads(line)
        annotations = js[labeling_job]['annotations']
        for notation in annotations:
            cls_id = notation['class_id']
            notation['class_id'] = get_mapper_fn(cls_id)
        metadata = js[labeling_job+'-metadata']
        metadata['class-map'] = new_class_map
        data.append(js)

In [46]:
json.dumps(data[0])

'{"source-ref": "s3://beyoung-sm-groundtruth/raw_data/sm_workshop/bear-01.jpg", "demo-workshop-20200417": {"annotations": [{"class_id": 21, "width": 1021, "top": 132, "height": 1301, "left": 143}, {"class_id": 21, "width": 520, "top": 519, "height": 909, "left": 976}], "image_size": [{"width": 1500, "depth": 3, "height": 1434}]}, "demo-workshop-20200417-metadata": {"job-name": "labeling-job/demo-workshop-20200417", "class-map": {"21": "bear", "16": "dog", "15": "cat", "14": "bird"}, "human-annotated": "yes", "objects": [{"confidence": 0.09}, {"confidence": 0.09}], "creation-date": "2020-04-20T03:47:36.100370", "type": "groundtruth/object-detection"}}'

In [49]:
fp_coco_manifest = './output_coco_class.manifest'

with open(fp_coco_manifest, "w") as f:
    for line in data:
        print(json.dumps(line), file=f)

In [50]:
# make sure the file format is correct
!tail -n 2 {fp_coco_manifest}

{"source-ref": "s3://beyoung-sm-groundtruth/raw_data/sm_workshop/dog-06.jpg", "demo-workshop-20200417": {"annotations": [{"class_id": 16, "width": 541, "top": 53, "height": 335, "left": 362}, {"class_id": 16, "width": 216, "top": 38, "height": 335, "left": 148}], "image_size": [{"width": 910, "depth": 3, "height": 432}]}, "demo-workshop-20200417-metadata": {"job-name": "labeling-job/demo-workshop-20200417", "class-map": {"21": "bear", "16": "dog", "15": "cat", "14": "bird"}, "human-annotated": "yes", "objects": [{"confidence": 0.09}, {"confidence": 0.09}], "creation-date": "2020-04-20T03:46:31.217570", "type": "groundtruth/object-detection"}}
{"source-ref": "s3://beyoung-sm-groundtruth/raw_data/sm_workshop/dog-07.jpg", "demo-workshop-20200417": {"annotations": [{"class_id": 16, "width": 602, "top": 358, "height": 687, "left": 499}], "image_size": [{"width": 1694, "depth": 3, "height": 1979}]}, "demo-workshop-20200417-metadata": {"job-name": "labeling-job/demo-workshop-20200417", "clas

In [122]:
fp_coco_train_manifest = "./coco_train.manifest"
fp_coco_validation_manifest = "./coco_validation.manifest"

dataset_size = len(data)
train_test_split_index = round(dataset_size*0.8)

train_data = data[:train_test_split_index]
validation_data = data[train_test_split_index:]

num_training_samples = 0
with open(fp_coco_train_manifest, 'w') as f:
    for line in train_data:
        print(json.dumps(line), file=f)
        num_training_samples += 1
    
with open(fp_coco_validation_manifest, 'w') as f:
    for line in validation_data:
        print(json.dumps(line), file=f)

In [131]:
!head -n 2 {fp_coco_validation_manifest}

{"source-ref": "s3://beyoung-sm-groundtruth/raw_data/sm_workshop/dog-03.jpg", "demo-workshop-20200417": {"annotations": [{"class_id": 16, "width": 1924, "top": 382, "height": 2526, "left": 1055}], "image_size": [{"width": 4042, "depth": 3, "height": 2921}]}, "demo-workshop-20200417-metadata": {"job-name": "labeling-job/demo-workshop-20200417", "class-map": {"21": "bear", "16": "dog", "15": "cat", "14": "bird"}, "human-annotated": "yes", "objects": [{"confidence": 0.09}], "creation-date": "2020-04-20T03:42:12.992614", "type": "groundtruth/object-detection"}}
{"source-ref": "s3://beyoung-sm-groundtruth/raw_data/sm_workshop/dog-04.jpg", "demo-workshop-20200417": {"annotations": [{"class_id": 16, "width": 174, "top": 10, "height": 386, "left": 75}], "image_size": [{"width": 268, "depth": 3, "height": 400}]}, "demo-workshop-20200417-metadata": {"job-name": "labeling-job/demo-workshop-20200417", "class-map": {"21": "bear", "16": "dog", "15": "cat", "14": "bird"}, "human-annotated": "yes", "

In [123]:
### upload coco manifest file to s3
train_manifest_channel = 'train_manifest'
s3_train_manifest = 's3://{}/{}/{}'.format(bucket, train_manifest_channel, fp_coco_train_manifest[2:])

validation_manifest_channel = 'validation_manifest'
s3_validation_manifest = 's3://{}/{}/{}'.format(bucket, validation_manifest_channel, fp_coco_validation_manifest[2:])

!aws s3 cp {fp_coco_train_manifest} {s3_train_manifest}
!aws s3 cp {fp_coco_validation_manifest} {s3_validation_manifest}

upload: ./coco_train.manifest to s3://beyoung-sagemaker/train_manifest/coco_train.manifest
upload: ./coco_validation.manifest to s3://beyoung-sagemaker/validation_manifest/coco_validation.manifest


In [124]:
print(num_training_samples)

20


## Prepare Incremental Training

In [81]:
# Use the output model from the previous job.

job_name = 'object-detection-2020-04-22-09-25-36-709'
jb_respone = client.describe_training_job(TrainingJobName=job_name)
model_s3uri = jb_respone['ModelArtifacts']['S3ModelArtifacts']
model_s3uri

's3://beyoung-sagemaker/coco-object-detection-20200422/output/object-detection-2020-04-22-09-25-36-709/output/model.tar.gz'

In [146]:
model_data = sagemaker.session.s3_input(model_s3uri, input_mode='File', distribution='FullyReplicated', 
                             content_type='application/x-sagemaker-model', s3_data_type='S3Prefix')

## Configure manifest training

Ground Truth output format is as the following:

```
{"source-ref": "s3://bucket_name/path_to_a_dataset_object.jpeg", "labeling-job-name": {"annotations":[{"class_id":"0",<bounding box dimensions>}],"image_size":[{<image size simensions>}]}
```

Be sure to pay close attention to the AttributeNames parameter in the training job request. The strings you specifuy in this field must correspond to those that are present in your augmented manifest.

In this case, we would define *attribute_names = ["source-ref", "labeling-job-name"]*. And, the input_mode should be *Pipe*.

In [147]:
# Create a train data channel with S3_data_type as 'AugmentedManifestFile' and attribute names.
manifest_attributes = ["source-ref", labeling_job]

train_data = sagemaker.session.s3_input(s3_train_manifest,
                                        input_mode='Pipe',
                                        distribution='FullyReplicated',
                                        content_type='application/x-recordio',
                                        record_wrapping='RecordIO',
                                        s3_data_type='AugmentedManifestFile',
                                        attribute_names= manifest_attributes)

In [148]:
validation_data = sagemaker.session.s3_input(s3_validation_manifest,
                                        input_mode='Pipe',
                                        distribution='FullyReplicated',
                                        content_type='application/x-recordio',
                                        record_wrapping='RecordIO',
                                        s3_data_type='AugmentedManifestFile',
                                        attribute_names= manifest_attributes)

### Important

SageMaker did not allow train and validation in different modes. Both are 'Pipe' or 'File' modes. But you can not 'Pipe' and 'File' for train and validation channel.

In [112]:
# validation_data = sagemaker.session.s3_input(s3_validation_data, distribution='FullyReplicated', 
#                              content_type='image/jpeg', s3_data_type='S3Prefix', input_mode= 'File')
# validation_annotation = sagemaker.session.s3_input(s3_validation_annotation, distribution='FullyReplicated', 
#                              content_type='image/jpeg', s3_data_type='S3Prefix', input_mode= 'File')

In [149]:
# In addition to two data channels, add a 'model' channel for the training.
new_data_channels = {'train': train_data, 'validation': validation_data, 'model': model_data}

### Here is the old model hyper parameters

```
od_model.set_hyperparameters(base_network='resnet-50',
                             use_pretrained_model=1,
                             num_classes=80,
                             mini_batch_size=16,
                             epochs=30,
                             learning_rate=0.001,
                             lr_scheduler_step='10',
                             lr_scheduler_factor=0.1,
                             optimizer='sgd',
                             momentum=0.9,
                             weight_decay=0.0005,
                             overlap_threshold=0.5,
                             nms_threshold=0.45,
                             image_shape=512,
                             label_width=600,
                             num_training_samples=4452)
```

In [150]:
new_od_model = sagemaker.estimator.Estimator(training_image,
                                             role, 
                                             train_instance_count=1, 
                                             train_instance_type='ml.p3.2xlarge',
                                             train_volume_size = 50,
                                             train_max_run = 360000,
                                             input_mode= 'Pipe',
                                             output_path=s3_output_location,
                                             sagemaker_session=sess)

In [151]:
# Because our training dataset is small (here is train:20, test:5)
# if start from 32, it will get the error The number of input images must be bigger or equal to the mini_batch_size

new_od_model.set_hyperparameters(base_network='resnet-50',
                                 num_classes=80,
                                 mini_batch_size=4,
                                 epochs=10,
                                 learning_rate=0.001,
                                 optimizer='rmsprop',
                                 momentum=0.9,
                                 overlap_threshold=0.4,
                                 nms_threshold=0.3,
                                 image_shape=512,
                                 label_width=600,
                                 num_training_samples=num_training_samples)

## Model fit to start Incremental Training

In [152]:
new_od_model.fit(inputs=new_data_channels, wait=False)

In [153]:
job_name = new_od_model.latest_training_job.job_name
job_name

'object-detection-2020-04-23-11-00-11-319'

In [154]:
import time

response = client.describe_training_job(TrainingJobName=job_name)
status = response['TrainingJobStatus']

while status == 'InProgress':
    time.sleep(30)
    response = client.describe_training_job(TrainingJobName=job_name)
    status = response['TrainingJobStatus']
    print(status)

InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
Completed
