In [None]:
import sagemaker
from sagemaker import get_execution_role
from datetime import datetime
import os

role = get_execution_role()
sess = sagemaker.Session()
bucket=sess.default_bucket()
prefix = "lego-{}".format(datetime.today().strftime("%y%m%d-%H%M"))

In [None]:
import boto3
import json
ssm = boto3.client('ssm')

# expected format: {"username":"xx","key":"xxx"}
kaggleAPI = ssm.get_parameter(
    Name='kaggleAPI'
)
kaggleAPI = json.loads(kaggleAPI.get("Parameter")["Value"])

In [None]:
!pip -q install kaggle

In [None]:
from os import environ
environ["KAGGLE_USERNAME"] = kaggleAPI["username"]
environ["KAGGLE_KEY"] = kaggleAPI["key"]

![ -z "lego-brick-images.zip" ] && rm lego-brick-images.zip

!kaggle datasets download --force joosthazelzet/lego-brick-images
!unzip -oq lego-brick-images.zip

## Pre-processing Training Data
Overlay the transparent background with Gaussian noises

In [None]:
from matplotlib import pyplot as plt
from math import ceil
from PIL import Image
import numpy as np
import os
from random import randrange

def replace_background(fg):
    w,h = fg.size

    # Create Gaussian background
    SHAPE = (w,h)

    noise = np.random.normal(255./2,255./10,SHAPE)
    bg = Image.fromarray(noise)

    with_gaussian_background = Image.new('RGBA', (w,h), (0, 0, 0, 0))

    # with_gaussian_background.paste(im, (0,0))
    # with_gaussian_background.paste(front, (0,0), mask=front)

    with_gaussian_background.paste(bg, ((with_gaussian_background.width - bg.width) // 2, (with_gaussian_background.height - bg.height) // 2))
    with_gaussian_background.paste(fg, ((with_gaussian_background.width - fg.width) // 2, (with_gaussian_background.height - fg.height) // 2), mask=fg)

    return with_gaussian_background

def loop_original_images(folder):
    dest_folder = "processed/train/{}".format(folder)
    !mkdir -p "{dest_folder}"
    print("processing images from folder: {}".format(dest_folder))
    names = [f for f in os.listdir('LEGO brick images/train/{}'.format(folder))]
    count=0
    for name in names:
        image = Image.open("LEGO brick images/train/{}/{}".format(folder, name), 'r')
        processed = replace_background(image)
        processed.save(dest_folder + "/" + name, format="png")
        count=count+1

    print("Images processed:{}".format(count))

In [None]:
loop_original_images("3003 Brick 2x2")
loop_original_images("3005 Brick 1x1")

In [None]:
pre='LEGO brick images/train/3003 Brick 2x2/'
post='processed/train/3003 Brick 2x2/'
plt.figure(figsize=(15,15))
for i in range(6):
    image_name=f'{(randrange(399)+1):04}'+".png"
    old=Image.open(pre+"/"+image_name)
    plt.subplot(6,4,((i+1)*2)-1)
    plt.xticks([])
    plt.yticks([])
    plt.imshow(old)
    plt.xlabel("old:{}".format(image_name))
    new=Image.open(post+"/"+image_name)
    plt.subplot(6,4,(i+1)*2)
    plt.xticks([])
    plt.yticks([])
    plt.imshow(new)
    plt.xlabel("new:{}".format(image_name))
plt.show()

In [None]:
!mkdir -p "processed/valid/3003 Brick 2x2/"
!cp -R "LEGO brick images/valid/3003 Brick 2x2/" "processed/valid/3003 Brick 2x2/"
!mkdir -p "processed/valid/3005 Brick 1x1/"
!cp -R "LEGO brick images/valid/3005 Brick 1x1/" "processed/valid/3005 Brick 1x1/"

In [None]:
!wget 'https://raw.githubusercontent.com/apache/incubator-mxnet/master/tools/im2rec.py'

In [None]:
training_image_folder = "./processed/train"
validation_image_folder = "./processed/valid"

# generate .rec database
!python im2rec.py lego_train "$training_image_folder" --list --recursive --pass-through --pack-label 
!python im2rec.py lego_train "$training_image_folder" --recursive --pass-through --pack-label 
!python im2rec.py lego_test "$validation_image_folder" --list --recursive --pass-through --pack-label 
!python im2rec.py lego_test "$validation_image_folder" --recursive --pass-through --pack-label 

In [None]:
# Upload the RecordIO files to train and validation channels
train_channel = prefix + '/train'
validation_channel = prefix + '/validation'

sess.upload_data(path='lego_train.rec', bucket=bucket, key_prefix=train_channel)
sess.upload_data(path='lego_test.rec', bucket=bucket, key_prefix=validation_channel)

s3_train_data = 's3://{}/{}'.format(bucket, train_channel)
s3_validation_data = 's3://{}/{}'.format(bucket, validation_channel)
s3_output_location = 's3://{}/{}/output'.format(bucket, prefix)

train_data = sagemaker.session.s3_input(s3_train_data, distribution='FullyReplicated', 
                        content_type='application/x-recordio', s3_data_type='S3Prefix')
validation_data = sagemaker.session.s3_input(s3_validation_data, distribution='FullyReplicated', 
                             content_type='application/x-recordio', s3_data_type='S3Prefix')

In [None]:
num_classes = len(os.listdir(training_image_folder))
num_training_samples = sum([len(files) for r, d, files in os.walk(training_image_folder)])

print("num_classes:{}".format(num_classes))
print("num_training_samples:{}".format(num_training_samples))

In [None]:
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner 
from datetime import date

training_image = get_image_uri(sess.boto_region_name, 'image-classification', repo_version="latest")
# training_image = get_image_uri(sess.boto_region_name, 'object-detection', repo_version="latest")

data_channels = {'train': train_data, 'validation': validation_data}

In [None]:
def hyperparameter_tuner():
    from sagemaker import HyperparameterTuningJobAnalytics
    
    mini_batch_size_min = 16
    mini_batch_size_max = 64
    learning_rate_min = "0.0001"
    learning_rate_max = "1.0"
    optimizers = ['sgd', 'adam', 'rmsprop', 'nag']

    # maximum number of training jobs
    hpo_max_number_of_training_jobs = 50
    # maximum number of parallel training jobs
    hpo_max_number_of_parallel_jobs = 2
    hpo_objective_metric_name = 'validation:accuracy'

    hpo_hyperparameter_ranges = \
    {
        'learning_rate': ContinuousParameter(learning_rate_min, learning_rate_max),
        'mini_batch_size': IntegerParameter(mini_batch_size_min, mini_batch_size_max),
        'optimizer': CategoricalParameter(optimizers)
    }
    ic = sagemaker.estimator.Estimator(
        training_image,
        role, 
        train_instance_count=1, 
        train_instance_type='ml.p3.8xlarge', 
        input_mode= 'File',
        output_path=s3_output_location,
        sagemaker_session=sess
    )

    layers=34 # [18, 34, 50, 101, 152, 200, 20, 32, 44, 56, 110]
    epochs=1000

    ic.set_hyperparameters(
        num_layers=layers,
        num_classes=num_classes,
        num_training_samples=num_training_samples,
        image_shape = "3,200,200",
    # mini_batch_size=4,
        epochs=epochs,
    # learning_rate=0.001,
         top_k=5,
         precision_dtype='float32',
         use_pretrained_model=0
    )
    
    tuner_es = HyperparameterTuner(ic, 
                                   hpo_objective_metric_name, 
                                   hpo_hyperparameter_ranges,
                                   objective_type='Maximize', 
                                   max_jobs=hpo_max_number_of_training_jobs, 
                                   max_parallel_jobs=hpo_max_number_of_parallel_jobs, 
                                   early_stopping_type='Auto',
                                   strategy="Random"
                                  )

    # run the hyperparameter tuning job
    tuner_es.fit(data_channels, job_name=prefix, include_cls_metadata=False)

    print('Hyperparameter Tuning job name: {}'.format(job_name))
    tuner_es.wait()
    tuner_metrics_es = HyperparameterTuningJobAnalytics(job_name)
    tuner_metrics_es.dataframe().sort_values(['FinalObjectiveValue'], ascending=False).head(5)    
    best_training_job_name = tuner_es.best_training_job()
    return best_training_job_name

In [None]:
def single_hyperparameter_training():
    ic = sagemaker.estimator.Estimator(
        training_image,
        role, 
        train_instance_count=1, 
        train_instance_type='ml.p3.8xlarge', 
        input_mode= 'File',
        output_path=s3_output_location,
        sagemaker_session=sess
    )

    layers=34 # [18, 34, 50, 101, 152, 200, 20, 32, 44, 56, 110]
    epochs=100

    ic.set_hyperparameters(
        num_layers=layers,
        num_classes=num_classes,
        num_training_samples=num_training_samples,
        image_shape = "3,200,200",
        mini_batch_size=64,
        epochs=epochs,
        learning_rate=0.0005,
        top_k=5,
        precision_dtype='float32',
        use_pretrained_model=0
    )
    
    ic.fit(inputs=data_channels, logs=True, wait=True)
    return ic.latest_training_job.name

In [None]:
training_job_name=single_hyperparameter_training()

In [None]:
metrics = ic.training_job_analytics

In [None]:
endpoint_name = sess.endpoint_from_job(
    job_name=training_job_name,
    initial_instance_count=1,
    instance_type='ml.m4.xlarge',
    deployment_image=training_image,
    role=role
)

In [None]:
# attached_estimator = sagemaker.estimator.Estimator.attach(best_training_job_name)
# attached_estimator.deploy(initial_instance_count = 1,
#                           instance_type = 'ml.c5.4xlarge')
predictor = sagemaker.predictor.RealTimePredictor(endpoint_name)

In [None]:
!mkdir test/
!unzip -o data/lego_photos.zip -d test/

In [None]:
import os
import json
import numpy as np
from PIL import Image
import io

names = [f for f in os.listdir('test')]
for name in names:
    print("image: {}".format(name))
    image = Image.open('test/' + name)
    w,h = image.size
    if(w>200):
        image.thumbnail((200,200))
    imgByteArr = io.BytesIO()
    image.save(imgByteArr, format='PNG')
    payload = imgByteArr.getvalue()
    predictor.content_type = 'application/x-image'
    result = json.loads(predictor.predict(payload))
    index = np.argmax(result)
    print("predicted index: {}".format(index))