### Training a Keras CNN on Fashion-MNIST

Fashion-MNIST is a Zalando dataset consisting of a training set of 60,000 examples and a test set of 10,000 examples. Each example is a 28x28 grayscale image, associated with a label from 10 classes. It's a drop-in replacement for MNIST.

https://github.com/zalandoresearch/fashion-mnist/

In this notebook, we'll train a simple CNN built with Keras, using the built-in Tensorflow and Apache MXNet containers provided by Amazon SageMaker.

In [None]:
from IPython.display import Image
Image("fashion-mnist-sprite.png")

In [None]:
import sagemaker

sess = sagemaker.Session()
role = sagemaker.get_execution_role()

## Download the Fashion-MNIST dataset

In [None]:
import os
import keras
import numpy as np
from keras.datasets import fashion_mnist
(x_train, y_train), (x_val, y_val) = fashion_mnist.load_data()

os.makedirs("./data", exist_ok = True)

np.savez('./data/training', image=x_train, label=y_train)
np.savez('./data/validation', image=x_val, label=y_val)

## Upload Fashion-MNIST data to S3

In [None]:
prefix = 'keras-fashion-mnist'

training_input_path   = sess.upload_data('data/training.npz', key_prefix=prefix+'/training')
validation_input_path = sess.upload_data('data/validation.npz', key_prefix=prefix+'/validation')

print(training_input_path)
print(validation_input_path)

## Train outside of SageMaker (just like on your laptop)

In [None]:
# 1 GPU on this machine
%env SM_NUM_GPUS=1
# Where to save the model
%env SM_MODEL_DIR=/tmp/model
# Where the training data is
%env SM_CHANNEL_TRAINING=data
# Where the validation data is
%env SM_CHANNEL_VALIDATION=data

!python mnist_keras_tf.py --epochs 1

## Train on the notebook instance (aka 'local mode')

In [None]:
from sagemaker.tensorflow import TensorFlow

tf_estimator = TensorFlow(entry_point='mnist_keras_tf.py', 
                          role=role,
                          train_instance_count=1, 
                          train_instance_type='local_gpu',
                          framework_version='1.12', 
                          py_version='py3',
                          script_mode=True,
                          hyperparameters={'epochs': 1}
                         )

In [None]:
tf_estimator.fit({'training': training_input_path, 'validation': validation_input_path})

## Configure the training job on a GPU instance

In [None]:
tf_estimator = TensorFlow(entry_point='mnist_keras_tf.py', 
                          role=role,
                          train_instance_count=1, 
                          train_instance_type='ml.p3.8xlarge',
                          framework_version='1.12', 
                          py_version='py3',
                          script_mode=True
                         )

So we could now do:

**tf_estimator.fit(...)**

**tf_estimator.deploy(...)**

But let's tune the model hyper parameters first!

## Configure Automatic Model Tuning

In [None]:
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner

hyperparameter_ranges = {
    'epochs':        IntegerParameter(10, 100),
    'learning-rate': ContinuousParameter(0.001, 0.1, scaling_type='Logarithmic'), 
    'batch-size':    IntegerParameter(256, 1024),
    'dense-layer':   IntegerParameter(128, 1024),
    'dropout':       ContinuousParameter(0.1, 0.5)
}

objective_metric_name = 'val_acc'
objective_type = 'Maximize'
metric_definitions = [{'Name': 'val_acc', 'Regex': 'val_acc: ([0-9\\.]+)'}]

tuner = HyperparameterTuner(tf_estimator,
                            objective_metric_name,
                            hyperparameter_ranges,
                            metric_definitions,
                            max_jobs=20,
                            max_parallel_jobs=2,
                            objective_type=objective_type)

In [None]:
tuner.fit({'training': training_input_path, 'validation': validation_input_path})

## Deploy the best model

In [None]:
import time

tf_endpoint_name = 'keras-tf-fmnist-'+time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())

#tf_predictor = tf_estimator.deploy(initial_instance_count=1,
#                                   instance_type='ml.p2.xlarge')      # $1.361/hour in eu-west-1

tf_predictor = tuner.deploy(initial_instance_count=1,
                         instance_type='ml.c5.large',        # $0.134/hour in eu-west-1
                         accelerator_type='ml.eia1.medium',  # $0.140/hour in eu-west-1
                         endpoint_name=tf_endpoint_name)

With Elastic Inference, we get comparable performance at **80% discount**.

## Predict 

In [None]:
%matplotlib inline
import random
import matplotlib.pyplot as plt

# Take random samples from the validation dataset
num_samples = 10
indices = random.sample(range(x_val.shape[0] - 1), num_samples)
images = x_val[indices]/255
labels = y_val[indices]

# Display them
for i in range(num_samples):
    plt.subplot(1,num_samples,i+1)
    plt.imshow(images[i].reshape(28, 28), cmap='gray')
    plt.title(labels[i])
    plt.axis('off')

# Predict their most likely class
prediction = tf_predictor.predict(images.reshape(num_samples, 28, 28, 1))['predictions']
prediction = np.array(prediction)
predicted_label = prediction.argmax(axis=1)
print('Predicted labels are: {}'.format(predicted_label))

## Clean up

In [None]:
#sess.delete_endpoint(endpoint_name=tf_endpoint_name)

# Now... how about deploying that model to AWS Fargate?

In [None]:
Image("hobbs-and-shaw-slice.jpg")

## Find best training job

In [None]:
import boto3

sagemaker = boto3.Session().client(service_name='sagemaker') 

In [None]:
tuning_job_name = tuner.latest_tuning_job.job_name
tuning_job_result = sagemaker.describe_hyper_parameter_tuning_job(HyperParameterTuningJobName=tuning_job_name)
best_training_job_name = tuning_job_result['BestTrainingJob']['TrainingJobName']
print(best_training_job_name)

## Get model artefact and push it to Git repository

In [None]:
best_training_job = sagemaker.describe_training_job(TrainingJobName=best_training_job_name)
best_model = best_training_job['ModelArtifacts']['S3ModelArtifacts']
print(best_model)

%env best_model {best_model}

In [None]:
%%sh
aws s3 cp ${best_model} .
tar xvfz model.tar.gz -C test-models 

In [None]:
%%sh
cd test-models
git add model
git commit -m 'New model'
git push

## Create cluster

In [None]:
%%sh 

aws ecs create-cluster --cluster-name fargate-demo
ecs-cli configure --cluster fargate-demo --region eu-west-1

In [None]:
%%sh

ecs-cli ps --desired-status RUNNING

## Run inference task

In [None]:
%%sh

export SECURITY_GROUP_ID=sg-0010f9778dc2e6fb2 # SSH access + Tensorflow Serving ports
export SUBNET_ID=subnet-cbf5bdbc

# aws ecs register-task-definition --cli-input-json file://inference-fargate-tf112-sagemaker.json

aws ecs run-task --cluster fargate-demo --task-definition inference-fargate-tf:8 --count 1 \
    --launch-type FARGATE \
    --network-configuration "awsvpcConfiguration={subnets=[$SUBNET_ID], \
                            securityGroups=[$SECURITY_GROUP_ID], \
                            assignPublicIp=ENABLED}"

In [None]:
%%sh

ecs-cli ps --desired-status RUNNING

## Predict

In [None]:
inference_task_ip = '34.243.74.153'
inference_url = 'http://'+inference_task_ip+':8501/v1/models/1:predict'

In [None]:
num_samples = 5
indices = random.sample(range(x_val.shape[0] - 1), num_samples)
images = x_val[indices]/255
labels = y_val[indices]

data = images.reshape(num_samples, 28, 28, 1)

In [None]:
import json, requests

headers = {"content-type": "application/json"}
data    = json.dumps({"signature_name": "serving_default", "instances": data.tolist()})

json_response = requests.post(inference_url, data=data, headers=headers)

predictions = json.loads(json_response.text)['predictions']
print(predictions)
predictions = np.array(predictions).argmax(axis=1)

print("Labels     : ", labels)
print("Predictions: ", predictions)