# Train Model

Author: k.mcnamara
Date: 12/11/2020

This Notebook has cells for training, validating and deploying a model.
Calls scripts from model_training.

## Import depdendencies
kernel = conda_tensorflow2_p36

In [2]:
import shutil
import sagemaker
import os
import numpy as np
from sagemaker import get_execution_role
from sagemaker.tensorflow import TensorFlowModel
from sagemaker.tensorflow import TensorFlow
from tensorflow.python.keras.preprocessing.image import load_img

In [3]:
sagemaker_session = sagemaker.Session()
role = get_execution_role()
region = sagemaker_session.boto_session.region_name

In [4]:
os.environ['SM_MODEL_DIR'] = "/opt/ml/model"
print(os.environ['SM_MODEL_DIR'])

/opt/ml/model


## S3 bucket URI for training and validation data

In [5]:
# Set S3 address to training data
bucket = "tickercardiology-echocv-sagemaker"
key = "model-data"
training_data_uri = "s3://{}/{}/train/".format(bucket, key)
# Set S3 address to validation data
validation_data_uri = "s3://{}/{}/test/".format(bucket, key)

## Display training script

In [None]:
!pygmentize 'keras_model_fn.py'

## Create TensorFlow2 estimator 

Training instance: ml.g4dn.xlarge
Base GPU enabled instance for training.

In [None]:
estimator2 = TensorFlow(entry_point='keras_model_fn.py',
                        role=role,
                        train_instance_count=1,
                        train_instance_type='ml.g4dn.xlarge', #$0.958
                        framework_version='2.1.0',
                        py_version='py36',
                        hyperparameters={
                                        'epochs': 15,
                                        'batch_size': 64,
                                        'learning_rate': 1e-5},
                        script_mode=True,
                        model_dir = os.environ['SM_MODEL_DIR'],
                        distributions={'parameter_server': {'enabled': True}})

bucket = "sagemaker-ap-southeast-2-611188727347"
key = "test1"
model_path = "s3://{}/{}/".format(bucket, key)

# Train Model
### model save path:

S3://{sagemaker-ap-southeast-2-611188727347}.{tensorflow-training-year-month-data-hour-minute-second-ms}/output/model.tar.gz

In [None]:
estimator2.fit({'training': training_data_uri, 'validation': validation_data_uri})

# Deploy trained model
## Load model from S3, if estimator trained in this notebook

instance: ml.m5.large

Increase instance size if analysing more data

In [None]:
predictor2 = estimator2.deploy(initial_instance_count=1, instance_type='ml.m5.large', endpoint_name='model1')

# OR
## Load model from S3, if estimator not trained in this notebook 

instance: ml.m5.large

Increase instance size if analysing more data

In [6]:
bucket = "sagemaker-ap-southeast-2-611188727347"
key = "tensorflow-training-2020-11-09-01-37-28-348"
model_path = "s3://{}/{}/output/model.tar.gz".format(bucket, key)

model = TensorFlowModel(model_data=model_path, 
                        role=role,
                        framework_version='2.1.0')

In [7]:
predictor = model.deploy(initial_instance_count=1, instance_type='ml.m5.large')

update_endpoint is a no-op in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


-------------!

## Delete the endpoints to save resources

In [30]:
sagemaker.Session().delete_endpoint(predictor.endpoint)

The endpoint attribute has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [None]:
## Download the .pb and variables from S3
# !aws s3 cp s3://sagemaker-ap-southeast-2-611188727347/tensorflow-training-2020-11-06-03-49-07-502/model/1 temp/1 --recursive 
## Run this in terminal:    tar -C "$PWD" -czf model.tar.gz temp/
## Copy the tar.gz to the s3://kate//tensorflow-training-date/output/
# !aws s3 cp model.tar.gz s3://sagemaker-ap-southeast-2-611188727347/tensorflow-training-2020-11-06-03-49-07-502/output/ 