In [1]:
#!/usr/bin/python
import os 
import sagemaker
import subprocess


# Define instance configurations 
sess = sagemaker.Session()
role = sagemaker.get_execution_role()
account = sess.boto_session.client('sts').get_caller_identity()['Account']
region = sess.boto_session.region_name

repo_name = 'ocr' # ECR repository
image_tag = 'prj_scsk' # ECR image tag
base_job_name = 'scsk-attention-ocr' # SageMaker training prefix
# dockerfile = os.path.abspath('./new_dockerfile')

%env dockerfile Dockerfile
%env account {account}
%env region {region}
%env repo_name {repo_name}
%env image_tag {image_tag}

# print("Account: {0}".format(account))
# print("Region: {0}".format(region))
# print("Repo name: {0}".format(repo_name))
# print("Image tag: {0}".format(image_tag))
# print("Base job name: {0}".format(base_job_name))
# print("Docker file: {0}".format(dockerfile))

env: dockerfile=Dockerfile
env: account=533155507761
env: region=us-west-2
env: repo_name=ocr
env: image_tag=prj_scsk


In [2]:
%%sh
aws ecr describe-repositories --repository-names $repo_name > /dev/null 2>&1
if [ $? -ne 0 ]
then
   aws ecr create-repository --repository-name $repo_name > /dev/null
fi
$(aws ecr get-login --region $region --no-include-email)

Login Succeeded


https://docs.docker.com/engine/reference/commandline/login/#credentials-store



In [3]:
# # Build docker and push to ionstance
# subprocess.run("docker build -t {0} -f {1} . ".format(image_tag, dockerfile), shell=True)
# subprocess.run("docker tag {0} {1}.dkr.ecr.{2}.amazonaws.com/{3}:latest".format(image_tag, account, region, repo_name), shell=True)
# subprocess.run("docker push {0}.dkr.ecr.{1}.amazonaws.com/{2}:latest".format(account, region, repo_name), shell=True)

!docker build -t $image_tag -f $dockerfile .
!docker tag $image_tag $account.dkr.ecr.$region.amazonaws.com/$repo_name:latest
!docker images
!docker push $account.dkr.ecr.$region.amazonaws.com/$repo_name:latest

Sending build context to Docker daemon    896kB
Step 1/19 : FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
9.0-cudnn7-devel-ubuntu16.04: Pulling from nvidia/cuda

[1B7927d38a: Already exists 
[1Bac894db4: Already exists 
[1B2af6d627: Already exists 
[1B86211d23: Already exists 
[1B10d14aae: Already exists 
[1Bfee99264: Pulling fs layer 
[1B384ee2be: Pulling fs layer 
[1Bdc8f51ef: Pulling fs layer 
[1Bdec49181: Pulling fs layer 
[1BDigest: sha256:bb858657cc4b2571eec97596d556bd17e791e72dd7bb70fa61f11640fe0a2f84[K[3A[1K[K[3A[1K[K[2A[1K[K[3A[1K[K[2A[1K[K[3A[1K[K[3A[1K[K[1A[1K[K[2A[1K[K[2A[1K[K[1A[1K[K[1A[1K[K[2A[1K[K[2A[1K[K[3A[1K[K[2A[1K[K[3A[1K[K[2A[1K[K[2A[1K[K[2A[1K[K[3A[1K[K[2A[1K[K[3A[1K[K[2A[1K[K[3A[1K[K[2A[1K[K[3A[1K[K[1A[1K[K[3A[1K[K[2A[1K[K[1A[1K[K[3A[1K[K[1A[1K[K[3A[1K[K[1A[1K[K[3A[1K[K[2A[1K[K[3A[1K[K[1A[1K[K[3A[1K[K[3A[1K[K[3A[1K[K[1A[1K[K

In [4]:

# Define data path in S3 
s3_directory = 's3://scsk-data/ocr_data/data'
train_input_channel = sagemaker.session.s3_input(s3_directory, distribution='FullyReplicated',  s3_data_type='S3Prefix')

# Define image name, output path to save model 
output_path = 's3://scsk-data/ocr_data/output/'
image_name  = '{}.dkr.ecr.{}.amazonaws.com/{}:latest'.format(account, region, repo_name)

## Define instance to train 
train_instance_type = 'ml.p3.2xlarge'
# train_instance_type = 'ml.p3.8xlarge'

# Define space of disk to storage input data
storage_space = 200 # Gb

# Maximum seconds for this training job’s life (days * hours * seconds)
train_max_run = 3 * 24  * 3600


In [None]:

# Set sagemaker estimator and process to train
estimator = sagemaker.estimator.Estimator(
                       image_name=image_name,
                       base_job_name=base_job_name,
                       role=role,
                       input_mode='File',
                       train_instance_count=1,
                       train_volume_size=storage_space,
                       train_instance_type=train_instance_type,
                       output_path=output_path,
                       train_max_run=train_max_run,
                       sagemaker_session=sess)

estimator.fit({'train': train_input_channel})

2019-08-13 02:04:57 Starting - Starting the training job...
2019-08-13 02:05:10 Starting - Launching requested ML instances......
2019-08-13 02:06:16 Starting - Preparing the instances for training......
2019-08-13 02:07:12 Downloading - Downloading input data......
2019-08-13 02:08:10 Training - Downloading the training image......
2019-08-13 02:09:16 Training - Training image download completed. Training in progress.
[31mInstalling requirements...[0m
[31mCollecting tensorflow-gpu==1.12.0 (from -r requirements.txt (line 1))
  Downloading https://files.pythonhosted.org/packages/55/7e/bec4d62e9dc95e828922c6cec38acd9461af8abe749f7c9def25ec4b2fdb/tensorflow_gpu-1.12.0-cp36-cp36m-manylinux1_x86_64.whl (281.7MB)[0m
[31mCollecting keras==2.2.4 (from -r requirements.txt (line 2))
  Downloading https://files.pythonhosted.org/packages/5e/10/aa32dad071ce52b5502266b5c659451cfd6ffcbf14e6c8c4f16c0ff5aaab/Keras-2.2.4-py2.py3-none-any.whl (312kB)[0m
[31mCollecting numpy (from -r requirements.t