In [1]:
!cat Dockerfile

FROM python:3.7-slim-buster

LABEL maintainer=cong

RUN apt-get -y update && rm -rf /var/lib/apt/lists/*

RUN pip install pandas && rm -rf /root/.cache
RUN mkdir /opt/program

ENV PYTHONUNBUFFERED=TRUE
ENV PYTHONDONTWRITEBYTECODE=TRUE
ENV PATH="/opt/program:${PATH}"

COPY preprocessing.py /opt/program
WORKDIR /opt/program

In [7]:
%%sh

#!/usr/bin/env bash

# This script shows how to build the Docker image and push it to ECR to be ready for use
# by SageMaker.

# The argument to this script is the image name. This will be used as the image on the local
# machine and combined with the account and region to form the repository name for ECR.
image=sagemaker-preprocessing

if [ "$image" == "" ]
then
    echo "Usage: $0 <image-name>"
    exit 1
fi

chmod +x preprocessing.py

# Get the account number associated with the current IAM credentials
account=$(aws sts get-caller-identity --query Account --output text)

if [ $? -ne 0 ]
then
    exit 255
fi


# Get the region defined in the current configuration (default to us-west-2 if none defined)
region=$(aws configure get region)
region=${region:-us-west-2}


fullname="${account}.dkr.ecr.${region}.amazonaws.com/${image}:latest"

# If the repository doesn't exist in ECR, create it.

aws ecr describe-repositories --repository-names "${image}" > /dev/null 2>&1

if [ $? -ne 0 ]
then
    aws ecr create-repository --repository-name "${image}" > /dev/null
fi

# Get the login command from ECR and execute it directly
$(aws ecr get-login --region ${region} --no-include-email)

# Build the docker image locally with the image name and then push it to ECR
# with the full name.

docker build  -t ${image} .
docker tag ${image} ${fullname}

docker push ${fullname}

Login Succeeded

Step 1/10 : FROM python:3.7-slim-buster
 ---> d7ee20941226
Step 2/10 : LABEL maintainer=cong
 ---> Using cache
 ---> dffc6d3f6427
Step 3/10 : RUN apt-get -y update && rm -rf /var/lib/apt/lists/*
 ---> Using cache
 ---> 3796209ac6de
Step 4/10 : RUN pip install pandas && rm -rf /root/.cache
 ---> Using cache
 ---> fb04ab9aca5a
Step 5/10 : RUN mkdir /opt/program
 ---> Using cache
 ---> a413a5682731
Step 6/10 : ENV PYTHONUNBUFFERED=TRUE
 ---> Using cache
 ---> 163890f6ab60
Step 7/10 : ENV PYTHONDONTWRITEBYTECODE=TRUE
 ---> Using cache
 ---> 9bdfc6711de2
Step 8/10 : ENV PATH="/opt/program:${PATH}"
 ---> Using cache
 ---> b1a501d91e71
Step 9/10 : COPY preprocessing.py /opt/program
 ---> 66fd39a749d7
Step 10/10 : WORKDIR /opt/program
 ---> Running in cc08a04e551c
Removing intermediate container cc08a04e551c
 ---> 93f29c4121cf
Successfully built 93f29c4121cf
Successfully tagged sagemaker-preprocessing:latest
The push refers to repository [169274280855.dkr.ecr.us-east-2.amazon

https://docs.docker.com/engine/reference/commandline/login/#credentials-store



### processing job

In [9]:
# S3 prefix
prefix = 'DEMO-processing-byo-memory'

# Define IAM role
import boto3

import pandas as pd
from sagemaker import get_execution_role

role = get_execution_role()
role

'arn:aws:iam::169274280855:role/SageMakerFullAccess'

In [10]:
import sagemaker as sage

sess = sage.Session()

In [11]:
WORK_DIRECTORY = 'data'

data_location = sess.upload_data(WORK_DIRECTORY, key_prefix=prefix)
data_location

's3://sagemaker-us-east-2-169274280855/DEMO-processing-byo-memory'

In [12]:
account = sess.boto_session.client('sts').get_caller_identity()['Account']
region = sess.boto_session.region_name
image = '{}.dkr.ecr.{}.amazonaws.com/sagemaker-preprocessing:latest'.format(account, region)
image

'169274280855.dkr.ecr.us-east-2.amazonaws.com/sagemaker-preprocessing:latest'

In [18]:
from sagemaker.processing import ScriptProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput

script_processor = ScriptProcessor(image_uri=image,
                                    role=role,
                                    instance_count=1,
                                    command=["python3", "-v"],
                                    instance_type='ml.m5.xlarge')

In [20]:
script_processor.run(code='preprocessing.py',
                    inputs=[ProcessingInput(source='data/memory.dat',destination='/opt/ml/processing/input')],
                    outputs=[ProcessingOutput(source='/opt/ml/processing/output/train')])


Job Name:  sagemaker-preprocessing-2020-05-09-08-15-24-152
Inputs:  [{'InputName': 'input-1', 'S3Input': {'S3Uri': 's3://sagemaker-us-east-2-169274280855/sagemaker-preprocessing-2020-05-09-08-15-24-152/input/input-1/memory.dat', 'LocalPath': '/opt/ml/processing/input', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'S3Input': {'S3Uri': 's3://sagemaker-us-east-2-169274280855/sagemaker-preprocessing-2020-05-09-08-15-24-152/input/code/preprocessing.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'output-1', 'S3Output': {'S3Uri': 's3://sagemaker-us-east-2-169274280855/sagemaker-preprocessing-2020-05-09-08-15-24-152/output/output-1', 'LocalPath': '/opt/ml/processing/output/train', 'S3UploadMode': 'EndOfJob'}}]
..................
[34mimport

In [21]:
region = boto3.session.Session().region_name

bucket = 'sagemaker-us-east-2-169274280855'
preprocessed_prefix = 'jc/aiops/data'
output_train_data = 's3://{}/{}/{}/'.format(bucket, preprocessed_prefix,'train')

In [26]:
%%time
#### all low level APIs
client = boto3.client('sagemaker')
_job_name = 'aiops-processing-job'
response = client.create_processing_job(
    ProcessingInputs=[
        {
            'InputName': 'string',
            'S3Input': {
                'S3Uri': data_location,
                'LocalPath': '/opt/ml/processing/input',
                'S3DataType': 'S3Prefix',
                'S3InputMode': 'File',
                'S3DataDistributionType': 'ShardedByS3Key',
                'S3CompressionType': 'None'
            }
        },
    ],
    ProcessingOutputConfig={
        'Outputs': [
            {
                'OutputName': 'train_data',
                'S3Output': {
                    'S3Uri': output_train_data,
                    'LocalPath': '/opt/ml/processing/output/train',
                    'S3UploadMode': 'EndOfJob'
                }
            }
        ]
    },
    ProcessingJobName=_job_name,
    ProcessingResources={
        'ClusterConfig': {
            'InstanceCount': 1,
            'InstanceType':'ml.c5.xlarge',
            'VolumeSizeInGB': 10
        }
    },
    StoppingCondition={
        'MaxRuntimeInSeconds': 1200
    },
    AppSpecification={
        'ImageUri': image,
        'ContainerEntrypoint': ['python3','preprocessing.py']
    },
    Environment={
        'mode': 'python'
    },
    RoleArn=role
)

CPU times: user 13.9 ms, sys: 182 µs, total: 14.1 ms
Wall time: 84.3 ms


In [30]:
import time
while 1:
    print(client.list_processing_jobs()['ProcessingJobSummaries'][0]['ProcessingJobStatus'])
    time.sleep(10)

InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
Completed
Completed
Completed
Completed


KeyboardInterrupt: 