# 环境准备

In [10]:
%%time
import boto3
import re
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri

role = get_execution_role()

bucket='auto-ai-ml-demo' # customize to your bucket

training_image = get_image_uri(boto3.Session().region_name, 'image-classification')

CPU times: user 673 ms, sys: 222 ms, total: 895 ms
Wall time: 6.69 s


# 数据预处理

In [8]:
# download 
!mkdir -p dataset
!rm -rf car_ims
!wget -O dataset/car_ims.tgz http://imagenet.stanford.edu/internal/car196/car_ims.tgz
!tar -zxf dataset/car_ims.tgz -C dataset
!ls -1 dataset/car_ims | wc -l

!wget -O dataset/bmw10_release.tgz http://imagenet.stanford.edu/internal/car196/bmw10_release.tgz
!tar -zxf dataset/bmw10_release.tgz -C dataset
!ls -1 dataset/bmw10_ims | wc -l

In [7]:
# im2rec.py 进行快速图像转换
!chmod +x invoke_im2rec.sh
!./invoke_im2rec.sh dataset/car_ims/

fatal: destination path 'incubator-mxnet' already exists and is not an empty directory.
Please run 'python ~/anaconda3/bin/EISetupValidator.py' if you experience issues using Amazon EI service. This script verifies that this instance is correctly configured to use Amazon EI service.
unzip:  cannot find or open *.zip, *.zip.zip or *.zip.ZIP.

No zipfiles found.
data_path: dataset/car_ims/
train_path: train/
val_path: validation/
. 0
Creating .rec file from /home/ec2-user/SageMaker/auto-image-classification/dataset/car_ims/data_train.lst in /home/ec2-user/SageMaker/auto-image-classification/dataset/car_ims
time: 0.2898116111755371  count: 0
time: 3.2048330307006836  count: 1000
time: 3.345341920852661  count: 2000
time: 3.420041561126709  count: 3000
time: 3.4450042247772217  count: 4000
time: 3.759990692138672  count: 5000
time: 3.546022653579712  count: 6000
time: 3.4200737476348877  count: 7000
time: 3.567103862762451  count: 8000
time: 3.417968988418579  count: 9000
time: 3.673059463

In [8]:
# 检查生成的文件data_train.rec和data_val.rec
!ls -sh validation/
!ls -sh train/

total 72M
72M data_val.rec
total 286M
286M data_train.rec


# 预处理部分上传到S3

In [14]:
%%time
import os 
import boto3

     
def upload_to_s3(prefix, file):
    s3 = boto3.resource('s3')
    data = open(file, "rb")
    key = prefix + file
    s3.Bucket(bucket).put_object(Key=key, Body=data)


# caltech-256
s3_train_key = "car_data_sample/train"
s3_validation_key = "car_data_sample/validation"
s3_train = 's3://{}/{}/'.format(bucket, s3_train_key)
s3_validation = 's3://{}/{}/'.format(bucket, s3_validation_key)

upload_to_s3('car_data_sample/','train/data_train.rec')
upload_to_s3('car_data_sample/','validation/data_val.rec')

CPU times: user 927 ms, sys: 325 ms, total: 1.25 s
Wall time: 4.65 s


In [15]:
!aws s3api list-objects-v2 --bucket auto-ai-ml-demo --prefix car_data_sample --max-items 20

{
    "Contents": [
        {
            "Key": "car_data_sample/train/data_train.rec",
            "LastModified": "2020-04-21T11:13:44.000Z",
            "ETag": "\"84ceb410a91816cb4eec37f48bd4c608\"",
            "Size": 298905388,
            "StorageClass": "STANDARD"
        },
        {
            "Key": "car_data_sample/validation/data_val.rec",
            "LastModified": "2020-04-21T11:13:47.000Z",
            "ETag": "\"159bca1f059617b14e0ae62b1d3cc98d\"",
            "Size": 74752152,
            "StorageClass": "STANDARD"
        }
    ]
}


# 使用迁移学习进行模型训练

## 配置模型训练的超参数
- Num_layers： 神经网络的层数，本例中可以选择18, 34, 50, 101, 152 and 200。很多经典网络模型的名字中包含的数字就代表了layer个数，如vgg16中的16就代表了权重层的个数
- Image_shape： 输入图像的通道数，像素的长宽
- Num_training_samples： 训练样本的个数
- Num_classes: 训练样本图像分类的类目数，本例中为了简介，只选取了三个class做范例
- mini_batch_size： 每轮训练的输入一批数据包含的数目
- epochs： 训练轮次
- learning_rate： 训练学习率
- use_pretrained_model： 是否使用预训练模型进行迁移学习，如为1，则初始化中使用已经基于一个较大的开源数据集，如imagenet，学习的网络结构


In [16]:
# The algorithm supports multiple network depth (number of layers). They are 18, 34, 50, 101, 152 and 200
# For this training, we will use 18 layers
num_layers = 18
# we need to specify the input image shape for the training data
image_shape = "3,224,224"
# we also need to specify the number of training samples in the training set
# for caltech it is 15420
num_training_samples = 96
# specify the number of output classes
num_classes = 3
# batch size for training
mini_batch_size =  30
# number of epochs
epochs = 100
# learning rate
learning_rate = 0.01
top_k=2
# Since we are using transfer learning, we set use_pretrained_model to 1 so that weights can be 
# initialized with pre-trained weights
use_pretrained_model = 1

## SageMaker API 的创建
构建对应的训练任务 – 其中有指定训练的输入与输出，训练的计算实例配置

In [30]:
%%time
import time
import boto3
from time import gmtime, strftime


s3 = boto3.client('s3')
# create unique job name 
job_name_prefix = 'cars-imageclassification'
timestamp = time.strftime('-%Y-%m-%d-%H-%M-%S', time.gmtime())
job_name = job_name_prefix + timestamp
training_params = \
{
    # specify the training docker image
    "AlgorithmSpecification": {
        "TrainingImage": training_image,
        "TrainingInputMode": "File"
    },
    "RoleArn": role,
    "OutputDataConfig": {
        "S3OutputPath": 's3://{}/{}/output'.format(bucket, job_name_prefix)
    },
    "ResourceConfig": {
        "InstanceCount": 1,
        "InstanceType": "ml.p2.xlarge",
        "VolumeSizeInGB": 50
    },
    "TrainingJobName": job_name,
    "HyperParameters": {
        "image_shape": image_shape,
        "num_layers": str(num_layers),
        "num_training_samples": str(num_training_samples),
        "num_classes": str(num_classes),
        "mini_batch_size": str(mini_batch_size),
        "epochs": str(epochs),
        "learning_rate": str(learning_rate),
        "use_pretrained_model": str(use_pretrained_model)
    },
    "StoppingCondition": {
        "MaxRuntimeInSeconds": 360000
    },
#Training data should be inside a subdirectory called "train"
#Validation data should be inside a subdirectory called "validation"
#The algorithm currently only supports fullyreplicated model (where data is copied onto each machine)
    "InputDataConfig": [
        {
            "ChannelName": "train",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": s3_train,
                    "S3DataDistributionType": "FullyReplicated"
                }
            },
            "ContentType": "application/x-recordio",
            "CompressionType": "None"
        },
        {
            "ChannelName": "validation",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": s3_validation,
                    "S3DataDistributionType": "FullyReplicated"
                }
            },
            "ContentType": "application/x-recordio",
            "CompressionType": "None"
        }
    ]
}
print('Training job name: {}'.format(job_name))
for i in training_params['InputDataConfig']:
    print('\nInput Data Location: {}'.format(i['DataSource']['S3DataSource']))

Training job name: cars-imageclassification-2020-04-21-11-34-19

Input Data Location: {'S3DataType': 'S3Prefix', 'S3Uri': 's3://auto-ai-ml-demo/car_data_sample/train/', 'S3DataDistributionType': 'FullyReplicated'}

Input Data Location: {'S3DataType': 'S3Prefix', 'S3Uri': 's3://auto-ai-ml-demo/car_data_sample/validation/', 'S3DataDistributionType': 'FullyReplicated'}
CPU times: user 7.8 ms, sys: 0 ns, total: 7.8 ms
Wall time: 7.29 ms


## 调用 SageMaker API 来启动训练任务
用一个简单的 API- sagemaker.create_training_job 并且根据上一步中的参数配置，就直接可以进行轻松的训练了。中间没有任何的环境构建，部署，甚至是神经网络模型设计的过程。

In [None]:
# create the Amazon SageMaker training job
sagemaker = boto3.client(service_name='sagemaker')
sagemaker.create_training_job(**training_params)

# confirm that the training job has started
status = sagemaker.describe_training_job(TrainingJobName=job_name)['TrainingJobStatus']
print('Training job current status: {}'.format(status))

try:
    # wait for the job to finish and report the ending status
    sagemaker.get_waiter('training_job_completed_or_stopped').wait(TrainingJobName=job_name)
    training_info = sagemaker.describe_training_job(TrainingJobName=job_name)
    status = training_info['TrainingJobStatus']
    print("Training job ended with status: " + status)
except:
    print('Training failed to start')
     # if exception is raised, that means it has failed
    message = sagemaker.describe_training_job(TrainingJobName=job_name)['FailureReason']
    print('Training failed with the following error: {}'.format(message))

Training job current status: InProgress


In [34]:
training_info = sagemaker.describe_training_job(TrainingJobName=job_name)
status = training_info['TrainingJobStatus']
print("Training job ended with status: " + status)

Training job ended with status: Completed


# 模型部署
## 创建模型

In [36]:
%%time
import boto3
from time import gmtime, strftime

sage = boto3.Session().client(service_name='sagemaker') 

model_name="cars-imageclassification-" + time.strftime('-%Y-%m-%d-%H-%M-%S', time.gmtime())
print(model_name)
info = sage.describe_training_job(TrainingJobName=job_name)
model_data = info['ModelArtifacts']['S3ModelArtifacts']
print(model_data)

hosting_image = get_image_uri(boto3.Session().region_name, 'image-classification')

primary_container = {
    'Image': hosting_image,
    'ModelDataUrl': model_data,
}

create_model_response = sage.create_model(
    ModelName = model_name,
    ExecutionRoleArn = role,
    PrimaryContainer = primary_container)

print(create_model_response['ModelArn'])


cars-imageclassification--2020-04-21-15-35-44
s3://auto-ai-ml-demo/cars-imageclassification/output/cars-imageclassification-2020-04-21-11-34-19/output/model.tar.gz
arn:aws:sagemaker:us-east-1:710299592439:model/cars-imageclassification--2020-04-21-15-35-44
CPU times: user 62.6 ms, sys: 4.79 ms, total: 67.4 ms
Wall time: 480 ms


## 配置推理Endpoint

In [37]:
from time import gmtime, strftime

timestamp = time.strftime('-%Y-%m-%d-%H-%M-%S', time.gmtime())
endpoint_config_name = job_name_prefix + '-epc-' + timestamp
endpoint_config_response = sage.create_endpoint_config(
    EndpointConfigName = endpoint_config_name,
    ProductionVariants=[{
        'InstanceType':'ml.m4.xlarge',
        'InitialInstanceCount':1,
        'ModelName':model_name,
        'VariantName':'AllTraffic'}])

print('Endpoint configuration name: {}'.format(endpoint_config_name))
print('Endpoint configuration arn:  {}'.format(endpoint_config_response['EndpointConfigArn']))

Endpoint configuration name: cars-imageclassification-epc--2020-04-21-15-38-34
Endpoint configuration arn:  arn:aws:sagemaker:us-east-1:710299592439:endpoint-config/cars-imageclassification-epc--2020-04-21-15-38-34


## 创建推理Endpoint

In [39]:
%%time
import time

timestamp = time.strftime('-%Y-%m-%d-%H-%M-%S', time.gmtime())
endpoint_name = job_name_prefix + '-ep-' + timestamp
print('Endpoint name: {}'.format(endpoint_name))

endpoint_params = {
    'EndpointName': endpoint_name,
    'EndpointConfigName': endpoint_config_name,
}
endpoint_response = sagemaker.create_endpoint(**endpoint_params)
print('EndpointArn = {}'.format(endpoint_response['EndpointArn']))

Endpoint name: cars-imageclassification-ep--2020-04-21-15-39-34
EndpointArn = arn:aws:sagemaker:us-east-1:710299592439:endpoint/cars-imageclassification-ep--2020-04-21-15-39-34
CPU times: user 14.9 ms, sys: 0 ns, total: 14.9 ms
Wall time: 257 ms


In [40]:
# get the status of the endpoint
response = sagemaker.describe_endpoint(EndpointName=endpoint_name)
status = response['EndpointStatus']
print('EndpointStatus = {}'.format(status))
# wait until the status has changed
sagemaker.get_waiter('endpoint_in_service').wait(EndpointName=endpoint_name)
# print the status of the endpoint
endpoint_response = sagemaker.describe_endpoint(EndpointName=endpoint_name)
status = endpoint_response['EndpointStatus']
print('Endpoint creation ended with EndpointStatus = {}'.format(status))

EndpointStatus = Creating
Endpoint creation ended with EndpointStatus = InService
