In [1]:
import numpy as np
import pandas as pd

# Define IAM role
import boto3
import re
import sagemaker
from sagemaker import get_execution_role

# SageMaker SDK Documentation: http://sagemaker.readthedocs.io/en/latest/estimators.html

## Upload Data to S3

In [2]:
bucket_name = 'pmitest-sagemaker'
training_file_key = 'movie/user_movie_train.recordio'
test_file_key = 'movie/user_movie_test.recordio'

s3_model_output_location = r's3://{0}/movie/model'.format(bucket_name)
s3_training_file_location = r's3://{0}/{1}'.format(bucket_name,training_file_key)
s3_test_file_location = r's3://{0}/{1}'.format(bucket_name,test_file_key)

In [3]:
print(s3_model_output_location)
print(s3_training_file_location)
print(s3_test_file_location)

s3://pmitest-sagemaker/movie/model
s3://pmitest-sagemaker/movie/user_movie_train.recordio
s3://pmitest-sagemaker/movie/user_movie_test.recordio


In [4]:
# Write and Reading from S3 is just as easy
# files are referred as objects in S3.  
# file name is referred as key name in S3
# Files stored in S3 are automatically replicated across 3 different availability zones 
# in the region where the bucket was created.

# http://boto3.readthedocs.io/en/latest/guide/s3.html
def write_to_s3(filename, bucket, key):
    with open(filename,'rb') as f: # Read in binary mode
        return boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_fileobj(f)

In [5]:
write_to_s3(r'ml-latest-small/user_movie_train.recordio',bucket_name,training_file_key)

In [6]:
write_to_s3(r'ml-latest-small/user_movie_test.recordio',bucket_name,test_file_key)

## Training Algorithm Docker Image
### AWS Maintains a separate image for every region and algorithm

In [7]:
# Registry Path for algorithms provided by SageMaker
#  https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html
containers = {'us-west-2': '174872318107.dkr.ecr.us-west-2.amazonaws.com/factorization-machines:latest',
              'us-east-1': '382416733822.dkr.ecr.us-east-1.amazonaws.com/factorization-machines:latest',
              'us-east-2': '404615174143.dkr.ecr.us-east-2.amazonaws.com/factorization-machines:latest',
              'eu-west-1': '438346466558.dkr.ecr.eu-west-1.amazonaws.com/factorization-machines:latest',
              'eu-central-1': '813361260812.dkr.ecr.eu-central-1.amazonaws.com/factorization-machines:latest'}

In [8]:
role = get_execution_role()

In [9]:
# This role contains the permissions needed to train, deploy models
# SageMaker Service is trusted to assume this role
print(role)

arn:aws:iam::239979694630:role/service-role/AmazonSageMaker-ExecutionRole-20190206T180766


## Build Model

In [10]:
sess = sagemaker.Session()

In [11]:
# Access appropriate algorithm container image
#  Specify how many instances to use for distributed training and what type of machine to use
#  Finally, specify where the trained model artifacts needs to be stored
#   Reference: http://sagemaker.readthedocs.io/en/latest/estimators.html
#    Optionally, give a name to the training job using base_job_name

estimator = sagemaker.estimator.Estimator(containers[boto3.Session().region_name],
                                       role, 
                                       train_instance_count=1, 
                                       train_instance_type='ml.m4.xlarge',
                                       output_path=s3_model_output_location,
                                       sagemaker_session=sess,
                                       base_job_name ='fm-movie-v2')

In [12]:
# Specify hyper parameters that appropriate for the training algorithm
# Sparse Matrix dimension: 100004, 9737
estimator.set_hyperparameters(feature_dim=9737,
                              num_factors=8,
                              predictor_type='regressor', 
                              mini_batch_size=1000,
                              epochs=100)

In [13]:
estimator.hyperparameters()

{'feature_dim': 9737,
 'num_factors': 8,
 'predictor_type': 'regressor',
 'mini_batch_size': 1000,
 'epochs': 100}

### Train the model

In [14]:
# XGBoost supports "train", "validation" channels
# Reference: Supported channels by algorithm
#   https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html
estimator.fit({'train':s3_training_file_location, 'test': s3_test_file_location})

INFO:sagemaker:Creating training-job with name: fm-movie-v2-2019-02-07-22-19-00-629


2019-02-07 22:19:00 Starting - Starting the training job...
2019-02-07 22:19:06 Starting - Launching requested ML instances......
2019-02-07 22:20:30 Starting - Preparing the instances for training......
2019-02-07 22:21:23 Downloading - Downloading input data
2019-02-07 22:21:23 Training - Downloading the training image
2019-02-07 22:21:23 Failed - Training job failed
..

ValueError: Error for Training job fm-movie-v2-2019-02-07-22-19-00-629: Failed Reason: ClientError: Cannot pull algorithm container. Either the image does not exist or its permissions are incorrect.

## Deploy Model

In [18]:
# Ref: http://sagemaker.readthedocs.io/en/latest/estimators.html
predictor = estimator.deploy(initial_instance_count=1,
                             instance_type='ml.m4.xlarge',
                             endpoint_name = 'fm-movie-v2')

INFO:sagemaker:Creating model with name: factorization-machines-2018-05-31-00-28-15-636
INFO:sagemaker:Creating endpoint with name fm-movie-v2


--------------------------------------------------------------------------!

## Run Predictions
### Dense and Sparse Formats
https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-inference.html

In [19]:

import json
from sagemaker.predictor import json_deserializer

dim_movie = 9737
def fm_sparse_serializer(data):
    js = {'instances': []}
    for row in data:
        
        column_list = row.tolist()
        value_list = np.ones(len(column_list),dtype=int).tolist()
       
        js['instances'].append({'data':{'features': { 'keys': column_list, 'shape':[dim_movie], 'values': value_list}}})
    return json.dumps(js)

In [20]:
predictor.content_type = 'application/json'
predictor.serializer = fm_sparse_serializer
predictor.deserializer = json_deserializer

In [21]:
import numpy as np

In [22]:
fm_sparse_serializer([np.array([341,1416])])

'{"instances": [{"data": {"features": {"keys": [341, 1416], "shape": [9737], "values": [1, 1]}}}]}'

In [23]:
# Rating Entry: ['5 341:1 1416:1', '2.5 209:1 2640:1','2.5 164:1 1346:1']
predictor.predict([np.array([341,1416])])

{'predictions': [{'score': 4.107213020324707}]}

## Summary

1. Ensure Training, Test and Validation data are in S3 Bucket
2. Select Algorithm Container Registry Path - Path varies by region
3. Configure Estimator for training - Specify Algorithm container, instance count, instance type, model output location
4. Specify algorithm specific hyper parameters
5. Train model
6. Deploy model - Specify instance count, instance type and endpoint name
7. Run Predictions